Remove S3 + encryption
All checks were successful
CI / Typos (push) Successful in 20s
CI / Clippy (push) Successful in 2m44s
CI / Build and test (push) Successful in 3m10s
Docker / build-and-push (push) Successful in 5m6s
CI / Build and test (all features) (push) Successful in 6m51s

This commit is contained in:
2026-03-26 14:37:18 -07:00
parent ec7326a55e
commit 80f4ebdbe6
24 changed files with 42 additions and 2915 deletions

View File

@@ -21,8 +21,6 @@ toml = { workspace = true }
smartstring = { workspace = true }
regex = { workspace = true }
blake3 = { workspace = true }
chacha20poly1305 = { workspace = true }
base64 = { workspace = true }
epub = { workspace = true }
kamadak-exif = { workspace = true }
pdf = { workspace = true }

View File

@@ -1,9 +1,6 @@
mod dir;
pub use dir::*;
mod s3;
pub use s3::*;
pub mod misc;
/// A read-only set of [Item]s.

View File

@@ -1,322 +0,0 @@
use chrono::{DateTime, Utc};
use pile_config::{
Label,
pattern::{GroupPattern, GroupSegment},
};
use pile_io::S3Client;
use smartstring::{LazyCompact, SmartString};
use std::{
collections::{BTreeMap, HashMap, HashSet},
sync::{Arc, OnceLock},
};
use crate::{
extract::traits::ExtractState,
source::DataSource,
value::{Item, PileValue},
};
#[derive(Debug)]
pub struct S3DataSource {
pub name: Label,
pub client: Arc<S3Client>,
pub prefix: Option<SmartString<LazyCompact>>,
pub pattern: GroupPattern,
pub encryption_key: Option<[u8; 32]>,
pub index: OnceLock<BTreeMap<SmartString<LazyCompact>, Item>>,
}
impl S3DataSource {
pub async fn new(
name: &Label,
bucket: &str,
prefix: Option<&str>,
endpoint: Option<&str>,
region: &str,
access_key_id: &str,
secret_access_key: &str,
cache_limit_bytes: usize,
pattern: GroupPattern,
encryption_key: Option<[u8; 32]>,
) -> Result<Arc<Self>, std::io::Error> {
let client = S3Client::new(
bucket,
endpoint,
region,
access_key_id,
secret_access_key,
cache_limit_bytes,
)
.await;
let source = Arc::new(Self {
name: name.clone(),
client,
prefix: prefix.map(|x| x.into()),
pattern,
encryption_key,
index: OnceLock::new(),
});
//
// MARK: list keys
//
let mut all_keys: HashSet<SmartString<LazyCompact>> = HashSet::new();
let mut continuation_token: Option<String> = None;
loop {
let mut req = source
.client
.client
.list_objects_v2()
.bucket(source.client.bucket());
if let Some(prefix) = &source.prefix {
req = req.prefix(prefix.as_str());
}
if let Some(token) = continuation_token {
req = req.continuation_token(token);
}
let resp = req.send().await.map_err(std::io::Error::other)?;
let next_token = resp.next_continuation_token().map(ToOwned::to_owned);
let is_truncated = resp.is_truncated().unwrap_or(false);
for obj in resp.contents() {
let Some(full_key) = obj.key() else { continue };
let raw_key = strip_prefix(full_key, source.prefix.as_deref());
let key = match &source.encryption_key {
None => raw_key.into(),
Some(enc_key) => match decrypt_path(enc_key, raw_key) {
Some(decrypted) => decrypted.into(),
None => continue,
},
};
all_keys.insert(key);
}
if !is_truncated {
break;
}
continuation_token = next_token;
}
//
// MARK: resolve groups
//
let mut keys_grouped: HashSet<SmartString<LazyCompact>> = HashSet::new();
for key in &all_keys {
let groups = resolve_groups(&source.pattern, key).await;
for group_key in groups.into_values() {
if all_keys.contains(&group_key) {
keys_grouped.insert(group_key);
}
}
}
let mut index = BTreeMap::new();
for key in all_keys.difference(&keys_grouped) {
let groups = resolve_groups(&source.pattern, key).await;
let group = groups
.into_iter()
.filter(|(_, gk)| all_keys.contains(gk))
.map(|(label, gk)| {
(
label,
Box::new(Item::S3 {
source: Arc::clone(&source),
mime: mime_guess::from_path(gk.as_str()).first_or_octet_stream(),
key: gk,
group: Arc::new(HashMap::new()),
}),
)
})
.collect::<HashMap<_, _>>();
let item = Item::S3 {
source: Arc::clone(&source),
mime: mime_guess::from_path(key.as_str()).first_or_octet_stream(),
key: key.clone(),
group: Arc::new(group),
};
index.insert(item.key(), item);
}
source.index.get_or_init(|| index);
Ok(source)
}
}
impl DataSource for Arc<S3DataSource> {
#[expect(clippy::expect_used)]
fn len(&self) -> usize {
self.index.get().expect("index should be initialized").len()
}
#[expect(clippy::expect_used)]
async fn get(&self, key: &str) -> Result<Option<Item>, std::io::Error> {
return Ok(self
.index
.get()
.expect("index should be initialized")
.get(key)
.cloned());
}
#[expect(clippy::expect_used)]
fn iter(&self) -> impl Iterator<Item = &Item> {
self.index
.get()
.expect("index should be initialized")
.values()
}
async fn latest_change(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
let mut ts: Option<DateTime<Utc>> = None;
let mut continuation_token: Option<String> = None;
loop {
let mut req = self
.client
.client
.list_objects_v2()
.bucket(self.client.bucket());
if let Some(prefix) = &self.prefix {
req = req.prefix(prefix.as_str());
}
if let Some(token) = continuation_token {
req = req.continuation_token(token);
}
let resp = match req.send().await {
Err(_) => return Ok(None),
Ok(resp) => resp,
};
let next_token = resp.next_continuation_token().map(ToOwned::to_owned);
let is_truncated = resp.is_truncated().unwrap_or(false);
for obj in resp.contents() {
if let Some(last_modified) = obj.last_modified() {
let dt = DateTime::from_timestamp(
last_modified.secs(),
last_modified.subsec_nanos(),
);
if let Some(dt) = dt {
ts = Some(match ts {
None => dt,
Some(prev) => prev.max(dt),
});
}
}
}
if !is_truncated {
break;
}
continuation_token = next_token;
}
Ok(ts)
}
}
/// Derive an encryption key from a password
pub fn string_to_key(password: &str) -> [u8; 32] {
blake3::derive_key("pile s3 encryption", password.as_bytes())
}
/// Encrypt a logical path to a base64 S3 key using a deterministic nonce.
pub fn encrypt_path(enc_key: &[u8; 32], path: &str) -> String {
use base64::Engine;
use chacha20poly1305::{KeyInit, XChaCha20Poly1305, XNonce, aead::Aead};
let hash = blake3::keyed_hash(enc_key, path.as_bytes());
let nonce_bytes = &hash.as_bytes()[..24];
let nonce = XNonce::from_slice(nonce_bytes);
let key = chacha20poly1305::Key::from_slice(enc_key);
let cipher = XChaCha20Poly1305::new(key);
#[expect(clippy::expect_used)]
let ciphertext = cipher
.encrypt(nonce, path.as_bytes())
.expect("path encryption should not fail");
let mut result = nonce_bytes.to_vec();
result.extend_from_slice(&ciphertext);
base64::engine::general_purpose::URL_SAFE_NO_PAD.encode(result)
}
/// Decrypt a base64 S3 key back to its logical path.
fn decrypt_path(enc_key: &[u8; 32], encrypted: &str) -> Option<String> {
use base64::Engine;
use chacha20poly1305::{KeyInit, XChaCha20Poly1305, XNonce, aead::Aead};
let bytes = base64::engine::general_purpose::URL_SAFE_NO_PAD
.decode(encrypted)
.ok()?;
if bytes.len() < 24 + 16 {
return None;
}
let (nonce_bytes, ciphertext) = bytes.split_at(24);
let nonce = XNonce::from_slice(nonce_bytes);
let key = chacha20poly1305::Key::from_slice(enc_key);
let cipher = XChaCha20Poly1305::new(key);
let plaintext = cipher.decrypt(nonce, ciphertext).ok()?;
String::from_utf8(plaintext).ok()
}
fn strip_prefix<'a>(key: &'a str, prefix: Option<&str>) -> &'a str {
match prefix {
None => key,
Some(p) => {
let with_slash = if p.ends_with('/') {
key.strip_prefix(p)
} else {
key.strip_prefix(&format!("{p}/"))
};
with_slash.unwrap_or(key)
}
}
}
async fn resolve_groups(
pattern: &GroupPattern,
key: &str,
) -> HashMap<Label, SmartString<LazyCompact>> {
let state = ExtractState { ignore_mime: false };
let mut group = HashMap::new();
'pattern: for (l, pat) in &pattern.pattern {
let item = PileValue::String(Arc::new(key.into()));
let mut target = String::new();
for p in pat {
match p {
GroupSegment::Literal(x) => target.push_str(x),
GroupSegment::Path(op) => {
let res = match item.query(&state, op).await {
Ok(Some(x)) => x,
_ => continue 'pattern,
};
let res = match res.as_str() {
Some(x) => x,
None => continue 'pattern,
};
target.push_str(res);
}
}
}
group.insert(l.clone(), target.into());
}
return group;
}

View File

@@ -1,158 +0,0 @@
use aws_sdk_s3::{error::SdkError, operation::get_object::GetObjectError};
use mime::Mime;
use std::io::{Error as IoError, Seek, SeekFrom, Write};
use thiserror::Error;
use super::S3Client;
use crate::retry;
#[derive(Debug, Error)]
#[expect(clippy::large_enum_variant)]
pub enum S3ReaderError {
#[error("sdk error")]
SdkError(#[from] SdkError<GetObjectError>),
#[error("byte stream error")]
ByteStreamError(#[from] aws_sdk_s3::primitives::ByteStreamError),
#[error("i/o error")]
IoError(#[from] IoError),
}
/// Provides a [`std::io::Read`]-like interface to an S3 object. \
/// This doesn't actually implement [`std::io::Read`] because Read isn't async.
///
/// Also implements [`std::io::Seek`]
pub struct S3Reader {
pub(super) client: S3Client,
pub(super) bucket: String,
pub(super) key: String,
pub(super) cursor: u64,
pub(super) size: u64,
pub(super) mime: Mime,
}
impl S3Reader {
pub async fn read(&mut self, mut buf: &mut [u8]) -> Result<usize, S3ReaderError> {
let len_left = self.size - self.cursor;
if len_left == 0 || buf.is_empty() {
return Ok(0);
}
#[expect(clippy::unwrap_used)] // TODO: probably fits?
let start_byte = usize::try_from(self.cursor).unwrap();
#[expect(clippy::unwrap_used)] // usize fits in u64
let len_to_read = u64::try_from(buf.len()).unwrap().min(len_left);
#[expect(clippy::unwrap_used)] // must fit, we called min()
let len_to_read = usize::try_from(len_to_read).unwrap();
let end_byte = start_byte + len_to_read - 1;
let b = retry!(
self.client.retries,
self.client
.client
.get_object()
.bucket(self.bucket.as_str())
.key(self.key.as_str())
.range(format!("bytes={start_byte}-{end_byte}"))
.send()
.await
)?;
// Looks like `bytes 31000000-31999999/33921176``
// println!("{:?}", b.content_range);
let mut bytes = b.body.collect().await?.into_bytes();
bytes.truncate(len_to_read);
let l = bytes.len();
// Memory to memory writes are infallible
#[expect(clippy::unwrap_used)]
buf.write_all(&bytes).unwrap();
// Cannot fail, usize should always fit into u64
#[expect(clippy::unwrap_used)]
{
self.cursor += u64::try_from(l).unwrap();
}
return Ok(len_to_read);
}
pub fn is_done(&self) -> bool {
return self.cursor == self.size;
}
pub fn mime(&self) -> &Mime {
&self.mime
}
/// Write the entire contents of this reader to `r`.
///
/// This method always downloads the whole object,
/// and always preserves `self.cursor`.
pub async fn download<W: Write>(&mut self, r: &mut W) -> Result<(), S3ReaderError> {
let pos = self.stream_position()?;
const BUF_LEN: usize = 10_000_000;
#[expect(clippy::unwrap_used)] // Cannot fail
let mut buf: Box<[u8; BUF_LEN]> = vec![0u8; BUF_LEN].try_into().unwrap();
while !self.is_done() {
let b = self.read(&mut buf[..]).await?;
r.write_all(&buf[0..b])?;
}
self.seek(SeekFrom::Start(pos))?;
Ok(())
}
}
impl Seek for S3Reader {
fn seek(&mut self, pos: SeekFrom) -> std::io::Result<u64> {
match pos {
SeekFrom::Start(x) => self.cursor = x.min(self.size - 1),
// Cannot panic, we handle all cases
#[expect(clippy::unwrap_used)]
SeekFrom::Current(x) => {
if x < 0 {
if u64::try_from(x.abs()).unwrap() > self.cursor {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidInput,
"cannot seek past start",
));
}
self.cursor -= u64::try_from(x.abs()).unwrap();
} else {
self.cursor += u64::try_from(x).unwrap();
}
}
// Cannot panic, we handle all cases
#[expect(clippy::unwrap_used)]
SeekFrom::End(x) => {
if x < 0 {
if u64::try_from(x.abs()).unwrap() > self.size {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidInput,
"cannot seek past start",
));
}
// Cannot fail, is abs
self.cursor = self.size - u64::try_from(x.abs()).unwrap();
} else {
// Cannot fail, is positive
self.cursor = self.size + u64::try_from(x).unwrap();
}
}
}
self.cursor = self.cursor.min(self.size - 1);
return Ok(self.cursor);
}
}

View File

@@ -1,13 +1,10 @@
use mime::Mime;
use pile_config::Label;
use pile_io::{SyncReadBridge, chacha::ChaChaReaderv1Async};
use pile_io::SyncReadBridge;
use smartstring::{LazyCompact, SmartString};
use std::{collections::HashMap, fs::File, path::PathBuf, sync::Arc};
use crate::{
source::{DirDataSource, S3DataSource, encrypt_path},
value::ItemReader,
};
use crate::{source::DirDataSource, value::ItemReader};
//
// MARK: item
@@ -23,58 +20,19 @@ pub enum Item {
path: PathBuf,
group: Arc<HashMap<Label, Box<Item>>>,
},
S3 {
source: Arc<S3DataSource>,
mime: Mime,
key: SmartString<LazyCompact>,
group: Arc<HashMap<Label, Box<Item>>>,
},
}
impl Item {
/// Open the item for reading. For S3, performs a HEAD request to determine
/// the object size.
/// Open the item for reading.
pub async fn read(&self) -> Result<ItemReader, std::io::Error> {
Ok(match self {
Self::File { path, .. } => ItemReader::File(File::open(path)?),
Self::S3 { source, key, .. } => {
let logical_key = key.as_str();
let s3_key_part: SmartString<LazyCompact> = match &source.encryption_key {
None => logical_key.into(),
Some(enc_key) => encrypt_path(enc_key, logical_key).into(),
};
let full_key: SmartString<LazyCompact> = match &source.prefix {
None => s3_key_part,
Some(p) => {
if p.ends_with('/') {
format!("{p}{s3_key_part}").into()
} else {
format!("{p}/{s3_key_part}").into()
}
}
};
let reader = source.client.get(&full_key).await?;
match source.encryption_key {
None => ItemReader::S3(reader),
Some(enc_key) => {
ItemReader::EncryptedS3(ChaChaReaderv1Async::new(reader, enc_key).await?)
}
}
}
})
}
pub fn source_name(&self) -> &pile_config::Label {
match self {
Self::File { source, .. } => &source.name,
Self::S3 { source, .. } => &source.name,
}
}
@@ -87,7 +45,6 @@ impl Item {
.to_str()
.expect("path is not utf-8")
.into(),
Self::S3 { key, .. } => key.clone(),
}
}
@@ -106,14 +63,12 @@ impl Item {
pub fn mime(&self) -> &Mime {
match self {
Self::File { mime, .. } => mime,
Self::S3 { mime, .. } => mime,
}
}
pub fn group(&self) -> &HashMap<Label, Box<Self>> {
match self {
Self::File { group, .. } => group,
Self::S3 { group, .. } => group,
}
}
}

View File

@@ -1,4 +1,4 @@
use pile_io::{AsyncReader, AsyncSeekReader, S3Reader, chacha::ChaChaReaderv1Async};
use pile_io::{AsyncReader, AsyncSeekReader};
use std::{fs::File, io::Seek};
//
@@ -7,16 +7,12 @@ use std::{fs::File, io::Seek};
pub enum ItemReader {
File(File),
S3(S3Reader),
EncryptedS3(ChaChaReaderv1Async<S3Reader>),
}
impl AsyncReader for ItemReader {
async fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
match self {
Self::File(x) => std::io::Read::read(x, buf),
Self::S3(x) => x.read(buf).await,
Self::EncryptedS3(x) => x.read(buf).await,
}
}
}
@@ -25,8 +21,6 @@ impl AsyncSeekReader for ItemReader {
async fn seek(&mut self, pos: std::io::SeekFrom) -> Result<u64, std::io::Error> {
match self {
Self::File(x) => x.seek(pos),
Self::S3(x) => x.seek(pos).await,
Self::EncryptedS3(x) => x.seek(pos).await,
}
}
}