Refactor sidecars

This commit is contained in:
2026-03-16 22:24:30 -07:00
parent f2f5726d7b
commit 053459f340
25 changed files with 674 additions and 530 deletions

View File

@@ -0,0 +1,56 @@
use std::sync::Arc;
use pile_config::Label;
use crate::{
extract::traits::{ExtractState, ObjectExtractor},
value::{Item, PileValue},
};
pub struct GroupExtractor {
item: Item,
}
impl GroupExtractor {
pub fn new(item: &Item) -> Self {
Self { item: item.clone() }
}
}
#[async_trait::async_trait]
impl ObjectExtractor for GroupExtractor {
async fn field(
&self,
_state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
Ok(self
.item
.group()
.get(name)
.map(|item| PileValue::ObjectExtractor(Arc::new(super::ItemExtractor::new(item)))))
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.item.group().keys().cloned().collect())
}
async fn to_json(&self, _state: &ExtractState) -> Result<serde_json::Value, std::io::Error> {
Ok(serde_json::Value::Object(
self.item
.group()
.iter()
.map(|(k, v)| {
(
k.to_string(),
serde_json::Value::String(format!("<GroupItem ({})>", v.key())),
)
})
.collect(),
))
}
}

View File

@@ -25,8 +25,8 @@ mod toml;
use pile_config::Label;
pub use toml::*;
mod sidecar;
pub use sidecar::*;
mod group;
pub use group::*;
use crate::{
extract::{
@@ -78,8 +78,8 @@ impl ItemExtractor {
PileValue::ObjectExtractor(Arc::new(TomlExtractor::new(item))),
),
(
Label::new("sidecar").unwrap(),
PileValue::ObjectExtractor(Arc::new(SidecarExtractor::new(item))),
Label::new("groups").unwrap(),
PileValue::ObjectExtractor(Arc::new(GroupExtractor::new(item))),
),
]),
};
@@ -109,7 +109,8 @@ impl ObjectExtractor for ItemExtractor {
Label::new("exif").unwrap(),
Label::new("pdf").unwrap(),
Label::new("json").unwrap(),
Label::new("sidecar").unwrap(),
Label::new("toml").unwrap(),
Label::new("groups").unwrap(),
]);
}
}

View File

@@ -1,57 +0,0 @@
use pile_config::Label;
use std::sync::OnceLock;
use tracing::trace;
use super::TomlExtractor;
use crate::{
extract::traits::{ExtractState, ObjectExtractor},
value::{Item, PileValue},
};
pub struct SidecarExtractor {
item: Item,
output: OnceLock<Option<TomlExtractor>>,
}
impl SidecarExtractor {
pub fn new(item: &Item) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
}
}
}
#[async_trait::async_trait]
impl ObjectExtractor for SidecarExtractor {
async fn field(
&self,
state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
trace!(
?args,
key = self.item.key().as_str(),
"Getting field {name:?} from SidecarExtractor",
);
match self
.output
.get_or_init(|| self.item.sidecar().map(TomlExtractor::new))
{
Some(x) => Ok(x.field(state, name, args).await?),
None => Ok(Some(PileValue::Null)),
}
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
match self
.output
.get_or_init(|| self.item.sidecar().map(TomlExtractor::new))
{
Some(x) => Ok(x.fields().await?),
None => Ok(Vec::new()),
}
}
}

View File

@@ -1,143 +1,176 @@
use chrono::{DateTime, Utc};
use pile_config::Label;
use std::{path::PathBuf, sync::Arc};
use tokio_stream::wrappers::ReceiverStream;
use pile_config::{
Label,
pattern::{GroupPattern, GroupSegment},
};
use smartstring::{LazyCompact, SmartString};
use std::{
collections::{HashMap, HashSet},
path::PathBuf,
sync::{Arc, OnceLock},
};
use walkdir::WalkDir;
use crate::{
extract::traits::ExtractState,
source::{DataSource, misc::path_ts_latest},
value::Item,
value::{Item, PileValue},
};
#[derive(Debug)]
pub struct DirDataSource {
pub name: Label,
pub dir: PathBuf,
pub sidecars: bool,
pub pattern: GroupPattern,
pub index: OnceLock<HashMap<SmartString<LazyCompact>, Item>>,
}
impl DirDataSource {
pub fn new(name: &Label, dir: PathBuf, sidecars: bool) -> Self {
Self {
pub async fn new(
name: &Label,
dir: PathBuf,
pattern: GroupPattern,
) -> Result<Arc<Self>, std::io::Error> {
let source = Arc::new(Self {
name: name.clone(),
dir,
sidecars,
pattern,
index: OnceLock::new(),
});
//
// MARK: list paths
//
let mut paths_items = HashSet::new();
let mut paths_grouped_items = HashSet::new();
'entry: for entry in WalkDir::new(&source.dir) {
let entry = match entry {
Err(e) => {
let msg = format!("walkdir error: {e:?}");
let err = e.into_io_error().unwrap_or(std::io::Error::other(msg));
return Err(err);
}
Ok(e) => e,
};
if entry.file_type().is_dir() {
continue;
}
let path = entry.into_path();
let path_str = match path.to_str() {
Some(x) => x,
None => continue 'entry,
};
let groups = resolve_groups(&source.pattern, path_str).await;
paths_grouped_items.extend(groups.into_values());
paths_items.insert(path);
}
//
// MARK: resolve groups
//
let mut index = HashMap::new();
'entry: for path in paths_items.difference(&paths_grouped_items) {
let path_str = match path.to_str() {
Some(x) => x,
None => continue 'entry,
};
let group = resolve_groups(&source.pattern, path_str).await;
let group = group
.into_iter()
.map(|(k, group_path)| {
(
k,
Box::new(Item::File {
source: Arc::clone(&source),
mime: mime_guess::from_path(&group_path).first_or_octet_stream(),
path: group_path.clone(),
group: Arc::new(HashMap::new()),
}),
)
})
.collect::<HashMap<_, _>>();
let item = Item::File {
source: Arc::clone(&source),
mime: mime_guess::from_path(path).first_or_octet_stream(),
path: path.into(),
group: Arc::new(group),
};
index.insert(item.key(), item);
}
source.index.get_or_init(|| index);
Ok(source)
}
}
impl DataSource for Arc<DirDataSource> {
#[expect(clippy::expect_used)]
async fn get(&self, key: &str) -> Result<Option<Item>, std::io::Error> {
let key = match key.parse::<PathBuf>() {
Ok(x) => self.dir.join(x),
Err(_) => return Ok(None),
};
if !key.is_file() {
return Ok(None);
}
// Ignore toml files if sidecars are enabled
if self.sidecars && key.extension().and_then(|x| x.to_str()) == Some("toml") {
return Ok(None);
}
return Ok(Some(Item::File {
source: Arc::clone(self),
mime: mime_guess::from_path(&key).first_or_octet_stream(),
path: key.clone(),
sidecar: self
.sidecars
.then(|| {
let sidecar_path = key.with_extension("toml");
sidecar_path.is_file().then(|| {
Box::new(Item::File {
source: Arc::clone(self),
mime: mime_guess::from_path(&sidecar_path).first_or_octet_stream(),
path: sidecar_path,
sidecar: None,
})
})
})
.flatten(),
}));
return Ok(self
.index
.get()
.expect("index should be initialized")
.get(key)
.cloned());
}
fn iter(&self) -> ReceiverStream<Result<Item, std::io::Error>> {
let (tx, rx) = tokio::sync::mpsc::channel(64);
let source = Arc::clone(self);
let dir = self.dir.clone();
tokio::task::spawn_blocking(move || {
for entry in WalkDir::new(dir) {
let entry = match entry {
Err(e) => {
let msg = format!("walkdir error: {e:?}");
let err = e.into_io_error().unwrap_or(std::io::Error::other(msg));
if tx.blocking_send(Err(err)).is_err() {
return;
}
continue;
}
Ok(e) => e,
};
if entry.file_type().is_dir() {
continue;
}
let path = entry.into_path();
let item = match path.extension().and_then(|x| x.to_str()) {
None => continue,
Some("toml") if source.sidecars => continue,
Some(_) => Item::File {
source: Arc::clone(&source),
mime: mime_guess::from_path(&path).first_or_octet_stream(),
path: path.clone(),
sidecar: source
.sidecars
.then(|| {
let sidecar_path = path.with_extension("toml");
sidecar_path.is_file().then(|| {
Box::new(Item::File {
source: Arc::clone(&source),
mime: mime_guess::from_path(&sidecar_path)
.first_or_octet_stream(),
path: sidecar_path,
sidecar: None,
})
})
})
.flatten(),
},
};
if tx.blocking_send(Ok(item)).is_err() {
return;
}
}
});
ReceiverStream::new(rx)
#[expect(clippy::expect_used)]
fn iter(&self) -> impl Iterator<Item = &Item> {
self.index
.get()
.expect("index should be initialized")
.values()
}
async fn latest_change(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
let mut ts: Option<DateTime<Utc>> = None;
if !self.dir.exists() {
return Ok(None);
}
let new = path_ts_latest(&self.dir)?;
match (ts, new) {
(_, None) => {}
(None, Some(new)) => ts = Some(new),
(Some(old), Some(new)) => ts = Some(old.max(new)),
};
return Ok(ts);
path_ts_latest(&self.dir)
}
}
async fn resolve_groups(pattern: &GroupPattern, path_str: &str) -> HashMap<Label, PathBuf> {
let state = ExtractState { ignore_mime: false };
let mut group = HashMap::new();
'pattern: for (l, pat) in &pattern.pattern {
let item = PileValue::String(Arc::new(path_str.into()));
let mut target = String::new();
for p in pat {
match p {
GroupSegment::Literal(x) => target.push_str(x),
GroupSegment::Path(op) => {
let res = match item.query(&state, op).await {
Ok(Some(x)) => x,
_ => continue 'pattern,
};
let res = match res.as_str() {
Some(x) => x,
None => continue 'pattern,
};
target.push_str(res);
}
}
}
let group_path: PathBuf = match target.parse() {
Ok(x) => x,
Err(_) => continue 'pattern,
};
if !group_path.exists() {
continue;
}
group.insert(l.clone(), group_path);
}
return group;
}

View File

@@ -6,9 +6,6 @@ pub use s3::*;
pub mod misc;
use chrono::{DateTime, Utc};
use tokio_stream::wrappers::ReceiverStream;
/// A read-only set of [Item]s.
pub trait DataSource {
/// Get an item from this datasource
@@ -18,10 +15,10 @@ pub trait DataSource {
) -> impl Future<Output = Result<Option<crate::value::Item>, std::io::Error>> + Send;
/// Iterate over all items in this source in an arbitrary order
fn iter(&self) -> ReceiverStream<Result<crate::value::Item, std::io::Error>>;
fn iter(&self) -> impl Iterator<Item = &crate::value::Item>;
/// Return the time of the latest change to the data in this source
fn latest_change(
&self,
) -> impl Future<Output = Result<Option<DateTime<Utc>>, std::io::Error>> + Send;
) -> impl Future<Output = Result<Option<chrono::DateTime<chrono::Utc>>, std::io::Error>> + Send;
}

View File

@@ -1,31 +1,41 @@
use aws_sdk_s3::config::{BehaviorVersion, Credentials, Region};
use chrono::{DateTime, Utc};
use pile_config::{Label, S3Credentials};
use pile_config::{
Label, S3Credentials,
pattern::{GroupPattern, GroupSegment},
};
use smartstring::{LazyCompact, SmartString};
use std::sync::Arc;
use tokio_stream::wrappers::ReceiverStream;
use std::{
collections::{HashMap, HashSet},
sync::{Arc, OnceLock},
};
use crate::{source::DataSource, value::Item};
use crate::{
extract::traits::ExtractState,
source::DataSource,
value::{Item, PileValue},
};
#[derive(Debug)]
pub struct S3DataSource {
pub name: Label,
pub bucket: SmartString<LazyCompact>,
pub prefix: Option<SmartString<LazyCompact>>,
pub sidecars: bool,
pub client: Arc<aws_sdk_s3::Client>,
pub pattern: GroupPattern,
pub index: OnceLock<HashMap<SmartString<LazyCompact>, Item>>,
}
impl S3DataSource {
pub fn new(
pub async fn new(
name: &Label,
bucket: String,
prefix: Option<String>,
endpoint: Option<String>,
region: String,
credentials: &S3Credentials,
sidecars: bool,
) -> Result<Self, std::io::Error> {
pattern: GroupPattern,
) -> Result<Arc<Self>, std::io::Error> {
let client = {
let creds = Credentials::new(
&credentials.access_key_id,
@@ -47,174 +57,118 @@ impl S3DataSource {
aws_sdk_s3::Client::from_conf(s3_config.build())
};
Ok(Self {
let source = Arc::new(Self {
name: name.clone(),
bucket: bucket.into(),
prefix: prefix.map(|x| x.into()),
sidecars,
client: Arc::new(client),
})
}
pattern,
index: OnceLock::new(),
});
async fn find_sidecar_key(&self, key: &str) -> Option<SmartString<LazyCompact>> {
// First try {key}.toml
let full_toml = format!("{key}.toml");
if self
.client
.head_object()
.bucket(self.bucket.as_str())
.key(&full_toml)
.send()
.await
.is_ok()
{
return Some(full_toml.into());
}
//
// MARK: list keys
//
// Then try {key-with-extension-stripped}.toml
let stripped = std::path::Path::new(key).with_extension("toml");
if let Some(stripped_str) = stripped.to_str()
&& stripped_str != full_toml.as_str()
&& self
let mut all_keys: HashSet<SmartString<LazyCompact>> = HashSet::new();
let mut continuation_token: Option<String> = None;
loop {
let mut req = source
.client
.head_object()
.bucket(self.bucket.as_str())
.key(stripped_str)
.send()
.await
.is_ok()
{
return Some(stripped_str.into());
.list_objects_v2()
.bucket(source.bucket.as_str());
if let Some(prefix) = &source.prefix {
req = req.prefix(prefix.as_str());
}
if let Some(token) = continuation_token {
req = req.continuation_token(token);
}
let resp = req.send().await.map_err(std::io::Error::other)?;
let next_token = resp.next_continuation_token().map(ToOwned::to_owned);
let is_truncated = resp.is_truncated().unwrap_or(false);
for obj in resp.contents() {
let Some(full_key) = obj.key() else { continue };
let key = strip_prefix(full_key, source.prefix.as_deref());
all_keys.insert(key.into());
}
if !is_truncated {
break;
}
continuation_token = next_token;
}
None
}
//
// MARK: resolve groups
//
async fn make_item(self: &Arc<Self>, key: impl Into<SmartString<LazyCompact>>) -> Item {
let key: SmartString<LazyCompact> = key.into();
let object_path = match &self.prefix {
Some(x) => format!("{x}/{key}").into(),
None => key.clone(),
};
let mut keys_grouped: HashSet<SmartString<LazyCompact>> = HashSet::new();
for key in &all_keys {
let groups = resolve_groups(&source.pattern, key).await;
for group_key in groups.into_values() {
if all_keys.contains(&group_key) {
keys_grouped.insert(group_key);
}
}
}
let mime = mime_guess::from_path(object_path.as_str()).first_or_octet_stream();
let sidecar = if self.sidecars {
self.find_sidecar_key(object_path.as_str())
.await
.map(|sidecar_key| {
Box::new(Item::S3 {
source: Arc::clone(self),
mime: mime_guess::from_path(sidecar_key.as_str()).first_or_octet_stream(),
key: sidecar_key,
sidecar: None,
})
let mut index = HashMap::new();
for key in all_keys.difference(&keys_grouped) {
let groups = resolve_groups(&source.pattern, key).await;
let group = groups
.into_iter()
.filter(|(_, gk)| all_keys.contains(gk))
.map(|(label, gk)| {
(
label,
Box::new(Item::S3 {
source: Arc::clone(&source),
mime: mime_guess::from_path(gk.as_str()).first_or_octet_stream(),
key: gk,
group: Arc::new(HashMap::new()),
}),
)
})
} else {
None
};
.collect::<HashMap<_, _>>();
Item::S3 {
source: Arc::clone(self),
mime,
key,
sidecar,
let item = Item::S3 {
source: Arc::clone(&source),
mime: mime_guess::from_path(key.as_str()).first_or_octet_stream(),
key: key.clone(),
group: Arc::new(group),
};
index.insert(item.key(), item);
}
source.index.get_or_init(|| index);
Ok(source)
}
}
impl DataSource for Arc<S3DataSource> {
#[expect(clippy::expect_used)]
async fn get(&self, key: &str) -> Result<Option<Item>, std::io::Error> {
if self.sidecars && key.ends_with(".toml") {
return Ok(None);
}
let key: SmartString<LazyCompact> = key.into();
let key = match &self.prefix {
Some(x) => format!("{x}/{key}").into(),
None => key,
};
let result = self
.client
.head_object()
.bucket(self.bucket.as_str())
.key(key.as_str())
.send()
.await;
match result {
Err(sdk_err) => {
let not_found = sdk_err
.as_service_error()
.map(|e| e.is_not_found())
.unwrap_or(false);
if not_found {
return Ok(None);
}
Err(std::io::Error::other(sdk_err))
}
Ok(_) => Ok(Some(self.make_item(key).await)),
}
return Ok(self
.index
.get()
.expect("index should be initialized")
.get(key)
.cloned());
}
fn iter(&self) -> ReceiverStream<Result<Item, std::io::Error>> {
let (tx, rx) = tokio::sync::mpsc::channel(64);
let source = Arc::clone(self);
tokio::spawn(async move {
let mut continuation_token: Option<String> = None;
loop {
let mut req = source
.client
.list_objects_v2()
.bucket(source.bucket.as_str());
if let Some(prefix) = &source.prefix {
req = req.prefix(prefix.as_str());
}
if let Some(token) = continuation_token {
req = req.continuation_token(token);
}
let resp = match req.send().await {
Err(e) => {
let _ = tx.send(Err(std::io::Error::other(e))).await;
break;
}
Ok(resp) => resp,
};
let next_token = resp.next_continuation_token().map(ToOwned::to_owned);
let is_truncated = resp.is_truncated().unwrap_or(false);
for obj in resp.contents() {
let key = match obj.key() {
Some(k) => k.to_owned(),
None => continue,
};
if source.sidecars && key.ends_with(".toml") {
continue;
}
let item = source.make_item(key).await;
if tx.send(Ok(item)).await.is_err() {
return;
}
}
if !is_truncated {
break;
}
continuation_token = next_token;
}
});
ReceiverStream::new(rx)
#[expect(clippy::expect_used)]
fn iter(&self) -> impl Iterator<Item = &Item> {
self.index
.get()
.expect("index should be initialized")
.values()
}
async fn latest_change(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
@@ -264,3 +218,51 @@ impl DataSource for Arc<S3DataSource> {
Ok(ts)
}
}
fn strip_prefix<'a>(key: &'a str, prefix: Option<&str>) -> &'a str {
match prefix {
None => key,
Some(p) => {
let with_slash = if p.ends_with('/') {
key.strip_prefix(p)
} else {
key.strip_prefix(&format!("{p}/"))
};
with_slash.unwrap_or(key)
}
}
}
async fn resolve_groups(
pattern: &GroupPattern,
key: &str,
) -> HashMap<Label, SmartString<LazyCompact>> {
let state = ExtractState { ignore_mime: false };
let mut group = HashMap::new();
'pattern: for (l, pat) in &pattern.pattern {
let item = PileValue::String(Arc::new(key.into()));
let mut target = String::new();
for p in pat {
match p {
GroupSegment::Literal(x) => target.push_str(x),
GroupSegment::Path(op) => {
let res = match item.query(&state, op).await {
Ok(Some(x)) => x,
_ => continue 'pattern,
};
let res = match res.as_str() {
Some(x) => x,
None => continue 'pattern,
};
target.push_str(res);
}
}
}
group.insert(l.clone(), target.into());
}
return group;
}

View File

@@ -1,6 +1,7 @@
use mime::Mime;
use pile_config::Label;
use smartstring::{LazyCompact, SmartString};
use std::{fs::File, path::PathBuf, sync::Arc};
use std::{collections::HashMap, fs::File, path::PathBuf, sync::Arc};
use crate::{
source::{DirDataSource, S3DataSource},
@@ -19,7 +20,7 @@ pub enum Item {
mime: Mime,
path: PathBuf,
sidecar: Option<Box<Item>>,
group: Arc<HashMap<Label, Box<Item>>>,
},
S3 {
@@ -27,7 +28,7 @@ pub enum Item {
mime: Mime,
key: SmartString<LazyCompact>,
sidecar: Option<Box<Item>>,
group: Arc<HashMap<Label, Box<Item>>>,
},
}
@@ -71,7 +72,12 @@ impl Item {
#[expect(clippy::expect_used)]
pub fn key(&self) -> SmartString<LazyCompact> {
match self {
Self::File { path, .. } => path.to_str().expect("path is not utf-8").into(),
Self::File { source, path, .. } => path
.strip_prefix(&source.dir)
.unwrap()
.to_str()
.expect("path is not utf-8")
.into(),
Self::S3 { key, .. } => key.clone(),
}
}
@@ -96,10 +102,10 @@ impl Item {
}
}
pub fn sidecar(&self) -> Option<&Self> {
pub fn group(&self) -> &HashMap<Label, Box<Self>> {
match self {
Self::File { sidecar, .. } => sidecar.as_ref().map(|x| &**x),
Self::S3 { sidecar, .. } => sidecar.as_ref().map(|x| &**x),
Self::File { group, .. } => group,
Self::S3 { group, .. } => group,
}
}
}