Extractor refactor, S3 support
Some checks failed
CI / Typos (push) Successful in 1m5s
CI / Clippy (push) Failing after 1m50s
CI / Build and test (push) Successful in 3m1s

This commit is contained in:
2026-03-06 17:49:12 -08:00
parent 77b3125af4
commit aecc84233b
31 changed files with 2676 additions and 675 deletions

1502
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -73,6 +73,10 @@ tantivy = "0.25.0"
# Async & Parallelism # Async & Parallelism
tokio = { version = "1.49.0", features = ["full"] } tokio = { version = "1.49.0", features = ["full"] }
tokio-stream = "0.1"
async-trait = "0.1"
aws-sdk-s3 = "1"
aws-config = "1"
# CLI & logging # CLI & logging
tracing = "0.1.44" tracing = "0.1.44"

View File

@@ -9,8 +9,7 @@ name = "dataset"
# working_dir = ".pile" # working_dir = ".pile"
# Data sources available in this dataset # Data sources available in this dataset
source."music" = { type = "flac", path = ["music", "music-2"] } source."music" = { type = "filesystem", path = "music" }
# This dataset's schema. # This dataset's schema.
# Defines normalized fields that are extracted from source entries on-demand. # Defines normalized fields that are extracted from source entries on-demand.

View File

@@ -46,16 +46,21 @@ pub struct DatasetConfig {
pub post: Vec<FieldSpecPost>, pub post: Vec<FieldSpecPost>,
} }
#[derive(Debug, Clone, Deserialize)]
pub struct S3Credentials {
pub access_key_id: String,
pub secret_access_key: String,
}
#[derive(Debug, Clone, Deserialize)] #[derive(Debug, Clone, Deserialize)]
#[serde(tag = "type")] #[serde(tag = "type")]
#[serde(rename_all = "lowercase")] #[serde(rename_all = "lowercase")]
pub enum Source { pub enum Source {
/// A directory files /// A directory of files
Filesystem { Filesystem {
/// The directories to scan. /// The directories to scan.
/// Must be relative. /// Must be relative.
#[serde(alias = "paths")] path: PathBuf,
path: OneOrMany<PathBuf>,
/// If true, all toml files are ignored. /// If true, all toml files are ignored.
/// Metadata can be added to any file using a {filename}.toml. /// Metadata can be added to any file using a {filename}.toml.
@@ -65,6 +70,23 @@ pub enum Source {
#[serde(default = "default_true")] #[serde(default = "default_true")]
sidecars: bool, sidecars: bool,
}, },
/// An S3-compatible object store bucket
S3 {
bucket: String,
prefix: Option<String>,
/// Custom endpoint URL (for MinIO, etc.)
endpoint: Option<String>,
region: String,
credentials: S3Credentials,
/// If true, all .toml objects are treated as sidecar metadata files.
#[serde(default = "default_true")]
sidecars: bool,
},
} }
// //

View File

@@ -20,9 +20,11 @@ tracing = { workspace = true }
chrono = { workspace = true } chrono = { workspace = true }
toml = { workspace = true } toml = { workspace = true }
thiserror = { workspace = true } thiserror = { workspace = true }
rayon = { workspace = true }
smartstring = { workspace = true } smartstring = { workspace = true }
blake3 = { workspace = true } blake3 = { workspace = true }
toml_edit = { workspace = true }
pdf = { workspace = true } pdf = { workspace = true }
id3 = { workspace = true } id3 = { workspace = true }
tokio = { workspace = true }
tokio-stream = { workspace = true }
async-trait = { workspace = true }
aws-sdk-s3 = { workspace = true }

View File

@@ -1,30 +1,17 @@
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use pile_config::{ConfigToml, Label, Source}; use pile_config::{ConfigToml, Label, Source};
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError}; use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
use rayon::{ use std::{collections::HashMap, io::ErrorKind, path::PathBuf, sync::Arc, time::Instant};
ThreadPoolBuilder,
iter::{IntoParallelIterator, ParallelIterator},
};
use std::{
io::ErrorKind,
path::PathBuf,
sync::{
Arc,
atomic::{AtomicU64, Ordering},
mpsc::Receiver,
},
thread::JoinHandle,
time::Instant,
};
use tantivy::{Executor, Index, IndexWriter, TantivyError, collector::TopDocs}; use tantivy::{Executor, Index, IndexWriter, TantivyError, collector::TopDocs};
use thiserror::Error; use thiserror::Error;
use tokio_stream::{StreamExt, wrappers::ReceiverStream};
use tracing::{debug, info, trace, warn}; use tracing::{debug, info, trace, warn};
use crate::{ use crate::{
DataSource, FileItem, DataSource, Item,
index::{DbFtsIndex, FtsLookupResult}, index::{DbFtsIndex, FtsLookupResult},
path_ts_earliest, path_ts_earliest,
source::DirDataSource, source::{DirDataSource, S3DataSource},
}; };
#[derive(Debug, Error)] #[derive(Debug, Error)]
@@ -39,15 +26,54 @@ pub enum DatasetError {
NoFtsIndex, NoFtsIndex,
} }
pub struct Dataset { //
// MARK: Dataset enum
//
/// An opened data source — either a local filesystem directory or an S3 bucket.
pub enum Dataset {
Dir(Arc<DirDataSource>),
S3(Arc<S3DataSource>),
}
impl Dataset {
pub async fn get(&self, key: &str) -> Option<Item> {
match self {
Self::Dir(ds) => ds.get(key).await.ok().flatten(),
Self::S3(ds) => ds.get(key).await.ok().flatten(),
}
}
pub fn iter(&self) -> ReceiverStream<Result<Item, std::io::Error>> {
match self {
Self::Dir(ds) => ds.iter(),
Self::S3(ds) => ds.iter(),
}
}
pub async fn latest_change(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
match self {
Self::Dir(ds) => ds.latest_change().await,
Self::S3(ds) => ds.latest_change().await,
}
}
}
//
// MARK: Datasets collection
//
/// An opened dataset: config, working directory, and all opened sources.
pub struct Datasets {
pub path_config: PathBuf, pub path_config: PathBuf,
pub path_parent: PathBuf, pub path_parent: PathBuf,
pub path_workdir: PathBuf, pub path_workdir: PathBuf,
pub config: ConfigToml, pub config: ConfigToml,
pub sources: HashMap<Label, Dataset>,
} }
impl Dataset { impl Datasets {
pub fn open(config: impl Into<PathBuf>) -> Result<Self, std::io::Error> { pub fn open(config: impl Into<PathBuf>) -> Result<Self, std::io::Error> {
let path_config = config.into(); let path_config = config.into();
let path_parent = path_config let path_parent = path_config
@@ -84,11 +110,54 @@ impl Dataset {
.unwrap_or(path_parent.join(".pile")) .unwrap_or(path_parent.join(".pile"))
.join(config.dataset.name.as_str()); .join(config.dataset.name.as_str());
let mut sources = HashMap::new();
for (label, source) in &config.dataset.source {
match source {
Source::Filesystem { path, sidecars } => {
sources.insert(
label.clone(),
Dataset::Dir(Arc::new(DirDataSource::new(
label,
path_parent.join(path),
*sidecars,
))),
);
}
Source::S3 {
bucket,
prefix,
endpoint,
region,
credentials,
sidecars,
} => {
match S3DataSource::new(
label,
bucket.clone(),
prefix.clone(),
endpoint.clone(),
region.clone(),
credentials,
*sidecars,
) {
Ok(ds) => {
sources.insert(label.clone(), Dataset::S3(Arc::new(ds)));
}
Err(err) => {
warn!("Could not open S3 source {label}: {err}");
}
}
}
}
}
return Ok(Self { return Ok(Self {
path_config, path_config,
path_parent, path_parent,
path_workdir, path_workdir,
config, config,
sources,
}); });
} }
@@ -96,15 +165,8 @@ impl Dataset {
// MARK: get // MARK: get
// //
pub fn get(&self, source: &Label, key: &PathBuf) -> Option<FileItem> { pub async fn get(&self, source: &Label, key: &str) -> Option<Item> {
let s = self.config.dataset.source.get(source)?; self.sources.get(source)?.get(key).await
let s = match s {
Source::Filesystem { path, sidecars } => {
DirDataSource::new(source, path.clone().to_vec(), *sidecars)
}
};
s.get(key).ok().flatten()
} }
// //
@@ -112,9 +174,9 @@ impl Dataset {
// //
/// Refresh this dataset's fts index. /// Refresh this dataset's fts index.
pub fn fts_refresh( pub async fn fts_refresh(
&self, &self,
threads: usize, _threads: usize,
flag: Option<CancelFlag>, flag: Option<CancelFlag>,
) -> Result<(), CancelableTaskError<DatasetError>> { ) -> Result<(), CancelableTaskError<DatasetError>> {
let fts_tmp_dir = self.path_workdir.join(".tmp-fts"); let fts_tmp_dir = self.path_workdir.join(".tmp-fts");
@@ -134,58 +196,40 @@ impl Dataset {
let mut index_writer: IndexWriter = let mut index_writer: IndexWriter =
index.writer(50 * 1024 * 1024).map_err(DatasetError::from)?; index.writer(50 * 1024 * 1024).map_err(DatasetError::from)?;
let batch_size = 1000;
let (_read_task, read_rx) = start_read_task(&self.config, batch_size);
#[expect(clippy::unwrap_used)]
let write_pool = ThreadPoolBuilder::new()
.num_threads(threads.max(1))
.thread_name(|x| format!("fts_refresh_thread_{x}"))
.build()
.unwrap();
let mut total = 0u64; let mut total = 0u64;
while let Ok(batch) = read_rx.recv() { let mut logged_at = Instant::now();
let batch = batch?;
for (name, dataset) in &self.sources {
info!("Loading source {name}");
let mut stream = dataset.iter();
while let Some(item_result) = stream.next().await {
if let Some(flag) = &flag if let Some(flag) = &flag
&& flag.is_cancelled() && flag.is_cancelled()
{ {
return Err(CancelableTaskError::Cancelled); return Err(CancelableTaskError::Cancelled);
} }
let this = AtomicU64::new(0); let item = item_result.map_err(DatasetError::from)?;
let start = Instant::now(); let key = item.key();
write_pool
.install(|| { match db_index.entry_to_document(&item).await {
batch Ok(Some(doc)) => {
.into_par_iter() index_writer.add_document(doc).map_err(DatasetError::from)?;
.filter_map(|(key, item)| match db_index.entry_to_document(&item) { total += 1;
Ok(Some(doc)) => Some((key, doc)), if logged_at.elapsed().as_secs() >= 5 {
debug!("Indexed {total} documents so far");
logged_at = Instant::now();
}
}
Ok(None) => { Ok(None) => {
warn!("Skipping {key:?}, document is empty"); warn!("Skipping {key:?}, document is empty");
None
} }
Err(err) => { Err(err) => {
warn!("Could not read {key:?}, skipping. {err}"); warn!("Could not read {key:?}, skipping. {err}");
None
} }
}) }
.map(|(key, doc)| { }
this.fetch_add(1, Ordering::Relaxed);
index_writer
.add_document(doc)
.map_err(|err| (key, err))
.map(|_| ())
})
.find_first(|x| x.is_err())
.unwrap_or(Ok(()))
})
.map_err(|(_key, err)| DatasetError::from(err))?;
let this = this.load(Ordering::Relaxed);
total += this;
let time_ms = start.elapsed().as_millis();
debug!("Added a batch of {this} in {time_ms} ms ({total} total)");
} }
if let Some(flag) = flag.as_ref() if let Some(flag) = flag.as_ref()
@@ -194,7 +238,7 @@ impl Dataset {
return Err(CancelableTaskError::Cancelled); return Err(CancelableTaskError::Cancelled);
} }
info!("Committing index"); info!("Committing {total} documents");
index_writer.commit().map_err(DatasetError::from)?; index_writer.commit().map_err(DatasetError::from)?;
if fts_dir.is_dir() { if fts_dir.is_dir() {
@@ -247,19 +291,14 @@ impl Dataset {
} }
/// Time at which data was last modified /// Time at which data was last modified
pub fn ts_data(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> { pub async fn ts_data(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
let mut ts: Option<DateTime<Utc>> = None; let mut ts: Option<DateTime<Utc>> = None;
for (label, source) in &self.config.dataset.source { for dataset in self.sources.values() {
match source { match (ts, dataset.latest_change().await?) {
Source::Filesystem { path, sidecars } => {
let s = DirDataSource::new(label, path.clone().to_vec(), *sidecars);
match (ts, s.latest_change()?) {
(_, None) => continue, (_, None) => continue,
(None, Some(new)) => ts = Some(new), (None, Some(new)) => ts = Some(new),
(Some(old), Some(new)) => ts = Some(old.max(new)), (Some(old), Some(new)) => ts = Some(old.max(new)),
};
}
} }
} }
@@ -268,10 +307,10 @@ impl Dataset {
/// Returns true if we do not have an fts index, /// Returns true if we do not have an fts index,
/// or if our fts index is older than our data. /// or if our fts index is older than our data.
pub fn needs_fts(&self) -> Result<bool, std::io::Error> { pub async fn needs_fts(&self) -> Result<bool, std::io::Error> {
let start = Instant::now(); let start = Instant::now();
let ts_fts = self.ts_fts()?; let ts_fts = self.ts_fts()?;
let ts_data = self.ts_data()?; let ts_data = self.ts_data().await?;
let result = match (ts_fts, ts_data) { let result = match (ts_fts, ts_data) {
(None, Some(_)) => true, (None, Some(_)) => true,
@@ -292,59 +331,3 @@ impl Dataset {
return Ok(result); return Ok(result);
} }
} }
//
// MARK: read_task
//
fn start_read_task(
config: &ConfigToml,
batch_size: usize,
) -> (
JoinHandle<()>,
Receiver<Result<Vec<(PathBuf, FileItem)>, DatasetError>>,
) {
let config = config.clone();
let (read_tx, read_rx) = std::sync::mpsc::sync_channel(2);
let read_task = std::thread::spawn(move || {
let mut batch = Vec::with_capacity(batch_size);
for (name, source) in &config.dataset.source {
info!("Loading source {name}");
match source {
Source::Filesystem { path, sidecars } => {
let source = DirDataSource::new(name, path.clone().to_vec(), *sidecars);
for i in source.iter() {
match i {
Ok(x) => batch.push(x),
Err(err) => {
let err = Err(DatasetError::from(err));
let _ = read_tx.send(err);
return;
}
}
if batch.len() >= batch_size {
let b = std::mem::replace(&mut batch, Vec::with_capacity(batch_size));
match read_tx.send(Ok(b)) {
Ok(()) => {}
Err(_) => return,
};
}
}
}
}
}
if !batch.is_empty() {
match read_tx.send(Ok(batch)) {
Ok(()) => {}
Err(_) => return,
};
}
});
return (read_task, read_rx);
}

View File

@@ -1,34 +1,40 @@
use pile_config::Label; use pile_config::Label;
use pile_flac::{FlacBlock, FlacReader}; use pile_flac::{FlacBlock, FlacReader};
use std::{collections::HashMap, fs::File, io::BufReader, sync::OnceLock}; use std::{collections::HashMap, io::BufReader, sync::OnceLock};
use crate::{FileItem, PileValue, extract::Extractor}; use crate::{Item, PileValue, extract::Extractor};
pub struct FlacExtractor<'a> { pub struct FlacExtractor<'a> {
item: &'a FileItem, item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>, output: OnceLock<HashMap<Label, PileValue<'a>>>,
} }
impl<'a> FlacExtractor<'a> { impl<'a> FlacExtractor<'a> {
pub fn new(item: &'a FileItem) -> Self { pub fn new(item: &'a Item) -> Self {
Self { Self {
item, item,
output: OnceLock::new(), output: OnceLock::new(),
} }
} }
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> { async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
if let Some(x) = self.output.get() { if let Some(x) = self.output.get() {
return Ok(x); return Ok(x);
} }
// If this isn't a flac file, ignore it. let key = match self.item {
if self.item.path.extension().map(|x| x.to_str()).flatten() != Some("flac") { Item::File { path, .. } => path.to_str().unwrap_or_default().to_owned(),
return Ok(self.output.get_or_init(|| HashMap::new())); Item::S3 { key, .. } => key.to_string(),
};
if !key.ends_with(".flac") {
let _ = self.output.set(HashMap::new());
#[expect(clippy::unwrap_used)]
return Ok(self.output.get().unwrap());
} }
let file = File::open(&self.item.path)?; let bytes = self.item.read().await?.read_to_end().await?;
let reader = FlacReader::new(BufReader::new(file)); let reader = FlacReader::new(BufReader::new(std::io::Cursor::new(bytes)));
let mut output: HashMap<Label, Vec<_>> = HashMap::new(); let mut output: HashMap<Label, Vec<_>> = HashMap::new();
for block in reader { for block in reader {
@@ -53,19 +59,22 @@ impl<'a> FlacExtractor<'a> {
.map(|(k, v)| (k, PileValue::Array(v))) .map(|(k, v)| (k, PileValue::Array(v)))
.collect(); .collect();
return Ok(self.output.get_or_init(|| output)); let _ = self.output.set(output);
#[expect(clippy::unwrap_used)]
return Ok(self.output.get().unwrap());
} }
} }
impl Extractor<FileItem> for FlacExtractor<'_> { #[async_trait::async_trait]
fn field<'a>( impl Extractor for FlacExtractor<'_> {
async fn field<'a>(
&'a self, &'a self,
name: &Label, name: &Label,
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> { ) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner()?.get(name)) Ok(self.get_inner().await?.get(name))
} }
fn fields(&self) -> Result<Vec<Label>, std::io::Error> { async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner()?.keys().cloned().collect()) Ok(self.get_inner().await?.keys().cloned().collect())
} }
} }

View File

@@ -1,50 +1,48 @@
use pile_config::Label; use pile_config::Label;
use std::{collections::HashMap, path::Component, sync::OnceLock}; use std::{collections::HashMap, path::Component, sync::OnceLock};
use crate::{FileItem, Key, PileValue, extract::Extractor}; use crate::{Item, PileValue, extract::Extractor};
pub struct FsExtractor<'a> { pub struct FsExtractor<'a> {
item: &'a FileItem, item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>, output: OnceLock<HashMap<Label, PileValue<'a>>>,
} }
impl<'a> FsExtractor<'a> { impl<'a> FsExtractor<'a> {
pub fn new(item: &'a FileItem) -> Self { pub fn new(item: &'a Item) -> Self {
Self { Self {
item, item,
output: OnceLock::new(), output: OnceLock::new(),
} }
} }
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> { fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
if let Some(x) = self.output.get() { if let Some(x) = self.output.get() {
return Ok(x); return Ok(x);
} }
let Item::File { path, .. } = self.item else {
return Ok(self.output.get_or_init(HashMap::new));
};
#[expect(clippy::unwrap_used)] #[expect(clippy::unwrap_used)]
let output = HashMap::from([ let output = HashMap::from([
( (
Label::new("extension").unwrap(), Label::new("extension").unwrap(),
self.item path.extension()
.path
.extension()
.and_then(|x| x.to_str()) .and_then(|x| x.to_str())
.map(|x| PileValue::String(x.into())) .map(|x| PileValue::String(x.into()))
.unwrap_or(PileValue::Null), .unwrap_or(PileValue::Null),
), ),
( (
Label::new("path").unwrap(), Label::new("path").unwrap(),
self.item path.to_str()
.path
.to_string()
.map(|x| PileValue::String(x.into())) .map(|x| PileValue::String(x.into()))
.unwrap_or(PileValue::Null), .unwrap_or(PileValue::Null),
), ),
( (
Label::new("segments").unwrap(), Label::new("segments").unwrap(),
self.item path.components()
.path
.components()
.map(|x| match x { .map(|x| match x {
Component::CurDir => Some(".".to_owned()), Component::CurDir => Some(".".to_owned()),
Component::Normal(x) => x.to_str().map(|x| x.to_owned()), Component::Normal(x) => x.to_str().map(|x| x.to_owned()),
@@ -63,15 +61,16 @@ impl<'a> FsExtractor<'a> {
} }
} }
impl Extractor<FileItem> for FsExtractor<'_> { #[async_trait::async_trait]
fn field<'a>( impl Extractor for FsExtractor<'_> {
async fn field<'a>(
&'a self, &'a self,
name: &Label, name: &Label,
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> { ) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner()?.get(name)) Ok(self.get_inner()?.get(name))
} }
fn fields(&self) -> Result<Vec<Label>, std::io::Error> { async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner()?.keys().cloned().collect()) Ok(self.get_inner()?.keys().cloned().collect())
} }
} }

View File

@@ -1,38 +1,50 @@
use id3::Tag; use id3::Tag;
use pile_config::Label; use pile_config::Label;
use std::{borrow::Cow, collections::HashMap, sync::OnceLock}; use std::{borrow::Cow, collections::HashMap, io::BufReader, sync::OnceLock};
use crate::{FileItem, PileValue, extract::Extractor}; use crate::{Item, PileValue, extract::Extractor};
pub struct Id3Extractor<'a> { pub struct Id3Extractor<'a> {
item: &'a FileItem, item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>, output: OnceLock<HashMap<Label, PileValue<'a>>>,
} }
impl<'a> Id3Extractor<'a> { impl<'a> Id3Extractor<'a> {
pub fn new(item: &'a FileItem) -> Self { pub fn new(item: &'a Item) -> Self {
Self { Self {
item, item,
output: OnceLock::new(), output: OnceLock::new(),
} }
} }
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> { async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
if let Some(x) = self.output.get() { if let Some(x) = self.output.get() {
return Ok(x); return Ok(x);
} }
let ext = self.item.path.extension().and_then(|x| x.to_str()); let key = match self.item {
Item::File { path, .. } => path.to_str().unwrap_or_default().to_owned(),
Item::S3 { key, .. } => key.to_string(),
};
let ext = key.rsplit('.').next();
if !matches!(ext, Some("mp3") | Some("aiff") | Some("aif") | Some("wav")) { if !matches!(ext, Some("mp3") | Some("aiff") | Some("aif") | Some("wav")) {
return Ok(self.output.get_or_init(HashMap::new)); let _ = self.output.set(HashMap::new());
#[expect(clippy::unwrap_used)]
return Ok(self.output.get().unwrap());
} }
let tag = match Tag::read_from_path(&self.item.path) { let bytes = self.item.read().await?.read_to_end().await?;
let tag = match Tag::read_from2(BufReader::new(std::io::Cursor::new(bytes))) {
Ok(tag) => tag, Ok(tag) => tag,
Err(id3::Error { Err(id3::Error {
kind: id3::ErrorKind::NoTag, kind: id3::ErrorKind::NoTag,
.. ..
}) => return Ok(self.output.get_or_init(HashMap::new)), }) => {
let _ = self.output.set(HashMap::new());
#[expect(clippy::unwrap_used)]
return Ok(self.output.get().unwrap());
}
Err(id3::Error { Err(id3::Error {
kind: id3::ErrorKind::Io(e), kind: id3::ErrorKind::Io(e),
.. ..
@@ -40,7 +52,7 @@ impl<'a> Id3Extractor<'a> {
Err(e) => return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, e)), Err(e) => return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, e)),
}; };
let mut output: HashMap<Label, Vec<PileValue<'a, FileItem>>> = HashMap::new(); let mut output: HashMap<Label, Vec<PileValue<'a>>> = HashMap::new();
for frame in tag.frames() { for frame in tag.frames() {
if let Some(text) = frame.content().text() { if let Some(text) = frame.content().text() {
let name = frame_id_to_field(frame.id()); let name = frame_id_to_field(frame.id());
@@ -58,7 +70,9 @@ impl<'a> Id3Extractor<'a> {
.map(|(k, v)| (k, PileValue::Array(v))) .map(|(k, v)| (k, PileValue::Array(v)))
.collect(); .collect();
return Ok(self.output.get_or_init(|| output)); let _ = self.output.set(output);
#[expect(clippy::unwrap_used)]
return Ok(self.output.get().unwrap());
} }
} }
@@ -66,6 +80,7 @@ impl<'a> Id3Extractor<'a> {
/// Falls back to the lowercased frame ID if no mapping exists. /// Falls back to the lowercased frame ID if no mapping exists.
fn frame_id_to_field(id: &str) -> Cow<'static, str> { fn frame_id_to_field(id: &str) -> Cow<'static, str> {
match id { match id {
// spell:off
"TIT2" => Cow::Borrowed("title"), "TIT2" => Cow::Borrowed("title"),
"TIT1" => Cow::Borrowed("grouping"), "TIT1" => Cow::Borrowed("grouping"),
"TIT3" => Cow::Borrowed("subtitle"), "TIT3" => Cow::Borrowed("subtitle"),
@@ -98,18 +113,20 @@ fn frame_id_to_field(id: &str) -> Cow<'static, str> {
"MVNM" => Cow::Borrowed("movement"), "MVNM" => Cow::Borrowed("movement"),
"MVIN" => Cow::Borrowed("movementnumber"), "MVIN" => Cow::Borrowed("movementnumber"),
_ => Cow::Owned(id.to_lowercase()), _ => Cow::Owned(id.to_lowercase()),
// spell:on
} }
} }
impl Extractor<FileItem> for Id3Extractor<'_> { #[async_trait::async_trait]
fn field<'a>( impl Extractor for Id3Extractor<'_> {
async fn field<'a>(
&'a self, &'a self,
name: &Label, name: &Label,
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> { ) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner()?.get(name)) Ok(self.get_inner().await?.get(name))
} }
fn fields(&self) -> Result<Vec<Label>, std::io::Error> { async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner()?.keys().cloned().collect()) Ok(self.get_inner().await?.keys().cloned().collect())
} }
} }

View File

@@ -1,18 +1,22 @@
use pile_config::Label; use pile_config::Label;
use std::collections::HashMap; use std::collections::HashMap;
use crate::{Item, PileValue, extract::Extractor}; use crate::{PileValue, extract::Extractor};
pub struct MapExtractor<'a, I: Item> { pub struct MapExtractor<'a> {
pub(crate) inner: HashMap<Label, PileValue<'a, I>>, pub(crate) inner: HashMap<Label, PileValue<'a>>,
} }
impl<I: Item> Extractor<I> for MapExtractor<'_, I> { #[async_trait::async_trait]
fn field<'a>(&'a self, name: &Label) -> Result<Option<&'a PileValue<'a, I>>, std::io::Error> { impl Extractor for MapExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.inner.get(name)) Ok(self.inner.get(name))
} }
fn fields(&self) -> Result<Vec<Label>, std::io::Error> { async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.inner.keys().cloned().collect()) Ok(self.inner.keys().cloned().collect())
} }
} }

View File

@@ -1,5 +1,5 @@
use pile_config::Label; use pile_config::Label;
use std::{collections::HashMap, rc::Rc}; use std::{collections::HashMap, sync::Arc};
mod flac; mod flac;
pub use flac::*; pub use flac::*;
@@ -13,59 +13,73 @@ pub use fs::*;
mod pdf; mod pdf;
pub use pdf::*; pub use pdf::*;
mod sidecar; mod toml;
pub use sidecar::*; pub use toml::*;
mod map; mod map;
pub use map::*; pub use map::*;
mod sidecar;
pub use sidecar::*;
use crate::Item;
/// An attachment that extracts metadata from an [Item]. /// An attachment that extracts metadata from an [Item].
/// ///
/// Metadata is exposed as an immutable map of {label: value}, /// Metadata is exposed as an immutable map of {label: value},
/// much like a json object. /// much like a json object.
pub trait Extractor<I: crate::Item> { #[async_trait::async_trait]
pub trait Extractor: Send + Sync {
/// Get the field at `name` from `item`. /// Get the field at `name` from `item`.
/// - returns `None` if `name` is not a valid field /// - returns `None` if `name` is not a valid field
/// - returns `Some(Null)` if `name` is not available /// - returns `Some(Null)` if `name` is not available
fn field<'a>( async fn field<'a>(
&'a self, &'a self,
name: &pile_config::Label, name: &pile_config::Label,
) -> Result<Option<&'a crate::PileValue<'a, I>>, std::io::Error>; ) -> Result<Option<&'a crate::PileValue<'a>>, std::io::Error>;
/// Return all fields in this extractor. /// Return all fields in this extractor.
/// `Self::field` must return [Some] for all these keys /// `Self::field` must return [Some] for all these keys
/// and [None] for all others. /// and [None] for all others.
fn fields(&self) -> Result<Vec<Label>, std::io::Error>; async fn fields(&self) -> Result<Vec<Label>, std::io::Error>;
} }
pub struct MetaExtractor<'a, I: crate::Item> { pub struct MetaExtractor<'a> {
inner: MapExtractor<'a, I>, inner: MapExtractor<'a>,
} }
impl<'a> MetaExtractor<'a, crate::FileItem> { //
// MARK: file
//
impl<'a> MetaExtractor<'a> {
#[expect(clippy::unwrap_used)] #[expect(clippy::unwrap_used)]
pub fn new(item: &'a crate::FileItem) -> Self { pub fn new(item: &'a Item) -> Self {
let inner = MapExtractor { let inner = MapExtractor {
inner: HashMap::from([ inner: HashMap::from([
( (
Label::new("flac").unwrap(), Label::new("flac").unwrap(),
crate::PileValue::Extractor(Rc::new(FlacExtractor::new(item))), crate::PileValue::Extractor(Arc::new(FlacExtractor::new(item))),
), ),
( (
Label::new("id3").unwrap(), Label::new("id3").unwrap(),
crate::PileValue::Extractor(Rc::new(Id3Extractor::new(item))), crate::PileValue::Extractor(Arc::new(Id3Extractor::new(item))),
), ),
( (
Label::new("fs").unwrap(), Label::new("fs").unwrap(),
crate::PileValue::Extractor(Rc::new(FsExtractor::new(item))), crate::PileValue::Extractor(Arc::new(FsExtractor::new(item))),
), ),
( (
Label::new("pdf").unwrap(), Label::new("pdf").unwrap(),
crate::PileValue::Extractor(Rc::new(PdfExtractor::new(item))), crate::PileValue::Extractor(Arc::new(PdfExtractor::new(item))),
),
(
Label::new("toml").unwrap(),
crate::PileValue::Extractor(Arc::new(TomlExtractor::new(item))),
), ),
( (
Label::new("sidecar").unwrap(), Label::new("sidecar").unwrap(),
crate::PileValue::Extractor(Rc::new(SidecarExtractor::new(item))), crate::PileValue::Extractor(Arc::new(SidecarExtractor::new(item))),
), ),
]), ]),
}; };
@@ -74,16 +88,17 @@ impl<'a> MetaExtractor<'a, crate::FileItem> {
} }
} }
impl Extractor<crate::FileItem> for MetaExtractor<'_, crate::FileItem> { #[async_trait::async_trait]
fn field<'a>( impl Extractor for MetaExtractor<'_> {
async fn field<'a>(
&'a self, &'a self,
name: &pile_config::Label, name: &pile_config::Label,
) -> Result<Option<&'a crate::PileValue<'a, crate::FileItem>>, std::io::Error> { ) -> Result<Option<&'a crate::PileValue<'a>>, std::io::Error> {
self.inner.field(name) self.inner.field(name).await
} }
#[expect(clippy::unwrap_used)] #[expect(clippy::unwrap_used)]
fn fields(&self) -> Result<Vec<Label>, std::io::Error> { async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
return Ok(vec![ return Ok(vec![
Label::new("flac").unwrap(), Label::new("flac").unwrap(),
Label::new("id3").unwrap(), Label::new("id3").unwrap(),

View File

@@ -1,5 +1,5 @@
use pile_config::Label; use pile_config::Label;
use std::{collections::HashMap, rc::Rc}; use std::{collections::HashMap, sync::Arc};
mod pdf_meta; mod pdf_meta;
pub use pdf_meta::*; pub use pdf_meta::*;
@@ -8,26 +8,26 @@ mod pdf_text;
pub use pdf_text::*; pub use pdf_text::*;
use crate::{ use crate::{
FileItem, PileValue, Item, PileValue,
extract::{Extractor, MapExtractor}, extract::{Extractor, MapExtractor},
}; };
pub struct PdfExtractor<'a> { pub struct PdfExtractor<'a> {
inner: MapExtractor<'a, FileItem>, inner: MapExtractor<'a>,
} }
impl<'a> PdfExtractor<'a> { impl<'a> PdfExtractor<'a> {
#[expect(clippy::unwrap_used)] #[expect(clippy::unwrap_used)]
pub fn new(item: &'a FileItem) -> Self { pub fn new(item: &'a Item) -> Self {
let inner = MapExtractor { let inner = MapExtractor {
inner: HashMap::from([ inner: HashMap::from([
( (
Label::new("text").unwrap(), Label::new("text").unwrap(),
PileValue::Extractor(Rc::new(PdfTextExtractor::new(item))), PileValue::Extractor(Arc::new(PdfTextExtractor::new(item))),
), ),
( (
Label::new("meta").unwrap(), Label::new("meta").unwrap(),
PileValue::Extractor(Rc::new(PdfMetaExtractor::new(item))), PileValue::Extractor(Arc::new(PdfMetaExtractor::new(item))),
), ),
]), ]),
}; };
@@ -36,23 +36,25 @@ impl<'a> PdfExtractor<'a> {
} }
} }
impl Extractor<FileItem> for PdfExtractor<'_> { #[async_trait::async_trait]
fn field<'a>( impl Extractor for PdfExtractor<'_> {
async fn field<'a>(
&'a self, &'a self,
name: &pile_config::Label, name: &pile_config::Label,
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> { ) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
#[expect(clippy::unwrap_used)]
if name.as_str() == "text" { if name.as_str() == "text" {
match self.inner.inner.get(name).unwrap() { match self.inner.inner.get(name).unwrap() {
PileValue::Extractor(x) => return x.field(name), PileValue::Extractor(x) => return x.field(name).await,
_ => unreachable!(), _ => unreachable!(),
}; };
} }
self.inner.field(name) self.inner.field(name).await
} }
#[expect(clippy::unwrap_used)] #[expect(clippy::unwrap_used)]
fn fields(&self) -> Result<Vec<Label>, std::io::Error> { async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(vec![ Ok(vec![
Label::new("text").unwrap(), Label::new("text").unwrap(),
Label::new("meta").unwrap(), Label::new("meta").unwrap(),

View File

@@ -1,40 +1,44 @@
use pdf::file::FileOptions; use pdf::file::FileOptions;
use pdf::primitive::{Date, TimeRel}; use pdf::primitive::{Date, PdfString, TimeRel};
use pile_config::Label; use pile_config::Label;
use std::{collections::HashMap, sync::OnceLock}; use std::{collections::HashMap, sync::OnceLock};
use tracing::debug;
use crate::{FileItem, PileValue, extract::Extractor}; use crate::{Item, PileValue, extract::Extractor};
pub struct PdfMetaExtractor<'a> { pub struct PdfMetaExtractor<'a> {
item: &'a FileItem, item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>, output: OnceLock<HashMap<Label, PileValue<'a>>>,
} }
impl<'a> PdfMetaExtractor<'a> { impl<'a> PdfMetaExtractor<'a> {
pub fn new(item: &'a FileItem) -> Self { pub fn new(item: &'a Item) -> Self {
Self { Self {
item, item,
output: OnceLock::new(), output: OnceLock::new(),
} }
} }
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> { async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
if let Some(x) = self.output.get() { if let Some(x) = self.output.get() {
return Ok(x); return Ok(x);
} }
if self.item.path.extension().map(|x| x.to_str()).flatten() != Some("pdf") { let bytes = self.item.read().await?.read_to_end().await?;
return Ok(self.output.get_or_init(|| HashMap::new()));
let file = match FileOptions::cached().load(bytes) {
Ok(x) => x,
Err(pdf::PdfError::Io { source }) => return Err(source),
Err(error) => {
debug!(message = "Could not process pdf", ?error, key = ?self.item.key());
return Ok(self.output.get_or_init(HashMap::new));
} }
};
let file = FileOptions::cached() let mut output: HashMap<Label, PileValue<'a>> = HashMap::new();
.open(&self.item.path)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let mut output: HashMap<Label, PileValue<'a, FileItem>> = HashMap::new();
if let Some(info) = &file.trailer.info_dict { if let Some(info) = &file.trailer.info_dict {
let fields: &[(&str, Option<&_>)] = &[ let fields: &[(&str, Option<&PdfString>)] = &[
("title", info.title.as_ref()), ("title", info.title.as_ref()),
("author", info.author.as_ref()), ("author", info.author.as_ref()),
("subject", info.subject.as_ref()), ("subject", info.subject.as_ref()),
@@ -88,15 +92,16 @@ fn format_date(d: &Date) -> String {
) )
} }
impl Extractor<FileItem> for PdfMetaExtractor<'_> { #[async_trait::async_trait]
fn field<'a>( impl Extractor for PdfMetaExtractor<'_> {
async fn field<'a>(
&'a self, &'a self,
name: &Label, name: &Label,
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> { ) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner()?.get(name)) Ok(self.get_inner().await?.get(name))
} }
fn fields(&self) -> Result<Vec<Label>, std::io::Error> { async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner()?.keys().cloned().collect()) Ok(self.get_inner().await?.keys().cloned().collect())
} }
} }

View File

@@ -2,34 +2,38 @@ use pdf::content::{Op, TextDrawAdjusted};
use pdf::file::FileOptions; use pdf::file::FileOptions;
use pile_config::Label; use pile_config::Label;
use std::{collections::HashMap, sync::OnceLock}; use std::{collections::HashMap, sync::OnceLock};
use tracing::debug;
use crate::{FileItem, PileValue, extract::Extractor}; use crate::{Item, PileValue, extract::Extractor};
pub struct PdfTextExtractor<'a> { pub struct PdfTextExtractor<'a> {
item: &'a FileItem, item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>, output: OnceLock<HashMap<Label, PileValue<'a>>>,
} }
impl<'a> PdfTextExtractor<'a> { impl<'a> PdfTextExtractor<'a> {
pub fn new(item: &'a FileItem) -> Self { pub fn new(item: &'a Item) -> Self {
Self { Self {
item, item,
output: OnceLock::new(), output: OnceLock::new(),
} }
} }
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> { async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
if let Some(x) = self.output.get() { if let Some(x) = self.output.get() {
return Ok(x); return Ok(x);
} }
if self.item.path.extension().map(|x| x.to_str()).flatten() != Some("pdf") { let bytes = self.item.read().await?.read_to_end().await?;
return Ok(self.output.get_or_init(|| HashMap::new()));
}
let file = FileOptions::cached() let file = match FileOptions::cached().load(bytes) {
.open(&self.item.path) Ok(x) => x,
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?; Err(pdf::PdfError::Io { source }) => return Err(source),
Err(error) => {
debug!(message = "Could not process pdf", ?error, key = ?self.item.key());
return Ok(self.output.get_or_init(HashMap::new));
}
};
let mut text_parts: Vec<String> = Vec::new(); let mut text_parts: Vec<String> = Vec::new();
@@ -65,19 +69,22 @@ impl<'a> PdfTextExtractor<'a> {
#[expect(clippy::unwrap_used)] #[expect(clippy::unwrap_used)]
let output = HashMap::from([(Label::new("text").unwrap(), PileValue::String(text.into()))]); let output = HashMap::from([(Label::new("text").unwrap(), PileValue::String(text.into()))]);
return Ok(self.output.get_or_init(|| output)); let _ = self.output.set(output);
#[expect(clippy::unwrap_used)]
return Ok(self.output.get().unwrap());
} }
} }
impl Extractor<FileItem> for PdfTextExtractor<'_> { #[async_trait::async_trait]
fn field<'a>( impl Extractor for PdfTextExtractor<'_> {
async fn field<'a>(
&'a self, &'a self,
name: &Label, name: &Label,
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> { ) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner()?.get(name)) Ok(self.get_inner().await?.get(name))
} }
fn fields(&self) -> Result<Vec<Label>, std::io::Error> { async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner()?.keys().cloned().collect()) Ok(self.get_inner().await?.keys().cloned().collect())
} }
} }

View File

@@ -1,71 +1,47 @@
use pile_config::Label; use pile_config::Label;
use std::{collections::HashMap, sync::OnceLock}; use std::sync::OnceLock;
use crate::{FileItem, Item, PileValue, extract::Extractor}; use crate::{
Item, PileValue,
fn toml_to_pile<I: Item>(value: toml::Value) -> PileValue<'static, I> { extract::{Extractor, TomlExtractor},
match value { };
toml::Value::String(s) => PileValue::String(s.into()),
toml::Value::Integer(i) => PileValue::String(i.to_string().into()),
toml::Value::Float(f) => PileValue::String(f.to_string().into()),
toml::Value::Boolean(b) => PileValue::String(b.to_string().into()),
toml::Value::Datetime(d) => PileValue::String(d.to_string().into()),
toml::Value::Array(a) => PileValue::Array(a.into_iter().map(toml_to_pile).collect()),
toml::Value::Table(_) => PileValue::Null,
}
}
pub struct SidecarExtractor<'a> { pub struct SidecarExtractor<'a> {
item: &'a FileItem, item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>, output: OnceLock<Option<TomlExtractor<'a>>>,
} }
impl<'a> SidecarExtractor<'a> { impl<'a> SidecarExtractor<'a> {
pub fn new(item: &'a FileItem) -> Self { pub fn new(item: &'a Item) -> Self {
Self { Self {
item, item,
output: OnceLock::new(), output: OnceLock::new(),
} }
} }
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
} }
let sidecar_file = self.item.path.with_extension("toml"); #[async_trait::async_trait]
impl Extractor for SidecarExtractor<'_> {
if !(sidecar_file.is_file() && self.item.sidecar) { async fn field<'a>(
return Ok(self.output.get_or_init(HashMap::new));
}
let sidecar = std::fs::read(&sidecar_file)?;
let sidecar: toml::Value = match toml::from_slice(&sidecar) {
Ok(x) => x,
Err(_) => return Ok(self.output.get_or_init(HashMap::new)),
};
let output: HashMap<Label, PileValue<'_, FileItem>> = match sidecar {
toml::Value::Table(t) => t
.into_iter()
.filter_map(|(k, v)| Label::new(&k).map(|label| (label, toml_to_pile(v))))
.collect(),
_ => HashMap::new(),
};
return Ok(self.output.get_or_init(|| output));
}
}
impl Extractor<FileItem> for SidecarExtractor<'_> {
fn field<'a>(
&'a self, &'a self,
name: &Label, name: &Label,
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> { ) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner()?.get(name)) match self
.output
.get_or_init(|| self.item.sidecar().map(TomlExtractor::new))
{
Some(x) => Ok(x.field(name).await?),
None => Ok(Some(&PileValue::Null)),
}
} }
fn fields(&self) -> Result<Vec<Label>, std::io::Error> { async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner()?.keys().cloned().collect()) match self
.output
.get_or_init(|| self.item.sidecar().map(TomlExtractor::new))
{
Some(x) => Ok(x.fields().await?),
None => Ok(Vec::new()),
}
} }
} }

View File

@@ -0,0 +1,66 @@
use pile_config::Label;
use std::{collections::HashMap, sync::OnceLock};
use crate::{Item, PileValue, extract::Extractor};
fn toml_to_pile(value: toml::Value) -> PileValue<'static> {
match value {
toml::Value::String(s) => PileValue::String(s.into()),
toml::Value::Integer(i) => PileValue::String(i.to_string().into()),
toml::Value::Float(f) => PileValue::String(f.to_string().into()),
toml::Value::Boolean(b) => PileValue::String(b.to_string().into()),
toml::Value::Datetime(d) => PileValue::String(d.to_string().into()),
toml::Value::Array(a) => PileValue::Array(a.into_iter().map(toml_to_pile).collect()),
toml::Value::Table(_) => PileValue::Null,
}
}
pub struct TomlExtractor<'a> {
item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a>>>,
}
impl<'a> TomlExtractor<'a> {
pub fn new(item: &'a Item) -> Self {
Self {
item,
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let bytes = self.item.read().await?.read_to_end().await?;
let toml: toml::Value = match toml::from_slice(&bytes) {
Ok(x) => x,
Err(_) => return Ok(self.output.get_or_init(HashMap::new)),
};
let output: HashMap<Label, PileValue<'_>> = match toml {
toml::Value::Table(t) => t
.into_iter()
.filter_map(|(k, v)| Label::new(&k).map(|label| (label, toml_to_pile(v))))
.collect(),
_ => HashMap::new(),
};
return Ok(self.output.get_or_init(|| output));
}
}
#[async_trait::async_trait]
impl Extractor for TomlExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner().await?.get(name))
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect())
}
}

View File

@@ -1,6 +1,9 @@
use itertools::Itertools; use itertools::Itertools;
use pile_config::{Case, ConfigToml, DatasetFts, FieldSpecPost, Label}; use pile_config::{Case, ConfigToml, DatasetFts, FieldSpecPost, Label};
use std::{path::PathBuf, rc::Rc, sync::LazyLock}; use std::{
path::PathBuf,
sync::{Arc, LazyLock},
};
use tantivy::{ use tantivy::{
DocAddress, Index, ReloadPolicy, TantivyDocument, TantivyError, DocAddress, Index, ReloadPolicy, TantivyDocument, TantivyError,
collector::Collector, collector::Collector,
@@ -9,7 +12,7 @@ use tantivy::{
}; };
use tracing::{debug, trace, warn}; use tracing::{debug, trace, warn};
use crate::{Item, Key, PileValue, extract::MetaExtractor}; use crate::{Item, PileValue, extract::MetaExtractor};
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct FtsLookupResult { pub struct FtsLookupResult {
@@ -63,37 +66,21 @@ impl DbFtsIndex {
// //
/// Turn an entry into a tantivy document /// Turn an entry into a tantivy document
pub fn entry_to_document<K: Key, I: Item<Key = K>>( pub async fn entry_to_document(
&self, &self,
item: &I, item: &Item,
) -> Result<Option<TantivyDocument>, TantivyError> { ) -> Result<Option<TantivyDocument>, TantivyError> {
let mut doc = TantivyDocument::default(); let mut doc = TantivyDocument::default();
let key = item.key();
let key = match item.key().to_string() {
Some(x) => x,
None => {
warn!(
message = "Item key cannot be converted to a string, skipping",
key = ?item.key(),
);
return Ok(None);
}
};
doc.add_text(self.schema.get_field("_meta_source")?, item.source_name()); doc.add_text(self.schema.get_field("_meta_source")?, item.source_name());
doc.add_text(self.schema.get_field("_meta_key")?, key); doc.add_text(self.schema.get_field("_meta_key")?, key);
let item = match item.as_file() { let extractor = PileValue::Extractor(Arc::new(MetaExtractor::new(item)));
Some(x) => x,
None => return Ok(None),
};
let extractor = MetaExtractor::new(item);
let extractor = PileValue::Extractor(Rc::new(extractor));
let mut empty = true; let mut empty = true;
for name in self.fts_cfg().fields.keys() { for name in self.fts_cfg().fields.keys() {
let x = self.get_field(&extractor, name)?; let x = self.get_field(&extractor, name).await?;
let val = match x { let val = match x {
Some(x) => x, Some(x) => x,
@@ -115,9 +102,9 @@ impl DbFtsIndex {
// MARK: read // MARK: read
// //
pub fn get_field<I: Item>( pub async fn get_field(
&self, &self,
extractor: &PileValue<'_, I>, extractor: &PileValue<'_>,
field_name: &Label, field_name: &Label,
) -> Result<Option<String>, std::io::Error> { ) -> Result<Option<String>, std::io::Error> {
let field = match self.cfg.schema.get(field_name) { let field = match self.cfg.schema.get(field_name) {
@@ -130,7 +117,7 @@ impl DbFtsIndex {
// Try paths in order, using the first value we find // Try paths in order, using the first value we find
'outer: for path in field.path.as_slice() { 'outer: for path in field.path.as_slice() {
let val = match extractor.query(path)? { let val = match extractor.query(path).await? {
Some(x) => x, Some(x) => x,
None => return Ok(None), None => return Ok(None),
}; };
@@ -292,10 +279,7 @@ impl DbFtsIndex {
} }
} }
pub fn apply<'a, I: Item>( pub fn apply<'a>(post: &FieldSpecPost, val: &PileValue<'a>) -> Option<PileValue<'a>> {
post: &FieldSpecPost,
val: &PileValue<'a, I>,
) -> Option<PileValue<'a, I>> {
Some(match post { Some(match post {
FieldSpecPost::NotEmpty { notempty: false } => val.clone(), FieldSpecPost::NotEmpty { notempty: false } => val.clone(),
FieldSpecPost::NotEmpty { notempty: true } => match val { FieldSpecPost::NotEmpty { notempty: true } => match val {

View File

@@ -1,178 +1,222 @@
use pile_config::Label; use smartstring::{LazyCompact, SmartString};
use std::{fmt::Debug, path::PathBuf, rc::Rc}; use std::{fs::File, io::Seek, path::PathBuf, sync::Arc};
use crate::{ use crate::source::{DirDataSource, S3DataSource};
PileValue,
extract::{Extractor, SidecarExtractor},
};
//
// MARK: key
//
pub trait Key: Debug + Clone + Send + Sync + 'static {
/// Convert this key to a string, returning `None`
/// if we encounter any kind of error.
fn to_string(&self) -> Option<String>;
fn from_string(str: &str) -> Option<Self>;
}
impl Key for PathBuf {
fn from_string(str: &str) -> Option<Self> {
str.parse().ok()
}
fn to_string(&self) -> Option<String> {
self.to_str().map(|x| x.to_owned())
}
}
// //
// MARK: item // MARK: item
// //
/// A pointer to raw data #[derive(Debug, Clone)]
pub trait Item: Debug + Send + Sync + 'static + Sized { pub enum Item {
type Key: Key; File {
source: Arc<DirDataSource>,
fn source_name(&self) -> &str; path: PathBuf,
fn key(&self) -> &Self::Key; sidecar: Option<Box<Item>>,
},
/// Get this item's sidecar metadata S3 {
fn sidecar(&self) -> Result<Option<Rc<dyn Extractor<Self> + '_>>, std::io::Error>; source: Arc<S3DataSource>,
/// Set this file's sidecar metadata, key: SmartString<LazyCompact>,
/// overwriting any existing file. sidecar: Option<Box<Item>>,
fn write_sidecar( },
&self,
path: Vec<Label>,
value: PileValue<'_, Self>,
) -> Result<(), std::io::Error>;
fn hash(&self) -> Result<blake3::Hash, std::io::Error>;
/// Item conversion, downcast to specific type.
/// Returns `None` if this is not a [FileItem]
fn as_file(&self) -> Option<&FileItem>;
} }
#[derive(Clone, Debug)] impl Item {
pub struct FileItem { /// Open the item for reading. For S3, performs a HEAD request to determine
/// Path to this file. /// the object size.
/// Must be relative to source root dir. pub async fn read(&self) -> Result<ItemReader, std::io::Error> {
pub path: PathBuf, Ok(match self {
pub source_name: Label, Self::File { path, .. } => ItemReader::File(File::open(path)?),
/// If true, look for a sidecar file Self::S3 { source, key, .. } => {
pub sidecar: bool, let head = source
.client
.head_object()
.bucket(source.bucket.as_str())
.key(key.as_str())
.send()
.await
.map_err(std::io::Error::other)?;
let size = head.content_length().unwrap_or(0) as u64;
ItemReader::S3(S3Reader {
client: source.client.clone(),
bucket: source.bucket.clone(),
key: key.to_owned(),
cursor: 0,
size,
})
}
})
} }
impl Item for FileItem { pub fn source_name(&self) -> &pile_config::Label {
type Key = PathBuf; match self {
Self::File { source, .. } => &source.name,
fn source_name(&self) -> &str { Self::S3 { source, .. } => &source.name,
&self.source_name }
} }
fn key(&self) -> &Self::Key { #[expect(clippy::expect_used)]
&self.path pub fn key(&self) -> SmartString<LazyCompact> {
match self {
Self::File { path, .. } => path.to_str().expect("path is not utf-8").into(),
Self::S3 { key, .. } => key.clone(),
}
} }
fn as_file(&self) -> Option<&FileItem> { pub fn hash(&self) -> Result<blake3::Hash, std::io::Error> {
Some(self) match self {
} Self::File { path, .. } => {
fn hash(&self) -> Result<blake3::Hash, std::io::Error> {
let mut hasher = blake3::Hasher::new(); let mut hasher = blake3::Hasher::new();
let mut file = std::fs::File::open(&self.path)?; let mut file = std::fs::File::open(path)?;
std::io::copy(&mut file, &mut hasher)?; std::io::copy(&mut file, &mut hasher)?;
return Ok(hasher.finalize()); return Ok(hasher.finalize());
} }
fn sidecar(&self) -> Result<Option<Rc<dyn Extractor<Self> + '_>>, std::io::Error> { Self::S3 { .. } => todo!(),
if !self.sidecar { }
return Ok(None);
} }
// TODO: use a generic tomlextractor instead? pub fn sidecar(&self) -> Option<&Self> {
// you'll need a fake _ref_ to the toml file, though. match self {
return Ok(Some(Rc::new(SidecarExtractor::new(self)))); Self::File { sidecar, .. } => sidecar.as_ref().map(|x| &**x),
Self::S3 { sidecar, .. } => sidecar.as_ref().map(|x| &**x),
}
}
} }
fn write_sidecar( pub enum ItemReader {
&self, File(File),
path: Vec<Label>, S3(S3Reader),
value: PileValue<'_, Self>,
) -> Result<(), std::io::Error> {
if !self.sidecar {
return Ok(());
} }
let sidecar_path = self.path.with_extension("toml"); impl ItemReader {
/// Read a chunk of bytes.
pub async fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
match self {
Self::File(x) => std::io::Read::read(x, buf),
Self::S3(x) => x.read(buf).await,
}
}
let mut doc: toml_edit::DocumentMut = if sidecar_path.is_file() { /// Read all remaining bytes into a `Vec`.
let content = std::fs::read_to_string(&sidecar_path)?; pub async fn read_to_end(mut self) -> std::io::Result<Vec<u8>> {
content.parse().unwrap_or_default() match self {
Self::File(mut f) => {
let mut buf = Vec::new();
std::io::Read::read_to_end(&mut f, &mut buf)?;
Ok(buf)
}
Self::S3(ref mut r) => {
let mut buf = Vec::new();
let mut chunk = vec![0u8; 65536];
loop {
let n = r.read(&mut chunk).await?;
if n == 0 {
break;
}
buf.extend_from_slice(&chunk[..n]);
}
Ok(buf)
}
}
}
pub fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result<u64> {
match self {
Self::File(x) => x.seek(pos),
Self::S3(x) => x.seek(pos),
}
}
}
//
// MARK: S3Reader
//
pub struct S3Reader {
client: Arc<aws_sdk_s3::Client>,
bucket: SmartString<LazyCompact>,
key: SmartString<LazyCompact>,
cursor: u64,
size: u64,
}
impl S3Reader {
async fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
let len_left = self.size.saturating_sub(self.cursor);
if len_left == 0 || buf.is_empty() {
return Ok(0);
}
let start_byte = self.cursor;
let len_to_read = (buf.len() as u64).min(len_left);
let end_byte = start_byte + len_to_read - 1;
let resp = self
.client
.get_object()
.bucket(self.bucket.as_str())
.key(self.key.as_str())
.range(format!("bytes={start_byte}-{end_byte}"))
.send()
.await
.map_err(std::io::Error::other)?;
let bytes = resp
.body
.collect()
.await
.map(|x| x.into_bytes())
.map_err(std::io::Error::other)?;
let n = bytes.len().min(buf.len());
buf[..n].copy_from_slice(&bytes[..n]);
self.cursor += n as u64;
Ok(n)
}
fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result<u64> {
match pos {
std::io::SeekFrom::Start(x) => self.cursor = x.min(self.size),
std::io::SeekFrom::Current(x) => {
if x < 0 {
let abs = x.unsigned_abs();
if abs > self.cursor {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidInput,
"cannot seek past start",
));
}
self.cursor -= abs;
} else { } else {
toml_edit::DocumentMut::new() self.cursor += x as u64;
}; }
}
fn to_edit_item(v: toml::Value) -> toml_edit::Item { std::io::SeekFrom::End(x) => {
match v { if x < 0 {
toml::Value::String(s) => toml_edit::value(s), let abs = x.unsigned_abs();
toml::Value::Integer(i) => toml_edit::value(i), if abs > self.size {
toml::Value::Float(f) => toml_edit::value(f), return Err(std::io::Error::new(
toml::Value::Boolean(b) => toml_edit::value(b), std::io::ErrorKind::InvalidInput,
toml::Value::Datetime(d) => toml_edit::value(d.to_string()), "cannot seek past start",
toml::Value::Array(arr) => { ));
let mut array = toml_edit::Array::new();
for item in arr {
if let toml_edit::Item::Value(v) = to_edit_item(item) {
array.push_formatted(v);
} }
} self.cursor = self.size - abs;
toml_edit::Item::Value(toml_edit::Value::Array(array)) } else {
} self.cursor = self.size + x as u64;
toml::Value::Table(t) => {
let mut table = toml_edit::Table::new();
for (k, v) in t {
table.insert(&k, to_edit_item(v));
}
toml_edit::Item::Table(table)
} }
} }
} }
let json_value = value.to_json()?; self.cursor = self.cursor.min(self.size);
let toml_value: toml::Value = serde_json::from_value(json_value) Ok(self.cursor)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
let item = to_edit_item(toml_value);
let Some((path_last, path_init)) = path.split_last() else {
return Ok(());
};
let mut table = doc.as_table_mut();
for label in path_init {
let key = label.as_str();
if !table.contains_key(key) {
table.insert(key, toml_edit::Item::Table(toml_edit::Table::new()));
}
table = table
.get_mut(key)
.and_then(|item| item.as_table_mut())
.ok_or_else(|| {
std::io::Error::new(
std::io::ErrorKind::InvalidData,
"path element is not a table",
)
})?;
}
table.insert(path_last.as_str(), item);
std::fs::write(&sidecar_path, doc.to_string())?;
Ok(())
} }
} }

View File

@@ -5,7 +5,7 @@ mod misc;
pub use misc::*; pub use misc::*;
mod dataset; mod dataset;
pub use dataset::*; pub use dataset::{Dataset, DatasetError, Datasets};
mod item; mod item;
pub use item::*; pub use item::*;

View File

@@ -1,35 +1,36 @@
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use itertools::Itertools;
use pile_config::Label; use pile_config::Label;
use std::path::PathBuf; use std::{path::PathBuf, sync::Arc};
use tokio_stream::wrappers::ReceiverStream;
use walkdir::WalkDir; use walkdir::WalkDir;
use crate::{DataSource, item::FileItem, path_ts_latest}; use crate::{DataSource, Item, path_ts_latest};
#[derive(Debug)] #[derive(Debug)]
pub struct DirDataSource { pub struct DirDataSource {
pub name: Label, pub name: Label,
pub dirs: Vec<PathBuf>, pub dir: PathBuf,
pub sidecars: bool, pub sidecars: bool,
} }
impl DirDataSource { impl DirDataSource {
pub fn new(name: &Label, dirs: Vec<PathBuf>, sidecars: bool) -> Self { pub fn new(name: &Label, dir: PathBuf, sidecars: bool) -> Self {
Self { Self {
name: name.clone(), name: name.clone(),
dirs, dir,
sidecars, sidecars,
} }
} }
} }
impl DataSource for DirDataSource { impl DataSource for Arc<DirDataSource> {
type Key = PathBuf; async fn get(&self, key: &str) -> Result<Option<Item>, std::io::Error> {
type Item = FileItem; let key = match key.parse::<PathBuf>() {
type Error = std::io::Error; Ok(x) => self.dir.join(x),
Err(_) => return Ok(None),
};
fn get(&self, key: &Self::Key) -> Result<Option<Self::Item>, Self::Error> {
if !key.is_file() { if !key.is_file() {
return Ok(None); return Ok(None);
} }
@@ -39,63 +40,83 @@ impl DataSource for DirDataSource {
return Ok(None); return Ok(None);
} }
return Ok(Some(FileItem { return Ok(Some(Item::File {
source_name: self.name.clone(), source: Arc::clone(self),
path: key.to_owned(), path: key.clone(),
sidecar: self.sidecars, sidecar: self.sidecars.then(|| {
Box::new(Item::File {
source: Arc::clone(self),
path: key.with_extension("toml"),
sidecar: None,
})
}),
})); }));
} }
fn iter(&self) -> impl Iterator<Item = Result<(Self::Key, Self::Item), Self::Error>> { fn iter(&self) -> ReceiverStream<Result<Item, std::io::Error>> {
return self let (tx, rx) = tokio::sync::mpsc::channel(64);
.dirs let source = Arc::clone(self);
.iter()
.flat_map(|x| WalkDir::new(x).into_iter().map_ok(move |d| (x, d))) let dir = self.dir.clone();
.filter_ok(|(_, entry)| !entry.file_type().is_dir()) tokio::task::spawn_blocking(move || {
.filter_map(|x| match x { for entry in WalkDir::new(dir) {
Err(err) => { let entry = match entry {
let msg = format!("other walkdir error: {err:?}"); Err(e) => {
Some(Err(err let msg = format!("walkdir error: {e:?}");
.into_io_error() let err = e.into_io_error().unwrap_or(std::io::Error::other(msg));
.unwrap_or(std::io::Error::other(msg)))) if tx.blocking_send(Err(err)).is_err() {
return;
} }
continue;
Ok((_, entry)) => { }
let path = entry.into_path(); Ok(e) => e,
let item = match path.extension().and_then(|x| x.to_str()) {
None => return None,
// Ignore toml if sidecars are enabled
Some("toml") if self.sidecars => return None,
Some(_) => FileItem {
source_name: self.name.clone(),
path: path.clone(),
sidecar: self.sidecars,
},
}; };
Some(Ok((path, item))) if entry.file_type().is_dir() {
}
});
}
fn latest_change(&self) -> Result<Option<DateTime<Utc>>, Self::Error> {
let mut ts: Option<DateTime<Utc>> = None;
for path in &self.dirs {
if !path.exists() {
continue; continue;
} }
let new = path_ts_latest(path)?; let path = entry.into_path();
let item = match path.extension().and_then(|x| x.to_str()) {
None => continue,
Some("toml") if source.sidecars => continue,
Some(_) => Item::File {
source: Arc::clone(&source),
path: path.clone(),
sidecar: source.sidecars.then(|| {
Box::new(Item::File {
source: Arc::clone(&source),
path: path.with_extension("toml"),
sidecar: None,
})
}),
},
};
if tx.blocking_send(Ok(item)).is_err() {
return;
}
}
});
ReceiverStream::new(rx)
}
async fn latest_change(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
let mut ts: Option<DateTime<Utc>> = None;
if !self.dir.exists() {
return Ok(None);
}
let new = path_ts_latest(&self.dir)?;
match (ts, new) { match (ts, new) {
(_, None) => continue, (_, None) => {}
(None, Some(new)) => ts = Some(new), (None, Some(new)) => ts = Some(new),
(Some(old), Some(new)) => ts = Some(old.max(new)), (Some(old), Some(new)) => ts = Some(old.max(new)),
}; };
}
return Ok(ts); return Ok(ts);
} }

View File

@@ -1,2 +1,5 @@
mod dir; mod dir;
pub use dir::*; pub use dir::*;
mod s3;
pub use s3::*;

View File

@@ -0,0 +1,206 @@
use aws_sdk_s3::config::{BehaviorVersion, Credentials, Region};
use chrono::{DateTime, Utc};
use pile_config::{Label, S3Credentials};
use smartstring::{LazyCompact, SmartString};
use std::sync::Arc;
use tokio_stream::wrappers::ReceiverStream;
use crate::{DataSource, Item};
#[derive(Debug)]
pub struct S3DataSource {
pub name: Label,
pub bucket: SmartString<LazyCompact>,
pub prefix: Option<SmartString<LazyCompact>>,
pub sidecars: bool,
pub client: Arc<aws_sdk_s3::Client>,
}
impl S3DataSource {
pub fn new(
name: &Label,
bucket: String,
prefix: Option<String>,
endpoint: Option<String>,
region: String,
credentials: &S3Credentials,
sidecars: bool,
) -> Result<Self, std::io::Error> {
let client = {
let creds = Credentials::new(
&credentials.access_key_id,
&credentials.secret_access_key,
None,
None,
"pile",
);
let mut s3_config = aws_sdk_s3::config::Builder::new()
.behavior_version(BehaviorVersion::latest())
.region(Region::new(region))
.credentials_provider(creds);
if let Some(ep) = endpoint {
s3_config = s3_config.endpoint_url(ep).force_path_style(true);
}
aws_sdk_s3::Client::from_conf(s3_config.build())
};
Ok(Self {
name: name.clone(),
bucket: bucket.into(),
prefix: prefix.map(|x| x.into()),
sidecars,
client: Arc::new(client),
})
}
fn make_item(self: &Arc<Self>, key: impl Into<SmartString<LazyCompact>>) -> Item {
Item::S3 {
source: Arc::clone(self),
key: key.into(),
sidecar: None, // TODO: add sidecars
}
}
}
impl DataSource for Arc<S3DataSource> {
async fn get(&self, key: &str) -> Result<Option<Item>, std::io::Error> {
if self.sidecars && key.ends_with(".toml") {
return Ok(None);
}
let result = self
.client
.head_object()
.bucket(self.bucket.as_str())
.key(key)
.send()
.await;
match result {
Err(sdk_err) => {
let not_found = sdk_err
.as_service_error()
.map(|e| e.is_not_found())
.unwrap_or(false);
if not_found {
return Ok(None);
}
Err(std::io::Error::other(sdk_err))
}
Ok(_) => Ok(Some(self.make_item(key))),
}
}
fn iter(&self) -> ReceiverStream<Result<Item, std::io::Error>> {
let (tx, rx) = tokio::sync::mpsc::channel(64);
let source = Arc::clone(self);
tokio::spawn(async move {
let mut continuation_token: Option<String> = None;
loop {
let mut req = source
.client
.list_objects_v2()
.bucket(source.bucket.as_str());
if let Some(prefix) = &source.prefix {
req = req.prefix(prefix.as_str());
}
if let Some(token) = continuation_token {
req = req.continuation_token(token);
}
let resp = match req.send().await {
Err(e) => {
let _ = tx.send(Err(std::io::Error::other(e))).await;
break;
}
Ok(resp) => resp,
};
let next_token = resp.next_continuation_token().map(ToOwned::to_owned);
let is_truncated = resp.is_truncated().unwrap_or(false);
for obj in resp.contents() {
let key = match obj.key() {
Some(k) => k.to_owned(),
None => continue,
};
if source.sidecars && key.ends_with(".toml") {
continue;
}
let item = Item::S3 {
source: Arc::clone(&source),
key: key.into(),
sidecar: None, // TODO: add sidecars
};
if tx.send(Ok(item)).await.is_err() {
return;
}
}
if !is_truncated {
break;
}
continuation_token = next_token;
}
});
ReceiverStream::new(rx)
}
async fn latest_change(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
let mut ts: Option<DateTime<Utc>> = None;
let mut continuation_token: Option<String> = None;
loop {
let mut req = self.client.list_objects_v2().bucket(self.bucket.as_str());
if let Some(prefix) = &self.prefix {
req = req.prefix(prefix.as_str());
}
if let Some(token) = continuation_token {
req = req.continuation_token(token);
}
let resp = match req.send().await {
Err(_) => return Ok(None),
Ok(resp) => resp,
};
let next_token = resp.next_continuation_token().map(ToOwned::to_owned);
let is_truncated = resp.is_truncated().unwrap_or(false);
for obj in resp.contents() {
if let Some(last_modified) = obj.last_modified() {
let dt = DateTime::from_timestamp(
last_modified.secs(),
last_modified.subsec_nanos(),
);
if let Some(dt) = dt {
ts = Some(match ts {
None => dt,
Some(prev) => prev.max(dt),
});
}
}
}
if !is_truncated {
break;
}
continuation_token = next_token;
}
Ok(ts)
}
}

View File

@@ -0,0 +1,158 @@
use aws_sdk_s3::{error::SdkError, operation::get_object::GetObjectError};
use mime::Mime;
use std::io::{Error as IoError, Seek, SeekFrom, Write};
use thiserror::Error;
use super::S3Client;
use crate::retry;
#[derive(Debug, Error)]
#[expect(clippy::large_enum_variant)]
pub enum S3ReaderError {
#[error("sdk error")]
SdkError(#[from] SdkError<GetObjectError>),
#[error("byte stream error")]
ByteStreamError(#[from] aws_sdk_s3::primitives::ByteStreamError),
#[error("i/o error")]
IoError(#[from] IoError),
}
/// Provides a [`std::io::Read`]-like interface to an S3 object. \
/// This doesn't actually implement [`std::io::Read`] because Read isn't async.
///
/// Also implements [`std::io::Seek`]
pub struct S3Reader {
pub(super) client: S3Client,
pub(super) bucket: String,
pub(super) key: String,
pub(super) cursor: u64,
pub(super) size: u64,
pub(super) mime: Mime,
}
impl S3Reader {
pub async fn read(&mut self, mut buf: &mut [u8]) -> Result<usize, S3ReaderError> {
let len_left = self.size - self.cursor;
if len_left == 0 || buf.is_empty() {
return Ok(0);
}
#[expect(clippy::unwrap_used)] // TODO: probably fits?
let start_byte = usize::try_from(self.cursor).unwrap();
#[expect(clippy::unwrap_used)] // usize fits in u64
let len_to_read = u64::try_from(buf.len()).unwrap().min(len_left);
#[expect(clippy::unwrap_used)] // must fit, we called min()
let len_to_read = usize::try_from(len_to_read).unwrap();
let end_byte = start_byte + len_to_read - 1;
let b = retry!(
self.client.retries,
self.client
.client
.get_object()
.bucket(self.bucket.as_str())
.key(self.key.as_str())
.range(format!("bytes={start_byte}-{end_byte}"))
.send()
.await
)?;
// Looks like `bytes 31000000-31999999/33921176``
// println!("{:?}", b.content_range);
let mut bytes = b.body.collect().await?.into_bytes();
bytes.truncate(len_to_read);
let l = bytes.len();
// Memory to memory writes are infallible
#[expect(clippy::unwrap_used)]
buf.write_all(&bytes).unwrap();
// Cannot fail, usize should always fit into u64
#[expect(clippy::unwrap_used)]
{
self.cursor += u64::try_from(l).unwrap();
}
return Ok(len_to_read);
}
pub fn is_done(&self) -> bool {
return self.cursor == self.size;
}
pub fn mime(&self) -> &Mime {
&self.mime
}
/// Write the entire contents of this reader to `r`.
///
/// This method always downloads the whole object,
/// and always preserves `self.cursor`.
pub async fn download<W: Write>(&mut self, r: &mut W) -> Result<(), S3ReaderError> {
let pos = self.stream_position()?;
const BUF_LEN: usize = 10_000_000;
#[expect(clippy::unwrap_used)] // Cannot fail
let mut buf: Box<[u8; BUF_LEN]> = vec![0u8; BUF_LEN].try_into().unwrap();
while !self.is_done() {
let b = self.read(&mut buf[..]).await?;
r.write_all(&buf[0..b])?;
}
self.seek(SeekFrom::Start(pos))?;
Ok(())
}
}
impl Seek for S3Reader {
fn seek(&mut self, pos: SeekFrom) -> std::io::Result<u64> {
match pos {
SeekFrom::Start(x) => self.cursor = x.min(self.size - 1),
// Cannot panic, we handle all cases
#[expect(clippy::unwrap_used)]
SeekFrom::Current(x) => {
if x < 0 {
if u64::try_from(x.abs()).unwrap() > self.cursor {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidInput,
"cannot seek past start",
));
}
self.cursor -= u64::try_from(x.abs()).unwrap();
} else {
self.cursor += u64::try_from(x).unwrap();
}
}
// Cannot panic, we handle all cases
#[expect(clippy::unwrap_used)]
SeekFrom::End(x) => {
if x < 0 {
if u64::try_from(x.abs()).unwrap() > self.size {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidInput,
"cannot seek past start",
));
}
// Cannot fail, is abs
self.cursor = self.size - u64::try_from(x.abs()).unwrap();
} else {
// Cannot fail, is positive
self.cursor = self.size + u64::try_from(x).unwrap();
}
}
}
self.cursor = self.cursor.min(self.size - 1);
return Ok(self.cursor);
}
}

View File

@@ -1,23 +1,18 @@
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use std::error::Error; use tokio_stream::wrappers::ReceiverStream;
use crate::{Item, Key}; use crate::Item;
/// A read-only set of [Item]s. /// A read-only set of [Item]s.
pub trait DataSource { pub trait DataSource {
/// The type used to retrieve items from this source
/// (e.g, a PathBuf or a primary key)
type Key: Key;
type Item: Item<Key = Self::Key>;
type Error: Error + Sync + Send;
/// Get an item from this datasource /// Get an item from this datasource
fn get(&self, key: &Self::Key) -> Result<Option<Self::Item>, Self::Error>; fn get(&self, key: &str) -> impl Future<Output = Result<Option<Item>, std::io::Error>> + Send;
/// Iterate over all items in this source in an arbitrary order /// Iterate over all items in this source in an arbitrary order
fn iter(&self) -> impl Iterator<Item = Result<(Self::Key, Self::Item), Self::Error>>; fn iter(&self) -> ReceiverStream<Result<Item, std::io::Error>>;
/// Return the time of the latest change to the data in this source /// Return the time of the latest change to the data in this source
fn latest_change(&self) -> Result<Option<DateTime<Utc>>, Self::Error>; fn latest_change(
&self,
) -> impl Future<Output = Result<Option<DateTime<Utc>>, std::io::Error>> + Send;
} }

View File

@@ -1,26 +1,25 @@
use std::rc::Rc;
use pile_config::objectpath::{ObjectPath, PathSegment}; use pile_config::objectpath::{ObjectPath, PathSegment};
use serde_json::{Map, Value}; use serde_json::{Map, Value};
use smartstring::{LazyCompact, SmartString}; use smartstring::{LazyCompact, SmartString};
use std::sync::Arc;
use crate::{Item, extract::Extractor}; use crate::extract::Extractor;
/// An immutable, lazily-computed value similar to [serde_json::Value]. /// An immutable, lazily-computed value similar to [serde_json::Value].
pub enum PileValue<'a, I: crate::Item> { pub enum PileValue<'a> {
Null, Null,
/// A string /// A string
String(SmartString<LazyCompact>), String(SmartString<LazyCompact>),
/// An array of values /// An array of values
Array(Vec<PileValue<'a, I>>), Array(Vec<PileValue<'a>>),
/// A lazily-computed map of {label: value} /// A lazily-computed map of {label: value}
Extractor(Rc<dyn Extractor<I> + 'a>), Extractor(Arc<dyn Extractor + 'a>),
} }
impl<I: Item> Clone for PileValue<'_, I> { impl Clone for PileValue<'_> {
fn clone(&self) -> Self { fn clone(&self) -> Self {
match self { match self {
Self::Null => Self::Null, Self::Null => Self::Null,
@@ -31,8 +30,8 @@ impl<I: Item> Clone for PileValue<'_, I> {
} }
} }
impl<'a, I: Item> PileValue<'a, I> { impl<'a> PileValue<'a> {
pub fn query(&'a self, query: &ObjectPath) -> Result<Option<&'a Self>, std::io::Error> { pub async fn query(&'a self, query: &ObjectPath) -> Result<Option<&'a Self>, std::io::Error> {
let mut out = Some(self); let mut out = Some(self);
for s in &query.segments { for s in &query.segments {
@@ -44,7 +43,7 @@ impl<'a, I: Item> PileValue<'a, I> {
Some(Self::Null) => None, Some(Self::Null) => None,
Some(Self::Array(_)) => None, Some(Self::Array(_)) => None,
Some(Self::String(_)) => None, Some(Self::String(_)) => None,
Some(Self::Extractor(e)) => e.field(field)?, Some(Self::Extractor(e)) => e.field(field).await?,
} }
} }
@@ -78,30 +77,29 @@ impl<'a, I: Item> PileValue<'a, I> {
} }
} }
pub fn to_json(&self) -> Result<Value, std::io::Error> { pub async fn to_json(&self) -> Result<Value, std::io::Error> {
Ok(match self { Ok(match self {
Self::Null => Value::Null, Self::Null => Value::Null,
Self::String(x) => Value::String(x.to_string()), Self::String(x) => Value::String(x.to_string()),
Self::Array(x) => Value::Array( Self::Array(x) => {
x.iter() let mut arr = Vec::new();
.map(|x| x.to_json()) for item in x {
.collect::<Result<Vec<_>, _>>()?, arr.push(Box::pin(item.to_json()).await?);
), }
Value::Array(arr)
}
Self::Extractor(e) => { Self::Extractor(e) => {
let keys = e.fields()?; let keys = e.fields().await?;
let map = keys let mut map = Map::new();
.iter() for k in &keys {
.map(|k| { let v = match e.field(k).await? {
#[expect(clippy::expect_used)] Some(x) => x,
let v = e.field(k)?.expect("key must be valid"); None => continue,
};
let v = v.to_json()?; map.insert(k.to_string(), Box::pin(v.to_json()).await?);
Ok((k.to_string(), v)) }
})
.collect::<Result<Map<String, Value>, std::io::Error>>()?;
Value::Object(map) Value::Object(map)
} }
}) })

View File

@@ -15,6 +15,7 @@ pile-config = { workspace = true }
tracing = { workspace = true } tracing = { workspace = true }
tracing-subscriber = { workspace = true } tracing-subscriber = { workspace = true }
tokio = { workspace = true } tokio = { workspace = true }
tokio-stream = { workspace = true }
clap = { workspace = true } clap = { workspace = true }
#clap_complete = { workspace = true } #clap_complete = { workspace = true }
serde = { workspace = true } serde = { workspace = true }

View File

@@ -3,9 +3,10 @@ use clap::Args;
use pile_config::{Label, Source}; use pile_config::{Label, Source};
use pile_dataset::index::DbFtsIndex; use pile_dataset::index::DbFtsIndex;
use pile_dataset::source::DirDataSource; use pile_dataset::source::DirDataSource;
use pile_dataset::{DataSource, Dataset, FileItem, Item, PileValue, extract::MetaExtractor}; use pile_dataset::{DataSource, Datasets, Item, PileValue, extract::MetaExtractor};
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError}; use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
use std::{path::PathBuf, rc::Rc}; use std::{path::PathBuf, sync::Arc};
use tokio_stream::StreamExt;
use tracing::{info, warn}; use tracing::{info, warn};
use crate::{CliCmd, GlobalContext}; use crate::{CliCmd, GlobalContext};
@@ -43,7 +44,7 @@ impl CliCmd for AnnotateCommand {
.ok_or_else(|| anyhow::anyhow!("invalid field name {:?}", self.field))?; .ok_or_else(|| anyhow::anyhow!("invalid field name {:?}", self.field))?;
let dest_path = Self::parse_dest(&self.dest)?; let dest_path = Self::parse_dest(&self.dest)?;
let ds = Dataset::open(&self.config) let ds = Datasets::open(&self.config)
.with_context(|| format!("while opening dataset for {}", self.config.display()))?; .with_context(|| format!("while opening dataset for {}", self.config.display()))?;
if !ds.config.schema.contains_key(&field) { if !ds.config.schema.contains_key(&field) {
@@ -51,7 +52,7 @@ impl CliCmd for AnnotateCommand {
} }
let index = DbFtsIndex::new(&ds.path_workdir, &ds.config); let index = DbFtsIndex::new(&ds.path_workdir, &ds.config);
let mut count = 0u64; let count = 0u64;
for (name, source) in &ds.config.dataset.source { for (name, source) in &ds.config.dataset.source {
match source { match source {
@@ -61,32 +62,41 @@ impl CliCmd for AnnotateCommand {
continue; continue;
} }
let source = DirDataSource::new(name, path.clone().to_vec(), *sidecars); let source = Arc::new(DirDataSource::new(name, path.clone(), *sidecars));
for res in source.iter() { let mut stream = source.iter();
let (_key, item) = while let Some(res) = stream.next().await {
res.with_context(|| format!("while reading source {name}"))?; let item = res.with_context(|| format!("while reading source {name}"))?;
let Item::File { path, .. } = &item else {
continue;
};
let meta = MetaExtractor::new(&item); let meta = MetaExtractor::new(&item);
let extractor = PileValue::<FileItem>::Extractor(Rc::new(meta)); let extractor = PileValue::Extractor(Arc::new(meta));
let Some(value) = let Some(value) =
index.get_field(&extractor, &field).with_context(|| { index.get_field(&extractor, &field).await.with_context(|| {
format!("while extracting field from {}", item.path.display()) format!("while extracting field from {}", path.display())
})? })?
else { else {
continue; continue;
}; };
item.write_sidecar(dest_path.clone(), PileValue::String(value.into())) // TODO: implement sidecar writing
.with_context(|| { let _ = (&dest_path, &value);
format!("while writing sidecar for {}", item.path.display()) todo!("write_sidecar not yet implemented");
})?;
#[expect(unreachable_code)]
{
count += 1; count += 1;
} }
} }
} }
Source::S3 { .. } => {
warn!("Source {name} is an S3 source; sidecar annotation is not yet supported");
}
}
} }
info!("Annotated {count} items"); info!("Annotated {count} items");

View File

@@ -1,7 +1,7 @@
use anyhow::{Context, Result, anyhow}; use anyhow::{Context, Result, anyhow};
use clap::Args; use clap::Args;
use pile_config::ConfigToml; use pile_config::ConfigToml;
use pile_dataset::Dataset; use pile_dataset::Datasets;
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError}; use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
use std::{fmt::Debug, path::PathBuf}; use std::{fmt::Debug, path::PathBuf};
use tracing::{debug, error, info, warn}; use tracing::{debug, error, info, warn};
@@ -43,11 +43,11 @@ impl CliCmd for CheckCommand {
} }
} }
let ds = Dataset::open(&self.config) let ds = Datasets::open(&self.config)
.with_context(|| format!("while opening dataset for {}", self.config.display()))?; .with_context(|| format!("while opening dataset for {}", self.config.display()))?;
let ts_fts = ds.ts_fts().context("while determining fts age")?; let ts_fts = ds.ts_fts().context("while determining fts age")?;
let ts_data = ds.ts_data().context("while determining data age")?; let ts_data = ds.ts_data().await.context("while determining data age")?;
match (ts_fts, ts_data) { match (ts_fts, ts_data) {
(None, Some(_)) => warn!("Could not determine fts age"), (None, Some(_)) => warn!("Could not determine fts age"),

View File

@@ -1,6 +1,6 @@
use anyhow::{Context, Result}; use anyhow::{Context, Result};
use clap::Args; use clap::Args;
use pile_dataset::Dataset; use pile_dataset::Datasets;
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError}; use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
use std::{fmt::Debug, path::PathBuf}; use std::{fmt::Debug, path::PathBuf};
@@ -23,10 +23,10 @@ impl CliCmd for IndexCommand {
_ctx: GlobalContext, _ctx: GlobalContext,
flag: CancelFlag, flag: CancelFlag,
) -> Result<i32, CancelableTaskError<anyhow::Error>> { ) -> Result<i32, CancelableTaskError<anyhow::Error>> {
let ds = Dataset::open(&self.config) let ds = Datasets::open(&self.config)
.with_context(|| format!("while opening dataset for {}", self.config.display()))?; .with_context(|| format!("while opening dataset for {}", self.config.display()))?;
ds.fts_refresh(self.jobs, Some(flag)).map_err(|x| { ds.fts_refresh(self.jobs, Some(flag)).await.map_err(|x| {
x.map_err(|x| { x.map_err(|x| {
anyhow::Error::from(x).context(format!( anyhow::Error::from(x).context(format!(
"while refreshing fts for {}", "while refreshing fts for {}",

View File

@@ -1,6 +1,6 @@
use anyhow::{Context, Result}; use anyhow::{Context, Result};
use clap::Args; use clap::Args;
use pile_dataset::Dataset; use pile_dataset::Datasets;
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError}; use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
use std::{fmt::Debug, path::PathBuf}; use std::{fmt::Debug, path::PathBuf};
use tracing::info; use tracing::info;
@@ -39,12 +39,12 @@ impl CliCmd for LookupCommand {
_ctx: GlobalContext, _ctx: GlobalContext,
flag: CancelFlag, flag: CancelFlag,
) -> Result<i32, CancelableTaskError<anyhow::Error>> { ) -> Result<i32, CancelableTaskError<anyhow::Error>> {
let ds = Dataset::open(&self.config) let ds = Datasets::open(&self.config)
.with_context(|| format!("while opening dataset for {}", self.config.display()))?; .with_context(|| format!("while opening dataset for {}", self.config.display()))?;
if self.refresh && ds.needs_fts().context("while checking dataset fts")? { if self.refresh && ds.needs_fts().await.context("while checking dataset fts")? {
info!("FTS index is missing or out-of-date, regenerating"); info!("FTS index is missing or out-of-date, regenerating");
ds.fts_refresh(self.jobs, Some(flag)).map_err(|x| { ds.fts_refresh(self.jobs, Some(flag)).await.map_err(|x| {
x.map_err(|x| { x.map_err(|x| {
anyhow::Error::from(x).context(format!( anyhow::Error::from(x).context(format!(
"while refreshing fts for {}", "while refreshing fts for {}",

View File

@@ -1,16 +1,23 @@
use anyhow::{Context, Result}; use anyhow::{Context, Result};
use clap::Args; use clap::Args;
use pile_config::Label; use pile_config::Label;
use pile_dataset::{FileItem, PileValue, extract::MetaExtractor}; use pile_dataset::{Datasets, PileValue, extract::MetaExtractor};
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError}; use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
use std::{fmt::Debug, path::PathBuf, rc::Rc}; use std::{path::PathBuf, sync::Arc};
use crate::{CliCmd, GlobalContext}; use crate::{CliCmd, GlobalContext};
#[derive(Debug, Args)] #[derive(Debug, Args)]
pub struct ProbeCommand { pub struct ProbeCommand {
/// The file to probe /// Source name (as defined in pile.toml)
file: PathBuf, source: String,
/// Item key within the source
key: String,
/// Path to dataset config
#[arg(long, short = 'c', default_value = "./pile.toml")]
config: PathBuf,
} }
impl CliCmd for ProbeCommand { impl CliCmd for ProbeCommand {
@@ -21,19 +28,23 @@ impl CliCmd for ProbeCommand {
_ctx: GlobalContext, _ctx: GlobalContext,
_flag: CancelFlag, _flag: CancelFlag,
) -> Result<i32, CancelableTaskError<anyhow::Error>> { ) -> Result<i32, CancelableTaskError<anyhow::Error>> {
let item = FileItem { let source = Label::new(&self.source)
path: self.file.clone(), .ok_or_else(|| anyhow::anyhow!("invalid source name {:?}", self.source))?;
source_name: Label::new("probe-source").unwrap(),
sidecar: true,
};
let value = PileValue::Extractor(Rc::new(MetaExtractor::new(&item))); let ds = Datasets::open(&self.config)
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
let item = ds.get(&source, &self.key).await.ok_or_else(|| {
anyhow::anyhow!("{:?} not found in source {:?}", self.key, self.source)
})?;
let value = PileValue::Extractor(Arc::new(MetaExtractor::new(&item)));
let json = value let json = value
.to_json() .to_json()
.with_context(|| format!("while extracting {}", self.file.display()))?; .await
.with_context(|| format!("while extracting {}", self.key))?;
let json = serde_json::to_string_pretty(&json).unwrap(); let json = serde_json::to_string_pretty(&json).unwrap();
println!("{json}"); println!("{json}");
return Ok(0); return Ok(0);
} }