Extractor refactor, S3 support
Some checks failed
CI / Typos (push) Successful in 1m5s
CI / Clippy (push) Failing after 1m50s
CI / Build and test (push) Successful in 3m1s

This commit is contained in:
2026-03-06 17:49:12 -08:00
parent 77b3125af4
commit aecc84233b
31 changed files with 2676 additions and 675 deletions

1502
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -73,6 +73,10 @@ tantivy = "0.25.0"
# Async & Parallelism
tokio = { version = "1.49.0", features = ["full"] }
tokio-stream = "0.1"
async-trait = "0.1"
aws-sdk-s3 = "1"
aws-config = "1"
# CLI & logging
tracing = "0.1.44"

View File

@@ -9,8 +9,7 @@ name = "dataset"
# working_dir = ".pile"
# Data sources available in this dataset
source."music" = { type = "flac", path = ["music", "music-2"] }
source."music" = { type = "filesystem", path = "music" }
# This dataset's schema.
# Defines normalized fields that are extracted from source entries on-demand.

View File

@@ -46,16 +46,21 @@ pub struct DatasetConfig {
pub post: Vec<FieldSpecPost>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct S3Credentials {
pub access_key_id: String,
pub secret_access_key: String,
}
#[derive(Debug, Clone, Deserialize)]
#[serde(tag = "type")]
#[serde(rename_all = "lowercase")]
pub enum Source {
/// A directory files
/// A directory of files
Filesystem {
/// The directories to scan.
/// Must be relative.
#[serde(alias = "paths")]
path: OneOrMany<PathBuf>,
path: PathBuf,
/// If true, all toml files are ignored.
/// Metadata can be added to any file using a {filename}.toml.
@@ -65,6 +70,23 @@ pub enum Source {
#[serde(default = "default_true")]
sidecars: bool,
},
/// An S3-compatible object store bucket
S3 {
bucket: String,
prefix: Option<String>,
/// Custom endpoint URL (for MinIO, etc.)
endpoint: Option<String>,
region: String,
credentials: S3Credentials,
/// If true, all .toml objects are treated as sidecar metadata files.
#[serde(default = "default_true")]
sidecars: bool,
},
}
//

View File

@@ -20,9 +20,11 @@ tracing = { workspace = true }
chrono = { workspace = true }
toml = { workspace = true }
thiserror = { workspace = true }
rayon = { workspace = true }
smartstring = { workspace = true }
blake3 = { workspace = true }
toml_edit = { workspace = true }
pdf = { workspace = true }
id3 = { workspace = true }
tokio = { workspace = true }
tokio-stream = { workspace = true }
async-trait = { workspace = true }
aws-sdk-s3 = { workspace = true }

View File

@@ -1,30 +1,17 @@
use chrono::{DateTime, Utc};
use pile_config::{ConfigToml, Label, Source};
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
use rayon::{
ThreadPoolBuilder,
iter::{IntoParallelIterator, ParallelIterator},
};
use std::{
io::ErrorKind,
path::PathBuf,
sync::{
Arc,
atomic::{AtomicU64, Ordering},
mpsc::Receiver,
},
thread::JoinHandle,
time::Instant,
};
use std::{collections::HashMap, io::ErrorKind, path::PathBuf, sync::Arc, time::Instant};
use tantivy::{Executor, Index, IndexWriter, TantivyError, collector::TopDocs};
use thiserror::Error;
use tokio_stream::{StreamExt, wrappers::ReceiverStream};
use tracing::{debug, info, trace, warn};
use crate::{
DataSource, FileItem,
DataSource, Item,
index::{DbFtsIndex, FtsLookupResult},
path_ts_earliest,
source::DirDataSource,
source::{DirDataSource, S3DataSource},
};
#[derive(Debug, Error)]
@@ -39,15 +26,54 @@ pub enum DatasetError {
NoFtsIndex,
}
pub struct Dataset {
//
// MARK: Dataset enum
//
/// An opened data source — either a local filesystem directory or an S3 bucket.
pub enum Dataset {
Dir(Arc<DirDataSource>),
S3(Arc<S3DataSource>),
}
impl Dataset {
pub async fn get(&self, key: &str) -> Option<Item> {
match self {
Self::Dir(ds) => ds.get(key).await.ok().flatten(),
Self::S3(ds) => ds.get(key).await.ok().flatten(),
}
}
pub fn iter(&self) -> ReceiverStream<Result<Item, std::io::Error>> {
match self {
Self::Dir(ds) => ds.iter(),
Self::S3(ds) => ds.iter(),
}
}
pub async fn latest_change(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
match self {
Self::Dir(ds) => ds.latest_change().await,
Self::S3(ds) => ds.latest_change().await,
}
}
}
//
// MARK: Datasets collection
//
/// An opened dataset: config, working directory, and all opened sources.
pub struct Datasets {
pub path_config: PathBuf,
pub path_parent: PathBuf,
pub path_workdir: PathBuf,
pub config: ConfigToml,
pub sources: HashMap<Label, Dataset>,
}
impl Dataset {
impl Datasets {
pub fn open(config: impl Into<PathBuf>) -> Result<Self, std::io::Error> {
let path_config = config.into();
let path_parent = path_config
@@ -84,11 +110,54 @@ impl Dataset {
.unwrap_or(path_parent.join(".pile"))
.join(config.dataset.name.as_str());
let mut sources = HashMap::new();
for (label, source) in &config.dataset.source {
match source {
Source::Filesystem { path, sidecars } => {
sources.insert(
label.clone(),
Dataset::Dir(Arc::new(DirDataSource::new(
label,
path_parent.join(path),
*sidecars,
))),
);
}
Source::S3 {
bucket,
prefix,
endpoint,
region,
credentials,
sidecars,
} => {
match S3DataSource::new(
label,
bucket.clone(),
prefix.clone(),
endpoint.clone(),
region.clone(),
credentials,
*sidecars,
) {
Ok(ds) => {
sources.insert(label.clone(), Dataset::S3(Arc::new(ds)));
}
Err(err) => {
warn!("Could not open S3 source {label}: {err}");
}
}
}
}
}
return Ok(Self {
path_config,
path_parent,
path_workdir,
config,
sources,
});
}
@@ -96,15 +165,8 @@ impl Dataset {
// MARK: get
//
pub fn get(&self, source: &Label, key: &PathBuf) -> Option<FileItem> {
let s = self.config.dataset.source.get(source)?;
let s = match s {
Source::Filesystem { path, sidecars } => {
DirDataSource::new(source, path.clone().to_vec(), *sidecars)
}
};
s.get(key).ok().flatten()
pub async fn get(&self, source: &Label, key: &str) -> Option<Item> {
self.sources.get(source)?.get(key).await
}
//
@@ -112,9 +174,9 @@ impl Dataset {
//
/// Refresh this dataset's fts index.
pub fn fts_refresh(
pub async fn fts_refresh(
&self,
threads: usize,
_threads: usize,
flag: Option<CancelFlag>,
) -> Result<(), CancelableTaskError<DatasetError>> {
let fts_tmp_dir = self.path_workdir.join(".tmp-fts");
@@ -134,58 +196,40 @@ impl Dataset {
let mut index_writer: IndexWriter =
index.writer(50 * 1024 * 1024).map_err(DatasetError::from)?;
let batch_size = 1000;
let (_read_task, read_rx) = start_read_task(&self.config, batch_size);
#[expect(clippy::unwrap_used)]
let write_pool = ThreadPoolBuilder::new()
.num_threads(threads.max(1))
.thread_name(|x| format!("fts_refresh_thread_{x}"))
.build()
.unwrap();
let mut total = 0u64;
while let Ok(batch) = read_rx.recv() {
let batch = batch?;
if let Some(flag) = &flag
&& flag.is_cancelled()
{
return Err(CancelableTaskError::Cancelled);
let mut logged_at = Instant::now();
for (name, dataset) in &self.sources {
info!("Loading source {name}");
let mut stream = dataset.iter();
while let Some(item_result) = stream.next().await {
if let Some(flag) = &flag
&& flag.is_cancelled()
{
return Err(CancelableTaskError::Cancelled);
}
let item = item_result.map_err(DatasetError::from)?;
let key = item.key();
match db_index.entry_to_document(&item).await {
Ok(Some(doc)) => {
index_writer.add_document(doc).map_err(DatasetError::from)?;
total += 1;
if logged_at.elapsed().as_secs() >= 5 {
debug!("Indexed {total} documents so far");
logged_at = Instant::now();
}
}
Ok(None) => {
warn!("Skipping {key:?}, document is empty");
}
Err(err) => {
warn!("Could not read {key:?}, skipping. {err}");
}
}
}
let this = AtomicU64::new(0);
let start = Instant::now();
write_pool
.install(|| {
batch
.into_par_iter()
.filter_map(|(key, item)| match db_index.entry_to_document(&item) {
Ok(Some(doc)) => Some((key, doc)),
Ok(None) => {
warn!("Skipping {key:?}, document is empty");
None
}
Err(err) => {
warn!("Could not read {key:?}, skipping. {err}");
None
}
})
.map(|(key, doc)| {
this.fetch_add(1, Ordering::Relaxed);
index_writer
.add_document(doc)
.map_err(|err| (key, err))
.map(|_| ())
})
.find_first(|x| x.is_err())
.unwrap_or(Ok(()))
})
.map_err(|(_key, err)| DatasetError::from(err))?;
let this = this.load(Ordering::Relaxed);
total += this;
let time_ms = start.elapsed().as_millis();
debug!("Added a batch of {this} in {time_ms} ms ({total} total)");
}
if let Some(flag) = flag.as_ref()
@@ -194,7 +238,7 @@ impl Dataset {
return Err(CancelableTaskError::Cancelled);
}
info!("Committing index");
info!("Committing {total} documents");
index_writer.commit().map_err(DatasetError::from)?;
if fts_dir.is_dir() {
@@ -247,19 +291,14 @@ impl Dataset {
}
/// Time at which data was last modified
pub fn ts_data(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
pub async fn ts_data(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
let mut ts: Option<DateTime<Utc>> = None;
for (label, source) in &self.config.dataset.source {
match source {
Source::Filesystem { path, sidecars } => {
let s = DirDataSource::new(label, path.clone().to_vec(), *sidecars);
match (ts, s.latest_change()?) {
(_, None) => continue,
(None, Some(new)) => ts = Some(new),
(Some(old), Some(new)) => ts = Some(old.max(new)),
};
}
for dataset in self.sources.values() {
match (ts, dataset.latest_change().await?) {
(_, None) => continue,
(None, Some(new)) => ts = Some(new),
(Some(old), Some(new)) => ts = Some(old.max(new)),
}
}
@@ -268,10 +307,10 @@ impl Dataset {
/// Returns true if we do not have an fts index,
/// or if our fts index is older than our data.
pub fn needs_fts(&self) -> Result<bool, std::io::Error> {
pub async fn needs_fts(&self) -> Result<bool, std::io::Error> {
let start = Instant::now();
let ts_fts = self.ts_fts()?;
let ts_data = self.ts_data()?;
let ts_data = self.ts_data().await?;
let result = match (ts_fts, ts_data) {
(None, Some(_)) => true,
@@ -292,59 +331,3 @@ impl Dataset {
return Ok(result);
}
}
//
// MARK: read_task
//
fn start_read_task(
config: &ConfigToml,
batch_size: usize,
) -> (
JoinHandle<()>,
Receiver<Result<Vec<(PathBuf, FileItem)>, DatasetError>>,
) {
let config = config.clone();
let (read_tx, read_rx) = std::sync::mpsc::sync_channel(2);
let read_task = std::thread::spawn(move || {
let mut batch = Vec::with_capacity(batch_size);
for (name, source) in &config.dataset.source {
info!("Loading source {name}");
match source {
Source::Filesystem { path, sidecars } => {
let source = DirDataSource::new(name, path.clone().to_vec(), *sidecars);
for i in source.iter() {
match i {
Ok(x) => batch.push(x),
Err(err) => {
let err = Err(DatasetError::from(err));
let _ = read_tx.send(err);
return;
}
}
if batch.len() >= batch_size {
let b = std::mem::replace(&mut batch, Vec::with_capacity(batch_size));
match read_tx.send(Ok(b)) {
Ok(()) => {}
Err(_) => return,
};
}
}
}
}
}
if !batch.is_empty() {
match read_tx.send(Ok(batch)) {
Ok(()) => {}
Err(_) => return,
};
}
});
return (read_task, read_rx);
}

View File

@@ -1,34 +1,40 @@
use pile_config::Label;
use pile_flac::{FlacBlock, FlacReader};
use std::{collections::HashMap, fs::File, io::BufReader, sync::OnceLock};
use std::{collections::HashMap, io::BufReader, sync::OnceLock};
use crate::{FileItem, PileValue, extract::Extractor};
use crate::{Item, PileValue, extract::Extractor};
pub struct FlacExtractor<'a> {
item: &'a FileItem,
output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>,
item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a>>>,
}
impl<'a> FlacExtractor<'a> {
pub fn new(item: &'a FileItem) -> Self {
pub fn new(item: &'a Item) -> Self {
Self {
item,
output: OnceLock::new(),
}
}
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> {
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
// If this isn't a flac file, ignore it.
if self.item.path.extension().map(|x| x.to_str()).flatten() != Some("flac") {
return Ok(self.output.get_or_init(|| HashMap::new()));
let key = match self.item {
Item::File { path, .. } => path.to_str().unwrap_or_default().to_owned(),
Item::S3 { key, .. } => key.to_string(),
};
if !key.ends_with(".flac") {
let _ = self.output.set(HashMap::new());
#[expect(clippy::unwrap_used)]
return Ok(self.output.get().unwrap());
}
let file = File::open(&self.item.path)?;
let reader = FlacReader::new(BufReader::new(file));
let bytes = self.item.read().await?.read_to_end().await?;
let reader = FlacReader::new(BufReader::new(std::io::Cursor::new(bytes)));
let mut output: HashMap<Label, Vec<_>> = HashMap::new();
for block in reader {
@@ -53,19 +59,22 @@ impl<'a> FlacExtractor<'a> {
.map(|(k, v)| (k, PileValue::Array(v)))
.collect();
return Ok(self.output.get_or_init(|| output));
let _ = self.output.set(output);
#[expect(clippy::unwrap_used)]
return Ok(self.output.get().unwrap());
}
}
impl Extractor<FileItem> for FlacExtractor<'_> {
fn field<'a>(
#[async_trait::async_trait]
impl Extractor for FlacExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> {
Ok(self.get_inner()?.get(name))
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner().await?.get(name))
}
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner()?.keys().cloned().collect())
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect())
}
}

View File

@@ -1,50 +1,48 @@
use pile_config::Label;
use std::{collections::HashMap, path::Component, sync::OnceLock};
use crate::{FileItem, Key, PileValue, extract::Extractor};
use crate::{Item, PileValue, extract::Extractor};
pub struct FsExtractor<'a> {
item: &'a FileItem,
output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>,
item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a>>>,
}
impl<'a> FsExtractor<'a> {
pub fn new(item: &'a FileItem) -> Self {
pub fn new(item: &'a Item) -> Self {
Self {
item,
output: OnceLock::new(),
}
}
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> {
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let Item::File { path, .. } = self.item else {
return Ok(self.output.get_or_init(HashMap::new));
};
#[expect(clippy::unwrap_used)]
let output = HashMap::from([
(
Label::new("extension").unwrap(),
self.item
.path
.extension()
path.extension()
.and_then(|x| x.to_str())
.map(|x| PileValue::String(x.into()))
.unwrap_or(PileValue::Null),
),
(
Label::new("path").unwrap(),
self.item
.path
.to_string()
path.to_str()
.map(|x| PileValue::String(x.into()))
.unwrap_or(PileValue::Null),
),
(
Label::new("segments").unwrap(),
self.item
.path
.components()
path.components()
.map(|x| match x {
Component::CurDir => Some(".".to_owned()),
Component::Normal(x) => x.to_str().map(|x| x.to_owned()),
@@ -63,15 +61,16 @@ impl<'a> FsExtractor<'a> {
}
}
impl Extractor<FileItem> for FsExtractor<'_> {
fn field<'a>(
#[async_trait::async_trait]
impl Extractor for FsExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> {
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner()?.get(name))
}
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner()?.keys().cloned().collect())
}
}

View File

@@ -1,38 +1,50 @@
use id3::Tag;
use pile_config::Label;
use std::{borrow::Cow, collections::HashMap, sync::OnceLock};
use std::{borrow::Cow, collections::HashMap, io::BufReader, sync::OnceLock};
use crate::{FileItem, PileValue, extract::Extractor};
use crate::{Item, PileValue, extract::Extractor};
pub struct Id3Extractor<'a> {
item: &'a FileItem,
output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>,
item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a>>>,
}
impl<'a> Id3Extractor<'a> {
pub fn new(item: &'a FileItem) -> Self {
pub fn new(item: &'a Item) -> Self {
Self {
item,
output: OnceLock::new(),
}
}
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> {
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let ext = self.item.path.extension().and_then(|x| x.to_str());
let key = match self.item {
Item::File { path, .. } => path.to_str().unwrap_or_default().to_owned(),
Item::S3 { key, .. } => key.to_string(),
};
let ext = key.rsplit('.').next();
if !matches!(ext, Some("mp3") | Some("aiff") | Some("aif") | Some("wav")) {
return Ok(self.output.get_or_init(HashMap::new));
let _ = self.output.set(HashMap::new());
#[expect(clippy::unwrap_used)]
return Ok(self.output.get().unwrap());
}
let tag = match Tag::read_from_path(&self.item.path) {
let bytes = self.item.read().await?.read_to_end().await?;
let tag = match Tag::read_from2(BufReader::new(std::io::Cursor::new(bytes))) {
Ok(tag) => tag,
Err(id3::Error {
kind: id3::ErrorKind::NoTag,
..
}) => return Ok(self.output.get_or_init(HashMap::new)),
}) => {
let _ = self.output.set(HashMap::new());
#[expect(clippy::unwrap_used)]
return Ok(self.output.get().unwrap());
}
Err(id3::Error {
kind: id3::ErrorKind::Io(e),
..
@@ -40,7 +52,7 @@ impl<'a> Id3Extractor<'a> {
Err(e) => return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, e)),
};
let mut output: HashMap<Label, Vec<PileValue<'a, FileItem>>> = HashMap::new();
let mut output: HashMap<Label, Vec<PileValue<'a>>> = HashMap::new();
for frame in tag.frames() {
if let Some(text) = frame.content().text() {
let name = frame_id_to_field(frame.id());
@@ -58,7 +70,9 @@ impl<'a> Id3Extractor<'a> {
.map(|(k, v)| (k, PileValue::Array(v)))
.collect();
return Ok(self.output.get_or_init(|| output));
let _ = self.output.set(output);
#[expect(clippy::unwrap_used)]
return Ok(self.output.get().unwrap());
}
}
@@ -66,6 +80,7 @@ impl<'a> Id3Extractor<'a> {
/// Falls back to the lowercased frame ID if no mapping exists.
fn frame_id_to_field(id: &str) -> Cow<'static, str> {
match id {
// spell:off
"TIT2" => Cow::Borrowed("title"),
"TIT1" => Cow::Borrowed("grouping"),
"TIT3" => Cow::Borrowed("subtitle"),
@@ -98,18 +113,20 @@ fn frame_id_to_field(id: &str) -> Cow<'static, str> {
"MVNM" => Cow::Borrowed("movement"),
"MVIN" => Cow::Borrowed("movementnumber"),
_ => Cow::Owned(id.to_lowercase()),
// spell:on
}
}
impl Extractor<FileItem> for Id3Extractor<'_> {
fn field<'a>(
#[async_trait::async_trait]
impl Extractor for Id3Extractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> {
Ok(self.get_inner()?.get(name))
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner().await?.get(name))
}
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner()?.keys().cloned().collect())
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect())
}
}

View File

@@ -1,18 +1,22 @@
use pile_config::Label;
use std::collections::HashMap;
use crate::{Item, PileValue, extract::Extractor};
use crate::{PileValue, extract::Extractor};
pub struct MapExtractor<'a, I: Item> {
pub(crate) inner: HashMap<Label, PileValue<'a, I>>,
pub struct MapExtractor<'a> {
pub(crate) inner: HashMap<Label, PileValue<'a>>,
}
impl<I: Item> Extractor<I> for MapExtractor<'_, I> {
fn field<'a>(&'a self, name: &Label) -> Result<Option<&'a PileValue<'a, I>>, std::io::Error> {
#[async_trait::async_trait]
impl Extractor for MapExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.inner.get(name))
}
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.inner.keys().cloned().collect())
}
}

View File

@@ -1,5 +1,5 @@
use pile_config::Label;
use std::{collections::HashMap, rc::Rc};
use std::{collections::HashMap, sync::Arc};
mod flac;
pub use flac::*;
@@ -13,59 +13,73 @@ pub use fs::*;
mod pdf;
pub use pdf::*;
mod sidecar;
pub use sidecar::*;
mod toml;
pub use toml::*;
mod map;
pub use map::*;
mod sidecar;
pub use sidecar::*;
use crate::Item;
/// An attachment that extracts metadata from an [Item].
///
/// Metadata is exposed as an immutable map of {label: value},
/// much like a json object.
pub trait Extractor<I: crate::Item> {
#[async_trait::async_trait]
pub trait Extractor: Send + Sync {
/// Get the field at `name` from `item`.
/// - returns `None` if `name` is not a valid field
/// - returns `Some(Null)` if `name` is not available
fn field<'a>(
async fn field<'a>(
&'a self,
name: &pile_config::Label,
) -> Result<Option<&'a crate::PileValue<'a, I>>, std::io::Error>;
) -> Result<Option<&'a crate::PileValue<'a>>, std::io::Error>;
/// Return all fields in this extractor.
/// `Self::field` must return [Some] for all these keys
/// and [None] for all others.
fn fields(&self) -> Result<Vec<Label>, std::io::Error>;
async fn fields(&self) -> Result<Vec<Label>, std::io::Error>;
}
pub struct MetaExtractor<'a, I: crate::Item> {
inner: MapExtractor<'a, I>,
pub struct MetaExtractor<'a> {
inner: MapExtractor<'a>,
}
impl<'a> MetaExtractor<'a, crate::FileItem> {
//
// MARK: file
//
impl<'a> MetaExtractor<'a> {
#[expect(clippy::unwrap_used)]
pub fn new(item: &'a crate::FileItem) -> Self {
pub fn new(item: &'a Item) -> Self {
let inner = MapExtractor {
inner: HashMap::from([
(
Label::new("flac").unwrap(),
crate::PileValue::Extractor(Rc::new(FlacExtractor::new(item))),
crate::PileValue::Extractor(Arc::new(FlacExtractor::new(item))),
),
(
Label::new("id3").unwrap(),
crate::PileValue::Extractor(Rc::new(Id3Extractor::new(item))),
crate::PileValue::Extractor(Arc::new(Id3Extractor::new(item))),
),
(
Label::new("fs").unwrap(),
crate::PileValue::Extractor(Rc::new(FsExtractor::new(item))),
crate::PileValue::Extractor(Arc::new(FsExtractor::new(item))),
),
(
Label::new("pdf").unwrap(),
crate::PileValue::Extractor(Rc::new(PdfExtractor::new(item))),
crate::PileValue::Extractor(Arc::new(PdfExtractor::new(item))),
),
(
Label::new("toml").unwrap(),
crate::PileValue::Extractor(Arc::new(TomlExtractor::new(item))),
),
(
Label::new("sidecar").unwrap(),
crate::PileValue::Extractor(Rc::new(SidecarExtractor::new(item))),
crate::PileValue::Extractor(Arc::new(SidecarExtractor::new(item))),
),
]),
};
@@ -74,16 +88,17 @@ impl<'a> MetaExtractor<'a, crate::FileItem> {
}
}
impl Extractor<crate::FileItem> for MetaExtractor<'_, crate::FileItem> {
fn field<'a>(
#[async_trait::async_trait]
impl Extractor for MetaExtractor<'_> {
async fn field<'a>(
&'a self,
name: &pile_config::Label,
) -> Result<Option<&'a crate::PileValue<'a, crate::FileItem>>, std::io::Error> {
self.inner.field(name)
) -> Result<Option<&'a crate::PileValue<'a>>, std::io::Error> {
self.inner.field(name).await
}
#[expect(clippy::unwrap_used)]
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
return Ok(vec![
Label::new("flac").unwrap(),
Label::new("id3").unwrap(),
@@ -92,4 +107,4 @@ impl Extractor<crate::FileItem> for MetaExtractor<'_, crate::FileItem> {
Label::new("sidecar").unwrap(),
]);
}
}
}

View File

@@ -1,5 +1,5 @@
use pile_config::Label;
use std::{collections::HashMap, rc::Rc};
use std::{collections::HashMap, sync::Arc};
mod pdf_meta;
pub use pdf_meta::*;
@@ -8,26 +8,26 @@ mod pdf_text;
pub use pdf_text::*;
use crate::{
FileItem, PileValue,
Item, PileValue,
extract::{Extractor, MapExtractor},
};
pub struct PdfExtractor<'a> {
inner: MapExtractor<'a, FileItem>,
inner: MapExtractor<'a>,
}
impl<'a> PdfExtractor<'a> {
#[expect(clippy::unwrap_used)]
pub fn new(item: &'a FileItem) -> Self {
pub fn new(item: &'a Item) -> Self {
let inner = MapExtractor {
inner: HashMap::from([
(
Label::new("text").unwrap(),
PileValue::Extractor(Rc::new(PdfTextExtractor::new(item))),
PileValue::Extractor(Arc::new(PdfTextExtractor::new(item))),
),
(
Label::new("meta").unwrap(),
PileValue::Extractor(Rc::new(PdfMetaExtractor::new(item))),
PileValue::Extractor(Arc::new(PdfMetaExtractor::new(item))),
),
]),
};
@@ -36,23 +36,25 @@ impl<'a> PdfExtractor<'a> {
}
}
impl Extractor<FileItem> for PdfExtractor<'_> {
fn field<'a>(
#[async_trait::async_trait]
impl Extractor for PdfExtractor<'_> {
async fn field<'a>(
&'a self,
name: &pile_config::Label,
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> {
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
#[expect(clippy::unwrap_used)]
if name.as_str() == "text" {
match self.inner.inner.get(name).unwrap() {
PileValue::Extractor(x) => return x.field(name),
PileValue::Extractor(x) => return x.field(name).await,
_ => unreachable!(),
};
}
self.inner.field(name)
self.inner.field(name).await
}
#[expect(clippy::unwrap_used)]
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(vec![
Label::new("text").unwrap(),
Label::new("meta").unwrap(),

View File

@@ -1,40 +1,44 @@
use pdf::file::FileOptions;
use pdf::primitive::{Date, TimeRel};
use pdf::primitive::{Date, PdfString, TimeRel};
use pile_config::Label;
use std::{collections::HashMap, sync::OnceLock};
use tracing::debug;
use crate::{FileItem, PileValue, extract::Extractor};
use crate::{Item, PileValue, extract::Extractor};
pub struct PdfMetaExtractor<'a> {
item: &'a FileItem,
output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>,
item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a>>>,
}
impl<'a> PdfMetaExtractor<'a> {
pub fn new(item: &'a FileItem) -> Self {
pub fn new(item: &'a Item) -> Self {
Self {
item,
output: OnceLock::new(),
}
}
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> {
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
if self.item.path.extension().map(|x| x.to_str()).flatten() != Some("pdf") {
return Ok(self.output.get_or_init(|| HashMap::new()));
}
let bytes = self.item.read().await?.read_to_end().await?;
let file = FileOptions::cached()
.open(&self.item.path)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let file = match FileOptions::cached().load(bytes) {
Ok(x) => x,
Err(pdf::PdfError::Io { source }) => return Err(source),
Err(error) => {
debug!(message = "Could not process pdf", ?error, key = ?self.item.key());
return Ok(self.output.get_or_init(HashMap::new));
}
};
let mut output: HashMap<Label, PileValue<'a, FileItem>> = HashMap::new();
let mut output: HashMap<Label, PileValue<'a>> = HashMap::new();
if let Some(info) = &file.trailer.info_dict {
let fields: &[(&str, Option<&_>)] = &[
let fields: &[(&str, Option<&PdfString>)] = &[
("title", info.title.as_ref()),
("author", info.author.as_ref()),
("subject", info.subject.as_ref()),
@@ -88,15 +92,16 @@ fn format_date(d: &Date) -> String {
)
}
impl Extractor<FileItem> for PdfMetaExtractor<'_> {
fn field<'a>(
#[async_trait::async_trait]
impl Extractor for PdfMetaExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> {
Ok(self.get_inner()?.get(name))
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner().await?.get(name))
}
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner()?.keys().cloned().collect())
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect())
}
}

View File

@@ -2,34 +2,38 @@ use pdf::content::{Op, TextDrawAdjusted};
use pdf::file::FileOptions;
use pile_config::Label;
use std::{collections::HashMap, sync::OnceLock};
use tracing::debug;
use crate::{FileItem, PileValue, extract::Extractor};
use crate::{Item, PileValue, extract::Extractor};
pub struct PdfTextExtractor<'a> {
item: &'a FileItem,
output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>,
item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a>>>,
}
impl<'a> PdfTextExtractor<'a> {
pub fn new(item: &'a FileItem) -> Self {
pub fn new(item: &'a Item) -> Self {
Self {
item,
output: OnceLock::new(),
}
}
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> {
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
if self.item.path.extension().map(|x| x.to_str()).flatten() != Some("pdf") {
return Ok(self.output.get_or_init(|| HashMap::new()));
}
let bytes = self.item.read().await?.read_to_end().await?;
let file = FileOptions::cached()
.open(&self.item.path)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let file = match FileOptions::cached().load(bytes) {
Ok(x) => x,
Err(pdf::PdfError::Io { source }) => return Err(source),
Err(error) => {
debug!(message = "Could not process pdf", ?error, key = ?self.item.key());
return Ok(self.output.get_or_init(HashMap::new));
}
};
let mut text_parts: Vec<String> = Vec::new();
@@ -65,19 +69,22 @@ impl<'a> PdfTextExtractor<'a> {
#[expect(clippy::unwrap_used)]
let output = HashMap::from([(Label::new("text").unwrap(), PileValue::String(text.into()))]);
return Ok(self.output.get_or_init(|| output));
let _ = self.output.set(output);
#[expect(clippy::unwrap_used)]
return Ok(self.output.get().unwrap());
}
}
impl Extractor<FileItem> for PdfTextExtractor<'_> {
fn field<'a>(
#[async_trait::async_trait]
impl Extractor for PdfTextExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> {
Ok(self.get_inner()?.get(name))
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner().await?.get(name))
}
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner()?.keys().cloned().collect())
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect())
}
}

View File

@@ -1,71 +1,47 @@
use pile_config::Label;
use std::{collections::HashMap, sync::OnceLock};
use std::sync::OnceLock;
use crate::{FileItem, Item, PileValue, extract::Extractor};
fn toml_to_pile<I: Item>(value: toml::Value) -> PileValue<'static, I> {
match value {
toml::Value::String(s) => PileValue::String(s.into()),
toml::Value::Integer(i) => PileValue::String(i.to_string().into()),
toml::Value::Float(f) => PileValue::String(f.to_string().into()),
toml::Value::Boolean(b) => PileValue::String(b.to_string().into()),
toml::Value::Datetime(d) => PileValue::String(d.to_string().into()),
toml::Value::Array(a) => PileValue::Array(a.into_iter().map(toml_to_pile).collect()),
toml::Value::Table(_) => PileValue::Null,
}
}
use crate::{
Item, PileValue,
extract::{Extractor, TomlExtractor},
};
pub struct SidecarExtractor<'a> {
item: &'a FileItem,
output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>,
item: &'a Item,
output: OnceLock<Option<TomlExtractor<'a>>>,
}
impl<'a> SidecarExtractor<'a> {
pub fn new(item: &'a FileItem) -> Self {
pub fn new(item: &'a Item) -> Self {
Self {
item,
output: OnceLock::new(),
}
}
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let sidecar_file = self.item.path.with_extension("toml");
if !(sidecar_file.is_file() && self.item.sidecar) {
return Ok(self.output.get_or_init(HashMap::new));
}
let sidecar = std::fs::read(&sidecar_file)?;
let sidecar: toml::Value = match toml::from_slice(&sidecar) {
Ok(x) => x,
Err(_) => return Ok(self.output.get_or_init(HashMap::new)),
};
let output: HashMap<Label, PileValue<'_, FileItem>> = match sidecar {
toml::Value::Table(t) => t
.into_iter()
.filter_map(|(k, v)| Label::new(&k).map(|label| (label, toml_to_pile(v))))
.collect(),
_ => HashMap::new(),
};
return Ok(self.output.get_or_init(|| output));
}
}
impl Extractor<FileItem> for SidecarExtractor<'_> {
fn field<'a>(
#[async_trait::async_trait]
impl Extractor for SidecarExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> {
Ok(self.get_inner()?.get(name))
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
match self
.output
.get_or_init(|| self.item.sidecar().map(TomlExtractor::new))
{
Some(x) => Ok(x.field(name).await?),
None => Ok(Some(&PileValue::Null)),
}
}
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner()?.keys().cloned().collect())
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
match self
.output
.get_or_init(|| self.item.sidecar().map(TomlExtractor::new))
{
Some(x) => Ok(x.fields().await?),
None => Ok(Vec::new()),
}
}
}

View File

@@ -0,0 +1,66 @@
use pile_config::Label;
use std::{collections::HashMap, sync::OnceLock};
use crate::{Item, PileValue, extract::Extractor};
fn toml_to_pile(value: toml::Value) -> PileValue<'static> {
match value {
toml::Value::String(s) => PileValue::String(s.into()),
toml::Value::Integer(i) => PileValue::String(i.to_string().into()),
toml::Value::Float(f) => PileValue::String(f.to_string().into()),
toml::Value::Boolean(b) => PileValue::String(b.to_string().into()),
toml::Value::Datetime(d) => PileValue::String(d.to_string().into()),
toml::Value::Array(a) => PileValue::Array(a.into_iter().map(toml_to_pile).collect()),
toml::Value::Table(_) => PileValue::Null,
}
}
pub struct TomlExtractor<'a> {
item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a>>>,
}
impl<'a> TomlExtractor<'a> {
pub fn new(item: &'a Item) -> Self {
Self {
item,
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let bytes = self.item.read().await?.read_to_end().await?;
let toml: toml::Value = match toml::from_slice(&bytes) {
Ok(x) => x,
Err(_) => return Ok(self.output.get_or_init(HashMap::new)),
};
let output: HashMap<Label, PileValue<'_>> = match toml {
toml::Value::Table(t) => t
.into_iter()
.filter_map(|(k, v)| Label::new(&k).map(|label| (label, toml_to_pile(v))))
.collect(),
_ => HashMap::new(),
};
return Ok(self.output.get_or_init(|| output));
}
}
#[async_trait::async_trait]
impl Extractor for TomlExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner().await?.get(name))
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect())
}
}

View File

@@ -1,6 +1,9 @@
use itertools::Itertools;
use pile_config::{Case, ConfigToml, DatasetFts, FieldSpecPost, Label};
use std::{path::PathBuf, rc::Rc, sync::LazyLock};
use std::{
path::PathBuf,
sync::{Arc, LazyLock},
};
use tantivy::{
DocAddress, Index, ReloadPolicy, TantivyDocument, TantivyError,
collector::Collector,
@@ -9,7 +12,7 @@ use tantivy::{
};
use tracing::{debug, trace, warn};
use crate::{Item, Key, PileValue, extract::MetaExtractor};
use crate::{Item, PileValue, extract::MetaExtractor};
#[derive(Debug, Clone)]
pub struct FtsLookupResult {
@@ -63,37 +66,21 @@ impl DbFtsIndex {
//
/// Turn an entry into a tantivy document
pub fn entry_to_document<K: Key, I: Item<Key = K>>(
pub async fn entry_to_document(
&self,
item: &I,
item: &Item,
) -> Result<Option<TantivyDocument>, TantivyError> {
let mut doc = TantivyDocument::default();
let key = match item.key().to_string() {
Some(x) => x,
None => {
warn!(
message = "Item key cannot be converted to a string, skipping",
key = ?item.key(),
);
return Ok(None);
}
};
let key = item.key();
doc.add_text(self.schema.get_field("_meta_source")?, item.source_name());
doc.add_text(self.schema.get_field("_meta_key")?, key);
let item = match item.as_file() {
Some(x) => x,
None => return Ok(None),
};
let extractor = MetaExtractor::new(item);
let extractor = PileValue::Extractor(Rc::new(extractor));
let extractor = PileValue::Extractor(Arc::new(MetaExtractor::new(item)));
let mut empty = true;
for name in self.fts_cfg().fields.keys() {
let x = self.get_field(&extractor, name)?;
let x = self.get_field(&extractor, name).await?;
let val = match x {
Some(x) => x,
@@ -115,9 +102,9 @@ impl DbFtsIndex {
// MARK: read
//
pub fn get_field<I: Item>(
pub async fn get_field(
&self,
extractor: &PileValue<'_, I>,
extractor: &PileValue<'_>,
field_name: &Label,
) -> Result<Option<String>, std::io::Error> {
let field = match self.cfg.schema.get(field_name) {
@@ -130,7 +117,7 @@ impl DbFtsIndex {
// Try paths in order, using the first value we find
'outer: for path in field.path.as_slice() {
let val = match extractor.query(path)? {
let val = match extractor.query(path).await? {
Some(x) => x,
None => return Ok(None),
};
@@ -292,10 +279,7 @@ impl DbFtsIndex {
}
}
pub fn apply<'a, I: Item>(
post: &FieldSpecPost,
val: &PileValue<'a, I>,
) -> Option<PileValue<'a, I>> {
pub fn apply<'a>(post: &FieldSpecPost, val: &PileValue<'a>) -> Option<PileValue<'a>> {
Some(match post {
FieldSpecPost::NotEmpty { notempty: false } => val.clone(),
FieldSpecPost::NotEmpty { notempty: true } => match val {

View File

@@ -1,178 +1,222 @@
use pile_config::Label;
use std::{fmt::Debug, path::PathBuf, rc::Rc};
use smartstring::{LazyCompact, SmartString};
use std::{fs::File, io::Seek, path::PathBuf, sync::Arc};
use crate::{
PileValue,
extract::{Extractor, SidecarExtractor},
};
//
// MARK: key
//
pub trait Key: Debug + Clone + Send + Sync + 'static {
/// Convert this key to a string, returning `None`
/// if we encounter any kind of error.
fn to_string(&self) -> Option<String>;
fn from_string(str: &str) -> Option<Self>;
}
impl Key for PathBuf {
fn from_string(str: &str) -> Option<Self> {
str.parse().ok()
}
fn to_string(&self) -> Option<String> {
self.to_str().map(|x| x.to_owned())
}
}
use crate::source::{DirDataSource, S3DataSource};
//
// MARK: item
//
/// A pointer to raw data
pub trait Item: Debug + Send + Sync + 'static + Sized {
type Key: Key;
#[derive(Debug, Clone)]
pub enum Item {
File {
source: Arc<DirDataSource>,
fn source_name(&self) -> &str;
fn key(&self) -> &Self::Key;
path: PathBuf,
sidecar: Option<Box<Item>>,
},
/// Get this item's sidecar metadata
fn sidecar(&self) -> Result<Option<Rc<dyn Extractor<Self> + '_>>, std::io::Error>;
S3 {
source: Arc<S3DataSource>,
/// Set this file's sidecar metadata,
/// overwriting any existing file.
fn write_sidecar(
&self,
path: Vec<Label>,
value: PileValue<'_, Self>,
) -> Result<(), std::io::Error>;
fn hash(&self) -> Result<blake3::Hash, std::io::Error>;
/// Item conversion, downcast to specific type.
/// Returns `None` if this is not a [FileItem]
fn as_file(&self) -> Option<&FileItem>;
key: SmartString<LazyCompact>,
sidecar: Option<Box<Item>>,
},
}
#[derive(Clone, Debug)]
pub struct FileItem {
/// Path to this file.
/// Must be relative to source root dir.
pub path: PathBuf,
pub source_name: Label,
impl Item {
/// Open the item for reading. For S3, performs a HEAD request to determine
/// the object size.
pub async fn read(&self) -> Result<ItemReader, std::io::Error> {
Ok(match self {
Self::File { path, .. } => ItemReader::File(File::open(path)?),
/// If true, look for a sidecar file
pub sidecar: bool,
Self::S3 { source, key, .. } => {
let head = source
.client
.head_object()
.bucket(source.bucket.as_str())
.key(key.as_str())
.send()
.await
.map_err(std::io::Error::other)?;
let size = head.content_length().unwrap_or(0) as u64;
ItemReader::S3(S3Reader {
client: source.client.clone(),
bucket: source.bucket.clone(),
key: key.to_owned(),
cursor: 0,
size,
})
}
})
}
pub fn source_name(&self) -> &pile_config::Label {
match self {
Self::File { source, .. } => &source.name,
Self::S3 { source, .. } => &source.name,
}
}
#[expect(clippy::expect_used)]
pub fn key(&self) -> SmartString<LazyCompact> {
match self {
Self::File { path, .. } => path.to_str().expect("path is not utf-8").into(),
Self::S3 { key, .. } => key.clone(),
}
}
pub fn hash(&self) -> Result<blake3::Hash, std::io::Error> {
match self {
Self::File { path, .. } => {
let mut hasher = blake3::Hasher::new();
let mut file = std::fs::File::open(path)?;
std::io::copy(&mut file, &mut hasher)?;
return Ok(hasher.finalize());
}
Self::S3 { .. } => todo!(),
}
}
pub fn sidecar(&self) -> Option<&Self> {
match self {
Self::File { sidecar, .. } => sidecar.as_ref().map(|x| &**x),
Self::S3 { sidecar, .. } => sidecar.as_ref().map(|x| &**x),
}
}
}
impl Item for FileItem {
type Key = PathBuf;
pub enum ItemReader {
File(File),
S3(S3Reader),
}
fn source_name(&self) -> &str {
&self.source_name
}
fn key(&self) -> &Self::Key {
&self.path
}
fn as_file(&self) -> Option<&FileItem> {
Some(self)
}
fn hash(&self) -> Result<blake3::Hash, std::io::Error> {
let mut hasher = blake3::Hasher::new();
let mut file = std::fs::File::open(&self.path)?;
std::io::copy(&mut file, &mut hasher)?;
return Ok(hasher.finalize());
}
fn sidecar(&self) -> Result<Option<Rc<dyn Extractor<Self> + '_>>, std::io::Error> {
if !self.sidecar {
return Ok(None);
impl ItemReader {
/// Read a chunk of bytes.
pub async fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
match self {
Self::File(x) => std::io::Read::read(x, buf),
Self::S3(x) => x.read(buf).await,
}
// TODO: use a generic tomlextractor instead?
// you'll need a fake _ref_ to the toml file, though.
return Ok(Some(Rc::new(SidecarExtractor::new(self))));
}
fn write_sidecar(
&self,
path: Vec<Label>,
value: PileValue<'_, Self>,
) -> Result<(), std::io::Error> {
if !self.sidecar {
return Ok(());
}
let sidecar_path = self.path.with_extension("toml");
let mut doc: toml_edit::DocumentMut = if sidecar_path.is_file() {
let content = std::fs::read_to_string(&sidecar_path)?;
content.parse().unwrap_or_default()
} else {
toml_edit::DocumentMut::new()
};
fn to_edit_item(v: toml::Value) -> toml_edit::Item {
match v {
toml::Value::String(s) => toml_edit::value(s),
toml::Value::Integer(i) => toml_edit::value(i),
toml::Value::Float(f) => toml_edit::value(f),
toml::Value::Boolean(b) => toml_edit::value(b),
toml::Value::Datetime(d) => toml_edit::value(d.to_string()),
toml::Value::Array(arr) => {
let mut array = toml_edit::Array::new();
for item in arr {
if let toml_edit::Item::Value(v) = to_edit_item(item) {
array.push_formatted(v);
}
/// Read all remaining bytes into a `Vec`.
pub async fn read_to_end(mut self) -> std::io::Result<Vec<u8>> {
match self {
Self::File(mut f) => {
let mut buf = Vec::new();
std::io::Read::read_to_end(&mut f, &mut buf)?;
Ok(buf)
}
Self::S3(ref mut r) => {
let mut buf = Vec::new();
let mut chunk = vec![0u8; 65536];
loop {
let n = r.read(&mut chunk).await?;
if n == 0 {
break;
}
toml_edit::Item::Value(toml_edit::Value::Array(array))
buf.extend_from_slice(&chunk[..n]);
}
toml::Value::Table(t) => {
let mut table = toml_edit::Table::new();
for (k, v) in t {
table.insert(&k, to_edit_item(v));
Ok(buf)
}
}
}
pub fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result<u64> {
match self {
Self::File(x) => x.seek(pos),
Self::S3(x) => x.seek(pos),
}
}
}
//
// MARK: S3Reader
//
pub struct S3Reader {
client: Arc<aws_sdk_s3::Client>,
bucket: SmartString<LazyCompact>,
key: SmartString<LazyCompact>,
cursor: u64,
size: u64,
}
impl S3Reader {
async fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
let len_left = self.size.saturating_sub(self.cursor);
if len_left == 0 || buf.is_empty() {
return Ok(0);
}
let start_byte = self.cursor;
let len_to_read = (buf.len() as u64).min(len_left);
let end_byte = start_byte + len_to_read - 1;
let resp = self
.client
.get_object()
.bucket(self.bucket.as_str())
.key(self.key.as_str())
.range(format!("bytes={start_byte}-{end_byte}"))
.send()
.await
.map_err(std::io::Error::other)?;
let bytes = resp
.body
.collect()
.await
.map(|x| x.into_bytes())
.map_err(std::io::Error::other)?;
let n = bytes.len().min(buf.len());
buf[..n].copy_from_slice(&bytes[..n]);
self.cursor += n as u64;
Ok(n)
}
fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result<u64> {
match pos {
std::io::SeekFrom::Start(x) => self.cursor = x.min(self.size),
std::io::SeekFrom::Current(x) => {
if x < 0 {
let abs = x.unsigned_abs();
if abs > self.cursor {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidInput,
"cannot seek past start",
));
}
toml_edit::Item::Table(table)
self.cursor -= abs;
} else {
self.cursor += x as u64;
}
}
std::io::SeekFrom::End(x) => {
if x < 0 {
let abs = x.unsigned_abs();
if abs > self.size {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidInput,
"cannot seek past start",
));
}
self.cursor = self.size - abs;
} else {
self.cursor = self.size + x as u64;
}
}
}
let json_value = value.to_json()?;
let toml_value: toml::Value = serde_json::from_value(json_value)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
let item = to_edit_item(toml_value);
let Some((path_last, path_init)) = path.split_last() else {
return Ok(());
};
let mut table = doc.as_table_mut();
for label in path_init {
let key = label.as_str();
if !table.contains_key(key) {
table.insert(key, toml_edit::Item::Table(toml_edit::Table::new()));
}
table = table
.get_mut(key)
.and_then(|item| item.as_table_mut())
.ok_or_else(|| {
std::io::Error::new(
std::io::ErrorKind::InvalidData,
"path element is not a table",
)
})?;
}
table.insert(path_last.as_str(), item);
std::fs::write(&sidecar_path, doc.to_string())?;
Ok(())
self.cursor = self.cursor.min(self.size);
Ok(self.cursor)
}
}

View File

@@ -5,7 +5,7 @@ mod misc;
pub use misc::*;
mod dataset;
pub use dataset::*;
pub use dataset::{Dataset, DatasetError, Datasets};
mod item;
pub use item::*;

View File

@@ -1,35 +1,36 @@
use chrono::{DateTime, Utc};
use itertools::Itertools;
use pile_config::Label;
use std::path::PathBuf;
use std::{path::PathBuf, sync::Arc};
use tokio_stream::wrappers::ReceiverStream;
use walkdir::WalkDir;
use crate::{DataSource, item::FileItem, path_ts_latest};
use crate::{DataSource, Item, path_ts_latest};
#[derive(Debug)]
pub struct DirDataSource {
pub name: Label,
pub dirs: Vec<PathBuf>,
pub dir: PathBuf,
pub sidecars: bool,
}
impl DirDataSource {
pub fn new(name: &Label, dirs: Vec<PathBuf>, sidecars: bool) -> Self {
pub fn new(name: &Label, dir: PathBuf, sidecars: bool) -> Self {
Self {
name: name.clone(),
dirs,
dir,
sidecars,
}
}
}
impl DataSource for DirDataSource {
type Key = PathBuf;
type Item = FileItem;
type Error = std::io::Error;
impl DataSource for Arc<DirDataSource> {
async fn get(&self, key: &str) -> Result<Option<Item>, std::io::Error> {
let key = match key.parse::<PathBuf>() {
Ok(x) => self.dir.join(x),
Err(_) => return Ok(None),
};
fn get(&self, key: &Self::Key) -> Result<Option<Self::Item>, Self::Error> {
if !key.is_file() {
return Ok(None);
}
@@ -39,64 +40,84 @@ impl DataSource for DirDataSource {
return Ok(None);
}
return Ok(Some(FileItem {
source_name: self.name.clone(),
path: key.to_owned(),
sidecar: self.sidecars,
return Ok(Some(Item::File {
source: Arc::clone(self),
path: key.clone(),
sidecar: self.sidecars.then(|| {
Box::new(Item::File {
source: Arc::clone(self),
path: key.with_extension("toml"),
sidecar: None,
})
}),
}));
}
fn iter(&self) -> impl Iterator<Item = Result<(Self::Key, Self::Item), Self::Error>> {
return self
.dirs
.iter()
.flat_map(|x| WalkDir::new(x).into_iter().map_ok(move |d| (x, d)))
.filter_ok(|(_, entry)| !entry.file_type().is_dir())
.filter_map(|x| match x {
Err(err) => {
let msg = format!("other walkdir error: {err:?}");
Some(Err(err
.into_io_error()
.unwrap_or(std::io::Error::other(msg))))
fn iter(&self) -> ReceiverStream<Result<Item, std::io::Error>> {
let (tx, rx) = tokio::sync::mpsc::channel(64);
let source = Arc::clone(self);
let dir = self.dir.clone();
tokio::task::spawn_blocking(move || {
for entry in WalkDir::new(dir) {
let entry = match entry {
Err(e) => {
let msg = format!("walkdir error: {e:?}");
let err = e.into_io_error().unwrap_or(std::io::Error::other(msg));
if tx.blocking_send(Err(err)).is_err() {
return;
}
continue;
}
Ok(e) => e,
};
if entry.file_type().is_dir() {
continue;
}
Ok((_, entry)) => {
let path = entry.into_path();
let path = entry.into_path();
let item = match path.extension().and_then(|x| x.to_str()) {
None => return None,
let item = match path.extension().and_then(|x| x.to_str()) {
None => continue,
Some("toml") if source.sidecars => continue,
Some(_) => Item::File {
source: Arc::clone(&source),
path: path.clone(),
// Ignore toml if sidecars are enabled
Some("toml") if self.sidecars => return None,
sidecar: source.sidecars.then(|| {
Box::new(Item::File {
source: Arc::clone(&source),
path: path.with_extension("toml"),
sidecar: None,
})
}),
},
};
Some(_) => FileItem {
source_name: self.name.clone(),
path: path.clone(),
sidecar: self.sidecars,
},
};
Some(Ok((path, item)))
if tx.blocking_send(Ok(item)).is_err() {
return;
}
});
}
});
ReceiverStream::new(rx)
}
fn latest_change(&self) -> Result<Option<DateTime<Utc>>, Self::Error> {
async fn latest_change(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
let mut ts: Option<DateTime<Utc>> = None;
for path in &self.dirs {
if !path.exists() {
continue;
}
let new = path_ts_latest(path)?;
match (ts, new) {
(_, None) => continue,
(None, Some(new)) => ts = Some(new),
(Some(old), Some(new)) => ts = Some(old.max(new)),
};
if !self.dir.exists() {
return Ok(None);
}
let new = path_ts_latest(&self.dir)?;
match (ts, new) {
(_, None) => {}
(None, Some(new)) => ts = Some(new),
(Some(old), Some(new)) => ts = Some(old.max(new)),
};
return Ok(ts);
}
}

View File

@@ -1,2 +1,5 @@
mod dir;
pub use dir::*;
mod s3;
pub use s3::*;

View File

@@ -0,0 +1,206 @@
use aws_sdk_s3::config::{BehaviorVersion, Credentials, Region};
use chrono::{DateTime, Utc};
use pile_config::{Label, S3Credentials};
use smartstring::{LazyCompact, SmartString};
use std::sync::Arc;
use tokio_stream::wrappers::ReceiverStream;
use crate::{DataSource, Item};
#[derive(Debug)]
pub struct S3DataSource {
pub name: Label,
pub bucket: SmartString<LazyCompact>,
pub prefix: Option<SmartString<LazyCompact>>,
pub sidecars: bool,
pub client: Arc<aws_sdk_s3::Client>,
}
impl S3DataSource {
pub fn new(
name: &Label,
bucket: String,
prefix: Option<String>,
endpoint: Option<String>,
region: String,
credentials: &S3Credentials,
sidecars: bool,
) -> Result<Self, std::io::Error> {
let client = {
let creds = Credentials::new(
&credentials.access_key_id,
&credentials.secret_access_key,
None,
None,
"pile",
);
let mut s3_config = aws_sdk_s3::config::Builder::new()
.behavior_version(BehaviorVersion::latest())
.region(Region::new(region))
.credentials_provider(creds);
if let Some(ep) = endpoint {
s3_config = s3_config.endpoint_url(ep).force_path_style(true);
}
aws_sdk_s3::Client::from_conf(s3_config.build())
};
Ok(Self {
name: name.clone(),
bucket: bucket.into(),
prefix: prefix.map(|x| x.into()),
sidecars,
client: Arc::new(client),
})
}
fn make_item(self: &Arc<Self>, key: impl Into<SmartString<LazyCompact>>) -> Item {
Item::S3 {
source: Arc::clone(self),
key: key.into(),
sidecar: None, // TODO: add sidecars
}
}
}
impl DataSource for Arc<S3DataSource> {
async fn get(&self, key: &str) -> Result<Option<Item>, std::io::Error> {
if self.sidecars && key.ends_with(".toml") {
return Ok(None);
}
let result = self
.client
.head_object()
.bucket(self.bucket.as_str())
.key(key)
.send()
.await;
match result {
Err(sdk_err) => {
let not_found = sdk_err
.as_service_error()
.map(|e| e.is_not_found())
.unwrap_or(false);
if not_found {
return Ok(None);
}
Err(std::io::Error::other(sdk_err))
}
Ok(_) => Ok(Some(self.make_item(key))),
}
}
fn iter(&self) -> ReceiverStream<Result<Item, std::io::Error>> {
let (tx, rx) = tokio::sync::mpsc::channel(64);
let source = Arc::clone(self);
tokio::spawn(async move {
let mut continuation_token: Option<String> = None;
loop {
let mut req = source
.client
.list_objects_v2()
.bucket(source.bucket.as_str());
if let Some(prefix) = &source.prefix {
req = req.prefix(prefix.as_str());
}
if let Some(token) = continuation_token {
req = req.continuation_token(token);
}
let resp = match req.send().await {
Err(e) => {
let _ = tx.send(Err(std::io::Error::other(e))).await;
break;
}
Ok(resp) => resp,
};
let next_token = resp.next_continuation_token().map(ToOwned::to_owned);
let is_truncated = resp.is_truncated().unwrap_or(false);
for obj in resp.contents() {
let key = match obj.key() {
Some(k) => k.to_owned(),
None => continue,
};
if source.sidecars && key.ends_with(".toml") {
continue;
}
let item = Item::S3 {
source: Arc::clone(&source),
key: key.into(),
sidecar: None, // TODO: add sidecars
};
if tx.send(Ok(item)).await.is_err() {
return;
}
}
if !is_truncated {
break;
}
continuation_token = next_token;
}
});
ReceiverStream::new(rx)
}
async fn latest_change(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
let mut ts: Option<DateTime<Utc>> = None;
let mut continuation_token: Option<String> = None;
loop {
let mut req = self.client.list_objects_v2().bucket(self.bucket.as_str());
if let Some(prefix) = &self.prefix {
req = req.prefix(prefix.as_str());
}
if let Some(token) = continuation_token {
req = req.continuation_token(token);
}
let resp = match req.send().await {
Err(_) => return Ok(None),
Ok(resp) => resp,
};
let next_token = resp.next_continuation_token().map(ToOwned::to_owned);
let is_truncated = resp.is_truncated().unwrap_or(false);
for obj in resp.contents() {
if let Some(last_modified) = obj.last_modified() {
let dt = DateTime::from_timestamp(
last_modified.secs(),
last_modified.subsec_nanos(),
);
if let Some(dt) = dt {
ts = Some(match ts {
None => dt,
Some(prev) => prev.max(dt),
});
}
}
}
if !is_truncated {
break;
}
continuation_token = next_token;
}
Ok(ts)
}
}

View File

@@ -0,0 +1,158 @@
use aws_sdk_s3::{error::SdkError, operation::get_object::GetObjectError};
use mime::Mime;
use std::io::{Error as IoError, Seek, SeekFrom, Write};
use thiserror::Error;
use super::S3Client;
use crate::retry;
#[derive(Debug, Error)]
#[expect(clippy::large_enum_variant)]
pub enum S3ReaderError {
#[error("sdk error")]
SdkError(#[from] SdkError<GetObjectError>),
#[error("byte stream error")]
ByteStreamError(#[from] aws_sdk_s3::primitives::ByteStreamError),
#[error("i/o error")]
IoError(#[from] IoError),
}
/// Provides a [`std::io::Read`]-like interface to an S3 object. \
/// This doesn't actually implement [`std::io::Read`] because Read isn't async.
///
/// Also implements [`std::io::Seek`]
pub struct S3Reader {
pub(super) client: S3Client,
pub(super) bucket: String,
pub(super) key: String,
pub(super) cursor: u64,
pub(super) size: u64,
pub(super) mime: Mime,
}
impl S3Reader {
pub async fn read(&mut self, mut buf: &mut [u8]) -> Result<usize, S3ReaderError> {
let len_left = self.size - self.cursor;
if len_left == 0 || buf.is_empty() {
return Ok(0);
}
#[expect(clippy::unwrap_used)] // TODO: probably fits?
let start_byte = usize::try_from(self.cursor).unwrap();
#[expect(clippy::unwrap_used)] // usize fits in u64
let len_to_read = u64::try_from(buf.len()).unwrap().min(len_left);
#[expect(clippy::unwrap_used)] // must fit, we called min()
let len_to_read = usize::try_from(len_to_read).unwrap();
let end_byte = start_byte + len_to_read - 1;
let b = retry!(
self.client.retries,
self.client
.client
.get_object()
.bucket(self.bucket.as_str())
.key(self.key.as_str())
.range(format!("bytes={start_byte}-{end_byte}"))
.send()
.await
)?;
// Looks like `bytes 31000000-31999999/33921176``
// println!("{:?}", b.content_range);
let mut bytes = b.body.collect().await?.into_bytes();
bytes.truncate(len_to_read);
let l = bytes.len();
// Memory to memory writes are infallible
#[expect(clippy::unwrap_used)]
buf.write_all(&bytes).unwrap();
// Cannot fail, usize should always fit into u64
#[expect(clippy::unwrap_used)]
{
self.cursor += u64::try_from(l).unwrap();
}
return Ok(len_to_read);
}
pub fn is_done(&self) -> bool {
return self.cursor == self.size;
}
pub fn mime(&self) -> &Mime {
&self.mime
}
/// Write the entire contents of this reader to `r`.
///
/// This method always downloads the whole object,
/// and always preserves `self.cursor`.
pub async fn download<W: Write>(&mut self, r: &mut W) -> Result<(), S3ReaderError> {
let pos = self.stream_position()?;
const BUF_LEN: usize = 10_000_000;
#[expect(clippy::unwrap_used)] // Cannot fail
let mut buf: Box<[u8; BUF_LEN]> = vec![0u8; BUF_LEN].try_into().unwrap();
while !self.is_done() {
let b = self.read(&mut buf[..]).await?;
r.write_all(&buf[0..b])?;
}
self.seek(SeekFrom::Start(pos))?;
Ok(())
}
}
impl Seek for S3Reader {
fn seek(&mut self, pos: SeekFrom) -> std::io::Result<u64> {
match pos {
SeekFrom::Start(x) => self.cursor = x.min(self.size - 1),
// Cannot panic, we handle all cases
#[expect(clippy::unwrap_used)]
SeekFrom::Current(x) => {
if x < 0 {
if u64::try_from(x.abs()).unwrap() > self.cursor {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidInput,
"cannot seek past start",
));
}
self.cursor -= u64::try_from(x.abs()).unwrap();
} else {
self.cursor += u64::try_from(x).unwrap();
}
}
// Cannot panic, we handle all cases
#[expect(clippy::unwrap_used)]
SeekFrom::End(x) => {
if x < 0 {
if u64::try_from(x.abs()).unwrap() > self.size {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidInput,
"cannot seek past start",
));
}
// Cannot fail, is abs
self.cursor = self.size - u64::try_from(x.abs()).unwrap();
} else {
// Cannot fail, is positive
self.cursor = self.size + u64::try_from(x).unwrap();
}
}
}
self.cursor = self.cursor.min(self.size - 1);
return Ok(self.cursor);
}
}

View File

@@ -1,23 +1,18 @@
use chrono::{DateTime, Utc};
use std::error::Error;
use tokio_stream::wrappers::ReceiverStream;
use crate::{Item, Key};
use crate::Item;
/// A read-only set of [Item]s.
pub trait DataSource {
/// The type used to retrieve items from this source
/// (e.g, a PathBuf or a primary key)
type Key: Key;
type Item: Item<Key = Self::Key>;
type Error: Error + Sync + Send;
/// Get an item from this datasource
fn get(&self, key: &Self::Key) -> Result<Option<Self::Item>, Self::Error>;
fn get(&self, key: &str) -> impl Future<Output = Result<Option<Item>, std::io::Error>> + Send;
/// Iterate over all items in this source in an arbitrary order
fn iter(&self) -> impl Iterator<Item = Result<(Self::Key, Self::Item), Self::Error>>;
fn iter(&self) -> ReceiverStream<Result<Item, std::io::Error>>;
/// Return the time of the latest change to the data in this source
fn latest_change(&self) -> Result<Option<DateTime<Utc>>, Self::Error>;
fn latest_change(
&self,
) -> impl Future<Output = Result<Option<DateTime<Utc>>, std::io::Error>> + Send;
}

View File

@@ -1,26 +1,25 @@
use std::rc::Rc;
use pile_config::objectpath::{ObjectPath, PathSegment};
use serde_json::{Map, Value};
use smartstring::{LazyCompact, SmartString};
use std::sync::Arc;
use crate::{Item, extract::Extractor};
use crate::extract::Extractor;
/// An immutable, lazily-computed value similar to [serde_json::Value].
pub enum PileValue<'a, I: crate::Item> {
pub enum PileValue<'a> {
Null,
/// A string
String(SmartString<LazyCompact>),
/// An array of values
Array(Vec<PileValue<'a, I>>),
Array(Vec<PileValue<'a>>),
/// A lazily-computed map of {label: value}
Extractor(Rc<dyn Extractor<I> + 'a>),
Extractor(Arc<dyn Extractor + 'a>),
}
impl<I: Item> Clone for PileValue<'_, I> {
impl Clone for PileValue<'_> {
fn clone(&self) -> Self {
match self {
Self::Null => Self::Null,
@@ -31,8 +30,8 @@ impl<I: Item> Clone for PileValue<'_, I> {
}
}
impl<'a, I: Item> PileValue<'a, I> {
pub fn query(&'a self, query: &ObjectPath) -> Result<Option<&'a Self>, std::io::Error> {
impl<'a> PileValue<'a> {
pub async fn query(&'a self, query: &ObjectPath) -> Result<Option<&'a Self>, std::io::Error> {
let mut out = Some(self);
for s in &query.segments {
@@ -44,7 +43,7 @@ impl<'a, I: Item> PileValue<'a, I> {
Some(Self::Null) => None,
Some(Self::Array(_)) => None,
Some(Self::String(_)) => None,
Some(Self::Extractor(e)) => e.field(field)?,
Some(Self::Extractor(e)) => e.field(field).await?,
}
}
@@ -78,30 +77,29 @@ impl<'a, I: Item> PileValue<'a, I> {
}
}
pub fn to_json(&self) -> Result<Value, std::io::Error> {
pub async fn to_json(&self) -> Result<Value, std::io::Error> {
Ok(match self {
Self::Null => Value::Null,
Self::String(x) => Value::String(x.to_string()),
Self::Array(x) => Value::Array(
x.iter()
.map(|x| x.to_json())
.collect::<Result<Vec<_>, _>>()?,
),
Self::Array(x) => {
let mut arr = Vec::new();
for item in x {
arr.push(Box::pin(item.to_json()).await?);
}
Value::Array(arr)
}
Self::Extractor(e) => {
let keys = e.fields()?;
let map = keys
.iter()
.map(|k| {
#[expect(clippy::expect_used)]
let v = e.field(k)?.expect("key must be valid");
let v = v.to_json()?;
Ok((k.to_string(), v))
})
.collect::<Result<Map<String, Value>, std::io::Error>>()?;
let keys = e.fields().await?;
let mut map = Map::new();
for k in &keys {
let v = match e.field(k).await? {
Some(x) => x,
None => continue,
};
map.insert(k.to_string(), Box::pin(v.to_json()).await?);
}
Value::Object(map)
}
})

View File

@@ -15,6 +15,7 @@ pile-config = { workspace = true }
tracing = { workspace = true }
tracing-subscriber = { workspace = true }
tokio = { workspace = true }
tokio-stream = { workspace = true }
clap = { workspace = true }
#clap_complete = { workspace = true }
serde = { workspace = true }

View File

@@ -3,9 +3,10 @@ use clap::Args;
use pile_config::{Label, Source};
use pile_dataset::index::DbFtsIndex;
use pile_dataset::source::DirDataSource;
use pile_dataset::{DataSource, Dataset, FileItem, Item, PileValue, extract::MetaExtractor};
use pile_dataset::{DataSource, Datasets, Item, PileValue, extract::MetaExtractor};
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
use std::{path::PathBuf, rc::Rc};
use std::{path::PathBuf, sync::Arc};
use tokio_stream::StreamExt;
use tracing::{info, warn};
use crate::{CliCmd, GlobalContext};
@@ -43,7 +44,7 @@ impl CliCmd for AnnotateCommand {
.ok_or_else(|| anyhow::anyhow!("invalid field name {:?}", self.field))?;
let dest_path = Self::parse_dest(&self.dest)?;
let ds = Dataset::open(&self.config)
let ds = Datasets::open(&self.config)
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
if !ds.config.schema.contains_key(&field) {
@@ -51,7 +52,7 @@ impl CliCmd for AnnotateCommand {
}
let index = DbFtsIndex::new(&ds.path_workdir, &ds.config);
let mut count = 0u64;
let count = 0u64;
for (name, source) in &ds.config.dataset.source {
match source {
@@ -61,31 +62,40 @@ impl CliCmd for AnnotateCommand {
continue;
}
let source = DirDataSource::new(name, path.clone().to_vec(), *sidecars);
let source = Arc::new(DirDataSource::new(name, path.clone(), *sidecars));
for res in source.iter() {
let (_key, item) =
res.with_context(|| format!("while reading source {name}"))?;
let mut stream = source.iter();
while let Some(res) = stream.next().await {
let item = res.with_context(|| format!("while reading source {name}"))?;
let Item::File { path, .. } = &item else {
continue;
};
let meta = MetaExtractor::new(&item);
let extractor = PileValue::<FileItem>::Extractor(Rc::new(meta));
let extractor = PileValue::Extractor(Arc::new(meta));
let Some(value) =
index.get_field(&extractor, &field).with_context(|| {
format!("while extracting field from {}", item.path.display())
index.get_field(&extractor, &field).await.with_context(|| {
format!("while extracting field from {}", path.display())
})?
else {
continue;
};
item.write_sidecar(dest_path.clone(), PileValue::String(value.into()))
.with_context(|| {
format!("while writing sidecar for {}", item.path.display())
})?;
// TODO: implement sidecar writing
let _ = (&dest_path, &value);
todo!("write_sidecar not yet implemented");
count += 1;
#[expect(unreachable_code)]
{
count += 1;
}
}
}
Source::S3 { .. } => {
warn!("Source {name} is an S3 source; sidecar annotation is not yet supported");
}
}
}

View File

@@ -1,7 +1,7 @@
use anyhow::{Context, Result, anyhow};
use clap::Args;
use pile_config::ConfigToml;
use pile_dataset::Dataset;
use pile_dataset::Datasets;
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
use std::{fmt::Debug, path::PathBuf};
use tracing::{debug, error, info, warn};
@@ -43,11 +43,11 @@ impl CliCmd for CheckCommand {
}
}
let ds = Dataset::open(&self.config)
let ds = Datasets::open(&self.config)
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
let ts_fts = ds.ts_fts().context("while determining fts age")?;
let ts_data = ds.ts_data().context("while determining data age")?;
let ts_data = ds.ts_data().await.context("while determining data age")?;
match (ts_fts, ts_data) {
(None, Some(_)) => warn!("Could not determine fts age"),

View File

@@ -1,6 +1,6 @@
use anyhow::{Context, Result};
use clap::Args;
use pile_dataset::Dataset;
use pile_dataset::Datasets;
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
use std::{fmt::Debug, path::PathBuf};
@@ -23,10 +23,10 @@ impl CliCmd for IndexCommand {
_ctx: GlobalContext,
flag: CancelFlag,
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
let ds = Dataset::open(&self.config)
let ds = Datasets::open(&self.config)
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
ds.fts_refresh(self.jobs, Some(flag)).map_err(|x| {
ds.fts_refresh(self.jobs, Some(flag)).await.map_err(|x| {
x.map_err(|x| {
anyhow::Error::from(x).context(format!(
"while refreshing fts for {}",

View File

@@ -1,6 +1,6 @@
use anyhow::{Context, Result};
use clap::Args;
use pile_dataset::Dataset;
use pile_dataset::Datasets;
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
use std::{fmt::Debug, path::PathBuf};
use tracing::info;
@@ -39,12 +39,12 @@ impl CliCmd for LookupCommand {
_ctx: GlobalContext,
flag: CancelFlag,
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
let ds = Dataset::open(&self.config)
let ds = Datasets::open(&self.config)
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
if self.refresh && ds.needs_fts().context("while checking dataset fts")? {
if self.refresh && ds.needs_fts().await.context("while checking dataset fts")? {
info!("FTS index is missing or out-of-date, regenerating");
ds.fts_refresh(self.jobs, Some(flag)).map_err(|x| {
ds.fts_refresh(self.jobs, Some(flag)).await.map_err(|x| {
x.map_err(|x| {
anyhow::Error::from(x).context(format!(
"while refreshing fts for {}",

View File

@@ -1,16 +1,23 @@
use anyhow::{Context, Result};
use clap::Args;
use pile_config::Label;
use pile_dataset::{FileItem, PileValue, extract::MetaExtractor};
use pile_dataset::{Datasets, PileValue, extract::MetaExtractor};
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
use std::{fmt::Debug, path::PathBuf, rc::Rc};
use std::{path::PathBuf, sync::Arc};
use crate::{CliCmd, GlobalContext};
#[derive(Debug, Args)]
pub struct ProbeCommand {
/// The file to probe
file: PathBuf,
/// Source name (as defined in pile.toml)
source: String,
/// Item key within the source
key: String,
/// Path to dataset config
#[arg(long, short = 'c', default_value = "./pile.toml")]
config: PathBuf,
}
impl CliCmd for ProbeCommand {
@@ -21,19 +28,23 @@ impl CliCmd for ProbeCommand {
_ctx: GlobalContext,
_flag: CancelFlag,
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
let item = FileItem {
path: self.file.clone(),
source_name: Label::new("probe-source").unwrap(),
sidecar: true,
};
let source = Label::new(&self.source)
.ok_or_else(|| anyhow::anyhow!("invalid source name {:?}", self.source))?;
let value = PileValue::Extractor(Rc::new(MetaExtractor::new(&item)));
let ds = Datasets::open(&self.config)
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
let item = ds.get(&source, &self.key).await.ok_or_else(|| {
anyhow::anyhow!("{:?} not found in source {:?}", self.key, self.source)
})?;
let value = PileValue::Extractor(Arc::new(MetaExtractor::new(&item)));
let json = value
.to_json()
.with_context(|| format!("while extracting {}", self.file.display()))?;
.await
.with_context(|| format!("while extracting {}", self.key))?;
let json = serde_json::to_string_pretty(&json).unwrap();
println!("{json}");
return Ok(0);
}