Extractor refactor, S3 support
This commit is contained in:
1502
Cargo.lock
generated
1502
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -73,6 +73,10 @@ tantivy = "0.25.0"
|
||||
|
||||
# Async & Parallelism
|
||||
tokio = { version = "1.49.0", features = ["full"] }
|
||||
tokio-stream = "0.1"
|
||||
async-trait = "0.1"
|
||||
aws-sdk-s3 = "1"
|
||||
aws-config = "1"
|
||||
|
||||
# CLI & logging
|
||||
tracing = "0.1.44"
|
||||
|
||||
@@ -9,8 +9,7 @@ name = "dataset"
|
||||
# working_dir = ".pile"
|
||||
|
||||
# Data sources available in this dataset
|
||||
source."music" = { type = "flac", path = ["music", "music-2"] }
|
||||
|
||||
source."music" = { type = "filesystem", path = "music" }
|
||||
|
||||
# This dataset's schema.
|
||||
# Defines normalized fields that are extracted from source entries on-demand.
|
||||
|
||||
@@ -46,16 +46,21 @@ pub struct DatasetConfig {
|
||||
pub post: Vec<FieldSpecPost>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub struct S3Credentials {
|
||||
pub access_key_id: String,
|
||||
pub secret_access_key: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
#[serde(tag = "type")]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum Source {
|
||||
/// A directory files
|
||||
/// A directory of files
|
||||
Filesystem {
|
||||
/// The directories to scan.
|
||||
/// Must be relative.
|
||||
#[serde(alias = "paths")]
|
||||
path: OneOrMany<PathBuf>,
|
||||
path: PathBuf,
|
||||
|
||||
/// If true, all toml files are ignored.
|
||||
/// Metadata can be added to any file using a {filename}.toml.
|
||||
@@ -65,6 +70,23 @@ pub enum Source {
|
||||
#[serde(default = "default_true")]
|
||||
sidecars: bool,
|
||||
},
|
||||
|
||||
/// An S3-compatible object store bucket
|
||||
S3 {
|
||||
bucket: String,
|
||||
prefix: Option<String>,
|
||||
|
||||
/// Custom endpoint URL (for MinIO, etc.)
|
||||
endpoint: Option<String>,
|
||||
|
||||
region: String,
|
||||
|
||||
credentials: S3Credentials,
|
||||
|
||||
/// If true, all .toml objects are treated as sidecar metadata files.
|
||||
#[serde(default = "default_true")]
|
||||
sidecars: bool,
|
||||
},
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
@@ -20,9 +20,11 @@ tracing = { workspace = true }
|
||||
chrono = { workspace = true }
|
||||
toml = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
rayon = { workspace = true }
|
||||
smartstring = { workspace = true }
|
||||
blake3 = { workspace = true }
|
||||
toml_edit = { workspace = true }
|
||||
pdf = { workspace = true }
|
||||
id3 = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
tokio-stream = { workspace = true }
|
||||
async-trait = { workspace = true }
|
||||
aws-sdk-s3 = { workspace = true }
|
||||
|
||||
@@ -1,30 +1,17 @@
|
||||
use chrono::{DateTime, Utc};
|
||||
use pile_config::{ConfigToml, Label, Source};
|
||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
||||
use rayon::{
|
||||
ThreadPoolBuilder,
|
||||
iter::{IntoParallelIterator, ParallelIterator},
|
||||
};
|
||||
use std::{
|
||||
io::ErrorKind,
|
||||
path::PathBuf,
|
||||
sync::{
|
||||
Arc,
|
||||
atomic::{AtomicU64, Ordering},
|
||||
mpsc::Receiver,
|
||||
},
|
||||
thread::JoinHandle,
|
||||
time::Instant,
|
||||
};
|
||||
use std::{collections::HashMap, io::ErrorKind, path::PathBuf, sync::Arc, time::Instant};
|
||||
use tantivy::{Executor, Index, IndexWriter, TantivyError, collector::TopDocs};
|
||||
use thiserror::Error;
|
||||
use tokio_stream::{StreamExt, wrappers::ReceiverStream};
|
||||
use tracing::{debug, info, trace, warn};
|
||||
|
||||
use crate::{
|
||||
DataSource, FileItem,
|
||||
DataSource, Item,
|
||||
index::{DbFtsIndex, FtsLookupResult},
|
||||
path_ts_earliest,
|
||||
source::DirDataSource,
|
||||
source::{DirDataSource, S3DataSource},
|
||||
};
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
@@ -39,15 +26,54 @@ pub enum DatasetError {
|
||||
NoFtsIndex,
|
||||
}
|
||||
|
||||
pub struct Dataset {
|
||||
//
|
||||
// MARK: Dataset enum
|
||||
//
|
||||
|
||||
/// An opened data source — either a local filesystem directory or an S3 bucket.
|
||||
pub enum Dataset {
|
||||
Dir(Arc<DirDataSource>),
|
||||
S3(Arc<S3DataSource>),
|
||||
}
|
||||
|
||||
impl Dataset {
|
||||
pub async fn get(&self, key: &str) -> Option<Item> {
|
||||
match self {
|
||||
Self::Dir(ds) => ds.get(key).await.ok().flatten(),
|
||||
Self::S3(ds) => ds.get(key).await.ok().flatten(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> ReceiverStream<Result<Item, std::io::Error>> {
|
||||
match self {
|
||||
Self::Dir(ds) => ds.iter(),
|
||||
Self::S3(ds) => ds.iter(),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn latest_change(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
|
||||
match self {
|
||||
Self::Dir(ds) => ds.latest_change().await,
|
||||
Self::S3(ds) => ds.latest_change().await,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// MARK: Datasets collection
|
||||
//
|
||||
|
||||
/// An opened dataset: config, working directory, and all opened sources.
|
||||
pub struct Datasets {
|
||||
pub path_config: PathBuf,
|
||||
pub path_parent: PathBuf,
|
||||
pub path_workdir: PathBuf,
|
||||
|
||||
pub config: ConfigToml,
|
||||
pub sources: HashMap<Label, Dataset>,
|
||||
}
|
||||
|
||||
impl Dataset {
|
||||
impl Datasets {
|
||||
pub fn open(config: impl Into<PathBuf>) -> Result<Self, std::io::Error> {
|
||||
let path_config = config.into();
|
||||
let path_parent = path_config
|
||||
@@ -84,11 +110,54 @@ impl Dataset {
|
||||
.unwrap_or(path_parent.join(".pile"))
|
||||
.join(config.dataset.name.as_str());
|
||||
|
||||
let mut sources = HashMap::new();
|
||||
for (label, source) in &config.dataset.source {
|
||||
match source {
|
||||
Source::Filesystem { path, sidecars } => {
|
||||
sources.insert(
|
||||
label.clone(),
|
||||
Dataset::Dir(Arc::new(DirDataSource::new(
|
||||
label,
|
||||
path_parent.join(path),
|
||||
*sidecars,
|
||||
))),
|
||||
);
|
||||
}
|
||||
|
||||
Source::S3 {
|
||||
bucket,
|
||||
prefix,
|
||||
endpoint,
|
||||
region,
|
||||
credentials,
|
||||
sidecars,
|
||||
} => {
|
||||
match S3DataSource::new(
|
||||
label,
|
||||
bucket.clone(),
|
||||
prefix.clone(),
|
||||
endpoint.clone(),
|
||||
region.clone(),
|
||||
credentials,
|
||||
*sidecars,
|
||||
) {
|
||||
Ok(ds) => {
|
||||
sources.insert(label.clone(), Dataset::S3(Arc::new(ds)));
|
||||
}
|
||||
Err(err) => {
|
||||
warn!("Could not open S3 source {label}: {err}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Ok(Self {
|
||||
path_config,
|
||||
path_parent,
|
||||
path_workdir,
|
||||
config,
|
||||
sources,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -96,15 +165,8 @@ impl Dataset {
|
||||
// MARK: get
|
||||
//
|
||||
|
||||
pub fn get(&self, source: &Label, key: &PathBuf) -> Option<FileItem> {
|
||||
let s = self.config.dataset.source.get(source)?;
|
||||
let s = match s {
|
||||
Source::Filesystem { path, sidecars } => {
|
||||
DirDataSource::new(source, path.clone().to_vec(), *sidecars)
|
||||
}
|
||||
};
|
||||
|
||||
s.get(key).ok().flatten()
|
||||
pub async fn get(&self, source: &Label, key: &str) -> Option<Item> {
|
||||
self.sources.get(source)?.get(key).await
|
||||
}
|
||||
|
||||
//
|
||||
@@ -112,9 +174,9 @@ impl Dataset {
|
||||
//
|
||||
|
||||
/// Refresh this dataset's fts index.
|
||||
pub fn fts_refresh(
|
||||
pub async fn fts_refresh(
|
||||
&self,
|
||||
threads: usize,
|
||||
_threads: usize,
|
||||
flag: Option<CancelFlag>,
|
||||
) -> Result<(), CancelableTaskError<DatasetError>> {
|
||||
let fts_tmp_dir = self.path_workdir.join(".tmp-fts");
|
||||
@@ -134,58 +196,40 @@ impl Dataset {
|
||||
let mut index_writer: IndexWriter =
|
||||
index.writer(50 * 1024 * 1024).map_err(DatasetError::from)?;
|
||||
|
||||
let batch_size = 1000;
|
||||
let (_read_task, read_rx) = start_read_task(&self.config, batch_size);
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
let write_pool = ThreadPoolBuilder::new()
|
||||
.num_threads(threads.max(1))
|
||||
.thread_name(|x| format!("fts_refresh_thread_{x}"))
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let mut total = 0u64;
|
||||
while let Ok(batch) = read_rx.recv() {
|
||||
let batch = batch?;
|
||||
if let Some(flag) = &flag
|
||||
&& flag.is_cancelled()
|
||||
{
|
||||
return Err(CancelableTaskError::Cancelled);
|
||||
let mut logged_at = Instant::now();
|
||||
|
||||
for (name, dataset) in &self.sources {
|
||||
info!("Loading source {name}");
|
||||
|
||||
let mut stream = dataset.iter();
|
||||
while let Some(item_result) = stream.next().await {
|
||||
if let Some(flag) = &flag
|
||||
&& flag.is_cancelled()
|
||||
{
|
||||
return Err(CancelableTaskError::Cancelled);
|
||||
}
|
||||
|
||||
let item = item_result.map_err(DatasetError::from)?;
|
||||
let key = item.key();
|
||||
|
||||
match db_index.entry_to_document(&item).await {
|
||||
Ok(Some(doc)) => {
|
||||
index_writer.add_document(doc).map_err(DatasetError::from)?;
|
||||
total += 1;
|
||||
if logged_at.elapsed().as_secs() >= 5 {
|
||||
debug!("Indexed {total} documents so far");
|
||||
logged_at = Instant::now();
|
||||
}
|
||||
}
|
||||
Ok(None) => {
|
||||
warn!("Skipping {key:?}, document is empty");
|
||||
}
|
||||
Err(err) => {
|
||||
warn!("Could not read {key:?}, skipping. {err}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let this = AtomicU64::new(0);
|
||||
let start = Instant::now();
|
||||
write_pool
|
||||
.install(|| {
|
||||
batch
|
||||
.into_par_iter()
|
||||
.filter_map(|(key, item)| match db_index.entry_to_document(&item) {
|
||||
Ok(Some(doc)) => Some((key, doc)),
|
||||
Ok(None) => {
|
||||
warn!("Skipping {key:?}, document is empty");
|
||||
None
|
||||
}
|
||||
Err(err) => {
|
||||
warn!("Could not read {key:?}, skipping. {err}");
|
||||
None
|
||||
}
|
||||
})
|
||||
.map(|(key, doc)| {
|
||||
this.fetch_add(1, Ordering::Relaxed);
|
||||
index_writer
|
||||
.add_document(doc)
|
||||
.map_err(|err| (key, err))
|
||||
.map(|_| ())
|
||||
})
|
||||
.find_first(|x| x.is_err())
|
||||
.unwrap_or(Ok(()))
|
||||
})
|
||||
.map_err(|(_key, err)| DatasetError::from(err))?;
|
||||
|
||||
let this = this.load(Ordering::Relaxed);
|
||||
total += this;
|
||||
let time_ms = start.elapsed().as_millis();
|
||||
debug!("Added a batch of {this} in {time_ms} ms ({total} total)");
|
||||
}
|
||||
|
||||
if let Some(flag) = flag.as_ref()
|
||||
@@ -194,7 +238,7 @@ impl Dataset {
|
||||
return Err(CancelableTaskError::Cancelled);
|
||||
}
|
||||
|
||||
info!("Committing index");
|
||||
info!("Committing {total} documents");
|
||||
index_writer.commit().map_err(DatasetError::from)?;
|
||||
|
||||
if fts_dir.is_dir() {
|
||||
@@ -247,19 +291,14 @@ impl Dataset {
|
||||
}
|
||||
|
||||
/// Time at which data was last modified
|
||||
pub fn ts_data(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
|
||||
pub async fn ts_data(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
|
||||
let mut ts: Option<DateTime<Utc>> = None;
|
||||
|
||||
for (label, source) in &self.config.dataset.source {
|
||||
match source {
|
||||
Source::Filesystem { path, sidecars } => {
|
||||
let s = DirDataSource::new(label, path.clone().to_vec(), *sidecars);
|
||||
match (ts, s.latest_change()?) {
|
||||
(_, None) => continue,
|
||||
(None, Some(new)) => ts = Some(new),
|
||||
(Some(old), Some(new)) => ts = Some(old.max(new)),
|
||||
};
|
||||
}
|
||||
for dataset in self.sources.values() {
|
||||
match (ts, dataset.latest_change().await?) {
|
||||
(_, None) => continue,
|
||||
(None, Some(new)) => ts = Some(new),
|
||||
(Some(old), Some(new)) => ts = Some(old.max(new)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -268,10 +307,10 @@ impl Dataset {
|
||||
|
||||
/// Returns true if we do not have an fts index,
|
||||
/// or if our fts index is older than our data.
|
||||
pub fn needs_fts(&self) -> Result<bool, std::io::Error> {
|
||||
pub async fn needs_fts(&self) -> Result<bool, std::io::Error> {
|
||||
let start = Instant::now();
|
||||
let ts_fts = self.ts_fts()?;
|
||||
let ts_data = self.ts_data()?;
|
||||
let ts_data = self.ts_data().await?;
|
||||
|
||||
let result = match (ts_fts, ts_data) {
|
||||
(None, Some(_)) => true,
|
||||
@@ -292,59 +331,3 @@ impl Dataset {
|
||||
return Ok(result);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// MARK: read_task
|
||||
//
|
||||
|
||||
fn start_read_task(
|
||||
config: &ConfigToml,
|
||||
batch_size: usize,
|
||||
) -> (
|
||||
JoinHandle<()>,
|
||||
Receiver<Result<Vec<(PathBuf, FileItem)>, DatasetError>>,
|
||||
) {
|
||||
let config = config.clone();
|
||||
let (read_tx, read_rx) = std::sync::mpsc::sync_channel(2);
|
||||
|
||||
let read_task = std::thread::spawn(move || {
|
||||
let mut batch = Vec::with_capacity(batch_size);
|
||||
for (name, source) in &config.dataset.source {
|
||||
info!("Loading source {name}");
|
||||
|
||||
match source {
|
||||
Source::Filesystem { path, sidecars } => {
|
||||
let source = DirDataSource::new(name, path.clone().to_vec(), *sidecars);
|
||||
for i in source.iter() {
|
||||
match i {
|
||||
Ok(x) => batch.push(x),
|
||||
Err(err) => {
|
||||
let err = Err(DatasetError::from(err));
|
||||
let _ = read_tx.send(err);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if batch.len() >= batch_size {
|
||||
let b = std::mem::replace(&mut batch, Vec::with_capacity(batch_size));
|
||||
|
||||
match read_tx.send(Ok(b)) {
|
||||
Ok(()) => {}
|
||||
Err(_) => return,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !batch.is_empty() {
|
||||
match read_tx.send(Ok(batch)) {
|
||||
Ok(()) => {}
|
||||
Err(_) => return,
|
||||
};
|
||||
}
|
||||
});
|
||||
|
||||
return (read_task, read_rx);
|
||||
}
|
||||
|
||||
@@ -1,34 +1,40 @@
|
||||
use pile_config::Label;
|
||||
use pile_flac::{FlacBlock, FlacReader};
|
||||
use std::{collections::HashMap, fs::File, io::BufReader, sync::OnceLock};
|
||||
use std::{collections::HashMap, io::BufReader, sync::OnceLock};
|
||||
|
||||
use crate::{FileItem, PileValue, extract::Extractor};
|
||||
use crate::{Item, PileValue, extract::Extractor};
|
||||
|
||||
pub struct FlacExtractor<'a> {
|
||||
item: &'a FileItem,
|
||||
output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>,
|
||||
item: &'a Item,
|
||||
output: OnceLock<HashMap<Label, PileValue<'a>>>,
|
||||
}
|
||||
|
||||
impl<'a> FlacExtractor<'a> {
|
||||
pub fn new(item: &'a FileItem) -> Self {
|
||||
pub fn new(item: &'a Item) -> Self {
|
||||
Self {
|
||||
item,
|
||||
output: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> {
|
||||
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
|
||||
if let Some(x) = self.output.get() {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
// If this isn't a flac file, ignore it.
|
||||
if self.item.path.extension().map(|x| x.to_str()).flatten() != Some("flac") {
|
||||
return Ok(self.output.get_or_init(|| HashMap::new()));
|
||||
let key = match self.item {
|
||||
Item::File { path, .. } => path.to_str().unwrap_or_default().to_owned(),
|
||||
Item::S3 { key, .. } => key.to_string(),
|
||||
};
|
||||
|
||||
if !key.ends_with(".flac") {
|
||||
let _ = self.output.set(HashMap::new());
|
||||
#[expect(clippy::unwrap_used)]
|
||||
return Ok(self.output.get().unwrap());
|
||||
}
|
||||
|
||||
let file = File::open(&self.item.path)?;
|
||||
let reader = FlacReader::new(BufReader::new(file));
|
||||
let bytes = self.item.read().await?.read_to_end().await?;
|
||||
let reader = FlacReader::new(BufReader::new(std::io::Cursor::new(bytes)));
|
||||
|
||||
let mut output: HashMap<Label, Vec<_>> = HashMap::new();
|
||||
for block in reader {
|
||||
@@ -53,19 +59,22 @@ impl<'a> FlacExtractor<'a> {
|
||||
.map(|(k, v)| (k, PileValue::Array(v)))
|
||||
.collect();
|
||||
|
||||
return Ok(self.output.get_or_init(|| output));
|
||||
let _ = self.output.set(output);
|
||||
#[expect(clippy::unwrap_used)]
|
||||
return Ok(self.output.get().unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
impl Extractor<FileItem> for FlacExtractor<'_> {
|
||||
fn field<'a>(
|
||||
#[async_trait::async_trait]
|
||||
impl Extractor for FlacExtractor<'_> {
|
||||
async fn field<'a>(
|
||||
&'a self,
|
||||
name: &Label,
|
||||
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> {
|
||||
Ok(self.get_inner()?.get(name))
|
||||
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.get(name))
|
||||
}
|
||||
|
||||
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner()?.keys().cloned().collect())
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.keys().cloned().collect())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,50 +1,48 @@
|
||||
use pile_config::Label;
|
||||
use std::{collections::HashMap, path::Component, sync::OnceLock};
|
||||
|
||||
use crate::{FileItem, Key, PileValue, extract::Extractor};
|
||||
use crate::{Item, PileValue, extract::Extractor};
|
||||
|
||||
pub struct FsExtractor<'a> {
|
||||
item: &'a FileItem,
|
||||
output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>,
|
||||
item: &'a Item,
|
||||
output: OnceLock<HashMap<Label, PileValue<'a>>>,
|
||||
}
|
||||
|
||||
impl<'a> FsExtractor<'a> {
|
||||
pub fn new(item: &'a FileItem) -> Self {
|
||||
pub fn new(item: &'a Item) -> Self {
|
||||
Self {
|
||||
item,
|
||||
output: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> {
|
||||
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
|
||||
if let Some(x) = self.output.get() {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let Item::File { path, .. } = self.item else {
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
};
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
let output = HashMap::from([
|
||||
(
|
||||
Label::new("extension").unwrap(),
|
||||
self.item
|
||||
.path
|
||||
.extension()
|
||||
path.extension()
|
||||
.and_then(|x| x.to_str())
|
||||
.map(|x| PileValue::String(x.into()))
|
||||
.unwrap_or(PileValue::Null),
|
||||
),
|
||||
(
|
||||
Label::new("path").unwrap(),
|
||||
self.item
|
||||
.path
|
||||
.to_string()
|
||||
path.to_str()
|
||||
.map(|x| PileValue::String(x.into()))
|
||||
.unwrap_or(PileValue::Null),
|
||||
),
|
||||
(
|
||||
Label::new("segments").unwrap(),
|
||||
self.item
|
||||
.path
|
||||
.components()
|
||||
path.components()
|
||||
.map(|x| match x {
|
||||
Component::CurDir => Some(".".to_owned()),
|
||||
Component::Normal(x) => x.to_str().map(|x| x.to_owned()),
|
||||
@@ -63,15 +61,16 @@ impl<'a> FsExtractor<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl Extractor<FileItem> for FsExtractor<'_> {
|
||||
fn field<'a>(
|
||||
#[async_trait::async_trait]
|
||||
impl Extractor for FsExtractor<'_> {
|
||||
async fn field<'a>(
|
||||
&'a self,
|
||||
name: &Label,
|
||||
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> {
|
||||
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
|
||||
Ok(self.get_inner()?.get(name))
|
||||
}
|
||||
|
||||
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner()?.keys().cloned().collect())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,38 +1,50 @@
|
||||
use id3::Tag;
|
||||
use pile_config::Label;
|
||||
use std::{borrow::Cow, collections::HashMap, sync::OnceLock};
|
||||
use std::{borrow::Cow, collections::HashMap, io::BufReader, sync::OnceLock};
|
||||
|
||||
use crate::{FileItem, PileValue, extract::Extractor};
|
||||
use crate::{Item, PileValue, extract::Extractor};
|
||||
|
||||
pub struct Id3Extractor<'a> {
|
||||
item: &'a FileItem,
|
||||
output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>,
|
||||
item: &'a Item,
|
||||
output: OnceLock<HashMap<Label, PileValue<'a>>>,
|
||||
}
|
||||
|
||||
impl<'a> Id3Extractor<'a> {
|
||||
pub fn new(item: &'a FileItem) -> Self {
|
||||
pub fn new(item: &'a Item) -> Self {
|
||||
Self {
|
||||
item,
|
||||
output: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> {
|
||||
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
|
||||
if let Some(x) = self.output.get() {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let ext = self.item.path.extension().and_then(|x| x.to_str());
|
||||
let key = match self.item {
|
||||
Item::File { path, .. } => path.to_str().unwrap_or_default().to_owned(),
|
||||
Item::S3 { key, .. } => key.to_string(),
|
||||
};
|
||||
|
||||
let ext = key.rsplit('.').next();
|
||||
if !matches!(ext, Some("mp3") | Some("aiff") | Some("aif") | Some("wav")) {
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
let _ = self.output.set(HashMap::new());
|
||||
#[expect(clippy::unwrap_used)]
|
||||
return Ok(self.output.get().unwrap());
|
||||
}
|
||||
|
||||
let tag = match Tag::read_from_path(&self.item.path) {
|
||||
let bytes = self.item.read().await?.read_to_end().await?;
|
||||
let tag = match Tag::read_from2(BufReader::new(std::io::Cursor::new(bytes))) {
|
||||
Ok(tag) => tag,
|
||||
Err(id3::Error {
|
||||
kind: id3::ErrorKind::NoTag,
|
||||
..
|
||||
}) => return Ok(self.output.get_or_init(HashMap::new)),
|
||||
}) => {
|
||||
let _ = self.output.set(HashMap::new());
|
||||
#[expect(clippy::unwrap_used)]
|
||||
return Ok(self.output.get().unwrap());
|
||||
}
|
||||
Err(id3::Error {
|
||||
kind: id3::ErrorKind::Io(e),
|
||||
..
|
||||
@@ -40,7 +52,7 @@ impl<'a> Id3Extractor<'a> {
|
||||
Err(e) => return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, e)),
|
||||
};
|
||||
|
||||
let mut output: HashMap<Label, Vec<PileValue<'a, FileItem>>> = HashMap::new();
|
||||
let mut output: HashMap<Label, Vec<PileValue<'a>>> = HashMap::new();
|
||||
for frame in tag.frames() {
|
||||
if let Some(text) = frame.content().text() {
|
||||
let name = frame_id_to_field(frame.id());
|
||||
@@ -58,7 +70,9 @@ impl<'a> Id3Extractor<'a> {
|
||||
.map(|(k, v)| (k, PileValue::Array(v)))
|
||||
.collect();
|
||||
|
||||
return Ok(self.output.get_or_init(|| output));
|
||||
let _ = self.output.set(output);
|
||||
#[expect(clippy::unwrap_used)]
|
||||
return Ok(self.output.get().unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -66,6 +80,7 @@ impl<'a> Id3Extractor<'a> {
|
||||
/// Falls back to the lowercased frame ID if no mapping exists.
|
||||
fn frame_id_to_field(id: &str) -> Cow<'static, str> {
|
||||
match id {
|
||||
// spell:off
|
||||
"TIT2" => Cow::Borrowed("title"),
|
||||
"TIT1" => Cow::Borrowed("grouping"),
|
||||
"TIT3" => Cow::Borrowed("subtitle"),
|
||||
@@ -98,18 +113,20 @@ fn frame_id_to_field(id: &str) -> Cow<'static, str> {
|
||||
"MVNM" => Cow::Borrowed("movement"),
|
||||
"MVIN" => Cow::Borrowed("movementnumber"),
|
||||
_ => Cow::Owned(id.to_lowercase()),
|
||||
// spell:on
|
||||
}
|
||||
}
|
||||
|
||||
impl Extractor<FileItem> for Id3Extractor<'_> {
|
||||
fn field<'a>(
|
||||
#[async_trait::async_trait]
|
||||
impl Extractor for Id3Extractor<'_> {
|
||||
async fn field<'a>(
|
||||
&'a self,
|
||||
name: &Label,
|
||||
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> {
|
||||
Ok(self.get_inner()?.get(name))
|
||||
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.get(name))
|
||||
}
|
||||
|
||||
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner()?.keys().cloned().collect())
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.keys().cloned().collect())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,18 +1,22 @@
|
||||
use pile_config::Label;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::{Item, PileValue, extract::Extractor};
|
||||
use crate::{PileValue, extract::Extractor};
|
||||
|
||||
pub struct MapExtractor<'a, I: Item> {
|
||||
pub(crate) inner: HashMap<Label, PileValue<'a, I>>,
|
||||
pub struct MapExtractor<'a> {
|
||||
pub(crate) inner: HashMap<Label, PileValue<'a>>,
|
||||
}
|
||||
|
||||
impl<I: Item> Extractor<I> for MapExtractor<'_, I> {
|
||||
fn field<'a>(&'a self, name: &Label) -> Result<Option<&'a PileValue<'a, I>>, std::io::Error> {
|
||||
#[async_trait::async_trait]
|
||||
impl Extractor for MapExtractor<'_> {
|
||||
async fn field<'a>(
|
||||
&'a self,
|
||||
name: &Label,
|
||||
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
|
||||
Ok(self.inner.get(name))
|
||||
}
|
||||
|
||||
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.inner.keys().cloned().collect())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use pile_config::Label;
|
||||
use std::{collections::HashMap, rc::Rc};
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
mod flac;
|
||||
pub use flac::*;
|
||||
@@ -13,59 +13,73 @@ pub use fs::*;
|
||||
mod pdf;
|
||||
pub use pdf::*;
|
||||
|
||||
mod sidecar;
|
||||
pub use sidecar::*;
|
||||
mod toml;
|
||||
pub use toml::*;
|
||||
|
||||
mod map;
|
||||
pub use map::*;
|
||||
|
||||
mod sidecar;
|
||||
pub use sidecar::*;
|
||||
|
||||
use crate::Item;
|
||||
|
||||
/// An attachment that extracts metadata from an [Item].
|
||||
///
|
||||
/// Metadata is exposed as an immutable map of {label: value},
|
||||
/// much like a json object.
|
||||
pub trait Extractor<I: crate::Item> {
|
||||
#[async_trait::async_trait]
|
||||
pub trait Extractor: Send + Sync {
|
||||
/// Get the field at `name` from `item`.
|
||||
/// - returns `None` if `name` is not a valid field
|
||||
/// - returns `Some(Null)` if `name` is not available
|
||||
fn field<'a>(
|
||||
async fn field<'a>(
|
||||
&'a self,
|
||||
name: &pile_config::Label,
|
||||
) -> Result<Option<&'a crate::PileValue<'a, I>>, std::io::Error>;
|
||||
) -> Result<Option<&'a crate::PileValue<'a>>, std::io::Error>;
|
||||
|
||||
/// Return all fields in this extractor.
|
||||
/// `Self::field` must return [Some] for all these keys
|
||||
/// and [None] for all others.
|
||||
fn fields(&self) -> Result<Vec<Label>, std::io::Error>;
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error>;
|
||||
}
|
||||
|
||||
pub struct MetaExtractor<'a, I: crate::Item> {
|
||||
inner: MapExtractor<'a, I>,
|
||||
pub struct MetaExtractor<'a> {
|
||||
inner: MapExtractor<'a>,
|
||||
}
|
||||
|
||||
impl<'a> MetaExtractor<'a, crate::FileItem> {
|
||||
//
|
||||
// MARK: file
|
||||
//
|
||||
|
||||
impl<'a> MetaExtractor<'a> {
|
||||
#[expect(clippy::unwrap_used)]
|
||||
pub fn new(item: &'a crate::FileItem) -> Self {
|
||||
pub fn new(item: &'a Item) -> Self {
|
||||
let inner = MapExtractor {
|
||||
inner: HashMap::from([
|
||||
(
|
||||
Label::new("flac").unwrap(),
|
||||
crate::PileValue::Extractor(Rc::new(FlacExtractor::new(item))),
|
||||
crate::PileValue::Extractor(Arc::new(FlacExtractor::new(item))),
|
||||
),
|
||||
(
|
||||
Label::new("id3").unwrap(),
|
||||
crate::PileValue::Extractor(Rc::new(Id3Extractor::new(item))),
|
||||
crate::PileValue::Extractor(Arc::new(Id3Extractor::new(item))),
|
||||
),
|
||||
(
|
||||
Label::new("fs").unwrap(),
|
||||
crate::PileValue::Extractor(Rc::new(FsExtractor::new(item))),
|
||||
crate::PileValue::Extractor(Arc::new(FsExtractor::new(item))),
|
||||
),
|
||||
(
|
||||
Label::new("pdf").unwrap(),
|
||||
crate::PileValue::Extractor(Rc::new(PdfExtractor::new(item))),
|
||||
crate::PileValue::Extractor(Arc::new(PdfExtractor::new(item))),
|
||||
),
|
||||
(
|
||||
Label::new("toml").unwrap(),
|
||||
crate::PileValue::Extractor(Arc::new(TomlExtractor::new(item))),
|
||||
),
|
||||
(
|
||||
Label::new("sidecar").unwrap(),
|
||||
crate::PileValue::Extractor(Rc::new(SidecarExtractor::new(item))),
|
||||
crate::PileValue::Extractor(Arc::new(SidecarExtractor::new(item))),
|
||||
),
|
||||
]),
|
||||
};
|
||||
@@ -74,16 +88,17 @@ impl<'a> MetaExtractor<'a, crate::FileItem> {
|
||||
}
|
||||
}
|
||||
|
||||
impl Extractor<crate::FileItem> for MetaExtractor<'_, crate::FileItem> {
|
||||
fn field<'a>(
|
||||
#[async_trait::async_trait]
|
||||
impl Extractor for MetaExtractor<'_> {
|
||||
async fn field<'a>(
|
||||
&'a self,
|
||||
name: &pile_config::Label,
|
||||
) -> Result<Option<&'a crate::PileValue<'a, crate::FileItem>>, std::io::Error> {
|
||||
self.inner.field(name)
|
||||
) -> Result<Option<&'a crate::PileValue<'a>>, std::io::Error> {
|
||||
self.inner.field(name).await
|
||||
}
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
return Ok(vec![
|
||||
Label::new("flac").unwrap(),
|
||||
Label::new("id3").unwrap(),
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use pile_config::Label;
|
||||
use std::{collections::HashMap, rc::Rc};
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
mod pdf_meta;
|
||||
pub use pdf_meta::*;
|
||||
@@ -8,26 +8,26 @@ mod pdf_text;
|
||||
pub use pdf_text::*;
|
||||
|
||||
use crate::{
|
||||
FileItem, PileValue,
|
||||
Item, PileValue,
|
||||
extract::{Extractor, MapExtractor},
|
||||
};
|
||||
|
||||
pub struct PdfExtractor<'a> {
|
||||
inner: MapExtractor<'a, FileItem>,
|
||||
inner: MapExtractor<'a>,
|
||||
}
|
||||
|
||||
impl<'a> PdfExtractor<'a> {
|
||||
#[expect(clippy::unwrap_used)]
|
||||
pub fn new(item: &'a FileItem) -> Self {
|
||||
pub fn new(item: &'a Item) -> Self {
|
||||
let inner = MapExtractor {
|
||||
inner: HashMap::from([
|
||||
(
|
||||
Label::new("text").unwrap(),
|
||||
PileValue::Extractor(Rc::new(PdfTextExtractor::new(item))),
|
||||
PileValue::Extractor(Arc::new(PdfTextExtractor::new(item))),
|
||||
),
|
||||
(
|
||||
Label::new("meta").unwrap(),
|
||||
PileValue::Extractor(Rc::new(PdfMetaExtractor::new(item))),
|
||||
PileValue::Extractor(Arc::new(PdfMetaExtractor::new(item))),
|
||||
),
|
||||
]),
|
||||
};
|
||||
@@ -36,23 +36,25 @@ impl<'a> PdfExtractor<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl Extractor<FileItem> for PdfExtractor<'_> {
|
||||
fn field<'a>(
|
||||
#[async_trait::async_trait]
|
||||
impl Extractor for PdfExtractor<'_> {
|
||||
async fn field<'a>(
|
||||
&'a self,
|
||||
name: &pile_config::Label,
|
||||
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> {
|
||||
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
|
||||
#[expect(clippy::unwrap_used)]
|
||||
if name.as_str() == "text" {
|
||||
match self.inner.inner.get(name).unwrap() {
|
||||
PileValue::Extractor(x) => return x.field(name),
|
||||
PileValue::Extractor(x) => return x.field(name).await,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
}
|
||||
|
||||
self.inner.field(name)
|
||||
self.inner.field(name).await
|
||||
}
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(vec![
|
||||
Label::new("text").unwrap(),
|
||||
Label::new("meta").unwrap(),
|
||||
|
||||
@@ -1,40 +1,44 @@
|
||||
use pdf::file::FileOptions;
|
||||
use pdf::primitive::{Date, TimeRel};
|
||||
use pdf::primitive::{Date, PdfString, TimeRel};
|
||||
use pile_config::Label;
|
||||
use std::{collections::HashMap, sync::OnceLock};
|
||||
use tracing::debug;
|
||||
|
||||
use crate::{FileItem, PileValue, extract::Extractor};
|
||||
use crate::{Item, PileValue, extract::Extractor};
|
||||
|
||||
pub struct PdfMetaExtractor<'a> {
|
||||
item: &'a FileItem,
|
||||
output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>,
|
||||
item: &'a Item,
|
||||
output: OnceLock<HashMap<Label, PileValue<'a>>>,
|
||||
}
|
||||
|
||||
impl<'a> PdfMetaExtractor<'a> {
|
||||
pub fn new(item: &'a FileItem) -> Self {
|
||||
pub fn new(item: &'a Item) -> Self {
|
||||
Self {
|
||||
item,
|
||||
output: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> {
|
||||
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
|
||||
if let Some(x) = self.output.get() {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
if self.item.path.extension().map(|x| x.to_str()).flatten() != Some("pdf") {
|
||||
return Ok(self.output.get_or_init(|| HashMap::new()));
|
||||
}
|
||||
let bytes = self.item.read().await?.read_to_end().await?;
|
||||
|
||||
let file = FileOptions::cached()
|
||||
.open(&self.item.path)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||
let file = match FileOptions::cached().load(bytes) {
|
||||
Ok(x) => x,
|
||||
Err(pdf::PdfError::Io { source }) => return Err(source),
|
||||
Err(error) => {
|
||||
debug!(message = "Could not process pdf", ?error, key = ?self.item.key());
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
};
|
||||
|
||||
let mut output: HashMap<Label, PileValue<'a, FileItem>> = HashMap::new();
|
||||
let mut output: HashMap<Label, PileValue<'a>> = HashMap::new();
|
||||
|
||||
if let Some(info) = &file.trailer.info_dict {
|
||||
let fields: &[(&str, Option<&_>)] = &[
|
||||
let fields: &[(&str, Option<&PdfString>)] = &[
|
||||
("title", info.title.as_ref()),
|
||||
("author", info.author.as_ref()),
|
||||
("subject", info.subject.as_ref()),
|
||||
@@ -88,15 +92,16 @@ fn format_date(d: &Date) -> String {
|
||||
)
|
||||
}
|
||||
|
||||
impl Extractor<FileItem> for PdfMetaExtractor<'_> {
|
||||
fn field<'a>(
|
||||
#[async_trait::async_trait]
|
||||
impl Extractor for PdfMetaExtractor<'_> {
|
||||
async fn field<'a>(
|
||||
&'a self,
|
||||
name: &Label,
|
||||
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> {
|
||||
Ok(self.get_inner()?.get(name))
|
||||
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.get(name))
|
||||
}
|
||||
|
||||
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner()?.keys().cloned().collect())
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.keys().cloned().collect())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,34 +2,38 @@ use pdf::content::{Op, TextDrawAdjusted};
|
||||
use pdf::file::FileOptions;
|
||||
use pile_config::Label;
|
||||
use std::{collections::HashMap, sync::OnceLock};
|
||||
use tracing::debug;
|
||||
|
||||
use crate::{FileItem, PileValue, extract::Extractor};
|
||||
use crate::{Item, PileValue, extract::Extractor};
|
||||
|
||||
pub struct PdfTextExtractor<'a> {
|
||||
item: &'a FileItem,
|
||||
output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>,
|
||||
item: &'a Item,
|
||||
output: OnceLock<HashMap<Label, PileValue<'a>>>,
|
||||
}
|
||||
|
||||
impl<'a> PdfTextExtractor<'a> {
|
||||
pub fn new(item: &'a FileItem) -> Self {
|
||||
pub fn new(item: &'a Item) -> Self {
|
||||
Self {
|
||||
item,
|
||||
output: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> {
|
||||
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
|
||||
if let Some(x) = self.output.get() {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
if self.item.path.extension().map(|x| x.to_str()).flatten() != Some("pdf") {
|
||||
return Ok(self.output.get_or_init(|| HashMap::new()));
|
||||
}
|
||||
let bytes = self.item.read().await?.read_to_end().await?;
|
||||
|
||||
let file = FileOptions::cached()
|
||||
.open(&self.item.path)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||
let file = match FileOptions::cached().load(bytes) {
|
||||
Ok(x) => x,
|
||||
Err(pdf::PdfError::Io { source }) => return Err(source),
|
||||
Err(error) => {
|
||||
debug!(message = "Could not process pdf", ?error, key = ?self.item.key());
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
};
|
||||
|
||||
let mut text_parts: Vec<String> = Vec::new();
|
||||
|
||||
@@ -65,19 +69,22 @@ impl<'a> PdfTextExtractor<'a> {
|
||||
#[expect(clippy::unwrap_used)]
|
||||
let output = HashMap::from([(Label::new("text").unwrap(), PileValue::String(text.into()))]);
|
||||
|
||||
return Ok(self.output.get_or_init(|| output));
|
||||
let _ = self.output.set(output);
|
||||
#[expect(clippy::unwrap_used)]
|
||||
return Ok(self.output.get().unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
impl Extractor<FileItem> for PdfTextExtractor<'_> {
|
||||
fn field<'a>(
|
||||
#[async_trait::async_trait]
|
||||
impl Extractor for PdfTextExtractor<'_> {
|
||||
async fn field<'a>(
|
||||
&'a self,
|
||||
name: &Label,
|
||||
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> {
|
||||
Ok(self.get_inner()?.get(name))
|
||||
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.get(name))
|
||||
}
|
||||
|
||||
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner()?.keys().cloned().collect())
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.keys().cloned().collect())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,71 +1,47 @@
|
||||
use pile_config::Label;
|
||||
use std::{collections::HashMap, sync::OnceLock};
|
||||
use std::sync::OnceLock;
|
||||
|
||||
use crate::{FileItem, Item, PileValue, extract::Extractor};
|
||||
|
||||
fn toml_to_pile<I: Item>(value: toml::Value) -> PileValue<'static, I> {
|
||||
match value {
|
||||
toml::Value::String(s) => PileValue::String(s.into()),
|
||||
toml::Value::Integer(i) => PileValue::String(i.to_string().into()),
|
||||
toml::Value::Float(f) => PileValue::String(f.to_string().into()),
|
||||
toml::Value::Boolean(b) => PileValue::String(b.to_string().into()),
|
||||
toml::Value::Datetime(d) => PileValue::String(d.to_string().into()),
|
||||
toml::Value::Array(a) => PileValue::Array(a.into_iter().map(toml_to_pile).collect()),
|
||||
toml::Value::Table(_) => PileValue::Null,
|
||||
}
|
||||
}
|
||||
use crate::{
|
||||
Item, PileValue,
|
||||
extract::{Extractor, TomlExtractor},
|
||||
};
|
||||
|
||||
pub struct SidecarExtractor<'a> {
|
||||
item: &'a FileItem,
|
||||
output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>,
|
||||
item: &'a Item,
|
||||
output: OnceLock<Option<TomlExtractor<'a>>>,
|
||||
}
|
||||
|
||||
impl<'a> SidecarExtractor<'a> {
|
||||
pub fn new(item: &'a FileItem) -> Self {
|
||||
pub fn new(item: &'a Item) -> Self {
|
||||
Self {
|
||||
item,
|
||||
output: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> {
|
||||
if let Some(x) = self.output.get() {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let sidecar_file = self.item.path.with_extension("toml");
|
||||
|
||||
if !(sidecar_file.is_file() && self.item.sidecar) {
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
|
||||
let sidecar = std::fs::read(&sidecar_file)?;
|
||||
let sidecar: toml::Value = match toml::from_slice(&sidecar) {
|
||||
Ok(x) => x,
|
||||
Err(_) => return Ok(self.output.get_or_init(HashMap::new)),
|
||||
};
|
||||
|
||||
let output: HashMap<Label, PileValue<'_, FileItem>> = match sidecar {
|
||||
toml::Value::Table(t) => t
|
||||
.into_iter()
|
||||
.filter_map(|(k, v)| Label::new(&k).map(|label| (label, toml_to_pile(v))))
|
||||
.collect(),
|
||||
_ => HashMap::new(),
|
||||
};
|
||||
|
||||
return Ok(self.output.get_or_init(|| output));
|
||||
}
|
||||
}
|
||||
|
||||
impl Extractor<FileItem> for SidecarExtractor<'_> {
|
||||
fn field<'a>(
|
||||
#[async_trait::async_trait]
|
||||
impl Extractor for SidecarExtractor<'_> {
|
||||
async fn field<'a>(
|
||||
&'a self,
|
||||
name: &Label,
|
||||
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> {
|
||||
Ok(self.get_inner()?.get(name))
|
||||
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
|
||||
match self
|
||||
.output
|
||||
.get_or_init(|| self.item.sidecar().map(TomlExtractor::new))
|
||||
{
|
||||
Some(x) => Ok(x.field(name).await?),
|
||||
None => Ok(Some(&PileValue::Null)),
|
||||
}
|
||||
}
|
||||
|
||||
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner()?.keys().cloned().collect())
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
match self
|
||||
.output
|
||||
.get_or_init(|| self.item.sidecar().map(TomlExtractor::new))
|
||||
{
|
||||
Some(x) => Ok(x.fields().await?),
|
||||
None => Ok(Vec::new()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
66
crates/pile-dataset/src/extract/toml.rs
Normal file
66
crates/pile-dataset/src/extract/toml.rs
Normal file
@@ -0,0 +1,66 @@
|
||||
use pile_config::Label;
|
||||
use std::{collections::HashMap, sync::OnceLock};
|
||||
|
||||
use crate::{Item, PileValue, extract::Extractor};
|
||||
|
||||
fn toml_to_pile(value: toml::Value) -> PileValue<'static> {
|
||||
match value {
|
||||
toml::Value::String(s) => PileValue::String(s.into()),
|
||||
toml::Value::Integer(i) => PileValue::String(i.to_string().into()),
|
||||
toml::Value::Float(f) => PileValue::String(f.to_string().into()),
|
||||
toml::Value::Boolean(b) => PileValue::String(b.to_string().into()),
|
||||
toml::Value::Datetime(d) => PileValue::String(d.to_string().into()),
|
||||
toml::Value::Array(a) => PileValue::Array(a.into_iter().map(toml_to_pile).collect()),
|
||||
toml::Value::Table(_) => PileValue::Null,
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TomlExtractor<'a> {
|
||||
item: &'a Item,
|
||||
output: OnceLock<HashMap<Label, PileValue<'a>>>,
|
||||
}
|
||||
|
||||
impl<'a> TomlExtractor<'a> {
|
||||
pub fn new(item: &'a Item) -> Self {
|
||||
Self {
|
||||
item,
|
||||
output: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
|
||||
if let Some(x) = self.output.get() {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let bytes = self.item.read().await?.read_to_end().await?;
|
||||
let toml: toml::Value = match toml::from_slice(&bytes) {
|
||||
Ok(x) => x,
|
||||
Err(_) => return Ok(self.output.get_or_init(HashMap::new)),
|
||||
};
|
||||
|
||||
let output: HashMap<Label, PileValue<'_>> = match toml {
|
||||
toml::Value::Table(t) => t
|
||||
.into_iter()
|
||||
.filter_map(|(k, v)| Label::new(&k).map(|label| (label, toml_to_pile(v))))
|
||||
.collect(),
|
||||
_ => HashMap::new(),
|
||||
};
|
||||
|
||||
return Ok(self.output.get_or_init(|| output));
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Extractor for TomlExtractor<'_> {
|
||||
async fn field<'a>(
|
||||
&'a self,
|
||||
name: &Label,
|
||||
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.get(name))
|
||||
}
|
||||
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.keys().cloned().collect())
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,9 @@
|
||||
use itertools::Itertools;
|
||||
use pile_config::{Case, ConfigToml, DatasetFts, FieldSpecPost, Label};
|
||||
use std::{path::PathBuf, rc::Rc, sync::LazyLock};
|
||||
use std::{
|
||||
path::PathBuf,
|
||||
sync::{Arc, LazyLock},
|
||||
};
|
||||
use tantivy::{
|
||||
DocAddress, Index, ReloadPolicy, TantivyDocument, TantivyError,
|
||||
collector::Collector,
|
||||
@@ -9,7 +12,7 @@ use tantivy::{
|
||||
};
|
||||
use tracing::{debug, trace, warn};
|
||||
|
||||
use crate::{Item, Key, PileValue, extract::MetaExtractor};
|
||||
use crate::{Item, PileValue, extract::MetaExtractor};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FtsLookupResult {
|
||||
@@ -63,37 +66,21 @@ impl DbFtsIndex {
|
||||
//
|
||||
|
||||
/// Turn an entry into a tantivy document
|
||||
pub fn entry_to_document<K: Key, I: Item<Key = K>>(
|
||||
pub async fn entry_to_document(
|
||||
&self,
|
||||
item: &I,
|
||||
item: &Item,
|
||||
) -> Result<Option<TantivyDocument>, TantivyError> {
|
||||
let mut doc = TantivyDocument::default();
|
||||
|
||||
let key = match item.key().to_string() {
|
||||
Some(x) => x,
|
||||
None => {
|
||||
warn!(
|
||||
message = "Item key cannot be converted to a string, skipping",
|
||||
key = ?item.key(),
|
||||
);
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
let key = item.key();
|
||||
|
||||
doc.add_text(self.schema.get_field("_meta_source")?, item.source_name());
|
||||
doc.add_text(self.schema.get_field("_meta_key")?, key);
|
||||
|
||||
let item = match item.as_file() {
|
||||
Some(x) => x,
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
let extractor = MetaExtractor::new(item);
|
||||
let extractor = PileValue::Extractor(Rc::new(extractor));
|
||||
let extractor = PileValue::Extractor(Arc::new(MetaExtractor::new(item)));
|
||||
|
||||
let mut empty = true;
|
||||
for name in self.fts_cfg().fields.keys() {
|
||||
let x = self.get_field(&extractor, name)?;
|
||||
let x = self.get_field(&extractor, name).await?;
|
||||
|
||||
let val = match x {
|
||||
Some(x) => x,
|
||||
@@ -115,9 +102,9 @@ impl DbFtsIndex {
|
||||
// MARK: read
|
||||
//
|
||||
|
||||
pub fn get_field<I: Item>(
|
||||
pub async fn get_field(
|
||||
&self,
|
||||
extractor: &PileValue<'_, I>,
|
||||
extractor: &PileValue<'_>,
|
||||
field_name: &Label,
|
||||
) -> Result<Option<String>, std::io::Error> {
|
||||
let field = match self.cfg.schema.get(field_name) {
|
||||
@@ -130,7 +117,7 @@ impl DbFtsIndex {
|
||||
|
||||
// Try paths in order, using the first value we find
|
||||
'outer: for path in field.path.as_slice() {
|
||||
let val = match extractor.query(path)? {
|
||||
let val = match extractor.query(path).await? {
|
||||
Some(x) => x,
|
||||
None => return Ok(None),
|
||||
};
|
||||
@@ -292,10 +279,7 @@ impl DbFtsIndex {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn apply<'a, I: Item>(
|
||||
post: &FieldSpecPost,
|
||||
val: &PileValue<'a, I>,
|
||||
) -> Option<PileValue<'a, I>> {
|
||||
pub fn apply<'a>(post: &FieldSpecPost, val: &PileValue<'a>) -> Option<PileValue<'a>> {
|
||||
Some(match post {
|
||||
FieldSpecPost::NotEmpty { notempty: false } => val.clone(),
|
||||
FieldSpecPost::NotEmpty { notempty: true } => match val {
|
||||
|
||||
@@ -1,178 +1,222 @@
|
||||
use pile_config::Label;
|
||||
use std::{fmt::Debug, path::PathBuf, rc::Rc};
|
||||
use smartstring::{LazyCompact, SmartString};
|
||||
use std::{fs::File, io::Seek, path::PathBuf, sync::Arc};
|
||||
|
||||
use crate::{
|
||||
PileValue,
|
||||
extract::{Extractor, SidecarExtractor},
|
||||
};
|
||||
|
||||
//
|
||||
// MARK: key
|
||||
//
|
||||
|
||||
pub trait Key: Debug + Clone + Send + Sync + 'static {
|
||||
/// Convert this key to a string, returning `None`
|
||||
/// if we encounter any kind of error.
|
||||
fn to_string(&self) -> Option<String>;
|
||||
|
||||
fn from_string(str: &str) -> Option<Self>;
|
||||
}
|
||||
|
||||
impl Key for PathBuf {
|
||||
fn from_string(str: &str) -> Option<Self> {
|
||||
str.parse().ok()
|
||||
}
|
||||
|
||||
fn to_string(&self) -> Option<String> {
|
||||
self.to_str().map(|x| x.to_owned())
|
||||
}
|
||||
}
|
||||
use crate::source::{DirDataSource, S3DataSource};
|
||||
|
||||
//
|
||||
// MARK: item
|
||||
//
|
||||
|
||||
/// A pointer to raw data
|
||||
pub trait Item: Debug + Send + Sync + 'static + Sized {
|
||||
type Key: Key;
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum Item {
|
||||
File {
|
||||
source: Arc<DirDataSource>,
|
||||
|
||||
fn source_name(&self) -> &str;
|
||||
fn key(&self) -> &Self::Key;
|
||||
path: PathBuf,
|
||||
sidecar: Option<Box<Item>>,
|
||||
},
|
||||
|
||||
/// Get this item's sidecar metadata
|
||||
fn sidecar(&self) -> Result<Option<Rc<dyn Extractor<Self> + '_>>, std::io::Error>;
|
||||
S3 {
|
||||
source: Arc<S3DataSource>,
|
||||
|
||||
/// Set this file's sidecar metadata,
|
||||
/// overwriting any existing file.
|
||||
fn write_sidecar(
|
||||
&self,
|
||||
path: Vec<Label>,
|
||||
value: PileValue<'_, Self>,
|
||||
) -> Result<(), std::io::Error>;
|
||||
|
||||
fn hash(&self) -> Result<blake3::Hash, std::io::Error>;
|
||||
|
||||
/// Item conversion, downcast to specific type.
|
||||
/// Returns `None` if this is not a [FileItem]
|
||||
fn as_file(&self) -> Option<&FileItem>;
|
||||
key: SmartString<LazyCompact>,
|
||||
sidecar: Option<Box<Item>>,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct FileItem {
|
||||
/// Path to this file.
|
||||
/// Must be relative to source root dir.
|
||||
pub path: PathBuf,
|
||||
pub source_name: Label,
|
||||
impl Item {
|
||||
/// Open the item for reading. For S3, performs a HEAD request to determine
|
||||
/// the object size.
|
||||
pub async fn read(&self) -> Result<ItemReader, std::io::Error> {
|
||||
Ok(match self {
|
||||
Self::File { path, .. } => ItemReader::File(File::open(path)?),
|
||||
|
||||
/// If true, look for a sidecar file
|
||||
pub sidecar: bool,
|
||||
Self::S3 { source, key, .. } => {
|
||||
let head = source
|
||||
.client
|
||||
.head_object()
|
||||
.bucket(source.bucket.as_str())
|
||||
.key(key.as_str())
|
||||
.send()
|
||||
.await
|
||||
.map_err(std::io::Error::other)?;
|
||||
|
||||
let size = head.content_length().unwrap_or(0) as u64;
|
||||
|
||||
ItemReader::S3(S3Reader {
|
||||
client: source.client.clone(),
|
||||
bucket: source.bucket.clone(),
|
||||
key: key.to_owned(),
|
||||
cursor: 0,
|
||||
size,
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn source_name(&self) -> &pile_config::Label {
|
||||
match self {
|
||||
Self::File { source, .. } => &source.name,
|
||||
Self::S3 { source, .. } => &source.name,
|
||||
}
|
||||
}
|
||||
|
||||
#[expect(clippy::expect_used)]
|
||||
pub fn key(&self) -> SmartString<LazyCompact> {
|
||||
match self {
|
||||
Self::File { path, .. } => path.to_str().expect("path is not utf-8").into(),
|
||||
Self::S3 { key, .. } => key.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn hash(&self) -> Result<blake3::Hash, std::io::Error> {
|
||||
match self {
|
||||
Self::File { path, .. } => {
|
||||
let mut hasher = blake3::Hasher::new();
|
||||
let mut file = std::fs::File::open(path)?;
|
||||
std::io::copy(&mut file, &mut hasher)?;
|
||||
return Ok(hasher.finalize());
|
||||
}
|
||||
|
||||
Self::S3 { .. } => todo!(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn sidecar(&self) -> Option<&Self> {
|
||||
match self {
|
||||
Self::File { sidecar, .. } => sidecar.as_ref().map(|x| &**x),
|
||||
Self::S3 { sidecar, .. } => sidecar.as_ref().map(|x| &**x),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Item for FileItem {
|
||||
type Key = PathBuf;
|
||||
pub enum ItemReader {
|
||||
File(File),
|
||||
S3(S3Reader),
|
||||
}
|
||||
|
||||
fn source_name(&self) -> &str {
|
||||
&self.source_name
|
||||
}
|
||||
|
||||
fn key(&self) -> &Self::Key {
|
||||
&self.path
|
||||
}
|
||||
|
||||
fn as_file(&self) -> Option<&FileItem> {
|
||||
Some(self)
|
||||
}
|
||||
|
||||
fn hash(&self) -> Result<blake3::Hash, std::io::Error> {
|
||||
let mut hasher = blake3::Hasher::new();
|
||||
let mut file = std::fs::File::open(&self.path)?;
|
||||
std::io::copy(&mut file, &mut hasher)?;
|
||||
return Ok(hasher.finalize());
|
||||
}
|
||||
|
||||
fn sidecar(&self) -> Result<Option<Rc<dyn Extractor<Self> + '_>>, std::io::Error> {
|
||||
if !self.sidecar {
|
||||
return Ok(None);
|
||||
impl ItemReader {
|
||||
/// Read a chunk of bytes.
|
||||
pub async fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
|
||||
match self {
|
||||
Self::File(x) => std::io::Read::read(x, buf),
|
||||
Self::S3(x) => x.read(buf).await,
|
||||
}
|
||||
|
||||
// TODO: use a generic tomlextractor instead?
|
||||
// you'll need a fake _ref_ to the toml file, though.
|
||||
return Ok(Some(Rc::new(SidecarExtractor::new(self))));
|
||||
}
|
||||
|
||||
fn write_sidecar(
|
||||
&self,
|
||||
path: Vec<Label>,
|
||||
value: PileValue<'_, Self>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
if !self.sidecar {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let sidecar_path = self.path.with_extension("toml");
|
||||
|
||||
let mut doc: toml_edit::DocumentMut = if sidecar_path.is_file() {
|
||||
let content = std::fs::read_to_string(&sidecar_path)?;
|
||||
content.parse().unwrap_or_default()
|
||||
} else {
|
||||
toml_edit::DocumentMut::new()
|
||||
};
|
||||
|
||||
fn to_edit_item(v: toml::Value) -> toml_edit::Item {
|
||||
match v {
|
||||
toml::Value::String(s) => toml_edit::value(s),
|
||||
toml::Value::Integer(i) => toml_edit::value(i),
|
||||
toml::Value::Float(f) => toml_edit::value(f),
|
||||
toml::Value::Boolean(b) => toml_edit::value(b),
|
||||
toml::Value::Datetime(d) => toml_edit::value(d.to_string()),
|
||||
toml::Value::Array(arr) => {
|
||||
let mut array = toml_edit::Array::new();
|
||||
for item in arr {
|
||||
if let toml_edit::Item::Value(v) = to_edit_item(item) {
|
||||
array.push_formatted(v);
|
||||
}
|
||||
/// Read all remaining bytes into a `Vec`.
|
||||
pub async fn read_to_end(mut self) -> std::io::Result<Vec<u8>> {
|
||||
match self {
|
||||
Self::File(mut f) => {
|
||||
let mut buf = Vec::new();
|
||||
std::io::Read::read_to_end(&mut f, &mut buf)?;
|
||||
Ok(buf)
|
||||
}
|
||||
Self::S3(ref mut r) => {
|
||||
let mut buf = Vec::new();
|
||||
let mut chunk = vec![0u8; 65536];
|
||||
loop {
|
||||
let n = r.read(&mut chunk).await?;
|
||||
if n == 0 {
|
||||
break;
|
||||
}
|
||||
toml_edit::Item::Value(toml_edit::Value::Array(array))
|
||||
buf.extend_from_slice(&chunk[..n]);
|
||||
}
|
||||
toml::Value::Table(t) => {
|
||||
let mut table = toml_edit::Table::new();
|
||||
for (k, v) in t {
|
||||
table.insert(&k, to_edit_item(v));
|
||||
Ok(buf)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result<u64> {
|
||||
match self {
|
||||
Self::File(x) => x.seek(pos),
|
||||
Self::S3(x) => x.seek(pos),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// MARK: S3Reader
|
||||
//
|
||||
|
||||
pub struct S3Reader {
|
||||
client: Arc<aws_sdk_s3::Client>,
|
||||
bucket: SmartString<LazyCompact>,
|
||||
key: SmartString<LazyCompact>,
|
||||
cursor: u64,
|
||||
size: u64,
|
||||
}
|
||||
|
||||
impl S3Reader {
|
||||
async fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
|
||||
let len_left = self.size.saturating_sub(self.cursor);
|
||||
if len_left == 0 || buf.is_empty() {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
let start_byte = self.cursor;
|
||||
let len_to_read = (buf.len() as u64).min(len_left);
|
||||
let end_byte = start_byte + len_to_read - 1;
|
||||
|
||||
let resp = self
|
||||
.client
|
||||
.get_object()
|
||||
.bucket(self.bucket.as_str())
|
||||
.key(self.key.as_str())
|
||||
.range(format!("bytes={start_byte}-{end_byte}"))
|
||||
.send()
|
||||
.await
|
||||
.map_err(std::io::Error::other)?;
|
||||
|
||||
let bytes = resp
|
||||
.body
|
||||
.collect()
|
||||
.await
|
||||
.map(|x| x.into_bytes())
|
||||
.map_err(std::io::Error::other)?;
|
||||
|
||||
let n = bytes.len().min(buf.len());
|
||||
buf[..n].copy_from_slice(&bytes[..n]);
|
||||
self.cursor += n as u64;
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result<u64> {
|
||||
match pos {
|
||||
std::io::SeekFrom::Start(x) => self.cursor = x.min(self.size),
|
||||
|
||||
std::io::SeekFrom::Current(x) => {
|
||||
if x < 0 {
|
||||
let abs = x.unsigned_abs();
|
||||
if abs > self.cursor {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidInput,
|
||||
"cannot seek past start",
|
||||
));
|
||||
}
|
||||
toml_edit::Item::Table(table)
|
||||
self.cursor -= abs;
|
||||
} else {
|
||||
self.cursor += x as u64;
|
||||
}
|
||||
}
|
||||
|
||||
std::io::SeekFrom::End(x) => {
|
||||
if x < 0 {
|
||||
let abs = x.unsigned_abs();
|
||||
if abs > self.size {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidInput,
|
||||
"cannot seek past start",
|
||||
));
|
||||
}
|
||||
self.cursor = self.size - abs;
|
||||
} else {
|
||||
self.cursor = self.size + x as u64;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let json_value = value.to_json()?;
|
||||
let toml_value: toml::Value = serde_json::from_value(json_value)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
|
||||
let item = to_edit_item(toml_value);
|
||||
|
||||
let Some((path_last, path_init)) = path.split_last() else {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
let mut table = doc.as_table_mut();
|
||||
for label in path_init {
|
||||
let key = label.as_str();
|
||||
if !table.contains_key(key) {
|
||||
table.insert(key, toml_edit::Item::Table(toml_edit::Table::new()));
|
||||
}
|
||||
table = table
|
||||
.get_mut(key)
|
||||
.and_then(|item| item.as_table_mut())
|
||||
.ok_or_else(|| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
"path element is not a table",
|
||||
)
|
||||
})?;
|
||||
}
|
||||
table.insert(path_last.as_str(), item);
|
||||
|
||||
std::fs::write(&sidecar_path, doc.to_string())?;
|
||||
|
||||
Ok(())
|
||||
self.cursor = self.cursor.min(self.size);
|
||||
Ok(self.cursor)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ mod misc;
|
||||
pub use misc::*;
|
||||
|
||||
mod dataset;
|
||||
pub use dataset::*;
|
||||
pub use dataset::{Dataset, DatasetError, Datasets};
|
||||
|
||||
mod item;
|
||||
pub use item::*;
|
||||
|
||||
@@ -1,35 +1,36 @@
|
||||
use chrono::{DateTime, Utc};
|
||||
use itertools::Itertools;
|
||||
use pile_config::Label;
|
||||
use std::path::PathBuf;
|
||||
use std::{path::PathBuf, sync::Arc};
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::{DataSource, item::FileItem, path_ts_latest};
|
||||
use crate::{DataSource, Item, path_ts_latest};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DirDataSource {
|
||||
pub name: Label,
|
||||
pub dirs: Vec<PathBuf>,
|
||||
pub dir: PathBuf,
|
||||
|
||||
pub sidecars: bool,
|
||||
}
|
||||
|
||||
impl DirDataSource {
|
||||
pub fn new(name: &Label, dirs: Vec<PathBuf>, sidecars: bool) -> Self {
|
||||
pub fn new(name: &Label, dir: PathBuf, sidecars: bool) -> Self {
|
||||
Self {
|
||||
name: name.clone(),
|
||||
dirs,
|
||||
dir,
|
||||
sidecars,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DataSource for DirDataSource {
|
||||
type Key = PathBuf;
|
||||
type Item = FileItem;
|
||||
type Error = std::io::Error;
|
||||
impl DataSource for Arc<DirDataSource> {
|
||||
async fn get(&self, key: &str) -> Result<Option<Item>, std::io::Error> {
|
||||
let key = match key.parse::<PathBuf>() {
|
||||
Ok(x) => self.dir.join(x),
|
||||
Err(_) => return Ok(None),
|
||||
};
|
||||
|
||||
fn get(&self, key: &Self::Key) -> Result<Option<Self::Item>, Self::Error> {
|
||||
if !key.is_file() {
|
||||
return Ok(None);
|
||||
}
|
||||
@@ -39,64 +40,84 @@ impl DataSource for DirDataSource {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
return Ok(Some(FileItem {
|
||||
source_name: self.name.clone(),
|
||||
path: key.to_owned(),
|
||||
sidecar: self.sidecars,
|
||||
return Ok(Some(Item::File {
|
||||
source: Arc::clone(self),
|
||||
path: key.clone(),
|
||||
sidecar: self.sidecars.then(|| {
|
||||
Box::new(Item::File {
|
||||
source: Arc::clone(self),
|
||||
path: key.with_extension("toml"),
|
||||
sidecar: None,
|
||||
})
|
||||
}),
|
||||
}));
|
||||
}
|
||||
|
||||
fn iter(&self) -> impl Iterator<Item = Result<(Self::Key, Self::Item), Self::Error>> {
|
||||
return self
|
||||
.dirs
|
||||
.iter()
|
||||
.flat_map(|x| WalkDir::new(x).into_iter().map_ok(move |d| (x, d)))
|
||||
.filter_ok(|(_, entry)| !entry.file_type().is_dir())
|
||||
.filter_map(|x| match x {
|
||||
Err(err) => {
|
||||
let msg = format!("other walkdir error: {err:?}");
|
||||
Some(Err(err
|
||||
.into_io_error()
|
||||
.unwrap_or(std::io::Error::other(msg))))
|
||||
fn iter(&self) -> ReceiverStream<Result<Item, std::io::Error>> {
|
||||
let (tx, rx) = tokio::sync::mpsc::channel(64);
|
||||
let source = Arc::clone(self);
|
||||
|
||||
let dir = self.dir.clone();
|
||||
tokio::task::spawn_blocking(move || {
|
||||
for entry in WalkDir::new(dir) {
|
||||
let entry = match entry {
|
||||
Err(e) => {
|
||||
let msg = format!("walkdir error: {e:?}");
|
||||
let err = e.into_io_error().unwrap_or(std::io::Error::other(msg));
|
||||
if tx.blocking_send(Err(err)).is_err() {
|
||||
return;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
Ok(e) => e,
|
||||
};
|
||||
|
||||
if entry.file_type().is_dir() {
|
||||
continue;
|
||||
}
|
||||
|
||||
Ok((_, entry)) => {
|
||||
let path = entry.into_path();
|
||||
let path = entry.into_path();
|
||||
|
||||
let item = match path.extension().and_then(|x| x.to_str()) {
|
||||
None => return None,
|
||||
let item = match path.extension().and_then(|x| x.to_str()) {
|
||||
None => continue,
|
||||
Some("toml") if source.sidecars => continue,
|
||||
Some(_) => Item::File {
|
||||
source: Arc::clone(&source),
|
||||
path: path.clone(),
|
||||
|
||||
// Ignore toml if sidecars are enabled
|
||||
Some("toml") if self.sidecars => return None,
|
||||
sidecar: source.sidecars.then(|| {
|
||||
Box::new(Item::File {
|
||||
source: Arc::clone(&source),
|
||||
path: path.with_extension("toml"),
|
||||
sidecar: None,
|
||||
})
|
||||
}),
|
||||
},
|
||||
};
|
||||
|
||||
Some(_) => FileItem {
|
||||
source_name: self.name.clone(),
|
||||
path: path.clone(),
|
||||
sidecar: self.sidecars,
|
||||
},
|
||||
};
|
||||
|
||||
Some(Ok((path, item)))
|
||||
if tx.blocking_send(Ok(item)).is_err() {
|
||||
return;
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
ReceiverStream::new(rx)
|
||||
}
|
||||
|
||||
fn latest_change(&self) -> Result<Option<DateTime<Utc>>, Self::Error> {
|
||||
async fn latest_change(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
|
||||
let mut ts: Option<DateTime<Utc>> = None;
|
||||
|
||||
for path in &self.dirs {
|
||||
if !path.exists() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let new = path_ts_latest(path)?;
|
||||
match (ts, new) {
|
||||
(_, None) => continue,
|
||||
(None, Some(new)) => ts = Some(new),
|
||||
(Some(old), Some(new)) => ts = Some(old.max(new)),
|
||||
};
|
||||
if !self.dir.exists() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let new = path_ts_latest(&self.dir)?;
|
||||
match (ts, new) {
|
||||
(_, None) => {}
|
||||
(None, Some(new)) => ts = Some(new),
|
||||
(Some(old), Some(new)) => ts = Some(old.max(new)),
|
||||
};
|
||||
|
||||
return Ok(ts);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,2 +1,5 @@
|
||||
mod dir;
|
||||
pub use dir::*;
|
||||
|
||||
mod s3;
|
||||
pub use s3::*;
|
||||
|
||||
206
crates/pile-dataset/src/source/s3.rs
Normal file
206
crates/pile-dataset/src/source/s3.rs
Normal file
@@ -0,0 +1,206 @@
|
||||
use aws_sdk_s3::config::{BehaviorVersion, Credentials, Region};
|
||||
use chrono::{DateTime, Utc};
|
||||
use pile_config::{Label, S3Credentials};
|
||||
use smartstring::{LazyCompact, SmartString};
|
||||
use std::sync::Arc;
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
|
||||
use crate::{DataSource, Item};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct S3DataSource {
|
||||
pub name: Label,
|
||||
pub bucket: SmartString<LazyCompact>,
|
||||
pub prefix: Option<SmartString<LazyCompact>>,
|
||||
pub sidecars: bool,
|
||||
pub client: Arc<aws_sdk_s3::Client>,
|
||||
}
|
||||
|
||||
impl S3DataSource {
|
||||
pub fn new(
|
||||
name: &Label,
|
||||
bucket: String,
|
||||
prefix: Option<String>,
|
||||
endpoint: Option<String>,
|
||||
region: String,
|
||||
credentials: &S3Credentials,
|
||||
sidecars: bool,
|
||||
) -> Result<Self, std::io::Error> {
|
||||
let client = {
|
||||
let creds = Credentials::new(
|
||||
&credentials.access_key_id,
|
||||
&credentials.secret_access_key,
|
||||
None,
|
||||
None,
|
||||
"pile",
|
||||
);
|
||||
|
||||
let mut s3_config = aws_sdk_s3::config::Builder::new()
|
||||
.behavior_version(BehaviorVersion::latest())
|
||||
.region(Region::new(region))
|
||||
.credentials_provider(creds);
|
||||
|
||||
if let Some(ep) = endpoint {
|
||||
s3_config = s3_config.endpoint_url(ep).force_path_style(true);
|
||||
}
|
||||
|
||||
aws_sdk_s3::Client::from_conf(s3_config.build())
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
name: name.clone(),
|
||||
bucket: bucket.into(),
|
||||
prefix: prefix.map(|x| x.into()),
|
||||
sidecars,
|
||||
client: Arc::new(client),
|
||||
})
|
||||
}
|
||||
|
||||
fn make_item(self: &Arc<Self>, key: impl Into<SmartString<LazyCompact>>) -> Item {
|
||||
Item::S3 {
|
||||
source: Arc::clone(self),
|
||||
key: key.into(),
|
||||
sidecar: None, // TODO: add sidecars
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DataSource for Arc<S3DataSource> {
|
||||
async fn get(&self, key: &str) -> Result<Option<Item>, std::io::Error> {
|
||||
if self.sidecars && key.ends_with(".toml") {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let result = self
|
||||
.client
|
||||
.head_object()
|
||||
.bucket(self.bucket.as_str())
|
||||
.key(key)
|
||||
.send()
|
||||
.await;
|
||||
|
||||
match result {
|
||||
Err(sdk_err) => {
|
||||
let not_found = sdk_err
|
||||
.as_service_error()
|
||||
.map(|e| e.is_not_found())
|
||||
.unwrap_or(false);
|
||||
if not_found {
|
||||
return Ok(None);
|
||||
}
|
||||
Err(std::io::Error::other(sdk_err))
|
||||
}
|
||||
Ok(_) => Ok(Some(self.make_item(key))),
|
||||
}
|
||||
}
|
||||
|
||||
fn iter(&self) -> ReceiverStream<Result<Item, std::io::Error>> {
|
||||
let (tx, rx) = tokio::sync::mpsc::channel(64);
|
||||
let source = Arc::clone(self);
|
||||
|
||||
tokio::spawn(async move {
|
||||
let mut continuation_token: Option<String> = None;
|
||||
|
||||
loop {
|
||||
let mut req = source
|
||||
.client
|
||||
.list_objects_v2()
|
||||
.bucket(source.bucket.as_str());
|
||||
|
||||
if let Some(prefix) = &source.prefix {
|
||||
req = req.prefix(prefix.as_str());
|
||||
}
|
||||
|
||||
if let Some(token) = continuation_token {
|
||||
req = req.continuation_token(token);
|
||||
}
|
||||
|
||||
let resp = match req.send().await {
|
||||
Err(e) => {
|
||||
let _ = tx.send(Err(std::io::Error::other(e))).await;
|
||||
break;
|
||||
}
|
||||
Ok(resp) => resp,
|
||||
};
|
||||
|
||||
let next_token = resp.next_continuation_token().map(ToOwned::to_owned);
|
||||
let is_truncated = resp.is_truncated().unwrap_or(false);
|
||||
|
||||
for obj in resp.contents() {
|
||||
let key = match obj.key() {
|
||||
Some(k) => k.to_owned(),
|
||||
None => continue,
|
||||
};
|
||||
|
||||
if source.sidecars && key.ends_with(".toml") {
|
||||
continue;
|
||||
}
|
||||
|
||||
let item = Item::S3 {
|
||||
source: Arc::clone(&source),
|
||||
key: key.into(),
|
||||
sidecar: None, // TODO: add sidecars
|
||||
};
|
||||
|
||||
if tx.send(Ok(item)).await.is_err() {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if !is_truncated {
|
||||
break;
|
||||
}
|
||||
continuation_token = next_token;
|
||||
}
|
||||
});
|
||||
|
||||
ReceiverStream::new(rx)
|
||||
}
|
||||
|
||||
async fn latest_change(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
|
||||
let mut ts: Option<DateTime<Utc>> = None;
|
||||
let mut continuation_token: Option<String> = None;
|
||||
|
||||
loop {
|
||||
let mut req = self.client.list_objects_v2().bucket(self.bucket.as_str());
|
||||
|
||||
if let Some(prefix) = &self.prefix {
|
||||
req = req.prefix(prefix.as_str());
|
||||
}
|
||||
|
||||
if let Some(token) = continuation_token {
|
||||
req = req.continuation_token(token);
|
||||
}
|
||||
|
||||
let resp = match req.send().await {
|
||||
Err(_) => return Ok(None),
|
||||
Ok(resp) => resp,
|
||||
};
|
||||
|
||||
let next_token = resp.next_continuation_token().map(ToOwned::to_owned);
|
||||
let is_truncated = resp.is_truncated().unwrap_or(false);
|
||||
|
||||
for obj in resp.contents() {
|
||||
if let Some(last_modified) = obj.last_modified() {
|
||||
let dt = DateTime::from_timestamp(
|
||||
last_modified.secs(),
|
||||
last_modified.subsec_nanos(),
|
||||
);
|
||||
if let Some(dt) = dt {
|
||||
ts = Some(match ts {
|
||||
None => dt,
|
||||
Some(prev) => prev.max(dt),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !is_truncated {
|
||||
break;
|
||||
}
|
||||
continuation_token = next_token;
|
||||
}
|
||||
|
||||
Ok(ts)
|
||||
}
|
||||
}
|
||||
158
crates/pile-dataset/src/source/s3reader.rs
Normal file
158
crates/pile-dataset/src/source/s3reader.rs
Normal file
@@ -0,0 +1,158 @@
|
||||
use aws_sdk_s3::{error::SdkError, operation::get_object::GetObjectError};
|
||||
use mime::Mime;
|
||||
use std::io::{Error as IoError, Seek, SeekFrom, Write};
|
||||
use thiserror::Error;
|
||||
|
||||
use super::S3Client;
|
||||
use crate::retry;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
#[expect(clippy::large_enum_variant)]
|
||||
pub enum S3ReaderError {
|
||||
#[error("sdk error")]
|
||||
SdkError(#[from] SdkError<GetObjectError>),
|
||||
|
||||
#[error("byte stream error")]
|
||||
ByteStreamError(#[from] aws_sdk_s3::primitives::ByteStreamError),
|
||||
|
||||
#[error("i/o error")]
|
||||
IoError(#[from] IoError),
|
||||
}
|
||||
|
||||
/// Provides a [`std::io::Read`]-like interface to an S3 object. \
|
||||
/// This doesn't actually implement [`std::io::Read`] because Read isn't async.
|
||||
///
|
||||
/// Also implements [`std::io::Seek`]
|
||||
pub struct S3Reader {
|
||||
pub(super) client: S3Client,
|
||||
pub(super) bucket: String,
|
||||
pub(super) key: String,
|
||||
|
||||
pub(super) cursor: u64,
|
||||
pub(super) size: u64,
|
||||
pub(super) mime: Mime,
|
||||
}
|
||||
|
||||
impl S3Reader {
|
||||
pub async fn read(&mut self, mut buf: &mut [u8]) -> Result<usize, S3ReaderError> {
|
||||
let len_left = self.size - self.cursor;
|
||||
if len_left == 0 || buf.is_empty() {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
#[expect(clippy::unwrap_used)] // TODO: probably fits?
|
||||
let start_byte = usize::try_from(self.cursor).unwrap();
|
||||
|
||||
#[expect(clippy::unwrap_used)] // usize fits in u64
|
||||
let len_to_read = u64::try_from(buf.len()).unwrap().min(len_left);
|
||||
|
||||
#[expect(clippy::unwrap_used)] // must fit, we called min()
|
||||
let len_to_read = usize::try_from(len_to_read).unwrap();
|
||||
|
||||
let end_byte = start_byte + len_to_read - 1;
|
||||
|
||||
let b = retry!(
|
||||
self.client.retries,
|
||||
self.client
|
||||
.client
|
||||
.get_object()
|
||||
.bucket(self.bucket.as_str())
|
||||
.key(self.key.as_str())
|
||||
.range(format!("bytes={start_byte}-{end_byte}"))
|
||||
.send()
|
||||
.await
|
||||
)?;
|
||||
|
||||
// Looks like `bytes 31000000-31999999/33921176``
|
||||
// println!("{:?}", b.content_range);
|
||||
|
||||
let mut bytes = b.body.collect().await?.into_bytes();
|
||||
bytes.truncate(len_to_read);
|
||||
let l = bytes.len();
|
||||
|
||||
// Memory to memory writes are infallible
|
||||
#[expect(clippy::unwrap_used)]
|
||||
buf.write_all(&bytes).unwrap();
|
||||
|
||||
// Cannot fail, usize should always fit into u64
|
||||
#[expect(clippy::unwrap_used)]
|
||||
{
|
||||
self.cursor += u64::try_from(l).unwrap();
|
||||
}
|
||||
|
||||
return Ok(len_to_read);
|
||||
}
|
||||
|
||||
pub fn is_done(&self) -> bool {
|
||||
return self.cursor == self.size;
|
||||
}
|
||||
|
||||
pub fn mime(&self) -> &Mime {
|
||||
&self.mime
|
||||
}
|
||||
|
||||
/// Write the entire contents of this reader to `r`.
|
||||
///
|
||||
/// This method always downloads the whole object,
|
||||
/// and always preserves `self.cursor`.
|
||||
pub async fn download<W: Write>(&mut self, r: &mut W) -> Result<(), S3ReaderError> {
|
||||
let pos = self.stream_position()?;
|
||||
|
||||
const BUF_LEN: usize = 10_000_000;
|
||||
#[expect(clippy::unwrap_used)] // Cannot fail
|
||||
let mut buf: Box<[u8; BUF_LEN]> = vec![0u8; BUF_LEN].try_into().unwrap();
|
||||
|
||||
while !self.is_done() {
|
||||
let b = self.read(&mut buf[..]).await?;
|
||||
r.write_all(&buf[0..b])?;
|
||||
}
|
||||
|
||||
self.seek(SeekFrom::Start(pos))?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Seek for S3Reader {
|
||||
fn seek(&mut self, pos: SeekFrom) -> std::io::Result<u64> {
|
||||
match pos {
|
||||
SeekFrom::Start(x) => self.cursor = x.min(self.size - 1),
|
||||
|
||||
// Cannot panic, we handle all cases
|
||||
#[expect(clippy::unwrap_used)]
|
||||
SeekFrom::Current(x) => {
|
||||
if x < 0 {
|
||||
if u64::try_from(x.abs()).unwrap() > self.cursor {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidInput,
|
||||
"cannot seek past start",
|
||||
));
|
||||
}
|
||||
self.cursor -= u64::try_from(x.abs()).unwrap();
|
||||
} else {
|
||||
self.cursor += u64::try_from(x).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
// Cannot panic, we handle all cases
|
||||
#[expect(clippy::unwrap_used)]
|
||||
SeekFrom::End(x) => {
|
||||
if x < 0 {
|
||||
if u64::try_from(x.abs()).unwrap() > self.size {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidInput,
|
||||
"cannot seek past start",
|
||||
));
|
||||
}
|
||||
// Cannot fail, is abs
|
||||
self.cursor = self.size - u64::try_from(x.abs()).unwrap();
|
||||
} else {
|
||||
// Cannot fail, is positive
|
||||
self.cursor = self.size + u64::try_from(x).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.cursor = self.cursor.min(self.size - 1);
|
||||
return Ok(self.cursor);
|
||||
}
|
||||
}
|
||||
@@ -1,23 +1,18 @@
|
||||
use chrono::{DateTime, Utc};
|
||||
use std::error::Error;
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
|
||||
use crate::{Item, Key};
|
||||
use crate::Item;
|
||||
|
||||
/// A read-only set of [Item]s.
|
||||
pub trait DataSource {
|
||||
/// The type used to retrieve items from this source
|
||||
/// (e.g, a PathBuf or a primary key)
|
||||
type Key: Key;
|
||||
type Item: Item<Key = Self::Key>;
|
||||
|
||||
type Error: Error + Sync + Send;
|
||||
|
||||
/// Get an item from this datasource
|
||||
fn get(&self, key: &Self::Key) -> Result<Option<Self::Item>, Self::Error>;
|
||||
fn get(&self, key: &str) -> impl Future<Output = Result<Option<Item>, std::io::Error>> + Send;
|
||||
|
||||
/// Iterate over all items in this source in an arbitrary order
|
||||
fn iter(&self) -> impl Iterator<Item = Result<(Self::Key, Self::Item), Self::Error>>;
|
||||
fn iter(&self) -> ReceiverStream<Result<Item, std::io::Error>>;
|
||||
|
||||
/// Return the time of the latest change to the data in this source
|
||||
fn latest_change(&self) -> Result<Option<DateTime<Utc>>, Self::Error>;
|
||||
fn latest_change(
|
||||
&self,
|
||||
) -> impl Future<Output = Result<Option<DateTime<Utc>>, std::io::Error>> + Send;
|
||||
}
|
||||
|
||||
@@ -1,26 +1,25 @@
|
||||
use std::rc::Rc;
|
||||
|
||||
use pile_config::objectpath::{ObjectPath, PathSegment};
|
||||
use serde_json::{Map, Value};
|
||||
use smartstring::{LazyCompact, SmartString};
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::{Item, extract::Extractor};
|
||||
use crate::extract::Extractor;
|
||||
|
||||
/// An immutable, lazily-computed value similar to [serde_json::Value].
|
||||
pub enum PileValue<'a, I: crate::Item> {
|
||||
pub enum PileValue<'a> {
|
||||
Null,
|
||||
|
||||
/// A string
|
||||
String(SmartString<LazyCompact>),
|
||||
|
||||
/// An array of values
|
||||
Array(Vec<PileValue<'a, I>>),
|
||||
Array(Vec<PileValue<'a>>),
|
||||
|
||||
/// A lazily-computed map of {label: value}
|
||||
Extractor(Rc<dyn Extractor<I> + 'a>),
|
||||
Extractor(Arc<dyn Extractor + 'a>),
|
||||
}
|
||||
|
||||
impl<I: Item> Clone for PileValue<'_, I> {
|
||||
impl Clone for PileValue<'_> {
|
||||
fn clone(&self) -> Self {
|
||||
match self {
|
||||
Self::Null => Self::Null,
|
||||
@@ -31,8 +30,8 @@ impl<I: Item> Clone for PileValue<'_, I> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, I: Item> PileValue<'a, I> {
|
||||
pub fn query(&'a self, query: &ObjectPath) -> Result<Option<&'a Self>, std::io::Error> {
|
||||
impl<'a> PileValue<'a> {
|
||||
pub async fn query(&'a self, query: &ObjectPath) -> Result<Option<&'a Self>, std::io::Error> {
|
||||
let mut out = Some(self);
|
||||
|
||||
for s in &query.segments {
|
||||
@@ -44,7 +43,7 @@ impl<'a, I: Item> PileValue<'a, I> {
|
||||
Some(Self::Null) => None,
|
||||
Some(Self::Array(_)) => None,
|
||||
Some(Self::String(_)) => None,
|
||||
Some(Self::Extractor(e)) => e.field(field)?,
|
||||
Some(Self::Extractor(e)) => e.field(field).await?,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,30 +77,29 @@ impl<'a, I: Item> PileValue<'a, I> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn to_json(&self) -> Result<Value, std::io::Error> {
|
||||
pub async fn to_json(&self) -> Result<Value, std::io::Error> {
|
||||
Ok(match self {
|
||||
Self::Null => Value::Null,
|
||||
Self::String(x) => Value::String(x.to_string()),
|
||||
|
||||
Self::Array(x) => Value::Array(
|
||||
x.iter()
|
||||
.map(|x| x.to_json())
|
||||
.collect::<Result<Vec<_>, _>>()?,
|
||||
),
|
||||
Self::Array(x) => {
|
||||
let mut arr = Vec::new();
|
||||
for item in x {
|
||||
arr.push(Box::pin(item.to_json()).await?);
|
||||
}
|
||||
Value::Array(arr)
|
||||
}
|
||||
|
||||
Self::Extractor(e) => {
|
||||
let keys = e.fields()?;
|
||||
let map = keys
|
||||
.iter()
|
||||
.map(|k| {
|
||||
#[expect(clippy::expect_used)]
|
||||
let v = e.field(k)?.expect("key must be valid");
|
||||
|
||||
let v = v.to_json()?;
|
||||
Ok((k.to_string(), v))
|
||||
})
|
||||
.collect::<Result<Map<String, Value>, std::io::Error>>()?;
|
||||
|
||||
let keys = e.fields().await?;
|
||||
let mut map = Map::new();
|
||||
for k in &keys {
|
||||
let v = match e.field(k).await? {
|
||||
Some(x) => x,
|
||||
None => continue,
|
||||
};
|
||||
map.insert(k.to_string(), Box::pin(v.to_json()).await?);
|
||||
}
|
||||
Value::Object(map)
|
||||
}
|
||||
})
|
||||
|
||||
@@ -15,6 +15,7 @@ pile-config = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
tracing-subscriber = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
tokio-stream = { workspace = true }
|
||||
clap = { workspace = true }
|
||||
#clap_complete = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
|
||||
@@ -3,9 +3,10 @@ use clap::Args;
|
||||
use pile_config::{Label, Source};
|
||||
use pile_dataset::index::DbFtsIndex;
|
||||
use pile_dataset::source::DirDataSource;
|
||||
use pile_dataset::{DataSource, Dataset, FileItem, Item, PileValue, extract::MetaExtractor};
|
||||
use pile_dataset::{DataSource, Datasets, Item, PileValue, extract::MetaExtractor};
|
||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
||||
use std::{path::PathBuf, rc::Rc};
|
||||
use std::{path::PathBuf, sync::Arc};
|
||||
use tokio_stream::StreamExt;
|
||||
use tracing::{info, warn};
|
||||
|
||||
use crate::{CliCmd, GlobalContext};
|
||||
@@ -43,7 +44,7 @@ impl CliCmd for AnnotateCommand {
|
||||
.ok_or_else(|| anyhow::anyhow!("invalid field name {:?}", self.field))?;
|
||||
let dest_path = Self::parse_dest(&self.dest)?;
|
||||
|
||||
let ds = Dataset::open(&self.config)
|
||||
let ds = Datasets::open(&self.config)
|
||||
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
||||
|
||||
if !ds.config.schema.contains_key(&field) {
|
||||
@@ -51,7 +52,7 @@ impl CliCmd for AnnotateCommand {
|
||||
}
|
||||
|
||||
let index = DbFtsIndex::new(&ds.path_workdir, &ds.config);
|
||||
let mut count = 0u64;
|
||||
let count = 0u64;
|
||||
|
||||
for (name, source) in &ds.config.dataset.source {
|
||||
match source {
|
||||
@@ -61,31 +62,40 @@ impl CliCmd for AnnotateCommand {
|
||||
continue;
|
||||
}
|
||||
|
||||
let source = DirDataSource::new(name, path.clone().to_vec(), *sidecars);
|
||||
let source = Arc::new(DirDataSource::new(name, path.clone(), *sidecars));
|
||||
|
||||
for res in source.iter() {
|
||||
let (_key, item) =
|
||||
res.with_context(|| format!("while reading source {name}"))?;
|
||||
let mut stream = source.iter();
|
||||
while let Some(res) = stream.next().await {
|
||||
let item = res.with_context(|| format!("while reading source {name}"))?;
|
||||
|
||||
let Item::File { path, .. } = &item else {
|
||||
continue;
|
||||
};
|
||||
|
||||
let meta = MetaExtractor::new(&item);
|
||||
let extractor = PileValue::<FileItem>::Extractor(Rc::new(meta));
|
||||
let extractor = PileValue::Extractor(Arc::new(meta));
|
||||
|
||||
let Some(value) =
|
||||
index.get_field(&extractor, &field).with_context(|| {
|
||||
format!("while extracting field from {}", item.path.display())
|
||||
index.get_field(&extractor, &field).await.with_context(|| {
|
||||
format!("while extracting field from {}", path.display())
|
||||
})?
|
||||
else {
|
||||
continue;
|
||||
};
|
||||
|
||||
item.write_sidecar(dest_path.clone(), PileValue::String(value.into()))
|
||||
.with_context(|| {
|
||||
format!("while writing sidecar for {}", item.path.display())
|
||||
})?;
|
||||
// TODO: implement sidecar writing
|
||||
let _ = (&dest_path, &value);
|
||||
todo!("write_sidecar not yet implemented");
|
||||
|
||||
count += 1;
|
||||
#[expect(unreachable_code)]
|
||||
{
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
Source::S3 { .. } => {
|
||||
warn!("Source {name} is an S3 source; sidecar annotation is not yet supported");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use anyhow::{Context, Result, anyhow};
|
||||
use clap::Args;
|
||||
use pile_config::ConfigToml;
|
||||
use pile_dataset::Dataset;
|
||||
use pile_dataset::Datasets;
|
||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
||||
use std::{fmt::Debug, path::PathBuf};
|
||||
use tracing::{debug, error, info, warn};
|
||||
@@ -43,11 +43,11 @@ impl CliCmd for CheckCommand {
|
||||
}
|
||||
}
|
||||
|
||||
let ds = Dataset::open(&self.config)
|
||||
let ds = Datasets::open(&self.config)
|
||||
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
||||
|
||||
let ts_fts = ds.ts_fts().context("while determining fts age")?;
|
||||
let ts_data = ds.ts_data().context("while determining data age")?;
|
||||
let ts_data = ds.ts_data().await.context("while determining data age")?;
|
||||
|
||||
match (ts_fts, ts_data) {
|
||||
(None, Some(_)) => warn!("Could not determine fts age"),
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use anyhow::{Context, Result};
|
||||
use clap::Args;
|
||||
use pile_dataset::Dataset;
|
||||
use pile_dataset::Datasets;
|
||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
||||
use std::{fmt::Debug, path::PathBuf};
|
||||
|
||||
@@ -23,10 +23,10 @@ impl CliCmd for IndexCommand {
|
||||
_ctx: GlobalContext,
|
||||
flag: CancelFlag,
|
||||
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
||||
let ds = Dataset::open(&self.config)
|
||||
let ds = Datasets::open(&self.config)
|
||||
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
||||
|
||||
ds.fts_refresh(self.jobs, Some(flag)).map_err(|x| {
|
||||
ds.fts_refresh(self.jobs, Some(flag)).await.map_err(|x| {
|
||||
x.map_err(|x| {
|
||||
anyhow::Error::from(x).context(format!(
|
||||
"while refreshing fts for {}",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use anyhow::{Context, Result};
|
||||
use clap::Args;
|
||||
use pile_dataset::Dataset;
|
||||
use pile_dataset::Datasets;
|
||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
||||
use std::{fmt::Debug, path::PathBuf};
|
||||
use tracing::info;
|
||||
@@ -39,12 +39,12 @@ impl CliCmd for LookupCommand {
|
||||
_ctx: GlobalContext,
|
||||
flag: CancelFlag,
|
||||
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
||||
let ds = Dataset::open(&self.config)
|
||||
let ds = Datasets::open(&self.config)
|
||||
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
||||
|
||||
if self.refresh && ds.needs_fts().context("while checking dataset fts")? {
|
||||
if self.refresh && ds.needs_fts().await.context("while checking dataset fts")? {
|
||||
info!("FTS index is missing or out-of-date, regenerating");
|
||||
ds.fts_refresh(self.jobs, Some(flag)).map_err(|x| {
|
||||
ds.fts_refresh(self.jobs, Some(flag)).await.map_err(|x| {
|
||||
x.map_err(|x| {
|
||||
anyhow::Error::from(x).context(format!(
|
||||
"while refreshing fts for {}",
|
||||
|
||||
@@ -1,16 +1,23 @@
|
||||
use anyhow::{Context, Result};
|
||||
use clap::Args;
|
||||
use pile_config::Label;
|
||||
use pile_dataset::{FileItem, PileValue, extract::MetaExtractor};
|
||||
use pile_dataset::{Datasets, PileValue, extract::MetaExtractor};
|
||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
||||
use std::{fmt::Debug, path::PathBuf, rc::Rc};
|
||||
use std::{path::PathBuf, sync::Arc};
|
||||
|
||||
use crate::{CliCmd, GlobalContext};
|
||||
|
||||
#[derive(Debug, Args)]
|
||||
pub struct ProbeCommand {
|
||||
/// The file to probe
|
||||
file: PathBuf,
|
||||
/// Source name (as defined in pile.toml)
|
||||
source: String,
|
||||
|
||||
/// Item key within the source
|
||||
key: String,
|
||||
|
||||
/// Path to dataset config
|
||||
#[arg(long, short = 'c', default_value = "./pile.toml")]
|
||||
config: PathBuf,
|
||||
}
|
||||
|
||||
impl CliCmd for ProbeCommand {
|
||||
@@ -21,19 +28,23 @@ impl CliCmd for ProbeCommand {
|
||||
_ctx: GlobalContext,
|
||||
_flag: CancelFlag,
|
||||
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
||||
let item = FileItem {
|
||||
path: self.file.clone(),
|
||||
source_name: Label::new("probe-source").unwrap(),
|
||||
sidecar: true,
|
||||
};
|
||||
let source = Label::new(&self.source)
|
||||
.ok_or_else(|| anyhow::anyhow!("invalid source name {:?}", self.source))?;
|
||||
|
||||
let value = PileValue::Extractor(Rc::new(MetaExtractor::new(&item)));
|
||||
let ds = Datasets::open(&self.config)
|
||||
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
||||
|
||||
let item = ds.get(&source, &self.key).await.ok_or_else(|| {
|
||||
anyhow::anyhow!("{:?} not found in source {:?}", self.key, self.source)
|
||||
})?;
|
||||
|
||||
let value = PileValue::Extractor(Arc::new(MetaExtractor::new(&item)));
|
||||
let json = value
|
||||
.to_json()
|
||||
.with_context(|| format!("while extracting {}", self.file.display()))?;
|
||||
.await
|
||||
.with_context(|| format!("while extracting {}", self.key))?;
|
||||
|
||||
let json = serde_json::to_string_pretty(&json).unwrap();
|
||||
|
||||
println!("{json}");
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user