Extractor rewrite

This commit is contained in:
2026-03-11 10:12:36 -07:00
parent b789255ea9
commit 4868a96b1e
51 changed files with 661 additions and 687 deletions

44
Cargo.lock generated
View File

@@ -2491,6 +2491,7 @@ dependencies = [
"pile-config", "pile-config",
"pile-dataset", "pile-dataset",
"pile-toolbox", "pile-toolbox",
"pile-value",
"serde", "serde",
"serde_json", "serde_json",
"tokio", "tokio",
@@ -2515,26 +2516,13 @@ dependencies = [
name = "pile-dataset" name = "pile-dataset"
version = "0.0.1" version = "0.0.1"
dependencies = [ dependencies = [
"async-trait",
"aws-sdk-s3",
"axum", "axum",
"blake3",
"chrono", "chrono",
"epub",
"id3",
"image",
"itertools 0.14.0",
"kamadak-exif",
"mime",
"mime_guess",
"pdf",
"pdfium-render",
"pile-config", "pile-config",
"pile-flac",
"pile-toolbox", "pile-toolbox",
"pile-value",
"serde", "serde",
"serde_json", "serde_json",
"smartstring",
"tantivy", "tantivy",
"thiserror", "thiserror",
"tokio", "tokio",
@@ -2543,7 +2531,6 @@ dependencies = [
"tracing", "tracing",
"utoipa", "utoipa",
"utoipa-swagger-ui", "utoipa-swagger-ui",
"walkdir",
] ]
[[package]] [[package]]
@@ -2568,6 +2555,33 @@ dependencies = [
"tokio", "tokio",
] ]
[[package]]
name = "pile-value"
version = "0.0.1"
dependencies = [
"async-trait",
"aws-sdk-s3",
"blake3",
"chrono",
"epub",
"id3",
"image",
"kamadak-exif",
"mime",
"mime_guess",
"pdf",
"pdfium-render",
"pile-config",
"pile-flac",
"serde_json",
"smartstring",
"tokio",
"tokio-stream",
"toml",
"tracing",
"walkdir",
]
[[package]] [[package]]
name = "pin-project-lite" name = "pin-project-lite"
version = "0.2.16" version = "0.2.16"

View File

@@ -67,6 +67,7 @@ pile-toolbox = { path = "crates/pile-toolbox" }
pile-config = { path = "crates/pile-config" } pile-config = { path = "crates/pile-config" }
pile-flac = { path = "crates/pile-flac" } pile-flac = { path = "crates/pile-flac" }
pile-dataset = { path = "crates/pile-dataset" } pile-dataset = { path = "crates/pile-dataset" }
pile-value = { path = "crates/pile-extractor" }
# Clients & servers # Clients & servers
tantivy = "0.25.0" tantivy = "0.25.0"

View File

@@ -1,9 +1,6 @@
use serde::Deserialize; use serde::Deserialize;
use std::{collections::HashMap, fmt::Debug, path::PathBuf}; use std::{collections::HashMap, fmt::Debug, path::PathBuf};
mod post;
pub use post::*;
mod misc; mod misc;
pub use misc::*; pub use misc::*;
@@ -40,10 +37,6 @@ pub struct DatasetConfig {
/// Where to find this field /// Where to find this field
pub source: HashMap<Label, Source>, pub source: HashMap<Label, Source>,
/// How to post-process this field
#[serde(default)]
pub post: Vec<FieldSpecPost>,
} }
#[derive(Debug, Clone, Deserialize)] #[derive(Debug, Clone, Deserialize)]
@@ -100,10 +93,6 @@ pub struct FieldSpec {
/// How to find this field in a data entry /// How to find this field in a data entry
pub path: Vec<ObjectPath>, pub path: Vec<ObjectPath>,
/// How to post-process this field
#[serde(default)]
pub post: Vec<FieldSpecPost>,
} }
#[derive(Debug, Clone, Copy, Deserialize, PartialEq, Eq)] #[derive(Debug, Clone, Copy, Deserialize, PartialEq, Eq)]

View File

@@ -1,18 +0,0 @@
use serde::Deserialize;
#[derive(Debug, Clone, Deserialize, PartialEq, Eq)]
#[serde(untagged)]
pub enum FieldSpecPost {
TrimSuffix { trim_suffix: String },
TrimPrefix { trim_prefix: String },
SetCase { case: Case },
Join { join: String },
NotEmpty { notempty: bool },
}
#[derive(Debug, Clone, Copy, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
pub enum Case {
Lower,
Upper,
}

View File

@@ -10,37 +10,23 @@ workspace = true
[dependencies] [dependencies]
pile-config = { workspace = true } pile-config = { workspace = true }
pile-toolbox = { workspace = true } pile-toolbox = { workspace = true }
pile-flac = { workspace = true } pile-value = { workspace = true }
serde_json = { workspace = true } serde_json = { workspace = true }
itertools = { workspace = true }
walkdir = { workspace = true }
tantivy = { workspace = true } tantivy = { workspace = true }
tracing = { workspace = true } tracing = { workspace = true }
chrono = { workspace = true } chrono = { workspace = true }
toml = { workspace = true } toml = { workspace = true }
thiserror = { workspace = true } thiserror = { workspace = true }
smartstring = { workspace = true }
blake3 = { workspace = true }
epub = { workspace = true }
kamadak-exif = { workspace = true }
pdf = { workspace = true }
pdfium-render = { workspace = true, optional = true }
image = { workspace = true, optional = true }
id3 = { workspace = true }
tokio = { workspace = true } tokio = { workspace = true }
tokio-stream = { workspace = true } tokio-stream = { workspace = true }
async-trait = { workspace = true }
aws-sdk-s3 = { workspace = true }
mime = { workspace = true }
mime_guess = { workspace = true }
serde = { workspace = true }
serde = { workspace = true, optional = true }
axum = { workspace = true, optional = true } axum = { workspace = true, optional = true }
utoipa = { workspace = true, optional = true } utoipa = { workspace = true, optional = true }
utoipa-swagger-ui = { workspace = true, optional = true } utoipa-swagger-ui = { workspace = true, optional = true }
[features] [features]
default = [] default = []
pdfium = ["dep:pdfium-render", "dep:image"] pdfium = ["pile-value/pdfium"]
axum = ["dep:axum", "dep:utoipa", "dep:utoipa-swagger-ui"] axum = ["dep:axum", "dep:utoipa", "dep:utoipa-swagger-ui", "dep:serde"]

View File

@@ -1,6 +1,10 @@
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use pile_config::{ConfigToml, Label, Source, objectpath::ObjectPath}; use pile_config::{ConfigToml, Label, Source, objectpath::ObjectPath};
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError}; use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
use pile_value::{
source::{DataSource, DirDataSource, S3DataSource, misc::path_ts_earliest},
value::{Item, PileValue},
};
use serde_json::Value; use serde_json::Value;
use std::{collections::HashMap, io::ErrorKind, path::PathBuf, sync::Arc, time::Instant}; use std::{collections::HashMap, io::ErrorKind, path::PathBuf, sync::Arc, time::Instant};
use tantivy::{Executor, Index, IndexWriter, TantivyError, collector::TopDocs}; use tantivy::{Executor, Index, IndexWriter, TantivyError, collector::TopDocs};
@@ -9,13 +13,7 @@ use tokio::task::JoinSet;
use tokio_stream::{StreamExt, wrappers::ReceiverStream}; use tokio_stream::{StreamExt, wrappers::ReceiverStream};
use tracing::{debug, info, trace, warn}; use tracing::{debug, info, trace, warn};
use crate::{ use crate::index::{DbFtsIndex, FtsLookupResult};
DataSource, Item, PileValue,
extract::MetaExtractor,
index::{DbFtsIndex, FtsLookupResult},
path_ts_earliest,
source::{DirDataSource, S3DataSource},
};
#[derive(Debug, Error)] #[derive(Debug, Error)]
pub enum DatasetError { pub enum DatasetError {
@@ -183,11 +181,12 @@ impl Datasets {
let Some(item) = self.get(source, key).await else { let Some(item) = self.get(source, key).await else {
return Ok(None); return Ok(None);
}; };
let extractor = MetaExtractor::new(&item);
let root = PileValue::ObjectExtractor(Arc::new(extractor)); let item = PileValue::Item(item);
let Some(value) = root.query(path).await? else { let Some(value) = item.query(path).await? else {
return Ok(None); return Ok(None);
}; };
Ok(Some(value.to_json().await?)) Ok(Some(value.to_json().await?))
} }

View File

@@ -1,165 +0,0 @@
use pile_config::Label;
use std::{collections::HashMap, sync::Arc};
mod flac;
pub use flac::*;
mod id3;
pub use id3::*;
mod fs;
pub use fs::*;
mod epub;
pub use epub::*;
mod exif;
pub use exif::*;
mod pdf;
pub use pdf::*;
mod toml;
pub use toml::*;
mod map;
pub use map::*;
mod sidecar;
pub use sidecar::*;
use crate::{Item, PileValue};
/// An attachment that extracts metadata from an [Item].
///
/// Metadata is exposed as an immutable map of {label: value},
/// much like a json object.
#[async_trait::async_trait]
pub trait ObjectExtractor: Send + Sync {
/// Get the field at `name` from `item`.
/// - returns `None` if `name` is not a valid field
/// - returns `Some(Null)` if `name` is not available
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error>;
/// Return all fields in this extractor.
/// `Self::field` must return [Some] for all these keys
/// and [None] for all others.
async fn fields(&self) -> Result<Vec<Label>, std::io::Error>;
/// Convert this to a JSON value.
async fn to_json(&self) -> Result<serde_json::Value, std::io::Error> {
let keys = self.fields().await?;
let mut map = serde_json::Map::new();
for k in &keys {
let v = match self.field(k).await? {
Some(x) => x,
None => continue,
};
map.insert(k.to_string(), Box::pin(v.to_json()).await?);
}
Ok(serde_json::Value::Object(map))
}
}
/// An attachment that extracts metadata from an [Item].
///
/// Metadata is exposed as an immutable list of values.
#[async_trait::async_trait]
pub trait ListExtractor: Send + Sync {
/// Get the item at index `idx`.
/// Indices start at zero, and must be consecutive.
/// - returns `None` if `idx` is out of range
/// - returns `Some(Null)` if `None` is at `idx`
async fn get(&self, idx: usize) -> Result<Option<PileValue>, std::io::Error>;
async fn len(&self) -> Result<usize, std::io::Error>;
async fn is_empty(&self) -> Result<bool, std::io::Error> {
Ok(self.len().await? == 0)
}
/// Convert this list to a JSON value.
async fn to_json(&self) -> Result<serde_json::Value, std::io::Error> {
let len = self.len().await?;
let mut list = Vec::with_capacity(len);
for i in 0..len {
#[expect(clippy::expect_used)]
let v = self
.get(i)
.await?
.expect("value must be present according to length");
list.push(Box::pin(v.to_json()).await?);
}
Ok(serde_json::Value::Array(list))
}
}
pub struct MetaExtractor {
inner: MapExtractor,
}
impl MetaExtractor {
#[expect(clippy::unwrap_used)]
pub fn new(item: &Item) -> Self {
let inner = MapExtractor {
inner: HashMap::from([
(
Label::new("flac").unwrap(),
crate::PileValue::ObjectExtractor(Arc::new(FlacExtractor::new(item))),
),
(
Label::new("id3").unwrap(),
crate::PileValue::ObjectExtractor(Arc::new(Id3Extractor::new(item))),
),
(
Label::new("fs").unwrap(),
crate::PileValue::ObjectExtractor(Arc::new(FsExtractor::new(item))),
),
(
Label::new("epub").unwrap(),
crate::PileValue::ObjectExtractor(Arc::new(EpubExtractor::new(item))),
),
(
Label::new("exif").unwrap(),
crate::PileValue::ObjectExtractor(Arc::new(ExifExtractor::new(item))),
),
(
Label::new("pdf").unwrap(),
crate::PileValue::ObjectExtractor(Arc::new(PdfExtractor::new(item))),
),
(
Label::new("toml").unwrap(),
crate::PileValue::ObjectExtractor(Arc::new(TomlExtractor::new(item))),
),
(
Label::new("sidecar").unwrap(),
crate::PileValue::ObjectExtractor(Arc::new(SidecarExtractor::new(item))),
),
]),
};
Self { inner }
}
}
#[async_trait::async_trait]
impl ObjectExtractor for MetaExtractor {
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error> {
self.inner.field(name).await
}
#[expect(clippy::unwrap_used)]
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
return Ok(vec![
Label::new("flac").unwrap(),
Label::new("id3").unwrap(),
Label::new("fs").unwrap(),
Label::new("epub").unwrap(),
Label::new("exif").unwrap(),
Label::new("pdf").unwrap(),
Label::new("sidecar").unwrap(),
]);
}
}

View File

@@ -1,95 +0,0 @@
use image::ImageFormat;
use pdfium_render::prelude::*;
use pile_config::Label;
use std::{
collections::HashMap,
io::{BufReader, Cursor},
sync::{Arc, OnceLock},
};
use tracing::trace;
use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor};
pub struct PdfCoverExtractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl PdfCoverExtractor {
pub fn new(item: &Item) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let reader = SyncReadBridge::new_current(self.item.read().await?);
let cover = tokio::task::spawn_blocking(move || {
let mut bytes = Vec::new();
std::io::Read::read_to_end(&mut BufReader::new(reader), &mut bytes)?;
let pdfium = Pdfium::default();
let document = pdfium
.load_pdf_from_byte_slice(&bytes, None)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let render_config = PdfRenderConfig::new().set_target_width(1024);
let page = document
.pages()
.get(0)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let image = page
.render_with_config(&render_config)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?
.as_image();
let mut png_bytes = Vec::new();
image
.write_to(&mut Cursor::new(&mut png_bytes), ImageFormat::Png)
.map_err(|e| std::io::Error::other(e.to_string()))?;
Ok::<_, std::io::Error>(png_bytes)
})
.await
.map_err(std::io::Error::other)?;
let output = match cover {
Ok(data) => {
#[expect(clippy::unwrap_used)]
let label = Label::new("cover").unwrap();
HashMap::from([(
label,
PileValue::Blob {
mime: mime::IMAGE_PNG,
bytes: Arc::new(data),
},
)])
}
Err(error) => {
trace!(message = "Could not render pdf cover", ?error, key = ?self.item.key());
HashMap::new()
}
};
return Ok(self.output.get_or_init(|| output));
}
}
#[async_trait::async_trait]
impl ObjectExtractor for PdfCoverExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.get_inner().await?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect())
}
}

View File

@@ -1,9 +1,6 @@
use itertools::Itertools; use pile_config::{ConfigToml, DatasetFts, Label};
use pile_config::{Case, ConfigToml, DatasetFts, FieldSpecPost, Label}; use pile_value::value::{Item, PileValue};
use std::{ use std::{path::PathBuf, sync::LazyLock};
path::PathBuf,
sync::{Arc, LazyLock},
};
use tantivy::{ use tantivy::{
DocAddress, Index, ReloadPolicy, TantivyDocument, TantivyError, DocAddress, Index, ReloadPolicy, TantivyDocument, TantivyError,
collector::Collector, collector::Collector,
@@ -12,8 +9,6 @@ use tantivy::{
}; };
use tracing::{debug, trace, warn}; use tracing::{debug, trace, warn};
use crate::{Item, PileValue, extract::MetaExtractor};
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct FtsLookupResult { pub struct FtsLookupResult {
pub score: f32, pub score: f32,
@@ -76,11 +71,11 @@ impl DbFtsIndex {
doc.add_text(self.schema.get_field("_meta_source")?, item.source_name()); doc.add_text(self.schema.get_field("_meta_source")?, item.source_name());
doc.add_text(self.schema.get_field("_meta_key")?, key); doc.add_text(self.schema.get_field("_meta_key")?, key);
let extractor = PileValue::ObjectExtractor(Arc::new(MetaExtractor::new(item))); let item = PileValue::Item(item.clone());
let mut empty = true; let mut empty = true;
for name in self.fts_cfg().fields.keys() { for name in self.fts_cfg().fields.keys() {
let x = self.get_field(&extractor, name).await?; let x = self.get_field(&item, name).await?;
let val = match x { let val = match x {
Some(x) => x, Some(x) => x,
@@ -135,13 +130,6 @@ impl DbFtsIndex {
x => x.clone(), x => x.clone(),
}; };
for post in &field.post {
val = match apply(post, &val) {
Some(x) => x,
None => return Ok(None),
};
}
loop { loop {
val = match val { val = match val {
PileValue::String(x) => return Ok(Some(x.to_string())), PileValue::String(x) => return Ok(Some(x.to_string())),
@@ -186,6 +174,15 @@ impl DbFtsIndex {
continue 'outer; continue 'outer;
} }
PileValue::Item(_) => {
trace!(
message = "Skipping field, is item",
field = field_name.to_string(),
?path,
);
continue 'outer;
}
PileValue::ListExtractor(_) => { PileValue::ListExtractor(_) => {
trace!( trace!(
message = "Skipping field, is ListExtractor", message = "Skipping field, is ListExtractor",
@@ -296,104 +293,3 @@ impl DbFtsIndex {
return Ok(out); return Ok(out);
} }
} }
pub fn apply(post: &FieldSpecPost, val: &PileValue) -> Option<PileValue> {
Some(match post {
FieldSpecPost::NotEmpty { notempty: false } => val.clone(),
FieldSpecPost::NotEmpty { notempty: true } => match val {
PileValue::Null => return None,
PileValue::String(x) if x.is_empty() => return None,
PileValue::Array(x) if x.is_empty() => return None,
x => x.clone(),
},
FieldSpecPost::SetCase { case: Case::Lower } => match val {
PileValue::Null => return None,
PileValue::U64(_) => return None,
PileValue::I64(_) => return None,
PileValue::Blob { .. } => return None,
PileValue::ObjectExtractor(_) => return None,
PileValue::ListExtractor(_) => return None,
PileValue::String(x) => PileValue::String(Arc::new(x.as_str().to_lowercase().into())),
PileValue::Array(x) => PileValue::Array(Arc::new(
x.iter().map(|x| apply(post, x)).collect::<Option<_>>()?,
)),
},
FieldSpecPost::SetCase { case: Case::Upper } => match val {
PileValue::Null => return None,
PileValue::U64(_) => return None,
PileValue::I64(_) => return None,
PileValue::Blob { .. } => return None,
PileValue::ObjectExtractor(_) => return None,
PileValue::ListExtractor(_) => return None,
PileValue::String(x) => PileValue::String(Arc::new(x.as_str().to_uppercase().into())),
PileValue::Array(x) => PileValue::Array(Arc::new(
x.iter()
.map(|x| apply(post, x))
.collect::<Option<Vec<_>>>()?,
)),
},
FieldSpecPost::TrimSuffix { trim_suffix } => match val {
PileValue::Null => return None,
PileValue::U64(_) => return None,
PileValue::I64(_) => return None,
PileValue::Blob { .. } => return None,
PileValue::ObjectExtractor(_) => return None,
PileValue::ListExtractor(_) => return None,
PileValue::String(x) => PileValue::String(Arc::new(
x.strip_suffix(trim_suffix).unwrap_or(x.as_str()).into(),
)),
PileValue::Array(x) => PileValue::Array(Arc::new(
x.iter()
.map(|x| apply(post, x))
.collect::<Option<Vec<_>>>()?,
)),
},
FieldSpecPost::TrimPrefix { trim_prefix } => match val {
PileValue::Null => return None,
PileValue::U64(_) => return None,
PileValue::I64(_) => return None,
PileValue::Blob { .. } => return None,
PileValue::ObjectExtractor(_) => return None,
PileValue::ListExtractor(_) => return None,
PileValue::String(x) => PileValue::String(Arc::new(
x.strip_prefix(trim_prefix).unwrap_or(x.as_str()).into(),
)),
PileValue::Array(x) => PileValue::Array(Arc::new(
x.iter()
.map(|x| apply(post, x))
.collect::<Option<Vec<_>>>()?,
)),
},
FieldSpecPost::Join { join } => match val {
PileValue::Null => return None,
PileValue::U64(_) => return None,
PileValue::I64(_) => return None,
PileValue::Blob { .. } => return None,
PileValue::ObjectExtractor(_) => return None,
PileValue::ListExtractor(_) => return None,
PileValue::String(x) => PileValue::String(x.clone()),
PileValue::Array(x) => PileValue::String(Arc::new(
x.iter()
.map(|x| apply(post, x))
.map(|x| x.and_then(|x| x.as_str().map(|x| x.to_owned())))
.collect::<Option<Vec<_>>>()?
.into_iter()
.join(join)
.into(),
)),
},
})
}

View File

@@ -1,21 +1,7 @@
mod traits;
pub use traits::*;
mod misc;
pub use misc::*;
mod dataset; mod dataset;
pub use dataset::{Dataset, DatasetError, Datasets}; pub use dataset::{Dataset, DatasetError, Datasets};
mod item;
pub use item::*;
mod value;
pub use value::*;
pub mod extract;
pub mod index; pub mod index;
pub mod source;
#[cfg(feature = "axum")] #[cfg(feature = "axum")]
pub mod serve; pub mod serve;

View File

@@ -5,12 +5,13 @@ use axum::{
response::{IntoResponse, Response}, response::{IntoResponse, Response},
}; };
use pile_config::{Label, objectpath::ObjectPath}; use pile_config::{Label, objectpath::ObjectPath};
use pile_value::value::PileValue;
use serde::Deserialize; use serde::Deserialize;
use std::{sync::Arc, time::Instant}; use std::{sync::Arc, time::Instant};
use tracing::debug; use tracing::debug;
use utoipa::ToSchema; use utoipa::ToSchema;
use crate::{Datasets, PileValue, extract::MetaExtractor}; use crate::Datasets;
#[derive(Deserialize, ToSchema)] #[derive(Deserialize, ToSchema)]
pub struct FieldQuery { pub struct FieldQuery {
@@ -61,10 +62,8 @@ pub async fn get_field(
return StatusCode::NOT_FOUND.into_response(); return StatusCode::NOT_FOUND.into_response();
}; };
let extractor = MetaExtractor::new(&item); let item = PileValue::Item(item);
let root: PileValue = PileValue::ObjectExtractor(Arc::new(extractor)); let value = match item.query(&path).await {
let value = match root.query(&path).await {
Ok(Some(v)) => v, Ok(Some(v)) => v,
Ok(None) => return StatusCode::NOT_FOUND.into_response(), Ok(None) => return StatusCode::NOT_FOUND.into_response(),
Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(), Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),

View File

@@ -4,12 +4,13 @@ use axum::{
response::{IntoResponse, Response}, response::{IntoResponse, Response},
}; };
use pile_config::Label; use pile_config::Label;
use pile_value::value::AsyncReader;
use serde::Deserialize; use serde::Deserialize;
use std::{sync::Arc, time::Instant}; use std::{sync::Arc, time::Instant};
use tracing::debug; use tracing::debug;
use utoipa::ToSchema; use utoipa::ToSchema;
use crate::{AsyncReader, Datasets}; use crate::Datasets;
#[derive(Deserialize, ToSchema)] #[derive(Deserialize, ToSchema)]
pub struct ItemQuery { pub struct ItemQuery {

View File

@@ -1,5 +0,0 @@
mod dir;
pub use dir::*;
mod s3;
pub use s3::*;

View File

@@ -0,0 +1,36 @@
[package]
name = "pile-value"
version = { workspace = true }
rust-version = { workspace = true }
edition = { workspace = true }
[lints]
workspace = true
[dependencies]
pile-config = { workspace = true }
pile-flac = { workspace = true }
serde_json = { workspace = true }
walkdir = { workspace = true }
tracing = { workspace = true }
chrono = { workspace = true }
toml = { workspace = true }
smartstring = { workspace = true }
blake3 = { workspace = true }
epub = { workspace = true }
kamadak-exif = { workspace = true }
pdf = { workspace = true }
pdfium-render = { workspace = true, optional = true }
image = { workspace = true, optional = true }
id3 = { workspace = true }
tokio = { workspace = true }
tokio-stream = { workspace = true }
async-trait = { workspace = true }
aws-sdk-s3 = { workspace = true }
mime = { workspace = true }
mime_guess = { workspace = true }
[features]
default = []
pdfium = ["dep:pdfium-render", "dep:image"]

View File

@@ -6,7 +6,10 @@ use std::{
}; };
use tracing::trace; use tracing::trace;
use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor}; use crate::{
extract::traits::ObjectExtractor,
value::{Item, PileValue, SyncReadBridge},
};
pub struct EpubMetaExtractor { pub struct EpubMetaExtractor {
item: Item, item: Item,

View File

@@ -6,7 +6,7 @@ use std::{
}; };
use tracing::debug; use tracing::debug;
use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor}; use crate::{value::{Item, PileValue, SyncReadBridge}, extract::traits::ObjectExtractor};
pub struct EpubTextExtractor { pub struct EpubTextExtractor {
item: Item, item: Item,

View File

@@ -7,7 +7,10 @@ pub use epub_meta::*;
mod epub_text; mod epub_text;
pub use epub_text::*; pub use epub_text::*;
use crate::{Item, PileValue, extract::ObjectExtractor}; use crate::{
extract::traits::ObjectExtractor,
value::{Item, PileValue},
};
pub struct EpubExtractor { pub struct EpubExtractor {
text: Arc<EpubTextExtractor>, text: Arc<EpubTextExtractor>,

View File

@@ -6,7 +6,7 @@ use std::{
}; };
use tracing::trace; use tracing::trace;
use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor}; use crate::{value::{Item, PileValue, SyncReadBridge}, extract::traits::ObjectExtractor};
pub struct ExifExtractor { pub struct ExifExtractor {
item: Item, item: Item,

View File

@@ -8,8 +8,8 @@ use std::{
}; };
use crate::{ use crate::{
Item, PileValue, SyncReadBridge, value::{Item, PileValue, SyncReadBridge},
extract::{ListExtractor, ObjectExtractor}, extract::traits::{ListExtractor, ObjectExtractor},
}; };
pub struct FlacImagesExtractor { pub struct FlacImagesExtractor {

View File

@@ -5,7 +5,7 @@ use std::{
sync::{Arc, OnceLock}, sync::{Arc, OnceLock},
}; };
use crate::{Item, PileValue, extract::ObjectExtractor}; use crate::{value::{Item, PileValue}, extract::traits::ObjectExtractor};
pub struct FsExtractor { pub struct FsExtractor {
item: Item, item: Item,

View File

@@ -7,7 +7,10 @@ use std::{
sync::{Arc, OnceLock}, sync::{Arc, OnceLock},
}; };
use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor}; use crate::{
extract::traits::ObjectExtractor,
value::{Item, PileValue, SyncReadBridge},
};
pub struct Id3Extractor { pub struct Id3Extractor {
item: Item, item: Item,

View File

@@ -0,0 +1,99 @@
mod flac;
use std::{collections::HashMap, sync::Arc};
pub use flac::*;
mod id3;
pub use id3::*;
mod fs;
pub use fs::*;
mod epub;
pub use epub::*;
mod exif;
pub use exif::*;
mod pdf;
pub use pdf::*;
mod toml;
use pile_config::Label;
pub use toml::*;
mod sidecar;
pub use sidecar::*;
use crate::{
extract::{misc::MapExtractor, traits::ObjectExtractor},
value::{Item, PileValue},
};
pub struct ItemExtractor {
inner: MapExtractor,
}
impl ItemExtractor {
#[expect(clippy::unwrap_used)]
pub fn new(item: &Item) -> Self {
let inner = MapExtractor {
inner: HashMap::from([
(
Label::new("flac").unwrap(),
PileValue::ObjectExtractor(Arc::new(FlacExtractor::new(item))),
),
(
Label::new("id3").unwrap(),
PileValue::ObjectExtractor(Arc::new(Id3Extractor::new(item))),
),
(
Label::new("fs").unwrap(),
PileValue::ObjectExtractor(Arc::new(FsExtractor::new(item))),
),
(
Label::new("epub").unwrap(),
PileValue::ObjectExtractor(Arc::new(EpubExtractor::new(item))),
),
(
Label::new("exif").unwrap(),
PileValue::ObjectExtractor(Arc::new(ExifExtractor::new(item))),
),
(
Label::new("pdf").unwrap(),
PileValue::ObjectExtractor(Arc::new(PdfExtractor::new(item))),
),
(
Label::new("toml").unwrap(),
PileValue::ObjectExtractor(Arc::new(TomlExtractor::new(item))),
),
(
Label::new("sidecar").unwrap(),
PileValue::ObjectExtractor(Arc::new(SidecarExtractor::new(item))),
),
]),
};
Self { inner }
}
}
#[async_trait::async_trait]
impl ObjectExtractor for ItemExtractor {
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error> {
self.inner.field(name).await
}
#[expect(clippy::unwrap_used)]
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
return Ok(vec![
Label::new("flac").unwrap(),
Label::new("id3").unwrap(),
Label::new("fs").unwrap(),
Label::new("epub").unwrap(),
Label::new("exif").unwrap(),
Label::new("pdf").unwrap(),
Label::new("sidecar").unwrap(),
]);
}
}

View File

@@ -1,11 +1,6 @@
use pile_config::Label; use pile_config::Label;
use std::sync::Arc; use std::sync::Arc;
#[cfg(feature = "pdfium")]
mod pdf_cover;
#[cfg(feature = "pdfium")]
pub use pdf_cover::*;
#[cfg(feature = "pdfium")] #[cfg(feature = "pdfium")]
mod pdf_pages; mod pdf_pages;
#[cfg(feature = "pdfium")] #[cfg(feature = "pdfium")]
@@ -17,14 +12,12 @@ pub use pdf_meta::*;
mod pdf_text; mod pdf_text;
pub use pdf_text::*; pub use pdf_text::*;
use crate::{Item, PileValue, extract::ObjectExtractor}; use crate::{value::{Item, PileValue}, extract::traits::ObjectExtractor};
pub struct PdfExtractor { pub struct PdfExtractor {
text: Arc<PdfTextExtractor>, text: Arc<PdfTextExtractor>,
meta: Arc<PdfMetaExtractor>, meta: Arc<PdfMetaExtractor>,
#[cfg(feature = "pdfium")] #[cfg(feature = "pdfium")]
cover: Arc<PdfCoverExtractor>,
#[cfg(feature = "pdfium")]
pages: Arc<PdfPagesExtractor>, pages: Arc<PdfPagesExtractor>,
} }
@@ -34,8 +27,6 @@ impl PdfExtractor {
text: Arc::new(PdfTextExtractor::new(item)), text: Arc::new(PdfTextExtractor::new(item)),
meta: Arc::new(PdfMetaExtractor::new(item)), meta: Arc::new(PdfMetaExtractor::new(item)),
#[cfg(feature = "pdfium")] #[cfg(feature = "pdfium")]
cover: Arc::new(PdfCoverExtractor::new(item)),
#[cfg(feature = "pdfium")]
pages: Arc::new(PdfPagesExtractor::new(item)), pages: Arc::new(PdfPagesExtractor::new(item)),
} }
} }
@@ -48,8 +39,6 @@ impl ObjectExtractor for PdfExtractor {
"text" => self.text.field(name).await, "text" => self.text.field(name).await,
"meta" => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))), "meta" => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))),
#[cfg(feature = "pdfium")] #[cfg(feature = "pdfium")]
"cover" => self.cover.field(name).await,
#[cfg(feature = "pdfium")]
"pages" => Ok(Some(PileValue::ListExtractor(self.pages.clone()))), "pages" => Ok(Some(PileValue::ListExtractor(self.pages.clone()))),
_ => Ok(None), _ => Ok(None),
} }

View File

@@ -8,8 +8,7 @@ use std::{
}; };
use tracing::trace; use tracing::trace;
use crate::extract::ObjectExtractor; use crate::{extract::traits::ObjectExtractor, value::{Item, PileValue, SyncReadBridge}};
use crate::{Item, PileValue, SyncReadBridge};
pub struct PdfMetaExtractor { pub struct PdfMetaExtractor {
item: Item, item: Item,

View File

@@ -6,7 +6,10 @@ use std::{
}; };
use tracing::trace; use tracing::trace;
use crate::{Item, PileValue, SyncReadBridge, extract::ListExtractor}; use crate::{
extract::traits::ListExtractor,
value::{Item, PileValue, SyncReadBridge},
};
pub struct PdfPagesExtractor { pub struct PdfPagesExtractor {
item: Item, item: Item,

View File

@@ -8,8 +8,7 @@ use std::{
}; };
use tracing::trace; use tracing::trace;
use crate::extract::ObjectExtractor; use crate::{extract::traits::ObjectExtractor, value::{Item, PileValue, SyncReadBridge}};
use crate::{Item, PileValue, SyncReadBridge};
pub struct PdfTextExtractor { pub struct PdfTextExtractor {
item: Item, item: Item,

View File

@@ -2,9 +2,10 @@ use pile_config::Label;
use std::sync::OnceLock; use std::sync::OnceLock;
use crate::{ use crate::{
Item, PileValue, value::{Item, PileValue},
extract::{ObjectExtractor, TomlExtractor}, extract::traits::ObjectExtractor,
}; };
use super::TomlExtractor;
pub struct SidecarExtractor { pub struct SidecarExtractor {
item: Item, item: Item,

View File

@@ -4,7 +4,10 @@ use std::{
sync::{Arc, OnceLock}, sync::{Arc, OnceLock},
}; };
use crate::{AsyncReader, Item, PileValue, extract::ObjectExtractor}; use crate::{
extract::traits::ObjectExtractor,
value::{AsyncReader, Item, PileValue},
};
fn toml_to_pile(value: toml::Value) -> PileValue { fn toml_to_pile(value: toml::Value) -> PileValue {
match value { match value {

View File

@@ -0,0 +1,24 @@
use std::sync::Arc;
use crate::{extract::traits::ListExtractor, value::PileValue};
pub struct ArrayExtractor {
inner: Arc<Vec<PileValue>>,
}
impl ArrayExtractor {
pub fn new(inner: Arc<Vec<PileValue>>) -> Self {
Self { inner }
}
}
#[async_trait::async_trait]
impl ListExtractor for ArrayExtractor {
async fn get(&self, idx: usize) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.inner.get(idx).cloned())
}
async fn len(&self) -> Result<usize, std::io::Error> {
Ok(self.inner.len())
}
}

View File

@@ -1,10 +1,18 @@
use pile_config::Label; use pile_config::Label;
use std::collections::HashMap; use std::collections::HashMap;
use crate::{PileValue, extract::ObjectExtractor}; use crate::{extract::traits::ObjectExtractor, value::PileValue};
pub struct MapExtractor { pub struct MapExtractor {
pub(crate) inner: HashMap<Label, PileValue>, pub inner: HashMap<Label, PileValue>,
}
impl Default for MapExtractor {
fn default() -> Self {
Self {
inner: HashMap::new(),
}
}
} }
#[async_trait::async_trait] #[async_trait::async_trait]

View File

@@ -0,0 +1,8 @@
mod list;
pub use list::*;
mod vec;
pub use vec::*;
mod map;
pub use map::*;

View File

@@ -0,0 +1,22 @@
use crate::{extract::traits::ListExtractor, value::PileValue};
pub struct VecExtractor {
pub inner: Vec<PileValue>,
}
impl Default for VecExtractor {
fn default() -> Self {
Self { inner: Vec::new() }
}
}
#[async_trait::async_trait]
impl ListExtractor for VecExtractor {
async fn get(&self, idx: usize) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.inner.get(idx).cloned())
}
async fn len(&self) -> Result<usize, std::io::Error> {
Ok(self.inner.len())
}
}

View File

@@ -0,0 +1,4 @@
pub mod item;
pub mod misc;
pub mod string;
pub mod traits;

View File

@@ -0,0 +1,51 @@
use pile_config::Label;
use smartstring::{LazyCompact, SmartString};
use std::sync::Arc;
use crate::{extract::traits::ObjectExtractor, value::PileValue};
pub struct StringExtractor {
item: Arc<SmartString<LazyCompact>>,
}
impl StringExtractor {
pub fn new(item: &Arc<SmartString<LazyCompact>>) -> Self {
Self { item: item.clone() }
}
}
#[async_trait::async_trait]
impl ObjectExtractor for StringExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(match name.as_str() {
"trim" => Some(PileValue::String(Arc::new(
self.item.as_str().trim().into(),
))),
"upper" => Some(PileValue::String(Arc::new(
self.item.as_str().to_lowercase().into(),
))),
"lower" => Some(PileValue::String(Arc::new(
self.item.as_str().to_uppercase().into(),
))),
"nonempty" => Some(match self.item.is_empty() {
true => PileValue::Null,
false => PileValue::String(self.item.clone()),
}),
_ => None,
})
}
#[expect(clippy::unwrap_used)]
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
return Ok(vec![
Label::new("trim").unwrap(),
Label::new("upper").unwrap(),
Label::new("lower").unwrap(),
Label::new("nonempty").unwrap(),
]);
}
}

View File

@@ -0,0 +1,68 @@
/// An attachment that extracts metadata from an [Item].
///
/// Metadata is exposed as an immutable map of {label: value},
/// much like a json object.
#[async_trait::async_trait]
pub trait ObjectExtractor: Send + Sync {
/// Get the field at `name` from `item`.
/// - returns `None` if `name` is not a valid field
/// - returns `Some(Null)` if `name` is not available
async fn field(
&self,
name: &pile_config::Label,
) -> Result<Option<crate::value::PileValue>, std::io::Error>;
/// Return all fields in this extractor.
/// `Self::field` must return [Some] for all these keys
/// and [None] for all others.
async fn fields(&self) -> Result<Vec<pile_config::Label>, std::io::Error>;
/// Convert this to a JSON value.
async fn to_json(&self) -> Result<serde_json::Value, std::io::Error> {
let keys = self.fields().await?;
let mut map = serde_json::Map::new();
for k in &keys {
let v = match self.field(k).await? {
Some(x) => x,
None => continue,
};
map.insert(k.to_string(), Box::pin(v.to_json()).await?);
}
Ok(serde_json::Value::Object(map))
}
}
/// An attachment that extracts metadata from an [Item].
///
/// Metadata is exposed as an immutable list of values.
#[async_trait::async_trait]
pub trait ListExtractor: Send + Sync {
/// Get the item at index `idx`.
/// Indices start at zero, and must be consecutive.
/// - returns `None` if `idx` is out of range
/// - returns `Some(Null)` if `None` is at `idx`
async fn get(&self, idx: usize) -> Result<Option<crate::value::PileValue>, std::io::Error>;
async fn len(&self) -> Result<usize, std::io::Error>;
async fn is_empty(&self) -> Result<bool, std::io::Error> {
Ok(self.len().await? == 0)
}
/// Convert this list to a JSON value.
async fn to_json(&self) -> Result<serde_json::Value, std::io::Error> {
let len = self.len().await?;
let mut list = Vec::with_capacity(len);
for i in 0..len {
#[expect(clippy::expect_used)]
let v = self
.get(i)
.await?
.expect("value must be present according to length");
list.push(Box::pin(v.to_json()).await?);
}
Ok(serde_json::Value::Array(list))
}
}

View File

@@ -0,0 +1,3 @@
pub mod extract;
pub mod source;
pub mod value;

View File

@@ -4,7 +4,10 @@ use std::{path::PathBuf, sync::Arc};
use tokio_stream::wrappers::ReceiverStream; use tokio_stream::wrappers::ReceiverStream;
use walkdir::WalkDir; use walkdir::WalkDir;
use crate::{DataSource, Item, path_ts_latest}; use crate::{
source::{DataSource, misc::path_ts_latest},
value::Item,
};
#[derive(Debug)] #[derive(Debug)]
pub struct DirDataSource { pub struct DirDataSource {

View File

@@ -1,15 +1,24 @@
mod dir;
pub use dir::*;
mod s3;
pub use s3::*;
pub mod misc;
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use tokio_stream::wrappers::ReceiverStream; use tokio_stream::wrappers::ReceiverStream;
use crate::Item;
/// A read-only set of [Item]s. /// A read-only set of [Item]s.
pub trait DataSource { pub trait DataSource {
/// Get an item from this datasource /// Get an item from this datasource
fn get(&self, key: &str) -> impl Future<Output = Result<Option<Item>, std::io::Error>> + Send; fn get(
&self,
key: &str,
) -> impl Future<Output = Result<Option<crate::value::Item>, std::io::Error>> + Send;
/// Iterate over all items in this source in an arbitrary order /// Iterate over all items in this source in an arbitrary order
fn iter(&self) -> ReceiverStream<Result<Item, std::io::Error>>; fn iter(&self) -> ReceiverStream<Result<crate::value::Item, std::io::Error>>;
/// Return the time of the latest change to the data in this source /// Return the time of the latest change to the data in this source
fn latest_change( fn latest_change(

View File

@@ -5,7 +5,7 @@ use smartstring::{LazyCompact, SmartString};
use std::sync::Arc; use std::sync::Arc;
use tokio_stream::wrappers::ReceiverStream; use tokio_stream::wrappers::ReceiverStream;
use crate::{DataSource, Item}; use crate::{source::DataSource, value::Item};
#[derive(Debug)] #[derive(Debug)]
pub struct S3DataSource { pub struct S3DataSource {

View File

@@ -0,0 +1,105 @@
use mime::Mime;
use smartstring::{LazyCompact, SmartString};
use std::{fs::File, path::PathBuf, sync::Arc};
use crate::{
source::{DirDataSource, S3DataSource},
value::{ItemReader, S3Reader},
};
//
// MARK: item
//
/// A cheaply-clonable pointer to an item in a dataset
#[derive(Debug, Clone)]
pub enum Item {
File {
source: Arc<DirDataSource>,
mime: Mime,
path: PathBuf,
sidecar: Option<Box<Item>>,
},
S3 {
source: Arc<S3DataSource>,
mime: Mime,
key: SmartString<LazyCompact>,
sidecar: Option<Box<Item>>,
},
}
impl Item {
/// Open the item for reading. For S3, performs a HEAD request to determine
/// the object size.
pub async fn read(&self) -> Result<ItemReader, std::io::Error> {
Ok(match self {
Self::File { path, .. } => ItemReader::File(File::open(path)?),
Self::S3 { source, key, .. } => {
let head = source
.client
.head_object()
.bucket(source.bucket.as_str())
.key(key.as_str())
.send()
.await
.map_err(std::io::Error::other)?;
let size = head.content_length().unwrap_or(0) as u64;
ItemReader::S3(S3Reader {
client: source.client.clone(),
bucket: source.bucket.clone(),
key: key.to_owned(),
cursor: 0,
size,
})
}
})
}
pub fn source_name(&self) -> &pile_config::Label {
match self {
Self::File { source, .. } => &source.name,
Self::S3 { source, .. } => &source.name,
}
}
#[expect(clippy::expect_used)]
pub fn key(&self) -> SmartString<LazyCompact> {
match self {
Self::File { path, .. } => path.to_str().expect("path is not utf-8").into(),
Self::S3 { key, .. } => key.clone(),
}
}
pub fn hash(&self) -> Result<blake3::Hash, std::io::Error> {
match self {
Self::File { path, .. } => {
let mut hasher = blake3::Hasher::new();
let mut file = std::fs::File::open(path)?;
std::io::copy(&mut file, &mut hasher)?;
return Ok(hasher.finalize());
}
Self::S3 { .. } => todo!(),
}
}
pub fn mime(&self) -> &Mime {
match self {
Self::File { mime, .. } => mime,
Self::S3 { mime, .. } => mime,
}
}
pub fn sidecar(&self) -> Option<&Self> {
match self {
Self::File { sidecar, .. } => sidecar.as_ref().map(|x| &**x),
Self::S3 { sidecar, .. } => sidecar.as_ref().map(|x| &**x),
}
}
}

View File

@@ -0,0 +1,8 @@
mod item;
pub use item::*;
mod readers;
pub use readers::*;
mod value;
pub use value::*;

View File

@@ -1,114 +1,13 @@
use mime::Mime;
use smartstring::{LazyCompact, SmartString}; use smartstring::{LazyCompact, SmartString};
use std::{ use std::{
fs::File, fs::File,
io::{Read, Seek, SeekFrom}, io::{Read, Seek, SeekFrom},
path::PathBuf,
sync::Arc, sync::Arc,
}; };
use tokio::runtime::Handle; use tokio::runtime::Handle;
use crate::source::{DirDataSource, S3DataSource};
// //
// MARK: item // MARK: traits
//
/// A cheaply-clonable pointer to an item in a dataset
#[derive(Debug, Clone)]
pub enum Item {
File {
source: Arc<DirDataSource>,
mime: Mime,
path: PathBuf,
sidecar: Option<Box<Item>>,
},
S3 {
source: Arc<S3DataSource>,
mime: Mime,
key: SmartString<LazyCompact>,
sidecar: Option<Box<Item>>,
},
}
impl Item {
/// Open the item for reading. For S3, performs a HEAD request to determine
/// the object size.
pub async fn read(&self) -> Result<ItemReader, std::io::Error> {
Ok(match self {
Self::File { path, .. } => ItemReader::File(File::open(path)?),
Self::S3 { source, key, .. } => {
let head = source
.client
.head_object()
.bucket(source.bucket.as_str())
.key(key.as_str())
.send()
.await
.map_err(std::io::Error::other)?;
let size = head.content_length().unwrap_or(0) as u64;
ItemReader::S3(S3Reader {
client: source.client.clone(),
bucket: source.bucket.clone(),
key: key.to_owned(),
cursor: 0,
size,
})
}
})
}
pub fn source_name(&self) -> &pile_config::Label {
match self {
Self::File { source, .. } => &source.name,
Self::S3 { source, .. } => &source.name,
}
}
#[expect(clippy::expect_used)]
pub fn key(&self) -> SmartString<LazyCompact> {
match self {
Self::File { path, .. } => path.to_str().expect("path is not utf-8").into(),
Self::S3 { key, .. } => key.clone(),
}
}
pub fn hash(&self) -> Result<blake3::Hash, std::io::Error> {
match self {
Self::File { path, .. } => {
let mut hasher = blake3::Hasher::new();
let mut file = std::fs::File::open(path)?;
std::io::copy(&mut file, &mut hasher)?;
return Ok(hasher.finalize());
}
Self::S3 { .. } => todo!(),
}
}
pub fn mime(&self) -> &Mime {
match self {
Self::File { mime, .. } => mime,
Self::S3 { mime, .. } => mime,
}
}
pub fn sidecar(&self) -> Option<&Self> {
match self {
Self::File { sidecar, .. } => sidecar.as_ref().map(|x| &**x),
Self::S3 { sidecar, .. } => sidecar.as_ref().map(|x| &**x),
}
}
}
//
// MARK: reader
// //
pub trait AsyncReader: Send { pub trait AsyncReader: Send {
@@ -210,11 +109,11 @@ impl AsyncSeekReader for ItemReader {
// //
pub struct S3Reader { pub struct S3Reader {
client: Arc<aws_sdk_s3::Client>, pub client: Arc<aws_sdk_s3::Client>,
bucket: SmartString<LazyCompact>, pub bucket: SmartString<LazyCompact>,
key: SmartString<LazyCompact>, pub key: SmartString<LazyCompact>,
cursor: u64, pub cursor: u64,
size: u64, pub size: u64,
} }
impl AsyncReader for S3Reader { impl AsyncReader for S3Reader {

View File

@@ -4,7 +4,15 @@ use serde_json::{Map, Value};
use smartstring::{LazyCompact, SmartString}; use smartstring::{LazyCompact, SmartString};
use std::sync::Arc; use std::sync::Arc;
use crate::extract::{ListExtractor, ObjectExtractor}; use crate::{
extract::{
item::ItemExtractor,
misc::{ArrayExtractor, MapExtractor, VecExtractor},
string::StringExtractor,
traits::{ListExtractor, ObjectExtractor},
},
value::Item,
};
/// An immutable, cheaply-clonable, lazily-computed value. /// An immutable, cheaply-clonable, lazily-computed value.
/// Very similar to [serde_json::Value]. /// Very similar to [serde_json::Value].
@@ -30,6 +38,9 @@ pub enum PileValue {
/// A lazily-computed array /// A lazily-computed array
ListExtractor(Arc<dyn ListExtractor>), ListExtractor(Arc<dyn ListExtractor>),
/// An pointer to an item in this dataset
Item(Item),
} }
impl Clone for PileValue { impl Clone for PileValue {
@@ -46,11 +57,40 @@ impl Clone for PileValue {
mime: mime.clone(), mime: mime.clone(),
bytes: bytes.clone(), bytes: bytes.clone(),
}, },
Self::Item(i) => Self::Item(i.clone()),
} }
} }
} }
impl PileValue { impl PileValue {
pub fn object_extractor(&self) -> Arc<dyn ObjectExtractor> {
match self {
Self::Null => Arc::new(MapExtractor::default()),
Self::U64(_) => Arc::new(MapExtractor::default()),
Self::I64(_) => Arc::new(MapExtractor::default()),
Self::Array(_) => Arc::new(MapExtractor::default()),
Self::String(s) => Arc::new(StringExtractor::new(&s)),
Self::Blob { .. } => Arc::new(MapExtractor::default()),
Self::ListExtractor(_) => Arc::new(MapExtractor::default()),
Self::ObjectExtractor(e) => e.clone(),
Self::Item(i) => Arc::new(ItemExtractor::new(i)),
}
}
pub fn list_extractor(&self) -> Arc<dyn ListExtractor> {
match self {
Self::Null => Arc::new(VecExtractor::default()),
Self::U64(_) => Arc::new(VecExtractor::default()),
Self::I64(_) => Arc::new(VecExtractor::default()),
Self::Array(a) => Arc::new(ArrayExtractor::new(a.clone())),
Self::String(_) => Arc::new(VecExtractor::default()),
Self::Blob { .. } => Arc::new(VecExtractor::default()),
Self::ListExtractor(e) => e.clone(),
Self::ObjectExtractor(_) => Arc::new(VecExtractor::default()),
Self::Item(_) => Arc::new(VecExtractor::default()),
}
}
pub async fn query(&self, query: &ObjectPath) -> Result<Option<Self>, std::io::Error> { pub async fn query(&self, query: &ObjectPath) -> Result<Option<Self>, std::io::Error> {
let mut out: Option<PileValue> = Some(self.clone()); let mut out: Option<PileValue> = Some(self.clone());
@@ -58,50 +98,41 @@ impl PileValue {
match s { match s {
PathSegment::Root => out = Some(self.clone()), PathSegment::Root => out = Some(self.clone()),
PathSegment::Field(field) => { PathSegment::Field(field) => {
out = match out { let e = match out.map(|x| x.object_extractor()) {
None => return Ok(None), Some(e) => e,
Some(Self::Null) => None, None => {
Some(Self::U64(_)) => None, out = None;
Some(Self::I64(_)) => None, continue;
Some(Self::Array(_)) => None,
Some(Self::String(_)) => None,
Some(Self::Blob { .. }) => None,
Some(Self::ListExtractor(_)) => None,
Some(Self::ObjectExtractor(e)) => e.field(field).await?,
} }
};
out = e.field(&field).await?;
} }
PathSegment::Index(idx) => { PathSegment::Index(idx) => {
out = match &out { let e = match out.map(|x| x.list_extractor()) {
None => return Ok(None), Some(e) => e,
Some(Self::Null) => None, None => {
Some(Self::U64(_)) => None, out = None;
Some(Self::I64(_)) => None, continue;
Some(Self::Blob { .. }) => None, }
Some(Self::Array(v)) => {
let idx = if *idx >= 0 {
usize::try_from(*idx).ok()
} else {
usize::try_from(v.len() as i64 - idx).ok()
}; };
idx.and_then(|idx| v.get(idx)).cloned()
}
Some(Self::String(_)) => None,
Some(Self::ObjectExtractor(_)) => None,
Some(Self::ListExtractor(e)) => {
let idx = if *idx >= 0 { let idx = if *idx >= 0 {
usize::try_from(*idx).ok() usize::try_from(*idx).ok()
} else { } else {
usize::try_from(e.len().await? as i64 - idx).ok() usize::try_from(e.len().await? as i64 - idx).ok()
}; };
match idx { let idx = match idx {
Some(idx) => e.get(idx).await?, Some(idx) => idx,
None => None, None => {
} out = None;
} continue;
} }
};
out = e.get(idx).await?;
} }
} }
} }
@@ -127,7 +158,8 @@ impl PileValue {
Self::Array(x) => (!x.is_empty()).then(|| Value::Number(1u64.into())), Self::Array(x) => (!x.is_empty()).then(|| Value::Number(1u64.into())),
Self::ListExtractor(x) => (!x.is_empty().await?).then(|| Value::Number(1u64.into())), Self::ListExtractor(x) => (!x.is_empty().await?).then(|| Value::Number(1u64.into())),
Self::ObjectExtractor(e) => { Self::ObjectExtractor(_) | Self::Item(_) => {
let e = self.object_extractor();
let keys = e.fields().await?; let keys = e.fields().await?;
let mut map = Map::new(); let mut map = Map::new();
for k in &keys { for k in &keys {
@@ -160,22 +192,27 @@ impl PileValue {
Self::Null => Value::Null, Self::Null => Value::Null,
Self::U64(x) => Value::Number((*x).into()), Self::U64(x) => Value::Number((*x).into()),
Self::I64(x) => Value::Number((*x).into()), Self::I64(x) => Value::Number((*x).into()),
Self::String(x) => Value::String(x.to_string()),
// TODO: replace with something meaningful // TODO: replace with something meaningful?
Self::Blob { mime, bytes } => { Self::Blob { mime, bytes } => {
Value::String(format!("<Blob ({mime}, {} bytes)>", bytes.len())) Value::String(format!("<Blob ({mime}, {} bytes)>", bytes.len()))
} }
Self::String(x) => Value::String(x.to_string()),
Self::Array(x) => { #[expect(clippy::expect_used)]
Self::Array(_) | Self::ListExtractor(_) => {
let e = self.list_extractor();
let len = e.len().await?;
let mut arr = Vec::new(); let mut arr = Vec::new();
for item in &**x { for i in 0..len {
arr.push(Box::pin(item.to_json()).await?); let v = e.get(i).await?.expect("item must be present");
arr.push(Box::pin(v.to_json()).await?);
} }
Value::Array(arr) Value::Array(arr)
} }
Self::ObjectExtractor(e) => { Self::ObjectExtractor(_) | Self::Item(_) => {
let e = self.object_extractor();
let keys = e.fields().await?; let keys = e.fields().await?;
let mut map = Map::new(); let mut map = Map::new();
for k in &keys { for k in &keys {
@@ -187,8 +224,6 @@ impl PileValue {
} }
Value::Object(map) Value::Object(map)
} }
Self::ListExtractor(e) => e.to_json().await?,
}) })
} }
} }

View File

@@ -10,6 +10,7 @@ workspace = true
[dependencies] [dependencies]
pile-toolbox = { workspace = true } pile-toolbox = { workspace = true }
pile-dataset = { workspace = true, features = ["axum", "pdfium"] } pile-dataset = { workspace = true, features = ["axum", "pdfium"] }
pile-value = { workspace = true, features = ["pdfium"] }
pile-config = { workspace = true } pile-config = { workspace = true }
tracing = { workspace = true } tracing = { workspace = true }

View File

@@ -1,10 +1,12 @@
use anyhow::{Context, Result}; use anyhow::{Context, Result};
use clap::Args; use clap::Args;
use pile_config::{Label, Source}; use pile_config::{Label, Source};
use pile_dataset::index::DbFtsIndex; use pile_dataset::{Datasets, index::DbFtsIndex};
use pile_dataset::source::DirDataSource;
use pile_dataset::{DataSource, Datasets, Item, PileValue, extract::MetaExtractor};
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError}; use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
use pile_value::{
source::{DataSource, DirDataSource},
value::{Item, PileValue},
};
use std::{path::PathBuf, sync::Arc}; use std::{path::PathBuf, sync::Arc};
use tokio_stream::StreamExt; use tokio_stream::StreamExt;
use tracing::{info, warn}; use tracing::{info, warn};
@@ -72,11 +74,9 @@ impl CliCmd for AnnotateCommand {
continue; continue;
}; };
let meta = MetaExtractor::new(&item); let item = PileValue::Item(item.clone());
let extractor = PileValue::ObjectExtractor(Arc::new(meta));
let Some(value) = let Some(value) =
index.get_field(&extractor, &field).await.with_context(|| { index.get_field(&item, &field).await.with_context(|| {
format!("while extracting field from {}", path.display()) format!("while extracting field from {}", path.display())
})? })?
else { else {

View File

@@ -1,9 +1,10 @@
use anyhow::{Context, Result}; use anyhow::{Context, Result};
use clap::Args; use clap::Args;
use pile_dataset::{Datasets, PileValue, extract::MetaExtractor}; use pile_dataset::Datasets;
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError}; use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
use pile_value::value::PileValue;
use serde_json::{Map, Value}; use serde_json::{Map, Value};
use std::{path::PathBuf, sync::Arc, time::Instant}; use std::{path::PathBuf, time::Instant};
use tokio::task::JoinSet; use tokio::task::JoinSet;
use tokio_stream::StreamExt; use tokio_stream::StreamExt;
use tracing::info; use tracing::info;
@@ -93,9 +94,8 @@ impl CliCmd for FieldsCommand {
item_result.with_context(|| format!("while reading source {name}"))?; item_result.with_context(|| format!("while reading source {name}"))?;
let name = name.clone(); let name = name.clone();
join_set.spawn(async move { join_set.spawn(async move {
let meta = MetaExtractor::new(&item); let item = PileValue::Item(item);
let value = PileValue::ObjectExtractor(Arc::new(meta)); let result = item.count_fields().await.with_context(|| {
let result = value.count_fields().await.with_context(|| {
format!("while counting fields in source {name}") format!("while counting fields in source {name}")
})?; })?;
Ok(result.and_then(|v| { Ok(result.and_then(|v| {

View File

@@ -1,8 +1,9 @@
use anyhow::{Context, Result}; use anyhow::{Context, Result};
use clap::Args; use clap::Args;
use pile_config::objectpath::ObjectPath; use pile_config::objectpath::ObjectPath;
use pile_dataset::{Datasets, PileValue, extract::MetaExtractor}; use pile_dataset::Datasets;
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError}; use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
use pile_value::value::PileValue;
use std::{path::PathBuf, str::FromStr, sync::Arc}; use std::{path::PathBuf, str::FromStr, sync::Arc};
use tokio::task::JoinSet; use tokio::task::JoinSet;
use tokio_stream::StreamExt; use tokio_stream::StreamExt;
@@ -79,9 +80,8 @@ impl CliCmd for ListCommand {
let invert = self.invert; let invert = self.invert;
join_set.spawn(async move { join_set.spawn(async move {
let meta = MetaExtractor::new(&item); let item = PileValue::Item(item);
let root = PileValue::ObjectExtractor(Arc::new(meta)); let value = item.query(&path).await?;
let value = root.query(&path).await?;
let is_present = let is_present =
matches!(value, Some(v) if !matches!(v, PileValue::Null)); matches!(value, Some(v) if !matches!(v, PileValue::Null));

View File

@@ -1,9 +1,10 @@
use anyhow::{Context, Result}; use anyhow::{Context, Result};
use clap::Args; use clap::Args;
use pile_config::{Label, objectpath::ObjectPath}; use pile_config::{Label, objectpath::ObjectPath};
use pile_dataset::{Datasets, PileValue, extract::MetaExtractor}; use pile_dataset::Datasets;
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError}; use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
use std::{path::PathBuf, sync::Arc}; use pile_value::value::PileValue;
use std::path::PathBuf;
use crate::{CliCmd, GlobalContext}; use crate::{CliCmd, GlobalContext};
@@ -54,9 +55,8 @@ impl CliCmd for ProbeCommand {
anyhow::anyhow!("{:?} not found in source {:?}", self.key, self.source) anyhow::anyhow!("{:?} not found in source {:?}", self.key, self.source)
})?; })?;
let value = PileValue::ObjectExtractor(Arc::new(MetaExtractor::new(&item))); let item = PileValue::Item(item);
value item.to_json()
.to_json()
.await .await
.with_context(|| format!("while extracting {}", self.key))? .with_context(|| format!("while extracting {}", self.key))?
}; };