Refactor grouping

This commit is contained in:
2026-03-28 11:20:16 -07:00
parent 9967e066bb
commit 5527b61d39
40 changed files with 466 additions and 630 deletions

4
Cargo.lock generated
View File

@@ -2042,15 +2042,15 @@ dependencies = [
"chrono", "chrono",
"percent-encoding", "percent-encoding",
"pile-config", "pile-config",
"pile-io",
"pile-toolbox", "pile-toolbox",
"pile-value", "pile-value",
"regex",
"serde", "serde",
"serde_json", "serde_json",
"tantivy", "tantivy",
"thiserror", "thiserror",
"tokio", "tokio",
"tokio-stream", "tokio-util",
"toml", "toml",
"tracing", "tracing",
"utoipa", "utoipa",

View File

@@ -1,7 +1,7 @@
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::{collections::HashMap, fmt::Debug, path::PathBuf}; use std::{collections::HashMap, fmt::Debug, path::PathBuf};
use crate::{objectpath::ObjectPath, pattern::GroupPattern}; use crate::objectpath::ObjectPath;
mod misc; mod misc;
pub use misc::*; pub use misc::*;
@@ -15,6 +15,15 @@ fn default_true() -> bool {
true true
} }
pub fn default_base() -> String {
"(.*)".to_owned()
}
#[expect(clippy::unwrap_used)]
pub fn default_files() -> HashMap<Label, String> {
[(Label::new("item").unwrap(), "{base}".to_owned())].into()
}
#[test] #[test]
#[expect(clippy::expect_used)] #[expect(clippy::expect_used)]
fn init_db_toml_valid() { fn init_db_toml_valid() {
@@ -51,9 +60,17 @@ pub enum Source {
/// Must be relative. /// Must be relative.
path: PathBuf, path: PathBuf,
/// How to group files into items in this source /// Regex that extracts an item key from a file path.
#[serde(default)] /// - File paths are relative to `path`.
pattern: GroupPattern, /// - The first group in this regex is the file's item key.
#[serde(default = "default_base")]
base_pattern: String,
/// Map of files included in each item.'
/// `{base}` is replaced with the string extraced by base_pattern.
/// Default is `{ item: "{base}" }`
#[serde(default = "default_files")]
files: HashMap<Label, String>,
}, },
} }

View File

@@ -11,8 +11,8 @@ workspace = true
pile-config = { workspace = true } pile-config = { workspace = true }
pile-toolbox = { workspace = true } pile-toolbox = { workspace = true }
pile-value = { workspace = true } pile-value = { workspace = true }
pile-io = { workspace = true }
regex = { workspace = true }
serde_json = { workspace = true } serde_json = { workspace = true }
tantivy = { workspace = true } tantivy = { workspace = true }
tracing = { workspace = true } tracing = { workspace = true }
@@ -20,7 +20,7 @@ chrono = { workspace = true }
toml = { workspace = true } toml = { workspace = true }
thiserror = { workspace = true } thiserror = { workspace = true }
tokio = { workspace = true } tokio = { workspace = true }
tokio-stream = { workspace = true } tokio-util = { version = "0.7", features = ["io"] }
serde = { workspace = true, optional = true } serde = { workspace = true, optional = true }
axum = { workspace = true, optional = true } axum = { workspace = true, optional = true }

View File

@@ -1,6 +1,6 @@
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use pile_config::{ use pile_config::{
ConfigToml, DatasetConfig, Label, Source, objectpath::ObjectPath, pattern::GroupPattern, ConfigToml, DatasetConfig, Label, Source, default_base, default_files, objectpath::ObjectPath,
}; };
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError}; use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
use pile_value::{ use pile_value::{
@@ -8,6 +8,7 @@ use pile_value::{
source::{DataSource, DirDataSource, misc::path_ts_earliest}, source::{DataSource, DirDataSource, misc::path_ts_earliest},
value::{Item, PileValue}, value::{Item, PileValue},
}; };
use regex::Regex;
use serde_json::Value; use serde_json::Value;
use std::{collections::HashMap, io::ErrorKind, path::PathBuf, sync::Arc, time::Instant}; use std::{collections::HashMap, io::ErrorKind, path::PathBuf, sync::Arc, time::Instant};
use tantivy::{Executor, Index, IndexWriter, TantivyError, collector::TopDocs}; use tantivy::{Executor, Index, IndexWriter, TantivyError, collector::TopDocs};
@@ -107,7 +108,8 @@ impl Datasets {
Source::Filesystem { Source::Filesystem {
enabled: true, enabled: true,
path: path_parent.clone(), path: path_parent.clone(),
pattern: GroupPattern::default(), base_pattern: default_base(),
files: default_files(),
}, },
)] )]
.into_iter() .into_iter()
@@ -125,17 +127,36 @@ impl Datasets {
Source::Filesystem { Source::Filesystem {
enabled, enabled,
path, path,
pattern, base_pattern,
files,
} => { } => {
let target = match enabled { let target = match enabled {
true => &mut sources, true => &mut sources,
false => &mut disabled_sources, false => &mut disabled_sources,
}; };
let base_regex = Regex::new(base_pattern).map_err(|e| {
std::io::Error::new(
ErrorKind::InvalidInput,
format!("invalid base_pattern: {e}"),
)
})?;
if base_regex.captures_len() != 2 {
return Err(std::io::Error::new(
ErrorKind::InvalidInput,
"base_pattern must have exactly one capture group",
));
}
target.insert( target.insert(
label.clone(), label.clone(),
Dataset::Dir( Dataset::Dir(
DirDataSource::new(label, path_parent.join(path), pattern.clone()) DirDataSource::new(
label,
path_parent.join(path),
base_regex,
files.clone(),
)
.await?, .await?,
), ),
); );
@@ -194,17 +215,36 @@ impl Datasets {
Source::Filesystem { Source::Filesystem {
enabled, enabled,
path, path,
pattern, base_pattern,
files,
} => { } => {
let target = match enabled { let target = match enabled {
true => &mut sources, true => &mut sources,
false => &mut disabled_sources, false => &mut disabled_sources,
}; };
let base_regex = Regex::new(base_pattern).map_err(|e| {
std::io::Error::new(
ErrorKind::InvalidInput,
format!("invalid base_pattern: {e}"),
)
})?;
if base_regex.captures_len() != 2 {
return Err(std::io::Error::new(
ErrorKind::InvalidInput,
"base_pattern must have exactly one capture group",
));
}
target.insert( target.insert(
label.clone(), label.clone(),
Dataset::Dir( Dataset::Dir(
DirDataSource::new(label, path_parent.join(path), pattern.clone()) DirDataSource::new(
label,
path_parent.join(path),
base_regex,
files.clone(),
)
.await?, .await?,
), ),
); );

View File

@@ -245,7 +245,7 @@ async fn val_to_string(
PileValue::Null => {} PileValue::Null => {}
PileValue::ObjectExtractor(_) => {} PileValue::ObjectExtractor(_) => {}
PileValue::Item(_) => {} PileValue::Item(_) => {}
PileValue::Blob { .. } => {} PileValue::Binary(_) => {}
} }
return Ok(Vec::new()); return Ok(Vec::new());

View File

@@ -1,14 +1,19 @@
use axum::{ use axum::{
Json, Json,
body::Body,
extract::{Query, RawQuery, State}, extract::{Query, RawQuery, State},
http::{StatusCode, header}, http::{StatusCode, header},
response::{IntoResponse, Response}, response::{IntoResponse, Response},
}; };
use percent_encoding::percent_decode_str; use percent_encoding::percent_decode_str;
use pile_config::{Label, objectpath::ObjectPath}; use pile_config::{Label, objectpath::ObjectPath};
use pile_value::{extract::traits::ExtractState, value::PileValue}; use pile_value::{
extract::traits::ExtractState,
value::{BinaryPileValue, PileValue},
};
use serde::Deserialize; use serde::Deserialize;
use std::{sync::Arc, time::Instant}; use std::{sync::Arc, time::Instant};
use tokio_util::io::ReaderStream;
use tracing::debug; use tracing::debug;
use utoipa::ToSchema; use utoipa::ToSchema;
@@ -141,15 +146,30 @@ pub async fn get_extract(
s.to_string(), s.to_string(),
) )
.into_response(), .into_response(),
PileValue::Blob { mime, bytes } => (
PileValue::Binary(binary) => {
let mime = binary.mime().to_string();
let body = match binary {
BinaryPileValue::Blob { bytes, .. } => Body::from(bytes.0.to_vec()),
BinaryPileValue::File { path, .. } => match tokio::fs::File::open(&path).await {
Ok(file) => Body::from_stream(ReaderStream::new(file)),
Err(e) => {
return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}"))
.into_response();
}
},
};
(
StatusCode::OK, StatusCode::OK,
[ [
(header::CONTENT_TYPE, mime.to_string()), (header::CONTENT_TYPE, mime),
(header::CONTENT_DISPOSITION, disposition), (header::CONTENT_DISPOSITION, disposition),
], ],
bytes.as_ref().clone(), body,
) )
.into_response(), .into_response()
}
_ => match value.to_json(&extract_state).await { _ => match value.to_json(&extract_state).await {
Ok(json) => ( Ok(json) => (
StatusCode::OK, StatusCode::OK,

View File

@@ -1,202 +0,0 @@
use axum::{
body::Body,
extract::{Query, State},
http::{HeaderMap, StatusCode, header},
response::{IntoResponse, Response},
};
use pile_config::Label;
use pile_io::{AsyncReader, AsyncSeekReader};
use serde::Deserialize;
use std::{io::SeekFrom, sync::Arc, time::Instant};
use tokio::sync::mpsc;
use tokio_stream::wrappers::ReceiverStream;
use tracing::debug;
use utoipa::ToSchema;
use crate::Datasets;
#[derive(Deserialize, ToSchema)]
pub struct ItemQuery {
source: String,
key: String,
#[serde(default)]
download: bool,
name: Option<String>,
}
/// Parse a `Range: bytes=...` header value.
/// Returns `(start, end)` where either may be `None` (suffix form has `None` start).
fn parse_byte_range(s: &str) -> Option<(Option<u64>, Option<u64>)> {
let spec = s.strip_prefix("bytes=")?;
if spec.contains(',') {
return None; // multiple ranges not supported
}
if let Some(suffix) = spec.strip_prefix('-') {
return Some((None, Some(suffix.parse().ok()?)));
}
let mut parts = spec.splitn(2, '-');
let start: u64 = parts.next()?.parse().ok()?;
let end = parts
.next()
.and_then(|e| if e.is_empty() { None } else { e.parse().ok() });
Some((Some(start), end))
}
/// Fetch the raw bytes of an item by source and key
#[utoipa::path(
get,
path = "/item",
params(
("source" = String, Query, description = "Source label"),
("key" = String, Query, description = "Item key"),
("name" = Option<String>, Query, description = "Downloaded filename; defaults to the last segment of the key"),
),
responses(
(status = 200, description = "Raw item bytes"),
(status = 206, description = "Partial content"),
(status = 400, description = "Invalid source label"),
(status = 404, description = "Item not found"),
(status = 416, description = "Range not satisfiable"),
(status = 500, description = "Internal server error"),
)
)]
pub async fn item_get(
State(state): State<Arc<Datasets>>,
Query(params): Query<ItemQuery>,
headers: HeaderMap,
) -> Response {
let start = Instant::now();
debug!(
message = "Serving /item",
source = params.source,
key = params.key
);
let label = match Label::try_from(params.source.clone()) {
Ok(l) => l,
Err(e) => return (StatusCode::BAD_REQUEST, format!("{e:?}")).into_response(),
};
let Some(item) = state.get(&label, &params.key).await else {
return StatusCode::NOT_FOUND.into_response();
};
let mime = item.mime().to_string();
let mut reader = match item.read().await {
Ok(r) => r,
Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
};
let total = match reader.seek(SeekFrom::End(0)).await {
Ok(n) => n,
Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
};
let range = headers
.get(header::RANGE)
.and_then(|v| v.to_str().ok())
.and_then(parse_byte_range);
// Resolve (byte_start, byte_end, content_length, is_range)
let (byte_start, byte_end, length, is_range) = match range {
Some((Some(s), e)) => {
let e = e
.unwrap_or(total.saturating_sub(1))
.min(total.saturating_sub(1));
if s >= total || s > e {
return (
StatusCode::RANGE_NOT_SATISFIABLE,
[(header::CONTENT_RANGE, format!("bytes */{total}"))],
)
.into_response();
}
(s, e, e - s + 1, true)
}
Some((None, Some(suffix))) => {
let s = total.saturating_sub(suffix);
let e = total.saturating_sub(1);
(s, e, total.saturating_sub(s), true)
}
_ => (0, total.saturating_sub(1), total, false),
};
if let Err(e) = reader.seek(SeekFrom::Start(byte_start)).await {
return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response();
}
debug!(
message = "Served /item",
source = params.source,
key = params.key,
time_ms = start.elapsed().as_millis()
);
let (tx, rx) = mpsc::channel::<Result<Vec<u8>, std::io::Error>>(8);
tokio::spawn(async move {
let mut buf = vec![0u8; 65536];
let mut remaining = length;
loop {
if remaining == 0 {
break;
}
let to_read = (buf.len() as u64).min(remaining) as usize;
match reader.read(&mut buf[..to_read]).await {
Ok(0) => break,
Ok(n) => {
remaining -= n as u64;
if tx.send(Ok(buf[..n].to_vec())).await.is_err() {
break;
}
}
Err(e) => {
let _ = tx.send(Err(e)).await;
break;
}
}
}
});
let body = Body::from_stream(ReceiverStream::new(rx));
let status = if is_range {
StatusCode::PARTIAL_CONTENT
} else {
StatusCode::OK
};
let disposition_type = if params.download {
"attachment"
} else {
"inline"
};
let file_name = params.name.unwrap_or_else(|| {
params
.key
.rsplit('/')
.next()
.unwrap_or(&params.key)
.to_owned()
});
let disposition = format!("{disposition_type}; filename=\"{file_name}\"");
let mut builder = axum::http::Response::builder()
.status(status)
.header(header::CONTENT_TYPE, mime)
.header(header::ACCEPT_RANGES, "bytes")
.header(header::CONTENT_LENGTH, length)
.header(header::CONTENT_DISPOSITION, disposition);
if is_range {
builder = builder.header(
header::CONTENT_RANGE,
format!("bytes {byte_start}-{byte_end}/{total}"),
);
}
builder
.body(body)
.map(IntoResponse::into_response)
.unwrap_or_else(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response())
}

View File

@@ -11,9 +11,6 @@ use crate::Datasets;
mod lookup; mod lookup;
pub use lookup::*; pub use lookup::*;
mod item;
pub use item::*;
mod extract; mod extract;
pub use extract::*; pub use extract::*;
@@ -34,7 +31,6 @@ pub use schema::*;
tags(), tags(),
paths( paths(
lookup, lookup,
item_get,
get_extract, get_extract,
items_list, items_list,
config_schema, config_schema,
@@ -45,7 +41,6 @@ pub use schema::*;
LookupRequest, LookupRequest,
LookupResponse, LookupResponse,
LookupResult, LookupResult,
ItemQuery,
ExtractQuery, ExtractQuery,
ItemsQuery, ItemsQuery,
ItemsResponse, ItemsResponse,
@@ -64,7 +59,6 @@ impl Datasets {
pub fn router_prefix(self: Arc<Self>, with_docs: bool, prefix: Option<&str>) -> Router<()> { pub fn router_prefix(self: Arc<Self>, with_docs: bool, prefix: Option<&str>) -> Router<()> {
let mut router = Router::new() let mut router = Router::new()
.route("/lookup", post(lookup)) .route("/lookup", post(lookup))
.route("/item", get(item_get))
.route("/extract", get(get_extract)) .route("/extract", get(get_extract))
.route("/items", get(items_list)) .route("/items", get(items_list))
.route("/config/schema", get(config_schema)) .route("/config/schema", get(config_schema))

View File

@@ -44,9 +44,9 @@ async fn pile_value_to_api(
PileValue::I64(n) => Ok(ApiValue::Number(n.into())), PileValue::I64(n) => Ok(ApiValue::Number(n.into())),
PileValue::Null => Ok(ApiValue::Null), PileValue::Null => Ok(ApiValue::Null),
PileValue::Blob { mime, .. } => Ok(ApiValue::Binary { PileValue::Binary(x) => Ok(ApiValue::Binary {
binary: true, binary: true,
mime: mime.to_string(), mime: x.mime().to_string(),
}), }),
PileValue::Array(arr) => { PileValue::Array(arr) => {

View File

@@ -1,13 +1,18 @@
use axum::{ use axum::{
Json, Json,
body::Body,
extract::{Path, Query, State}, extract::{Path, Query, State},
http::{StatusCode, header}, http::{StatusCode, header},
response::{IntoResponse, Response}, response::{IntoResponse, Response},
}; };
use pile_config::Label; use pile_config::Label;
use pile_value::{extract::traits::ExtractState, value::PileValue}; use pile_value::{
extract::traits::ExtractState,
value::{BinaryPileValue, PileValue},
};
use serde::Deserialize; use serde::Deserialize;
use std::{sync::Arc, time::Instant}; use std::{sync::Arc, time::Instant};
use tokio_util::io::ReaderStream;
use tracing::debug; use tracing::debug;
use utoipa::IntoParams; use utoipa::IntoParams;
@@ -125,15 +130,30 @@ pub async fn schema_field(
s.to_string(), s.to_string(),
) )
.into_response(), .into_response(),
PileValue::Blob { mime, bytes } => (
PileValue::Binary(binary) => {
let mime = binary.mime().to_string();
let body = match binary {
BinaryPileValue::Blob { bytes, .. } => Body::from(bytes.0.to_vec()),
BinaryPileValue::File { path, .. } => match tokio::fs::File::open(&path).await {
Ok(file) => Body::from_stream(ReaderStream::new(file)),
Err(e) => {
return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}"))
.into_response();
}
},
};
(
StatusCode::OK, StatusCode::OK,
[ [
(header::CONTENT_TYPE, mime.to_string()), (header::CONTENT_TYPE, mime),
(header::CONTENT_DISPOSITION, disposition), (header::CONTENT_DISPOSITION, disposition),
], ],
bytes.as_ref().clone(), body,
) )
.into_response(), .into_response()
}
_ => match value.to_json(&extract_state).await { _ => match value.to_json(&extract_state).await {
Ok(json) => ( Ok(json) => (
StatusCode::OK, StatusCode::OK,

View File

@@ -6,16 +6,16 @@ use tracing::trace;
use crate::{ use crate::{
extract::traits::ExtractState, extract::traits::ExtractState,
value::{Item, PileValue}, value::{ArcBytes, BinaryPileValue, PileValue},
}; };
pub struct EpubCoverExtractor { pub struct EpubCoverExtractor {
item: Item, item: BinaryPileValue,
output: OnceLock<Option<(Mime, Vec<u8>)>>, output: OnceLock<Option<(Mime, Vec<u8>)>>,
} }
impl EpubCoverExtractor { impl EpubCoverExtractor {
pub fn new(item: &Item) -> Self { pub fn new(item: &BinaryPileValue) -> Self {
Self { Self {
item: item.clone(), item: item.clone(),
output: OnceLock::new(), output: OnceLock::new(),
@@ -51,7 +51,7 @@ impl EpubCoverExtractor {
Err(error) => match error.downcast::<std::io::Error>() { Err(error) => match error.downcast::<std::io::Error>() {
Ok(x) => return Err(x), Ok(x) => return Err(x),
Err(error) => { Err(error) => {
trace!(message = "Could not extract epub cover", ?error, key = ?self.item.key()); trace!(message = "Could not extract epub cover", ?error, item = ?self.item);
None None
} }
}, },
@@ -65,12 +65,11 @@ impl EpubCoverExtractor {
return Ok(None); return Ok(None);
} }
Ok(self Ok(self.get_inner().await?.map(|(mime, bytes)| {
.get_inner() PileValue::Binary(BinaryPileValue::Blob {
.await?
.map(|(mime, bytes)| PileValue::Blob {
mime: mime.clone(), mime: mime.clone(),
bytes: Arc::new(bytes.clone()), bytes: ArcBytes(Arc::new(bytes.clone())),
})
})) }))
} }
} }

View File

@@ -9,16 +9,16 @@ use tracing::trace;
use crate::{ use crate::{
extract::traits::{ExtractState, ObjectExtractor}, extract::traits::{ExtractState, ObjectExtractor},
value::{Item, PileValue}, value::{BinaryPileValue, PileValue},
}; };
pub struct EpubMetaExtractor { pub struct EpubMetaExtractor {
item: Item, item: BinaryPileValue,
output: OnceLock<HashMap<Label, PileValue>>, output: OnceLock<HashMap<Label, PileValue>>,
} }
impl EpubMetaExtractor { impl EpubMetaExtractor {
pub fn new(item: &Item) -> Self { pub fn new(item: &BinaryPileValue) -> Self {
Self { Self {
item: item.clone(), item: item.clone(),
output: OnceLock::new(), output: OnceLock::new(),
@@ -57,7 +57,7 @@ impl EpubMetaExtractor {
Err(error) => match error.downcast::<std::io::Error>() { Err(error) => match error.downcast::<std::io::Error>() {
Ok(x) => return Err(x), Ok(x) => return Err(x),
Err(error) => { Err(error) => {
trace!(message = "Could not process epub", ?error, key = ?self.item.key()); trace!(message = "Could not process epub", ?error, item = ?self.item);
return Ok(self.output.get_or_init(HashMap::new)); return Ok(self.output.get_or_init(HashMap::new));
} }
}, },

View File

@@ -9,16 +9,16 @@ use tracing::trace;
use crate::{ use crate::{
extract::traits::{ExtractState, ObjectExtractor}, extract::traits::{ExtractState, ObjectExtractor},
value::{Item, PileValue}, value::{BinaryPileValue, PileValue},
}; };
pub struct EpubTextExtractor { pub struct EpubTextExtractor {
item: Item, item: BinaryPileValue,
output: OnceLock<HashMap<Label, PileValue>>, output: OnceLock<HashMap<Label, PileValue>>,
} }
impl EpubTextExtractor { impl EpubTextExtractor {
pub fn new(item: &Item) -> Self { pub fn new(item: &BinaryPileValue) -> Self {
Self { Self {
item: item.clone(), item: item.clone(),
output: OnceLock::new(), output: OnceLock::new(),
@@ -54,7 +54,7 @@ impl EpubTextExtractor {
Err(error) => match error.downcast::<std::io::Error>() { Err(error) => match error.downcast::<std::io::Error>() {
Ok(x) => return Err(x), Ok(x) => return Err(x),
Err(error) => { Err(error) => {
trace!(message = "Could not process epub", ?error, key = ?self.item.key()); trace!(message = "Could not process epub", ?error, item = ?self.item);
return Ok(self.output.get_or_init(HashMap::new)); return Ok(self.output.get_or_init(HashMap::new));
} }
}, },

View File

@@ -12,7 +12,7 @@ pub use epub_text::*;
use crate::{ use crate::{
extract::traits::{ExtractState, ObjectExtractor}, extract::traits::{ExtractState, ObjectExtractor},
value::{Item, PileValue}, value::{BinaryPileValue, PileValue},
}; };
pub struct EpubExtractor { pub struct EpubExtractor {
@@ -22,7 +22,7 @@ pub struct EpubExtractor {
} }
impl EpubExtractor { impl EpubExtractor {
pub fn new(item: &Item) -> Self { pub fn new(item: &BinaryPileValue) -> Self {
Self { Self {
text: Arc::new(EpubTextExtractor::new(item)), text: Arc::new(EpubTextExtractor::new(item)),
meta: Arc::new(EpubMetaExtractor::new(item)), meta: Arc::new(EpubMetaExtractor::new(item)),
@@ -87,9 +87,13 @@ impl ObjectExtractor for EpubExtractor {
if k.as_str() == "cover" { if k.as_str() == "cover" {
let summary = match &v { let summary = match &v {
PileValue::Blob { mime, bytes } => { PileValue::Binary(BinaryPileValue::Blob { mime, bytes }) => {
format!("<Blob ({}, {} bytes)>", mime, bytes.len()) format!("<Blob ({mime}, {} bytes)>", bytes.0.len())
} }
PileValue::Binary(BinaryPileValue::File { mime, .. }) => {
format!("<File ({mime})>")
}
PileValue::Null => "<null>".to_owned(), PileValue::Null => "<null>".to_owned(),
_ => "<cover>".to_owned(), _ => "<cover>".to_owned(),
}; };

View File

@@ -9,16 +9,16 @@ use tracing::trace;
use crate::{ use crate::{
extract::traits::{ExtractState, ObjectExtractor}, extract::traits::{ExtractState, ObjectExtractor},
value::{Item, PileValue}, value::{BinaryPileValue, PileValue},
}; };
pub struct ExifExtractor { pub struct ExifExtractor {
item: Item, item: BinaryPileValue,
output: OnceLock<HashMap<Label, PileValue>>, output: OnceLock<HashMap<Label, PileValue>>,
} }
impl ExifExtractor { impl ExifExtractor {
pub fn new(item: &Item) -> Self { pub fn new(item: &BinaryPileValue) -> Self {
Self { Self {
item: item.clone(), item: item.clone(),
output: OnceLock::new(), output: OnceLock::new(),
@@ -53,7 +53,7 @@ impl ExifExtractor {
Ok(x) => x, Ok(x) => x,
Err(exif::Error::Io(x)) => return Err(x), Err(exif::Error::Io(x)) => return Err(x),
Err(error) => { Err(error) => {
trace!(message = "Could not process exif", ?error, key = ?self.item.key()); trace!(message = "Could not process exif", ?error, item = ?self.item);
return Ok(self.output.get_or_init(HashMap::new)); return Ok(self.output.get_or_init(HashMap::new));
} }
}; };
@@ -94,7 +94,7 @@ impl ObjectExtractor for ExifExtractor {
) -> Result<Option<PileValue>, std::io::Error> { ) -> Result<Option<PileValue>, std::io::Error> {
trace!( trace!(
?args, ?args,
key = self.item.key().as_str(), item = ?self.item,
"Getting field {name:?} from ExifExtractor", "Getting field {name:?} from ExifExtractor",
); );

View File

@@ -11,16 +11,16 @@ use tracing::trace;
use crate::{ use crate::{
extract::traits::{ExtractState, ListExtractor, ObjectExtractor}, extract::traits::{ExtractState, ListExtractor, ObjectExtractor},
value::{Item, PileValue}, value::{ArcBytes, BinaryPileValue, PileValue},
}; };
pub struct FlacImagesExtractor { pub struct FlacImagesExtractor {
item: Item, item: BinaryPileValue,
cached_count: OnceLock<usize>, cached_count: OnceLock<usize>,
} }
impl FlacImagesExtractor { impl FlacImagesExtractor {
pub fn new(item: &Item) -> Self { pub fn new(item: &BinaryPileValue) -> Self {
Self { Self {
item: item.clone(), item: item.clone(),
cached_count: OnceLock::new(), cached_count: OnceLock::new(),
@@ -65,7 +65,7 @@ impl ListExtractor for FlacImagesExtractor {
mut idx: usize, mut idx: usize,
) -> Result<Option<PileValue>, std::io::Error> { ) -> Result<Option<PileValue>, std::io::Error> {
trace!( trace!(
key = self.item.key().as_str(), item = ?self.item,
"Getting index {idx} from FlacImagesExtractor", "Getting index {idx} from FlacImagesExtractor",
); );
@@ -73,7 +73,7 @@ impl ListExtractor for FlacImagesExtractor {
return Ok(None); return Ok(None);
} }
let key = self.item.key(); let item = self.item.clone();
let reader = SyncReadBridge::new_current(self.item.read().await?); let reader = SyncReadBridge::new_current(self.item.read().await?);
let image = tokio::task::spawn_blocking(move || { let image = tokio::task::spawn_blocking(move || {
let reader = FlacReader::new(BufReader::new(reader)); let reader = FlacReader::new(BufReader::new(reader));
@@ -93,11 +93,7 @@ impl ListExtractor for FlacImagesExtractor {
Err(FlacDecodeError::IoError(err)) => return Err(err), Err(FlacDecodeError::IoError(err)) => return Err(err),
Err(error) => { Err(error) => {
trace!( trace!(message = "Could not parse FLAC images", ?item, ?error);
message = "Could not parse FLAC images",
key = key.as_str(),
?error
);
return Ok(None); return Ok(None);
} }
@@ -109,9 +105,11 @@ impl ListExtractor for FlacImagesExtractor {
.await .await
.map_err(std::io::Error::other)??; .map_err(std::io::Error::other)??;
Ok(image.map(|(mime, data)| PileValue::Blob { Ok(image.map(|(mime, data)| {
PileValue::Binary(BinaryPileValue::Blob {
mime, mime,
bytes: Arc::new(data), bytes: ArcBytes(Arc::new(data)),
})
})) }))
} }
@@ -130,13 +128,13 @@ impl ListExtractor for FlacImagesExtractor {
} }
pub struct FlacExtractor { pub struct FlacExtractor {
item: Item, item: BinaryPileValue,
output: OnceLock<HashMap<Label, PileValue>>, output: OnceLock<HashMap<Label, PileValue>>,
images: PileValue, images: PileValue,
} }
impl FlacExtractor { impl FlacExtractor {
pub fn new(item: &Item) -> Self { pub fn new(item: &BinaryPileValue) -> Self {
Self { Self {
item: item.clone(), item: item.clone(),
output: OnceLock::new(), output: OnceLock::new(),
@@ -149,12 +147,9 @@ impl FlacExtractor {
return Ok(x); return Ok(x);
} }
trace!( trace!(message = "Reading FLAC tags", item = ?self.item);
message = "Reading FLAC tags",
key = self.item.key().as_str()
);
let key = self.item.key(); let item = self.item.clone();
let reader = SyncReadBridge::new_current(self.item.read().await?); let reader = SyncReadBridge::new_current(self.item.read().await?);
let output = tokio::task::spawn_blocking(move || { let output = tokio::task::spawn_blocking(move || {
let reader = FlacReader::new(BufReader::new(reader)); let reader = FlacReader::new(BufReader::new(reader));
@@ -176,11 +171,7 @@ impl FlacExtractor {
Err(FlacDecodeError::IoError(err)) => return Err(err), Err(FlacDecodeError::IoError(err)) => return Err(err),
Err(error) => { Err(error) => {
trace!( trace!(message = "Could not parse FLAC metadata", ?item, ?error);
message = "Could not parse FLAC metadata",
key = key.as_str(),
?error
);
return Ok(HashMap::new()); return Ok(HashMap::new());
} }

View File

@@ -1,21 +1,21 @@
use crate::{ use crate::{
extract::traits::{ExtractState, ObjectExtractor}, extract::traits::{ExtractState, ObjectExtractor},
value::{Item, PileValue}, value::{BinaryPileValue, PileValue},
}; };
use pile_config::Label; use pile_config::Label;
use std::{ use std::{
collections::HashMap, collections::HashMap,
path::{Component, PathBuf}, path::Component,
sync::{Arc, OnceLock}, sync::{Arc, OnceLock},
}; };
pub struct FsExtractor { pub struct FsExtractor {
item: Item, item: BinaryPileValue,
output: OnceLock<HashMap<Label, PileValue>>, output: OnceLock<HashMap<Label, PileValue>>,
} }
impl FsExtractor { impl FsExtractor {
pub fn new(item: &Item) -> Self { pub fn new(item: &BinaryPileValue) -> Self {
Self { Self {
item: item.clone(), item: item.clone(),
output: OnceLock::new(), output: OnceLock::new(),
@@ -27,7 +27,10 @@ impl FsExtractor {
return Ok(x); return Ok(x);
} }
let path = PathBuf::from(self.item.key().as_str()); let path = match &self.item {
BinaryPileValue::File { path, .. } => path,
_ => return Ok(self.output.get_or_init(HashMap::new)),
};
let mut root = false; let mut root = false;
let components = path let components = path

View File

@@ -11,16 +11,16 @@ use tracing::trace;
use crate::{ use crate::{
extract::traits::{ExtractState, ObjectExtractor}, extract::traits::{ExtractState, ObjectExtractor},
value::{Item, PileValue}, value::{BinaryPileValue, PileValue},
}; };
pub struct Id3Extractor { pub struct Id3Extractor {
item: Item, item: BinaryPileValue,
output: OnceLock<HashMap<Label, PileValue>>, output: OnceLock<HashMap<Label, PileValue>>,
} }
impl Id3Extractor { impl Id3Extractor {
pub fn new(item: &Item) -> Self { pub fn new(item: &BinaryPileValue) -> Self {
Self { Self {
item: item.clone(), item: item.clone(),
output: OnceLock::new(), output: OnceLock::new(),
@@ -32,9 +32,9 @@ impl Id3Extractor {
return Ok(x); return Ok(x);
} }
trace!(message = "Reading id3 tags", key = self.item.key().as_str()); trace!(message = "Reading id3 tags", key = ?self.item);
let key = self.item.key(); let item = self.item.clone();
let reader = SyncReadBridge::new_current(self.item.read().await?); let reader = SyncReadBridge::new_current(self.item.read().await?);
let tag = match tokio::task::spawn_blocking(move || Tag::read_from2(BufReader::new(reader))) let tag = match tokio::task::spawn_blocking(move || Tag::read_from2(BufReader::new(reader)))
.await .await
@@ -48,11 +48,7 @@ impl Id3Extractor {
})) => return Err(e), })) => return Err(e),
Ok(Err(error)) => { Ok(Err(error)) => {
trace!( trace!(message = "Could not parse id3 tags", ?item, ?error);
message = "Could not parse id3 tags",
key = key.as_str(),
?error
);
return Ok(self.output.get_or_init(HashMap::new)); return Ok(self.output.get_or_init(HashMap::new));
} }
}; };

View File

@@ -1,63 +1,25 @@
mod transform;
use transform::{CropTransformer, ImageTransformer, MaxDimTransformer};
use image::ImageFormat; use image::ImageFormat;
use mime::Mime; use mime::Mime;
use pile_config::Label; use pile_config::Label;
use pile_io::AsyncReader; use pile_io::AsyncReader;
use std::{ use std::{io::Cursor, str::FromStr, sync::Arc};
io::Cursor,
str::FromStr,
sync::{Arc, OnceLock},
};
use tracing::trace; use tracing::trace;
use transform::{CropTransformer, ImageTransformer, MaxDimTransformer};
mod transform;
use crate::{ use crate::{
extract::traits::{ExtractState, ObjectExtractor}, extract::traits::{ExtractState, ObjectExtractor},
value::{Item, PileValue}, value::{ArcBytes, BinaryPileValue, PileValue},
}; };
enum ImageSource {
Item(Item, OnceLock<Arc<Vec<u8>>>),
Blob(Arc<Vec<u8>>, Mime),
}
pub struct ImageExtractor { pub struct ImageExtractor {
source: ImageSource, item: BinaryPileValue,
} }
impl ImageExtractor { impl ImageExtractor {
pub fn new(item: &Item) -> Self { pub fn new(item: &BinaryPileValue) -> Self {
Self { Self { item: item.clone() }
source: ImageSource::Item(item.clone(), OnceLock::new()),
}
}
pub fn from_blob(bytes: Arc<Vec<u8>>, mime: Mime) -> Self {
Self {
source: ImageSource::Blob(bytes, mime),
}
}
fn mime(&self) -> &Mime {
match &self.source {
ImageSource::Item(item, _) => item.mime(),
ImageSource::Blob(_, mime) => mime,
}
}
async fn read_bytes(&self) -> Result<Arc<Vec<u8>>, std::io::Error> {
match &self.source {
ImageSource::Blob(bytes, _) => Ok(bytes.clone()),
ImageSource::Item(item, cache) => {
if let Some(x) = cache.get() {
return Ok(x.clone());
}
let mut reader = item.read().await?;
let bytes = reader.read_to_end().await?;
Ok(cache.get_or_init(|| Arc::new(bytes)).clone())
}
}
} }
async fn apply<T: ImageTransformer + Send + 'static>( async fn apply<T: ImageTransformer + Send + 'static>(
@@ -69,11 +31,14 @@ impl ImageExtractor {
Err(_) => return Ok(None), Err(_) => return Ok(None),
}; };
let mime = self.mime().clone(); let mime = self.item.mime().clone();
let bytes = self.read_bytes().await?; let bytes = self.item.read().await?.read_to_end().await?;
let Some(format) = ImageFormat::from_mime_type(&mime) else { let Some(format) = ImageFormat::from_mime_type(&mime) else {
return Ok(Some(PileValue::Blob { mime, bytes })); return Ok(Some(PileValue::Binary(BinaryPileValue::Blob {
mime,
bytes: ArcBytes(Arc::new(bytes)),
})));
}; };
let bytes_for_closure = bytes.clone(); let bytes_for_closure = bytes.clone();
@@ -91,11 +56,15 @@ impl ImageExtractor {
.await?; .await?;
match result { match result {
Ok((out_mime, out_bytes)) => Ok(Some(PileValue::Blob { Ok((out_mime, out_bytes)) => Ok(Some(PileValue::Binary(BinaryPileValue::Blob {
mime: out_mime, mime: out_mime,
bytes: Arc::new(out_bytes), bytes: ArcBytes(Arc::new(out_bytes)),
})), }))),
Err(_) => Ok(Some(PileValue::Blob { mime, bytes })),
Err(_) => Ok(Some(PileValue::Binary(BinaryPileValue::Blob {
mime,
bytes: ArcBytes(Arc::new(bytes)),
}))),
} }
} }
} }

View File

@@ -7,7 +7,7 @@ use std::{
use crate::{ use crate::{
extract::traits::{ExtractState, ObjectExtractor}, extract::traits::{ExtractState, ObjectExtractor},
value::{Item, PileValue}, value::{BinaryPileValue, PileValue},
}; };
fn json_to_pile(value: serde_json::Value) -> PileValue { fn json_to_pile(value: serde_json::Value) -> PileValue {
@@ -24,12 +24,12 @@ fn json_to_pile(value: serde_json::Value) -> PileValue {
} }
pub struct JsonExtractor { pub struct JsonExtractor {
item: Item, item: BinaryPileValue,
output: OnceLock<HashMap<Label, PileValue>>, output: OnceLock<HashMap<Label, PileValue>>,
} }
impl JsonExtractor { impl JsonExtractor {
pub fn new(item: &Item) -> Self { pub fn new(item: &BinaryPileValue) -> Self {
Self { Self {
item: item.clone(), item: item.clone(),
output: OnceLock::new(), output: OnceLock::new(),

View File

@@ -25,9 +25,6 @@ mod toml;
use pile_config::Label; use pile_config::Label;
pub use toml::*; pub use toml::*;
mod group;
pub use group::*;
mod text; mod text;
pub use text::*; pub use text::*;
@@ -39,17 +36,17 @@ use crate::{
misc::MapExtractor, misc::MapExtractor,
traits::{ExtractState, ObjectExtractor}, traits::{ExtractState, ObjectExtractor},
}, },
value::{Item, PileValue}, value::{BinaryPileValue, PileValue},
}; };
pub struct ItemExtractor { pub struct BinaryExtractor {
inner: MapExtractor, inner: MapExtractor,
image: Arc<ImageExtractor>, image: Arc<ImageExtractor>,
} }
impl ItemExtractor { impl BinaryExtractor {
#[expect(clippy::unwrap_used)] #[expect(clippy::unwrap_used)]
pub fn new(item: &Item) -> Self { pub fn new(item: &BinaryPileValue) -> Self {
let inner = MapExtractor { let inner = MapExtractor {
inner: HashMap::from([ inner: HashMap::from([
( (
@@ -88,10 +85,6 @@ impl ItemExtractor {
Label::new("text").unwrap(), Label::new("text").unwrap(),
PileValue::ObjectExtractor(Arc::new(TextExtractor::new(item))), PileValue::ObjectExtractor(Arc::new(TextExtractor::new(item))),
), ),
(
Label::new("groups").unwrap(),
PileValue::ObjectExtractor(Arc::new(GroupExtractor::new(item))),
),
]), ]),
}; };
@@ -103,7 +96,7 @@ impl ItemExtractor {
} }
#[async_trait::async_trait] #[async_trait::async_trait]
impl ObjectExtractor for ItemExtractor { impl ObjectExtractor for BinaryExtractor {
async fn field( async fn field(
&self, &self,
state: &ExtractState, state: &ExtractState,

View File

@@ -15,7 +15,7 @@ pub use pdf_text::*;
use crate::{ use crate::{
extract::traits::{ExtractState, ObjectExtractor}, extract::traits::{ExtractState, ObjectExtractor},
value::{Item, PileValue}, value::{BinaryPileValue, PileValue},
}; };
pub struct PdfExtractor { pub struct PdfExtractor {
@@ -26,7 +26,7 @@ pub struct PdfExtractor {
} }
impl PdfExtractor { impl PdfExtractor {
pub fn new(item: &Item) -> Self { pub fn new(item: &BinaryPileValue) -> Self {
Self { Self {
text: Arc::new(PdfTextExtractor::new(item)), text: Arc::new(PdfTextExtractor::new(item)),
meta: Arc::new(PdfMetaExtractor::new(item)), meta: Arc::new(PdfMetaExtractor::new(item)),
@@ -46,7 +46,7 @@ impl ObjectExtractor for PdfExtractor {
) -> Result<Option<PileValue>, std::io::Error> { ) -> Result<Option<PileValue>, std::io::Error> {
trace!( trace!(
?args, ?args,
key = self.text.item.key().as_str(), item = ?self.text.item,
"Getting field {name:?} from PdfExtractor", "Getting field {name:?} from PdfExtractor",
); );

View File

@@ -9,18 +9,19 @@ use std::{
}; };
use tracing::trace; use tracing::trace;
use crate::value::BinaryPileValue;
use crate::{ use crate::{
extract::traits::{ExtractState, ObjectExtractor}, extract::traits::{ExtractState, ObjectExtractor},
value::{Item, PileValue}, value::PileValue,
}; };
pub struct PdfMetaExtractor { pub struct PdfMetaExtractor {
item: Item, item: BinaryPileValue,
output: OnceLock<HashMap<Label, PileValue>>, output: OnceLock<HashMap<Label, PileValue>>,
} }
impl PdfMetaExtractor { impl PdfMetaExtractor {
pub fn new(item: &Item) -> Self { pub fn new(item: &BinaryPileValue) -> Self {
Self { Self {
item: item.clone(), item: item.clone(),
output: OnceLock::new(), output: OnceLock::new(),
@@ -82,7 +83,7 @@ impl PdfMetaExtractor {
let (page_count, raw_meta) = match raw_meta { let (page_count, raw_meta) = match raw_meta {
Ok(x) => x, Ok(x) => x,
Err(error) => { Err(error) => {
trace!(message = "Could not process pdf", ?error, key = ?self.item.key()); trace!(message = "Could not process pdf", ?error, item = ?self.item);
return Ok(self.output.get_or_init(HashMap::new)); return Ok(self.output.get_or_init(HashMap::new));
} }
}; };

View File

@@ -9,15 +9,15 @@ use tracing::trace;
use crate::{ use crate::{
extract::traits::{ExtractState, ListExtractor}, extract::traits::{ExtractState, ListExtractor},
value::{Item, PileValue}, value::{ArcBytes, BinaryPileValue, PileValue},
}; };
pub struct PdfPagesExtractor { pub struct PdfPagesExtractor {
item: Item, item: BinaryPileValue,
} }
impl PdfPagesExtractor { impl PdfPagesExtractor {
pub fn new(item: &Item) -> Self { pub fn new(item: &BinaryPileValue) -> Self {
Self { item: item.clone() } Self { item: item.clone() }
} }
@@ -41,7 +41,7 @@ impl ListExtractor for PdfPagesExtractor {
idx: usize, idx: usize,
) -> Result<Option<PileValue>, std::io::Error> { ) -> Result<Option<PileValue>, std::io::Error> {
trace!( trace!(
key = self.item.key().as_str(), item = ?self.item,
"Getting index {idx} from PdfPagesExtractor", "Getting index {idx} from PdfPagesExtractor",
); );
@@ -78,12 +78,12 @@ impl ListExtractor for PdfPagesExtractor {
let value = match png { let value = match png {
Ok(None) => return Ok(None), Ok(None) => return Ok(None),
Ok(Some(bytes)) => PileValue::Blob { Ok(Some(bytes)) => PileValue::Binary(BinaryPileValue::Blob {
mime: mime::IMAGE_PNG, mime: mime::IMAGE_PNG,
bytes: Arc::new(bytes), bytes: ArcBytes(Arc::new(bytes)),
}, }),
Err(error) => { Err(error) => {
trace!(message = "Could not render pdf page", ?error, idx, key = ?self.item.key()); trace!(message = "Could not render pdf page", ?error, idx, item = ?self.item);
PileValue::Null PileValue::Null
} }
}; };
@@ -108,7 +108,7 @@ impl ListExtractor for PdfPagesExtractor {
match count { match count {
Ok(n) => Ok(n), Ok(n) => Ok(n),
Err(error) => { Err(error) => {
trace!(message = "Could not read pdf page count", ?error, key = ?self.item.key()); trace!(message = "Could not read pdf page count", ?error, item = ?self.item);
Ok(0) Ok(0)
} }
} }

View File

@@ -9,18 +9,19 @@ use std::{
}; };
use tracing::trace; use tracing::trace;
use crate::value::BinaryPileValue;
use crate::{ use crate::{
extract::traits::{ExtractState, ObjectExtractor}, extract::traits::{ExtractState, ObjectExtractor},
value::{Item, PileValue}, value::PileValue,
}; };
pub struct PdfTextExtractor { pub struct PdfTextExtractor {
pub(super) item: Item, pub(super) item: BinaryPileValue,
output: OnceLock<HashMap<Label, PileValue>>, output: OnceLock<HashMap<Label, PileValue>>,
} }
impl PdfTextExtractor { impl PdfTextExtractor {
pub fn new(item: &Item) -> Self { pub fn new(item: &BinaryPileValue) -> Self {
Self { Self {
item: item.clone(), item: item.clone(),
output: OnceLock::new(), output: OnceLock::new(),
@@ -86,7 +87,7 @@ impl PdfTextExtractor {
let raw_text = match raw_text { let raw_text = match raw_text {
Ok(x) => x, Ok(x) => x,
Err(error) => { Err(error) => {
trace!(message = "Could not process pdf", ?error, key = ?self.item.key()); trace!(message = "Could not process pdf", ?error, item = ?self.item);
return Ok(self.output.get_or_init(HashMap::new)); return Ok(self.output.get_or_init(HashMap::new));
} }
}; };

View File

@@ -4,16 +4,16 @@ use std::sync::{Arc, OnceLock};
use crate::{ use crate::{
extract::traits::{ExtractState, ObjectExtractor}, extract::traits::{ExtractState, ObjectExtractor},
value::{Item, PileValue}, value::{BinaryPileValue, PileValue},
}; };
pub struct TextExtractor { pub struct TextExtractor {
item: Item, item: BinaryPileValue,
output: OnceLock<PileValue>, output: OnceLock<PileValue>,
} }
impl TextExtractor { impl TextExtractor {
pub fn new(item: &Item) -> Self { pub fn new(item: &BinaryPileValue) -> Self {
Self { Self {
item: item.clone(), item: item.clone(),
output: OnceLock::new(), output: OnceLock::new(),

View File

@@ -7,7 +7,7 @@ use std::{
use crate::{ use crate::{
extract::traits::{ExtractState, ObjectExtractor}, extract::traits::{ExtractState, ObjectExtractor},
value::{Item, PileValue}, value::{BinaryPileValue, PileValue},
}; };
fn toml_to_pile(value: toml::Value) -> PileValue { fn toml_to_pile(value: toml::Value) -> PileValue {
@@ -25,12 +25,12 @@ fn toml_to_pile(value: toml::Value) -> PileValue {
} }
pub struct TomlExtractor { pub struct TomlExtractor {
item: Item, item: BinaryPileValue,
output: OnceLock<HashMap<Label, PileValue>>, output: OnceLock<HashMap<Label, PileValue>>,
} }
impl TomlExtractor { impl TomlExtractor {
pub fn new(item: &Item) -> Self { pub fn new(item: &BinaryPileValue) -> Self {
Self { Self {
item: item.clone(), item: item.clone(),
output: OnceLock::new(), output: OnceLock::new(),

View File

@@ -0,0 +1,58 @@
use std::{collections::HashMap, sync::Arc};
use pile_config::Label;
use crate::{
extract::{
misc::MapExtractor,
traits::{ExtractState, ObjectExtractor},
},
value::{Item, PileValue},
};
pub struct ItemExtractor {
inner: MapExtractor,
}
impl ItemExtractor {
pub fn new(item: &Item) -> Self {
let files = {
let Item::File { files, .. } = &item;
let mut inner = HashMap::new();
for f in files {
inner.insert(f.0.clone(), f.1.clone());
}
PileValue::ObjectExtractor(Arc::new(MapExtractor { inner }))
};
#[expect(clippy::unwrap_used)]
let inner = MapExtractor {
inner: HashMap::from([
(Label::new("files").unwrap(), files),
(
Label::new("key").unwrap(),
PileValue::String(Arc::new(item.key())),
),
]),
};
Self { inner }
}
}
#[async_trait::async_trait]
impl ObjectExtractor for ItemExtractor {
async fn field(
&self,
state: &ExtractState,
name: &pile_config::Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
self.inner.field(state, name, args).await
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
let fields = self.inner.fields().await?;
Ok(fields)
}
}

View File

@@ -1,56 +0,0 @@
use std::sync::Arc;
use pile_config::Label;
use crate::{
extract::traits::{ExtractState, ObjectExtractor},
value::{Item, PileValue},
};
pub struct GroupExtractor {
item: Item,
}
impl GroupExtractor {
pub fn new(item: &Item) -> Self {
Self { item: item.clone() }
}
}
#[async_trait::async_trait]
impl ObjectExtractor for GroupExtractor {
async fn field(
&self,
_state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
Ok(self
.item
.group()
.get(name)
.map(|item| PileValue::ObjectExtractor(Arc::new(super::ItemExtractor::new(item)))))
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.item.group().keys().cloned().collect())
}
async fn to_json(&self, _state: &ExtractState) -> Result<serde_json::Value, std::io::Error> {
Ok(serde_json::Value::Object(
self.item
.group()
.iter()
.map(|(k, v)| {
(
k.to_string(),
serde_json::Value::String(format!("<GroupItem ({})>", v.key())),
)
})
.collect(),
))
}
}

View File

@@ -1,3 +1,4 @@
pub mod blob;
pub mod item; pub mod item;
pub mod misc; pub mod misc;
pub mod regex; pub mod regex;

View File

@@ -1,27 +1,25 @@
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use pile_config::{ use pile_config::Label;
Label, use regex::Regex;
pattern::{GroupPattern, GroupSegment},
};
use smartstring::{LazyCompact, SmartString}; use smartstring::{LazyCompact, SmartString};
use std::{ use std::{
collections::{BTreeMap, HashMap, HashSet}, collections::{BTreeMap, HashMap},
path::PathBuf, path::PathBuf,
sync::{Arc, OnceLock}, sync::{Arc, OnceLock},
}; };
use walkdir::WalkDir; use walkdir::WalkDir;
use crate::{ use crate::{
extract::traits::ExtractState,
source::{DataSource, misc::path_ts_latest}, source::{DataSource, misc::path_ts_latest},
value::{Item, PileValue}, value::{BinaryPileValue, Item, PileValue},
}; };
#[derive(Debug)] #[derive(Debug)]
pub struct DirDataSource { pub struct DirDataSource {
pub name: Label, pub name: Label,
pub dir: PathBuf, pub dir: PathBuf,
pub pattern: GroupPattern, pub base_pattern: Regex,
pub files: HashMap<Label, String>,
pub index: OnceLock<BTreeMap<SmartString<LazyCompact>, Item>>, pub index: OnceLock<BTreeMap<SmartString<LazyCompact>, Item>>,
} }
@@ -29,21 +27,18 @@ impl DirDataSource {
pub async fn new( pub async fn new(
name: &Label, name: &Label,
dir: PathBuf, dir: PathBuf,
pattern: GroupPattern, base_pattern: Regex,
files: HashMap<Label, String>,
) -> Result<Arc<Self>, std::io::Error> { ) -> Result<Arc<Self>, std::io::Error> {
let source = Arc::new(Self { let source = Arc::new(Self {
name: name.clone(), name: name.clone(),
dir, dir,
pattern, base_pattern,
files,
index: OnceLock::new(), index: OnceLock::new(),
}); });
// let mut index = BTreeMap::new();
// MARK: list paths
//
let mut paths_items = HashSet::new();
let mut paths_grouped_items = HashSet::new();
'entry: for entry in WalkDir::new(&source.dir) { 'entry: for entry in WalkDir::new(&source.dir) {
let entry = match entry { let entry = match entry {
Err(e) => { Err(e) => {
@@ -59,51 +54,52 @@ impl DirDataSource {
} }
let path = entry.into_path(); let path = entry.into_path();
let path_str = match path.to_str() { let rel_path = match path.strip_prefix(&source.dir) {
Ok(p) => p,
Err(_) => continue 'entry,
};
let path_str = match rel_path.to_str() {
Some(x) => x, Some(x) => x,
None => continue 'entry, None => continue 'entry,
}; };
let groups = resolve_groups(&source.pattern, path_str).await; let captures = match source.base_pattern.captures(path_str) {
paths_grouped_items.extend(groups.into_values()); Some(c) => c,
paths_items.insert(path); None => continue 'entry,
};
let base = match captures.get(1) {
Some(m) => m.as_str(),
None => continue 'entry,
};
let key: SmartString<LazyCompact> = base.into();
if index.contains_key(&key) {
continue 'entry;
} }
// let mut item_files = HashMap::new();
// MARK: resolve groups for (label, template) in &source.files {
// let file_path = source.dir.join(template.replace("{base}", base));
if file_path.exists() {
let mut index = BTreeMap::new(); let mime = mime_guess::from_path(&file_path).first_or_octet_stream();
'entry: for path in paths_items.difference(&paths_grouped_items) { item_files.insert(
let path_str = match path.to_str() { label.clone(),
Some(x) => x, PileValue::Binary(BinaryPileValue::File {
None => continue 'entry, mime,
}; path: file_path,
let group = resolve_groups(&source.pattern, path_str).await;
let group = group
.into_iter()
.map(|(k, group_path)| {
(
k,
Box::new(Item::File {
source: Arc::clone(&source),
mime: mime_guess::from_path(&group_path).first_or_octet_stream(),
path: group_path.clone(),
group: Arc::new(HashMap::new()),
}), }),
) );
}) }
.collect::<HashMap<_, _>>(); }
let item = Item::File { index.insert(
key.clone(),
Item::File {
key,
source: Arc::clone(&source), source: Arc::clone(&source),
mime: mime_guess::from_path(path).first_or_octet_stream(), files: item_files,
path: path.into(), },
group: Arc::new(group), );
};
index.insert(item.key(), item);
} }
source.index.get_or_init(|| index); source.index.get_or_init(|| index);
@@ -139,43 +135,3 @@ impl DataSource for Arc<DirDataSource> {
path_ts_latest(&self.dir) path_ts_latest(&self.dir)
} }
} }
async fn resolve_groups(pattern: &GroupPattern, path_str: &str) -> HashMap<Label, PathBuf> {
let state = ExtractState { ignore_mime: false };
let mut group = HashMap::new();
'pattern: for (l, pat) in &pattern.pattern {
let item = PileValue::String(Arc::new(path_str.into()));
let mut target = String::new();
for p in pat {
match p {
GroupSegment::Literal(x) => target.push_str(x),
GroupSegment::Path(op) => {
let res = match item.query(&state, op).await {
Ok(Some(x)) => x,
_ => continue 'pattern,
};
let res = match res.as_str() {
Some(x) => x,
None => continue 'pattern,
};
target.push_str(res);
}
}
}
let group_path: PathBuf = match target.parse() {
Ok(x) => x,
Err(_) => continue 'pattern,
};
if !group_path.exists() {
continue;
}
group.insert(l.clone(), group_path);
}
return group;
}

View File

@@ -1,74 +1,45 @@
use mime::Mime;
use pile_config::Label; use pile_config::Label;
use pile_io::SyncReadBridge;
use smartstring::{LazyCompact, SmartString}; use smartstring::{LazyCompact, SmartString};
use std::{collections::HashMap, fs::File, path::PathBuf, sync::Arc}; use std::{collections::HashMap, sync::Arc};
use crate::{source::DirDataSource, value::ItemReader}; use crate::{source::DirDataSource, value::PileValue};
// //
// MARK: item // MARK: item
// //
/// A cheaply-cloneable pointer to an item in a dataset /// A cheaply-cloneable pointer to an item in a dataset
#[derive(Debug, Clone)] #[derive(Clone)]
pub enum Item { pub enum Item {
File { File {
key: SmartString<LazyCompact>,
source: Arc<DirDataSource>, source: Arc<DirDataSource>,
mime: Mime, files: HashMap<Label, PileValue>,
path: PathBuf,
group: Arc<HashMap<Label, Box<Item>>>,
}, },
} }
impl Item { impl std::fmt::Debug for Item {
/// Open the item for reading. fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
pub async fn read(&self) -> Result<ItemReader, std::io::Error> { match self {
Ok(match self { Self::File { key, files, .. } => f
Self::File { path, .. } => ItemReader::File(File::open(path)?), .debug_struct("Item::File")
}) .field("key", key)
.field("files", &files.keys().collect::<Vec<_>>())
.finish(),
}
}
} }
impl Item {
pub fn source_name(&self) -> &pile_config::Label { pub fn source_name(&self) -> &pile_config::Label {
match self { match self {
Self::File { source, .. } => &source.name, Self::File { source, .. } => &source.name,
} }
} }
#[expect(clippy::expect_used)]
pub fn key(&self) -> SmartString<LazyCompact> { pub fn key(&self) -> SmartString<LazyCompact> {
match self { match self {
Self::File { source, path, .. } => path Self::File { key, .. } => key.clone(),
.strip_prefix(&source.dir)
.expect("item must be inside source")
.to_str()
.expect("path is not utf-8")
.into(),
}
}
pub async fn hash(&self) -> Result<blake3::Hash, std::io::Error> {
let read = self.read().await?;
let mut read = SyncReadBridge::new_current(read);
let out = tokio::task::spawn_blocking(move || {
let mut hasher = blake3::Hasher::new();
std::io::copy(&mut read, &mut hasher)?;
return Ok::<_, std::io::Error>(hasher.finalize());
})
.await??;
return Ok(out);
}
pub fn mime(&self) -> &Mime {
match self {
Self::File { mime, .. } => mime,
}
}
pub fn group(&self) -> &HashMap<Label, Box<Self>> {
match self {
Self::File { group, .. } => group,
} }
} }
} }

View File

@@ -1,5 +1,10 @@
use pile_io::{AsyncReader, AsyncSeekReader}; use pile_io::{AsyncReader, AsyncSeekReader};
use std::{fs::File, io::Seek}; use std::{
fs::File,
io::{Cursor, Seek},
};
use crate::value::ArcBytes;
// //
// MARK: itemreader // MARK: itemreader
@@ -7,12 +12,14 @@ use std::{fs::File, io::Seek};
pub enum ItemReader { pub enum ItemReader {
File(File), File(File),
Vec(Cursor<ArcBytes>),
} }
impl AsyncReader for ItemReader { impl AsyncReader for ItemReader {
async fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> { async fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
match self { match self {
Self::File(x) => std::io::Read::read(x, buf), Self::File(x) => std::io::Read::read(x, buf),
Self::Vec(x) => std::io::Read::read(x, buf),
} }
} }
} }
@@ -21,6 +28,7 @@ impl AsyncSeekReader for ItemReader {
async fn seek(&mut self, pos: std::io::SeekFrom) -> Result<u64, std::io::Error> { async fn seek(&mut self, pos: std::io::SeekFrom) -> Result<u64, std::io::Error> {
match self { match self {
Self::File(x) => x.seek(pos), Self::File(x) => x.seek(pos),
Self::Vec(x) => x.seek(pos),
} }
} }
} }

View File

@@ -1,19 +1,75 @@
use mime::Mime; use mime::Mime;
use pile_config::objectpath::{ObjectPath, PathSegment}; use pile_config::objectpath::{ObjectPath, PathSegment};
use pile_io::SyncReadBridge;
use serde_json::{Map, Value}; use serde_json::{Map, Value};
use smartstring::{LazyCompact, SmartString}; use smartstring::{LazyCompact, SmartString};
use std::sync::Arc; use std::{fmt::Debug, fs::File, io::Cursor, path::PathBuf, sync::Arc};
use crate::{ use crate::{
extract::{ extract::{
item::{ImageExtractor, ItemExtractor}, blob::BinaryExtractor,
item::ItemExtractor,
misc::{ArrayExtractor, MapExtractor, VecExtractor}, misc::{ArrayExtractor, MapExtractor, VecExtractor},
string::StringExtractor, string::StringExtractor,
traits::{ExtractState, ListExtractor, ObjectExtractor}, traits::{ExtractState, ListExtractor, ObjectExtractor},
}, },
value::Item, value::{Item, ItemReader},
}; };
#[derive(Clone)]
pub struct ArcBytes(pub Arc<Vec<u8>>);
impl Debug for ArcBytes {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("ArcBytes")
.field("len()", &self.0.len())
.finish()
}
}
impl AsRef<[u8]> for ArcBytes {
fn as_ref(&self) -> &[u8] {
&self.0
}
}
#[derive(Debug, Clone)]
pub enum BinaryPileValue {
/// A binary blob
Blob { mime: Mime, bytes: ArcBytes },
/// An pointer to a file
File { mime: Mime, path: PathBuf },
}
impl BinaryPileValue {
/// Open the item for reading.
pub async fn read(&self) -> Result<ItemReader, std::io::Error> {
match self {
Self::File { path, .. } => Ok(ItemReader::File(File::open(path)?)),
Self::Blob { bytes, .. } => Ok(ItemReader::Vec(Cursor::new(bytes.clone()))),
}
}
pub async fn hash(&self) -> Result<blake3::Hash, std::io::Error> {
let read = self.read().await?;
let mut read = SyncReadBridge::new_current(read);
let out = tokio::task::spawn_blocking(move || {
let mut hasher = blake3::Hasher::new();
std::io::copy(&mut read, &mut hasher)?;
return Ok::<_, std::io::Error>(hasher.finalize());
})
.await??;
return Ok(out);
}
pub fn mime(&self) -> &Mime {
match self {
Self::Blob { mime, .. } => mime,
Self::File { mime, .. } => mime,
}
}
}
/// An immutable, cheaply-cloneable, lazily-computed value. /// An immutable, cheaply-cloneable, lazily-computed value.
/// Very similar to [serde_json::Value]. /// Very similar to [serde_json::Value].
pub enum PileValue { pub enum PileValue {
@@ -27,12 +83,6 @@ pub enum PileValue {
/// An array of values /// An array of values
Array(Arc<Vec<PileValue>>), Array(Arc<Vec<PileValue>>),
/// A binary blob
Blob {
mime: Mime,
bytes: Arc<Vec<u8>>,
},
/// A lazily-computed map of {label: value} /// A lazily-computed map of {label: value}
ObjectExtractor(Arc<dyn ObjectExtractor>), ObjectExtractor(Arc<dyn ObjectExtractor>),
@@ -41,6 +91,9 @@ pub enum PileValue {
/// An pointer to an item in this dataset /// An pointer to an item in this dataset
Item(Item), Item(Item),
/// Binary data
Binary(BinaryPileValue),
} }
impl Clone for PileValue { impl Clone for PileValue {
@@ -53,11 +106,8 @@ impl Clone for PileValue {
Self::Array(x) => Self::Array(x.clone()), Self::Array(x) => Self::Array(x.clone()),
Self::ObjectExtractor(x) => Self::ObjectExtractor(x.clone()), Self::ObjectExtractor(x) => Self::ObjectExtractor(x.clone()),
Self::ListExtractor(x) => Self::ListExtractor(x.clone()), Self::ListExtractor(x) => Self::ListExtractor(x.clone()),
Self::Blob { mime, bytes } => Self::Blob {
mime: mime.clone(),
bytes: bytes.clone(),
},
Self::Item(i) => Self::Item(i.clone()), Self::Item(i) => Self::Item(i.clone()),
Self::Binary(b) => Self::Binary(b.clone()),
} }
} }
} }
@@ -70,13 +120,10 @@ impl PileValue {
Self::I64(_) => Arc::new(MapExtractor::default()), Self::I64(_) => Arc::new(MapExtractor::default()),
Self::Array(_) => Arc::new(MapExtractor::default()), Self::Array(_) => Arc::new(MapExtractor::default()),
Self::String(s) => Arc::new(StringExtractor::new(s)), Self::String(s) => Arc::new(StringExtractor::new(s)),
Self::Blob { mime, bytes } => {
// TODO: make a blobextractor (with pdf, epub, etc; like item)
Arc::new(ImageExtractor::from_blob(bytes.clone(), mime.clone()))
}
Self::ListExtractor(_) => Arc::new(MapExtractor::default()), Self::ListExtractor(_) => Arc::new(MapExtractor::default()),
Self::ObjectExtractor(e) => e.clone(), Self::ObjectExtractor(e) => e.clone(),
Self::Item(i) => Arc::new(ItemExtractor::new(i)), Self::Item(i) => Arc::new(ItemExtractor::new(i)),
Self::Binary(b) => Arc::new(BinaryExtractor::new(b)),
} }
} }
@@ -87,12 +134,12 @@ impl PileValue {
Self::I64(_) => Arc::new(VecExtractor::default()), Self::I64(_) => Arc::new(VecExtractor::default()),
Self::Array(a) => Arc::new(ArrayExtractor::new(a.clone())), Self::Array(a) => Arc::new(ArrayExtractor::new(a.clone())),
Self::String(_) => Arc::new(VecExtractor::default()), Self::String(_) => Arc::new(VecExtractor::default()),
Self::Blob { .. } => Arc::new(VecExtractor::default()),
Self::ListExtractor(e) => e.clone(), Self::ListExtractor(e) => e.clone(),
Self::ObjectExtractor(e) => e Self::ObjectExtractor(e) => e
.as_list() .as_list()
.unwrap_or_else(|| Arc::new(VecExtractor::default())), .unwrap_or_else(|| Arc::new(VecExtractor::default())),
Self::Item(_) => Arc::new(VecExtractor::default()), Self::Item(_) => Arc::new(VecExtractor::default()),
Self::Binary(_) => Arc::new(VecExtractor::default()),
} }
} }
@@ -197,14 +244,17 @@ impl PileValue {
Ok(match self { Ok(match self {
Self::Null => None, Self::Null => None,
Self::U64(_) | Self::I64(_) | Self::String(_) | Self::Blob { .. } => { Self::U64(_)
Some(Value::Number(1u64.into())) | Self::I64(_)
} | Self::String(_)
| Self::Binary(BinaryPileValue::Blob { .. }) => Some(Value::Number(1u64.into())),
Self::Array(x) => (!x.is_empty()).then(|| Value::Number(1u64.into())), Self::Array(x) => (!x.is_empty()).then(|| Value::Number(1u64.into())),
Self::ListExtractor(x) => (x.len(state).await? > 0).then(|| Value::Number(1u64.into())), Self::ListExtractor(x) => (x.len(state).await? > 0).then(|| Value::Number(1u64.into())),
Self::ObjectExtractor(_) | Self::Item(_) => { Self::ObjectExtractor(_)
| Self::Item(_)
| Self::Binary(BinaryPileValue::File { .. }) => {
let e = self.object_extractor(); let e = self.object_extractor();
let keys = e.fields().await?; let keys = e.fields().await?;
let mut map = Map::new(); let mut map = Map::new();
@@ -241,8 +291,8 @@ impl PileValue {
Self::String(x) => Value::String(x.to_string()), Self::String(x) => Value::String(x.to_string()),
// TODO: replace with something meaningful? // TODO: replace with something meaningful?
Self::Blob { mime, bytes } => { Self::Binary(BinaryPileValue::Blob { mime, bytes }) => {
Value::String(format!("<Blob ({mime}, {} bytes)>", bytes.len())) Value::String(format!("<Blob ({mime}, {} bytes)>", bytes.0.len()))
} }
Self::Array(_) | Self::ListExtractor(_) => { Self::Array(_) | Self::ListExtractor(_) => {
@@ -250,7 +300,9 @@ impl PileValue {
return e.to_json(state).await; return e.to_json(state).await;
} }
Self::ObjectExtractor(_) | Self::Item(_) => { Self::ObjectExtractor(_)
| Self::Item(_)
| Self::Binary(BinaryPileValue::File { .. }) => {
let e = self.object_extractor(); let e = self.object_extractor();
return e.to_json(state).await; return e.to_json(state).await;
} }

View File

@@ -22,7 +22,7 @@ pub struct ItemCommand {
/// If present, print the schema fields instead of item data /// If present, print the schema fields instead of item data
#[arg(long)] #[arg(long)]
fields: bool, schema: bool,
#[arg(long, short = 'x')] #[arg(long, short = 'x')]
exclude: Vec<String>, exclude: Vec<String>,
@@ -58,7 +58,7 @@ impl CliCmd for ItemCommand {
})?; })?;
let pv = PileValue::Item(item); let pv = PileValue::Item(item);
if self.fields { if self.schema {
let mut map = serde_json::Map::new(); let mut map = serde_json::Map::new();
for (name, spec) in &ds.config.schema { for (name, spec) in &ds.config.schema {
if self.exclude.contains(&name.to_string()) { if self.exclude.contains(&name.to_string()) {