Owned items, static values
Some checks failed
CI / Typos (push) Failing after 20s
CI / Build and test (push) Failing after 2m17s
CI / Clippy (push) Failing after 3m27s
CI / Build and test (all features) (push) Failing after 5m56s

This commit is contained in:
2026-03-10 21:05:51 -07:00
parent 48ac93c78e
commit bfa67994bf
20 changed files with 304 additions and 378 deletions

View File

@@ -1,24 +1,27 @@
use epub::doc::EpubDoc;
use pile_config::Label;
use std::{collections::HashMap, sync::OnceLock};
use std::{
collections::HashMap,
sync::{Arc, OnceLock},
};
use tracing::trace;
use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor};
pub struct EpubMetaExtractor<'a> {
item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a>>>,
pub struct EpubMetaExtractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl<'a> EpubMetaExtractor<'a> {
pub fn new(item: &'a Item) -> Self {
impl EpubMetaExtractor {
pub fn new(item: &Item) -> Self {
Self {
item,
item: item.clone(),
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
@@ -61,13 +64,13 @@ impl<'a> EpubMetaExtractor<'a> {
}
};
let mut output: HashMap<Label, PileValue<'a>> = HashMap::new();
let mut output: HashMap<Label, PileValue> = HashMap::new();
#[expect(clippy::unwrap_used)]
for (key, val) in raw_meta {
let label = Label::new(key).unwrap();
let value = match val {
Some(s) => PileValue::String(s.into()),
Some(s) => PileValue::String(Arc::new(s.into())),
None => PileValue::Null,
};
output.insert(label, value);
@@ -78,12 +81,9 @@ impl<'a> EpubMetaExtractor<'a> {
}
#[async_trait::async_trait]
impl ObjectExtractor for EpubMetaExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner().await?.get(name))
impl ObjectExtractor for EpubMetaExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.get_inner().await?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {

View File

@@ -1,24 +1,27 @@
use epub::doc::EpubDoc;
use pile_config::Label;
use std::{collections::HashMap, sync::OnceLock};
use std::{
collections::HashMap,
sync::{Arc, OnceLock},
};
use tracing::debug;
use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor};
pub struct EpubTextExtractor<'a> {
item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a>>>,
pub struct EpubTextExtractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl<'a> EpubTextExtractor<'a> {
pub fn new(item: &'a Item) -> Self {
impl EpubTextExtractor {
pub fn new(item: &Item) -> Self {
Self {
item,
item: item.clone(),
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
@@ -61,7 +64,7 @@ impl<'a> EpubTextExtractor<'a> {
#[expect(clippy::unwrap_used)]
let output = HashMap::from([(
Label::new("text").unwrap(),
PileValue::String(raw_text.into()),
PileValue::String(Arc::new(raw_text.into())),
)]);
let _ = self.output.set(output);
@@ -88,12 +91,9 @@ fn strip_html(html: &str) -> String {
}
#[async_trait::async_trait]
impl ObjectExtractor for EpubTextExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner().await?.get(name))
impl ObjectExtractor for EpubTextExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.get_inner().await?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {

View File

@@ -12,13 +12,13 @@ use crate::{
extract::{MapExtractor, ObjectExtractor},
};
pub struct EpubExtractor<'a> {
inner: MapExtractor<'a>,
pub struct EpubExtractor {
inner: MapExtractor,
}
impl<'a> EpubExtractor<'a> {
impl EpubExtractor {
#[expect(clippy::unwrap_used)]
pub fn new(item: &'a Item) -> Self {
pub fn new(item: &Item) -> Self {
let inner = MapExtractor {
inner: HashMap::from([
(
@@ -37,19 +37,8 @@ impl<'a> EpubExtractor<'a> {
}
#[async_trait::async_trait]
impl ObjectExtractor for EpubExtractor<'_> {
async fn field<'a>(
&'a self,
name: &pile_config::Label,
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
#[expect(clippy::unwrap_used)]
if name.as_str() == "text" {
match self.inner.inner.get(name).unwrap() {
PileValue::ObjectExtractor(x) => return x.field(name).await,
_ => unreachable!(),
};
}
impl ObjectExtractor for EpubExtractor {
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error> {
self.inner.field(name).await
}

View File

@@ -1,23 +1,27 @@
use pile_config::Label;
use std::{collections::HashMap, io::BufReader, sync::OnceLock};
use std::{
collections::HashMap,
io::BufReader,
sync::{Arc, OnceLock},
};
use tracing::debug;
use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor};
pub struct ExifExtractor<'a> {
item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a>>>,
pub struct ExifExtractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl<'a> ExifExtractor<'a> {
pub fn new(item: &'a Item) -> Self {
impl ExifExtractor {
pub fn new(item: &Item) -> Self {
Self {
item,
item: item.clone(),
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
@@ -52,7 +56,7 @@ impl<'a> ExifExtractor<'a> {
}
};
let mut output: HashMap<Label, PileValue<'a>> = HashMap::new();
let mut output: HashMap<Label, PileValue> = HashMap::new();
for (tag_name, value) in raw_fields {
let Some(label) = tag_to_label(&tag_name) else {
@@ -61,7 +65,7 @@ impl<'a> ExifExtractor<'a> {
// First occurrence wins (PRIMARY IFD comes before THUMBNAIL)
output
.entry(label)
.or_insert_with(|| PileValue::String(value.into()));
.or_insert_with(|| PileValue::String(Arc::new(value.into())));
}
return Ok(self.output.get_or_init(|| output));
@@ -78,12 +82,9 @@ fn tag_to_label(tag: &str) -> Option<Label> {
}
#[async_trait::async_trait]
impl ObjectExtractor for ExifExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner().await?.get(name))
impl ObjectExtractor for ExifExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.get_inner().await?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {

View File

@@ -12,24 +12,16 @@ use crate::{
extract::{ListExtractor, ObjectExtractor},
};
pub struct FlacImagesExtractor<'a> {
item: &'a Item,
output: OnceLock<Vec<PileValue<'a>>>,
pub struct FlacImagesExtractor {
item: Item,
}
impl<'a> FlacImagesExtractor<'a> {
pub fn new(item: &'a Item) -> Self {
Self {
item,
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&Vec<PileValue<'a>>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
impl FlacImagesExtractor {
pub fn new(item: &Item) -> Self {
Self { item: item.clone() }
}
async fn get_images(&self) -> Result<Vec<PileValue>, std::io::Error> {
let reader = SyncReadBridge::new_current(self.item.read().await?);
let raw_images = tokio::task::spawn_blocking(move || {
let reader = FlacReader::new(BufReader::new(reader));
@@ -48,39 +40,35 @@ impl<'a> FlacImagesExtractor<'a> {
.await
.map_err(std::io::Error::other)??;
let images = raw_images
Ok(raw_images
.into_iter()
.map(|(mime, data)| PileValue::Blob {
mime,
bytes: Arc::new(data),
})
.collect();
let _ = self.output.set(images);
#[expect(clippy::unwrap_used)]
return Ok(self.output.get().unwrap());
.collect())
}
}
#[async_trait::async_trait]
impl ListExtractor for FlacImagesExtractor<'_> {
async fn get<'a>(&'a self, idx: usize) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner().await?.get(idx))
impl ListExtractor for FlacImagesExtractor {
async fn get<'a>(&'a self, idx: usize) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.get_images().await?.into_iter().nth(idx))
}
async fn len(&self) -> Result<usize, std::io::Error> {
Ok(self.get_inner().await?.len())
Ok(self.get_images().await?.len())
}
}
pub struct FlacExtractor<'a> {
item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a>>>,
images: Option<PileValue<'a>>,
pub struct FlacExtractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
images: Option<PileValue>,
}
impl<'a> FlacExtractor<'a> {
pub fn new(item: &'a Item) -> Self {
impl FlacExtractor {
pub fn new(item: &Item) -> Self {
let is_flac = match item {
Item::File { path, .. } => path.to_str().unwrap_or_default().ends_with(".flac"),
Item::S3 { key, .. } => key.ends_with(".flac"),
@@ -90,18 +78,18 @@ impl<'a> FlacExtractor<'a> {
is_flac.then(|| PileValue::ListExtractor(Arc::new(FlacImagesExtractor::new(item))));
Self {
item,
item: item.clone(),
output: OnceLock::new(),
images,
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let key = match self.item {
let key = match &self.item {
Item::File { path, .. } => path.to_str().unwrap_or_default().to_owned(),
Item::S3 { key, .. } => key.to_string(),
};
@@ -132,18 +120,18 @@ impl<'a> FlacExtractor<'a> {
.await
.map_err(std::io::Error::other)??;
let mut output: HashMap<Label, Vec<PileValue<'a>>> = HashMap::new();
let mut output: HashMap<Label, Vec<PileValue>> = HashMap::new();
for (k, v) in raw_tags {
if let Some(label) = Label::new(k) {
output
.entry(label)
.or_default()
.push(PileValue::String(v.into()));
.push(PileValue::String(Arc::new(v.into())));
}
}
let output: HashMap<Label, PileValue<'a>> = output
let output: HashMap<Label, PileValue> = output
.into_iter()
.map(|(k, v)| (k, PileValue::Array(v)))
.map(|(k, v)| (k, PileValue::Array(Arc::new(v))))
.collect();
let _ = self.output.set(output);
@@ -153,17 +141,14 @@ impl<'a> FlacExtractor<'a> {
}
#[async_trait::async_trait]
impl ObjectExtractor for FlacExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
impl ObjectExtractor for FlacExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
if name.as_str() == "images"
&& let Some(ref images) = self.images
{
return Ok(Some(images));
return Ok(Some(images.clone()));
}
Ok(self.get_inner().await?.get(name))
Ok(self.get_inner().await?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {

View File

@@ -1,27 +1,31 @@
use pile_config::Label;
use std::{collections::HashMap, path::Component, sync::OnceLock};
use std::{
collections::HashMap,
path::Component,
sync::{Arc, OnceLock},
};
use crate::{Item, PileValue, extract::ObjectExtractor};
pub struct FsExtractor<'a> {
item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a>>>,
pub struct FsExtractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl<'a> FsExtractor<'a> {
pub fn new(item: &'a Item) -> Self {
impl FsExtractor {
pub fn new(item: &Item) -> Self {
Self {
item,
item: item.clone(),
output: OnceLock::new(),
}
}
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let Item::File { path, .. } = self.item else {
let Item::File { path, .. } = &self.item else {
return Ok(self.output.get_or_init(HashMap::new));
};
@@ -31,13 +35,13 @@ impl<'a> FsExtractor<'a> {
Label::new("extension").unwrap(),
path.extension()
.and_then(|x| x.to_str())
.map(|x| PileValue::String(x.into()))
.map(|x| PileValue::String(Arc::new(x.into())))
.unwrap_or(PileValue::Null),
),
(
Label::new("path").unwrap(),
path.to_str()
.map(|x| PileValue::String(x.into()))
.map(|x| PileValue::String(Arc::new(x.into())))
.unwrap_or(PileValue::Null),
),
(
@@ -50,9 +54,9 @@ impl<'a> FsExtractor<'a> {
Component::RootDir => Some("/".to_owned()),
Component::Prefix(x) => x.as_os_str().to_str().map(|x| x.to_owned()),
})
.map(|x| x.map(|x| PileValue::String(x.into())))
.map(|x| x.map(|x| PileValue::String(Arc::new(x.into()))))
.collect::<Option<Vec<_>>>()
.map(PileValue::Array)
.map(|v| PileValue::Array(Arc::new(v)))
.unwrap_or(PileValue::Null),
),
]);
@@ -62,12 +66,9 @@ impl<'a> FsExtractor<'a> {
}
#[async_trait::async_trait]
impl ObjectExtractor for FsExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner()?.get(name))
impl ObjectExtractor for FsExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.get_inner()?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {

View File

@@ -1,23 +1,28 @@
use id3::Tag;
use pile_config::Label;
use std::{borrow::Cow, collections::HashMap, io::BufReader, sync::OnceLock};
use std::{
borrow::Cow,
collections::HashMap,
io::BufReader,
sync::{Arc, OnceLock},
};
use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor};
pub struct Id3Extractor<'a> {
item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a>>>,
pub struct Id3Extractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl<'a> Id3Extractor<'a> {
pub fn new(item: &'a Item) -> Self {
impl Id3Extractor {
pub fn new(item: &Item) -> Self {
Self {
item,
item: item.clone(),
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
@@ -50,7 +55,7 @@ impl<'a> Id3Extractor<'a> {
Err(e) => return Err(e.into()),
};
let mut output: HashMap<Label, Vec<PileValue<'a>>> = HashMap::new();
let mut output: HashMap<Label, Vec<PileValue>> = HashMap::new();
for frame in tag.frames() {
if let Some(text) = frame.content().text() {
let name = frame_id_to_field(frame.id());
@@ -58,14 +63,14 @@ impl<'a> Id3Extractor<'a> {
output
.entry(key)
.or_default()
.push(PileValue::String(text.into()));
.push(PileValue::String(Arc::new(text.into())));
}
}
}
let output = output
.into_iter()
.map(|(k, v)| (k, PileValue::Array(v)))
.map(|(k, v)| (k, PileValue::Array(Arc::new(v))))
.collect();
return Ok(self.output.get_or_init(|| output));
@@ -114,12 +119,9 @@ fn frame_id_to_field(id: &str) -> Cow<'static, str> {
}
#[async_trait::async_trait]
impl ObjectExtractor for Id3Extractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner().await?.get(name))
impl ObjectExtractor for Id3Extractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.get_inner().await?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {

View File

@@ -3,17 +3,14 @@ use std::collections::HashMap;
use crate::{PileValue, extract::ObjectExtractor};
pub struct MapExtractor<'a> {
pub(crate) inner: HashMap<Label, PileValue<'a>>,
pub struct MapExtractor {
pub(crate) inner: HashMap<Label, PileValue>,
}
#[async_trait::async_trait]
impl ObjectExtractor for MapExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.inner.get(name))
impl ObjectExtractor for MapExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.inner.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {

View File

@@ -28,7 +28,7 @@ pub use map::*;
mod sidecar;
pub use sidecar::*;
use crate::Item;
use crate::{Item, PileValue};
/// An attachment that extracts metadata from an [Item].
///
@@ -39,10 +39,7 @@ pub trait ObjectExtractor: Send + Sync {
/// Get the field at `name` from `item`.
/// - returns `None` if `name` is not a valid field
/// - returns `Some(Null)` if `name` is not available
async fn field<'a>(
&'a self,
name: &pile_config::Label,
) -> Result<Option<&'a crate::PileValue<'a>>, std::io::Error>;
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error>;
/// Return all fields in this extractor.
/// `Self::field` must return [Some] for all these keys
@@ -59,10 +56,7 @@ pub trait ListExtractor: Send + Sync {
/// Indices start at zero, and must be consecutive.
/// - returns `None` if `idx` is out of range
/// - returns `Some(Null)` if `None` is at `idx`
async fn get<'a>(
&'a self,
idx: usize,
) -> Result<Option<&'a crate::PileValue<'a>>, std::io::Error>;
async fn get(&self, idx: usize) -> Result<Option<PileValue>, std::io::Error>;
async fn len(&self) -> Result<usize, std::io::Error>;
@@ -71,13 +65,13 @@ pub trait ListExtractor: Send + Sync {
}
}
pub struct MetaExtractor<'a> {
inner: MapExtractor<'a>,
pub struct MetaExtractor {
inner: MapExtractor,
}
impl<'a> MetaExtractor<'a> {
impl MetaExtractor {
#[expect(clippy::unwrap_used)]
pub fn new(item: &'a Item) -> Self {
pub fn new(item: &Item) -> Self {
let inner = MapExtractor {
inner: HashMap::from([
(
@@ -120,11 +114,8 @@ impl<'a> MetaExtractor<'a> {
}
#[async_trait::async_trait]
impl ObjectExtractor for MetaExtractor<'_> {
async fn field<'a>(
&'a self,
name: &pile_config::Label,
) -> Result<Option<&'a crate::PileValue<'a>>, std::io::Error> {
impl ObjectExtractor for MetaExtractor {
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error> {
self.inner.field(name).await
}

View File

@@ -22,13 +22,13 @@ use crate::{
extract::{MapExtractor, ObjectExtractor},
};
pub struct PdfExtractor<'a> {
inner: MapExtractor<'a>,
pub struct PdfExtractor {
inner: MapExtractor,
}
impl<'a> PdfExtractor<'a> {
impl PdfExtractor {
#[expect(clippy::unwrap_used)]
pub fn new(item: &'a Item) -> Self {
pub fn new(item: &Item) -> Self {
let mut inner_map = HashMap::new();
inner_map.insert(
Label::new("text").unwrap(),
@@ -56,28 +56,8 @@ impl<'a> PdfExtractor<'a> {
}
#[async_trait::async_trait]
impl ObjectExtractor for PdfExtractor<'_> {
async fn field<'a>(
&'a self,
name: &pile_config::Label,
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
#[expect(clippy::unwrap_used)]
if name.as_str() == "text" {
match self.inner.inner.get(name).unwrap() {
PileValue::ObjectExtractor(x) => return x.field(name).await,
_ => unreachable!(),
};
}
#[cfg(feature = "pdfium")]
#[expect(clippy::unwrap_used)]
if name.as_str() == "cover" {
match self.inner.inner.get(name).unwrap() {
PileValue::ObjectExtractor(x) => return x.field(name).await,
_ => unreachable!(),
};
}
impl ObjectExtractor for PdfExtractor {
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error> {
self.inner.field(name).await
}

View File

@@ -10,20 +10,20 @@ use tracing::trace;
use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor};
pub struct PdfCoverExtractor<'a> {
item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a>>>,
pub struct PdfCoverExtractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl<'a> PdfCoverExtractor<'a> {
pub fn new(item: &'a Item) -> Self {
impl PdfCoverExtractor {
pub fn new(item: &Item) -> Self {
Self {
item,
item: item.clone(),
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
@@ -84,12 +84,9 @@ impl<'a> PdfCoverExtractor<'a> {
}
#[async_trait::async_trait]
impl ObjectExtractor for PdfCoverExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner().await?.get(name))
impl ObjectExtractor for PdfCoverExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.get_inner().await?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {

View File

@@ -1,26 +1,30 @@
use pdf::file::FileOptions;
use pdf::primitive::{Date, TimeRel};
use pile_config::Label;
use std::{collections::HashMap, io::BufReader, sync::OnceLock};
use std::{
collections::HashMap,
io::BufReader,
sync::{Arc, OnceLock},
};
use tracing::trace;
use crate::extract::ObjectExtractor;
use crate::{Item, PileValue, SyncReadBridge};
pub struct PdfMetaExtractor<'a> {
item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a>>>,
pub struct PdfMetaExtractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl<'a> PdfMetaExtractor<'a> {
pub fn new(item: &'a Item) -> Self {
impl PdfMetaExtractor {
pub fn new(item: &Item) -> Self {
Self {
item,
item: item.clone(),
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
@@ -80,7 +84,7 @@ impl<'a> PdfMetaExtractor<'a> {
}
};
let mut output: HashMap<Label, PileValue<'a>> = HashMap::new();
let mut output: HashMap<Label, PileValue> = HashMap::new();
#[expect(clippy::unwrap_used)]
output.insert(
@@ -92,7 +96,7 @@ impl<'a> PdfMetaExtractor<'a> {
for (key, val) in raw_meta {
let label = Label::new(key).unwrap();
let value = match val {
Some(s) => PileValue::String(s.into()),
Some(s) => PileValue::String(Arc::new(s.into())),
None => PileValue::Null,
};
output.insert(label, value);
@@ -115,12 +119,9 @@ fn format_date(d: &Date) -> String {
}
#[async_trait::async_trait]
impl ObjectExtractor for PdfMetaExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner().await?.get(name))
impl ObjectExtractor for PdfMetaExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.get_inner().await?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {

View File

@@ -2,86 +2,45 @@ use image::ImageFormat;
use pdfium_render::prelude::*;
use std::{
io::{BufReader, Cursor},
sync::{Arc, OnceLock},
sync::Arc,
};
use tracing::trace;
use crate::{Item, PileValue, SyncReadBridge, extract::ListExtractor};
pub struct PdfPagesExtractor<'a> {
item: &'a Item,
bytes: OnceLock<Arc<Vec<u8>>>,
pages: OnceLock<Vec<OnceLock<PileValue<'a>>>>,
pub struct PdfPagesExtractor {
item: Item,
}
impl<'a> PdfPagesExtractor<'a> {
pub fn new(item: &'a Item) -> Self {
Self {
item,
bytes: OnceLock::new(),
pages: OnceLock::new(),
}
impl PdfPagesExtractor {
pub fn new(item: &Item) -> Self {
Self { item: item.clone() }
}
async fn get_bytes(&self) -> Result<&Arc<Vec<u8>>, std::io::Error> {
if let Some(x) = self.bytes.get() {
return Ok(x);
}
async fn get_bytes(&self) -> Result<Vec<u8>, std::io::Error> {
let reader = SyncReadBridge::new_current(self.item.read().await?);
let bytes = tokio::task::spawn_blocking(move || {
tokio::task::spawn_blocking(move || {
let mut b = Vec::new();
std::io::Read::read_to_end(&mut BufReader::new(reader), &mut b)?;
Ok::<_, std::io::Error>(b)
})
.await
.map_err(std::io::Error::other)??;
let _ = self.bytes.set(Arc::new(bytes));
#[expect(clippy::unwrap_used)]
return Ok(self.bytes.get().unwrap());
}
async fn init_pages(&self) -> Result<&Vec<OnceLock<PileValue<'a>>>, std::io::Error> {
if let Some(x) = self.pages.get() {
return Ok(x);
}
let bytes = Arc::clone(self.get_bytes().await?);
let count = tokio::task::spawn_blocking(move || {
let pdfium = Pdfium::default();
let doc = pdfium
.load_pdf_from_byte_slice(&bytes, None)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
Ok::<_, std::io::Error>(doc.pages().len() as usize)
})
.await
.map_err(std::io::Error::other)?;
let slots = match count {
Ok(n) => (0..n).map(|_| OnceLock::new()).collect(),
Err(error) => {
trace!(message = "Could not read pdf page count", ?error, key = ?self.item.key());
Vec::new()
}
};
return Ok(self.pages.get_or_init(|| slots));
.map_err(std::io::Error::other)?
}
}
#[async_trait::async_trait]
impl ListExtractor for PdfPagesExtractor<'_> {
async fn get<'a>(&'a self, idx: usize) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
let pages = self.init_pages().await?;
let Some(slot) = pages.get(idx) else {
return Ok(None);
};
if let Some(v) = slot.get() {
return Ok(Some(v));
}
let bytes = Arc::clone(self.get_bytes().await?);
impl ListExtractor for PdfPagesExtractor {
async fn get(&self, idx: usize) -> Result<Option<PileValue>, std::io::Error> {
let bytes = self.get_bytes().await?;
let png = tokio::task::spawn_blocking(move || {
let pdfium = Pdfium::default();
let doc = pdfium
.load_pdf_from_byte_slice(&bytes, None)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
if idx >= doc.pages().len() as usize {
return Ok::<_, std::io::Error>(None);
}
let render_config = PdfRenderConfig::new().set_target_width(1024);
let page = doc
.pages()
@@ -95,13 +54,14 @@ impl ListExtractor for PdfPagesExtractor<'_> {
image
.write_to(&mut Cursor::new(&mut png_bytes), ImageFormat::Png)
.map_err(|e| std::io::Error::other(e.to_string()))?;
Ok::<_, std::io::Error>(png_bytes)
Ok(Some(png_bytes))
})
.await
.map_err(std::io::Error::other)?;
let value = match png {
Ok(bytes) => PileValue::Blob {
Ok(None) => return Ok(None),
Ok(Some(bytes)) => PileValue::Blob {
mime: mime::IMAGE_PNG,
bytes: Arc::new(bytes),
},
@@ -110,10 +70,26 @@ impl ListExtractor for PdfPagesExtractor<'_> {
PileValue::Null
}
};
return Ok(Some(slot.get_or_init(|| value)));
Ok(Some(value))
}
async fn len(&self) -> Result<usize, std::io::Error> {
Ok(self.init_pages().await?.len())
let bytes = self.get_bytes().await?;
let count = tokio::task::spawn_blocking(move || {
let pdfium = Pdfium::default();
let doc = pdfium
.load_pdf_from_byte_slice(&bytes, None)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
Ok::<_, std::io::Error>(doc.pages().len() as usize)
})
.await
.map_err(std::io::Error::other)?;
match count {
Ok(n) => Ok(n),
Err(error) => {
trace!(message = "Could not read pdf page count", ?error, key = ?self.item.key());
Ok(0)
}
}
}
}

View File

@@ -1,26 +1,30 @@
use pdf::content::{Op, TextDrawAdjusted};
use pdf::file::FileOptions;
use pile_config::Label;
use std::{collections::HashMap, io::BufReader, sync::OnceLock};
use std::{
collections::HashMap,
io::BufReader,
sync::{Arc, OnceLock},
};
use tracing::trace;
use crate::extract::ObjectExtractor;
use crate::{Item, PileValue, SyncReadBridge};
pub struct PdfTextExtractor<'a> {
item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a>>>,
pub struct PdfTextExtractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl<'a> PdfTextExtractor<'a> {
pub fn new(item: &'a Item) -> Self {
impl PdfTextExtractor {
pub fn new(item: &Item) -> Self {
Self {
item,
item: item.clone(),
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
@@ -87,7 +91,7 @@ impl<'a> PdfTextExtractor<'a> {
#[expect(clippy::unwrap_used)]
let output = HashMap::from([(
Label::new("text").unwrap(),
PileValue::String(raw_text.into()),
PileValue::String(Arc::new(raw_text.into())),
)]);
return Ok(self.output.get_or_init(|| output));
@@ -95,12 +99,9 @@ impl<'a> PdfTextExtractor<'a> {
}
#[async_trait::async_trait]
impl ObjectExtractor for PdfTextExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner().await?.get(name))
impl ObjectExtractor for PdfTextExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.get_inner().await?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {

View File

@@ -6,32 +6,29 @@ use crate::{
extract::{ObjectExtractor, TomlExtractor},
};
pub struct SidecarExtractor<'a> {
item: &'a Item,
output: OnceLock<Option<TomlExtractor<'a>>>,
pub struct SidecarExtractor {
item: Item,
output: OnceLock<Option<TomlExtractor>>,
}
impl<'a> SidecarExtractor<'a> {
pub fn new(item: &'a Item) -> Self {
impl SidecarExtractor {
pub fn new(item: &Item) -> Self {
Self {
item,
item: item.clone(),
output: OnceLock::new(),
}
}
}
#[async_trait::async_trait]
impl ObjectExtractor for SidecarExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
impl ObjectExtractor for SidecarExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
match self
.output
.get_or_init(|| self.item.sidecar().map(TomlExtractor::new))
{
Some(x) => Ok(x.field(name).await?),
None => Ok(Some(&PileValue::Null)),
None => Ok(Some(PileValue::Null)),
}
}

View File

@@ -1,34 +1,39 @@
use pile_config::Label;
use std::{collections::HashMap, sync::OnceLock};
use std::{
collections::HashMap,
sync::{Arc, OnceLock},
};
use crate::{AsyncReader, Item, PileValue, extract::ObjectExtractor};
fn toml_to_pile(value: toml::Value) -> PileValue<'static> {
fn toml_to_pile(value: toml::Value) -> PileValue {
match value {
toml::Value::String(s) => PileValue::String(s.into()),
toml::Value::Integer(i) => PileValue::String(i.to_string().into()),
toml::Value::Float(f) => PileValue::String(f.to_string().into()),
toml::Value::Boolean(b) => PileValue::String(b.to_string().into()),
toml::Value::Datetime(d) => PileValue::String(d.to_string().into()),
toml::Value::Array(a) => PileValue::Array(a.into_iter().map(toml_to_pile).collect()),
toml::Value::String(s) => PileValue::String(Arc::new(s.into())),
toml::Value::Integer(i) => PileValue::String(Arc::new(i.to_string().into())),
toml::Value::Float(f) => PileValue::String(Arc::new(f.to_string().into())),
toml::Value::Boolean(b) => PileValue::String(Arc::new(b.to_string().into())),
toml::Value::Datetime(d) => PileValue::String(Arc::new(d.to_string().into())),
toml::Value::Array(a) => {
PileValue::Array(Arc::new(a.into_iter().map(toml_to_pile).collect()))
}
toml::Value::Table(_) => PileValue::Null,
}
}
pub struct TomlExtractor<'a> {
item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a>>>,
pub struct TomlExtractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl<'a> TomlExtractor<'a> {
pub fn new(item: &'a Item) -> Self {
impl TomlExtractor {
pub fn new(item: &Item) -> Self {
Self {
item,
item: item.clone(),
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
@@ -39,7 +44,7 @@ impl<'a> TomlExtractor<'a> {
Err(_) => return Ok(self.output.get_or_init(HashMap::new)),
};
let output: HashMap<Label, PileValue<'_>> = match toml {
let output: HashMap<Label, PileValue> = match toml {
toml::Value::Table(t) => t
.into_iter()
.filter_map(|(k, v)| Label::new(&k).map(|label| (label, toml_to_pile(v))))
@@ -52,12 +57,9 @@ impl<'a> TomlExtractor<'a> {
}
#[async_trait::async_trait]
impl ObjectExtractor for TomlExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner().await?.get(name))
impl ObjectExtractor for TomlExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.get_inner().await?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {

View File

@@ -104,7 +104,7 @@ impl DbFtsIndex {
pub async fn get_field(
&self,
extractor: &PileValue<'_>,
extractor: &PileValue,
field_name: &Label,
) -> Result<Option<String>, std::io::Error> {
let field = match self.cfg.schema.get(field_name) {
@@ -148,16 +148,14 @@ impl DbFtsIndex {
PileValue::U64(x) => return Ok(Some(x.to_string())),
PileValue::I64(x) => return Ok(Some(x.to_string())),
#[expect(clippy::unwrap_used)]
PileValue::Array(ref mut x) => {
PileValue::Array(x) => {
if x.len() == 1 {
x.pop().unwrap()
x[0].clone()
} else if x.len() > 1 {
debug!(
message = "Skipping field, is array with more than one element",
field = field_name.to_string(),
?path,
//value = ?val
);
continue 'outer;
} else {
@@ -299,7 +297,7 @@ impl DbFtsIndex {
}
}
pub fn apply<'a>(post: &FieldSpecPost, val: &PileValue<'a>) -> Option<PileValue<'a>> {
pub fn apply(post: &FieldSpecPost, val: &PileValue) -> Option<PileValue> {
Some(match post {
FieldSpecPost::NotEmpty { notempty: false } => val.clone(),
FieldSpecPost::NotEmpty { notempty: true } => match val {
@@ -316,11 +314,11 @@ pub fn apply<'a>(post: &FieldSpecPost, val: &PileValue<'a>) -> Option<PileValue<
PileValue::Blob { .. } => return None,
PileValue::ObjectExtractor(_) => return None,
PileValue::ListExtractor(_) => return None,
PileValue::String(x) => PileValue::String(x.to_lowercase().into()),
PileValue::String(x) => PileValue::String(Arc::new(x.as_str().to_lowercase().into())),
PileValue::Array(x) => {
PileValue::Array(x.iter().map(|x| apply(post, x)).collect::<Option<_>>()?)
}
PileValue::Array(x) => PileValue::Array(Arc::new(
x.iter().map(|x| apply(post, x)).collect::<Option<_>>()?,
)),
},
FieldSpecPost::SetCase { case: Case::Upper } => match val {
@@ -330,11 +328,13 @@ pub fn apply<'a>(post: &FieldSpecPost, val: &PileValue<'a>) -> Option<PileValue<
PileValue::Blob { .. } => return None,
PileValue::ObjectExtractor(_) => return None,
PileValue::ListExtractor(_) => return None,
PileValue::String(x) => PileValue::String(x.to_uppercase().into()),
PileValue::String(x) => PileValue::String(Arc::new(x.as_str().to_uppercase().into())),
PileValue::Array(x) => {
PileValue::Array(x.iter().map(|x| apply(post, x)).collect::<Option<_>>()?)
}
PileValue::Array(x) => PileValue::Array(Arc::new(
x.iter()
.map(|x| apply(post, x))
.collect::<Option<Vec<_>>>()?,
)),
},
FieldSpecPost::TrimSuffix { trim_suffix } => match val {
@@ -345,13 +345,15 @@ pub fn apply<'a>(post: &FieldSpecPost, val: &PileValue<'a>) -> Option<PileValue<
PileValue::ObjectExtractor(_) => return None,
PileValue::ListExtractor(_) => return None,
PileValue::String(x) => {
PileValue::String(x.strip_suffix(trim_suffix).unwrap_or(x).into())
}
PileValue::String(x) => PileValue::String(Arc::new(
x.strip_suffix(trim_suffix).unwrap_or(x.as_str()).into(),
)),
PileValue::Array(x) => {
PileValue::Array(x.iter().map(|x| apply(post, x)).collect::<Option<_>>()?)
}
PileValue::Array(x) => PileValue::Array(Arc::new(
x.iter()
.map(|x| apply(post, x))
.collect::<Option<Vec<_>>>()?,
)),
},
FieldSpecPost::TrimPrefix { trim_prefix } => match val {
@@ -362,13 +364,15 @@ pub fn apply<'a>(post: &FieldSpecPost, val: &PileValue<'a>) -> Option<PileValue<
PileValue::ObjectExtractor(_) => return None,
PileValue::ListExtractor(_) => return None,
PileValue::String(x) => {
PileValue::String(x.strip_prefix(trim_prefix).unwrap_or(x).into())
}
PileValue::String(x) => PileValue::String(Arc::new(
x.strip_prefix(trim_prefix).unwrap_or(x.as_str()).into(),
)),
PileValue::Array(x) => {
PileValue::Array(x.iter().map(|x| apply(post, x)).collect::<Option<_>>()?)
}
PileValue::Array(x) => PileValue::Array(Arc::new(
x.iter()
.map(|x| apply(post, x))
.collect::<Option<Vec<_>>>()?,
)),
},
FieldSpecPost::Join { join } => match val {
@@ -381,7 +385,7 @@ pub fn apply<'a>(post: &FieldSpecPost, val: &PileValue<'a>) -> Option<PileValue<
PileValue::String(x) => PileValue::String(x.clone()),
PileValue::Array(x) => PileValue::String(
PileValue::Array(x) => PileValue::String(Arc::new(
x.iter()
.map(|x| apply(post, x))
.map(|x| x.and_then(|x| x.as_str().map(|x| x.to_owned())))
@@ -389,7 +393,7 @@ pub fn apply<'a>(post: &FieldSpecPost, val: &PileValue<'a>) -> Option<PileValue<
.into_iter()
.join(join)
.into(),
),
)),
},
})
}

View File

@@ -14,6 +14,7 @@ use crate::source::{DirDataSource, S3DataSource};
// MARK: item
//
/// A cheaply-clonable pointer to an item in a dataset
#[derive(Debug, Clone)]
pub enum Item {
File {

View File

@@ -62,7 +62,7 @@ pub async fn get_field(
};
let extractor = MetaExtractor::new(&item);
let root: PileValue<'_> = PileValue::ObjectExtractor(Arc::new(extractor));
let root: PileValue = PileValue::ObjectExtractor(Arc::new(extractor));
let value = match root.query(&path).await {
Ok(Some(v)) => v,

View File

@@ -6,17 +6,18 @@ use std::sync::Arc;
use crate::extract::{ListExtractor, ObjectExtractor};
/// An immutable, lazily-computed value similar to [serde_json::Value].
pub enum PileValue<'a> {
/// An immutable, cheaply-clonable, lazily-computed value.
/// Very similar to [serde_json::Value].
pub enum PileValue {
Null,
U64(u64),
I64(i64),
/// A string
String(SmartString<LazyCompact>),
String(Arc<SmartString<LazyCompact>>),
/// An array of values
Array(Vec<PileValue<'a>>),
Array(Arc<Vec<PileValue>>),
/// A binary blob
Blob {
@@ -25,13 +26,13 @@ pub enum PileValue<'a> {
},
/// A lazily-computed map of {label: value}
ObjectExtractor(Arc<dyn ObjectExtractor + 'a>),
ObjectExtractor(Arc<dyn ObjectExtractor>),
/// A lazily-computed array
ListExtractor(Arc<dyn ListExtractor + 'a>),
ListExtractor(Arc<dyn ListExtractor>),
}
impl Clone for PileValue<'_> {
impl Clone for PileValue {
fn clone(&self) -> Self {
match self {
Self::Null => Self::Null,
@@ -49,15 +50,15 @@ impl Clone for PileValue<'_> {
}
}
impl<'a> PileValue<'a> {
pub async fn query(&'a self, query: &ObjectPath) -> Result<Option<&'a Self>, std::io::Error> {
let mut out = Some(self);
impl PileValue {
pub async fn query(&self, query: &ObjectPath) -> Result<Option<Self>, std::io::Error> {
let mut out: Option<PileValue> = Some(self.clone());
for s in &query.segments {
match s {
PathSegment::Root => out = Some(self),
PathSegment::Root => out = Some(self.clone()),
PathSegment::Field(field) => {
out = match &out {
out = match out {
None => return Ok(None),
Some(Self::Null) => None,
Some(Self::U64(_)) => None,
@@ -84,7 +85,7 @@ impl<'a> PileValue<'a> {
usize::try_from(v.len() as i64 - idx).ok()
};
idx.and_then(|idx| v.get(idx))
idx.and_then(|idx| v.get(idx)).cloned()
}
Some(Self::String(_)) => None,
Some(Self::ObjectExtractor(_)) => None,
@@ -105,7 +106,7 @@ impl<'a> PileValue<'a> {
}
}
return Ok(out);
return Ok(out.clone());
}
pub fn as_str(&self) -> Option<&str> {
@@ -129,7 +130,7 @@ impl<'a> PileValue<'a> {
Self::Array(x) => {
let mut arr = Vec::new();
for item in x {
for item in &**x {
arr.push(Box::pin(item.to_json()).await?);
}
Value::Array(arr)