Add ListExtractor

This commit is contained in:
2026-03-10 20:24:56 -07:00
parent 280bbcb83e
commit 48ac93c78e
22 changed files with 386 additions and 93 deletions

View File

@@ -184,7 +184,7 @@ impl Datasets {
return Ok(None); return Ok(None);
}; };
let extractor = MetaExtractor::new(&item); let extractor = MetaExtractor::new(&item);
let root = PileValue::Extractor(Arc::new(extractor)); let root = PileValue::ObjectExtractor(Arc::new(extractor));
let Some(value) = root.query(path).await? else { let Some(value) = root.query(path).await? else {
return Ok(None); return Ok(None);
}; };

View File

@@ -3,7 +3,7 @@ use pile_config::Label;
use std::{collections::HashMap, sync::OnceLock}; use std::{collections::HashMap, sync::OnceLock};
use tracing::trace; use tracing::trace;
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor}; use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor};
pub struct EpubMetaExtractor<'a> { pub struct EpubMetaExtractor<'a> {
item: &'a Item, item: &'a Item,
@@ -78,7 +78,7 @@ impl<'a> EpubMetaExtractor<'a> {
} }
#[async_trait::async_trait] #[async_trait::async_trait]
impl Extractor for EpubMetaExtractor<'_> { impl ObjectExtractor for EpubMetaExtractor<'_> {
async fn field<'a>( async fn field<'a>(
&'a self, &'a self,
name: &Label, name: &Label,

View File

@@ -3,7 +3,7 @@ use pile_config::Label;
use std::{collections::HashMap, sync::OnceLock}; use std::{collections::HashMap, sync::OnceLock};
use tracing::debug; use tracing::debug;
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor}; use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor};
pub struct EpubTextExtractor<'a> { pub struct EpubTextExtractor<'a> {
item: &'a Item, item: &'a Item,
@@ -88,7 +88,7 @@ fn strip_html(html: &str) -> String {
} }
#[async_trait::async_trait] #[async_trait::async_trait]
impl Extractor for EpubTextExtractor<'_> { impl ObjectExtractor for EpubTextExtractor<'_> {
async fn field<'a>( async fn field<'a>(
&'a self, &'a self,
name: &Label, name: &Label,

View File

@@ -9,7 +9,7 @@ pub use epub_text::*;
use crate::{ use crate::{
Item, PileValue, Item, PileValue,
extract::{Extractor, MapExtractor}, extract::{MapExtractor, ObjectExtractor},
}; };
pub struct EpubExtractor<'a> { pub struct EpubExtractor<'a> {
@@ -23,11 +23,11 @@ impl<'a> EpubExtractor<'a> {
inner: HashMap::from([ inner: HashMap::from([
( (
Label::new("text").unwrap(), Label::new("text").unwrap(),
PileValue::Extractor(Arc::new(EpubTextExtractor::new(item))), PileValue::ObjectExtractor(Arc::new(EpubTextExtractor::new(item))),
), ),
( (
Label::new("meta").unwrap(), Label::new("meta").unwrap(),
PileValue::Extractor(Arc::new(EpubMetaExtractor::new(item))), PileValue::ObjectExtractor(Arc::new(EpubMetaExtractor::new(item))),
), ),
]), ]),
}; };
@@ -37,7 +37,7 @@ impl<'a> EpubExtractor<'a> {
} }
#[async_trait::async_trait] #[async_trait::async_trait]
impl Extractor for EpubExtractor<'_> { impl ObjectExtractor for EpubExtractor<'_> {
async fn field<'a>( async fn field<'a>(
&'a self, &'a self,
name: &pile_config::Label, name: &pile_config::Label,
@@ -45,7 +45,7 @@ impl Extractor for EpubExtractor<'_> {
#[expect(clippy::unwrap_used)] #[expect(clippy::unwrap_used)]
if name.as_str() == "text" { if name.as_str() == "text" {
match self.inner.inner.get(name).unwrap() { match self.inner.inner.get(name).unwrap() {
PileValue::Extractor(x) => return x.field(name).await, PileValue::ObjectExtractor(x) => return x.field(name).await,
_ => unreachable!(), _ => unreachable!(),
}; };
} }

View File

@@ -2,7 +2,7 @@ use pile_config::Label;
use std::{collections::HashMap, io::BufReader, sync::OnceLock}; use std::{collections::HashMap, io::BufReader, sync::OnceLock};
use tracing::debug; use tracing::debug;
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor}; use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor};
pub struct ExifExtractor<'a> { pub struct ExifExtractor<'a> {
item: &'a Item, item: &'a Item,
@@ -78,7 +78,7 @@ fn tag_to_label(tag: &str) -> Option<Label> {
} }
#[async_trait::async_trait] #[async_trait::async_trait]
impl Extractor for ExifExtractor<'_> { impl ObjectExtractor for ExifExtractor<'_> {
async fn field<'a>( async fn field<'a>(
&'a self, &'a self,
name: &Label, name: &Label,

View File

@@ -7,14 +7,17 @@ use std::{
sync::{Arc, OnceLock}, sync::{Arc, OnceLock},
}; };
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor}; use crate::{
Item, PileValue, SyncReadBridge,
extract::{ListExtractor, ObjectExtractor},
};
pub struct FlacExtractor<'a> { pub struct FlacImagesExtractor<'a> {
item: &'a Item, item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a>>>, output: OnceLock<Vec<PileValue<'a>>>,
} }
impl<'a> FlacExtractor<'a> { impl<'a> FlacImagesExtractor<'a> {
pub fn new(item: &'a Item) -> Self { pub fn new(item: &'a Item) -> Self {
Self { Self {
item, item,
@@ -22,6 +25,77 @@ impl<'a> FlacExtractor<'a> {
} }
} }
async fn get_inner(&self) -> Result<&Vec<PileValue<'a>>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let reader = SyncReadBridge::new_current(self.item.read().await?);
let raw_images = tokio::task::spawn_blocking(move || {
let reader = FlacReader::new(BufReader::new(reader));
let mut images: Vec<(Mime, Vec<u8>)> = Vec::new();
for block in reader {
match block.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))? {
FlacBlock::Picture(picture) => {
images.push((picture.mime, picture.img_data));
}
FlacBlock::AudioFrame(_) => break,
_ => {}
}
}
Ok::<_, std::io::Error>(images)
})
.await
.map_err(std::io::Error::other)??;
let images = raw_images
.into_iter()
.map(|(mime, data)| PileValue::Blob {
mime,
bytes: Arc::new(data),
})
.collect();
let _ = self.output.set(images);
#[expect(clippy::unwrap_used)]
return Ok(self.output.get().unwrap());
}
}
#[async_trait::async_trait]
impl ListExtractor for FlacImagesExtractor<'_> {
async fn get<'a>(&'a self, idx: usize) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner().await?.get(idx))
}
async fn len(&self) -> Result<usize, std::io::Error> {
Ok(self.get_inner().await?.len())
}
}
pub struct FlacExtractor<'a> {
item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a>>>,
images: Option<PileValue<'a>>,
}
impl<'a> FlacExtractor<'a> {
pub fn new(item: &'a Item) -> Self {
let is_flac = match item {
Item::File { path, .. } => path.to_str().unwrap_or_default().ends_with(".flac"),
Item::S3 { key, .. } => key.ends_with(".flac"),
};
let images =
is_flac.then(|| PileValue::ListExtractor(Arc::new(FlacImagesExtractor::new(item))));
Self {
item,
output: OnceLock::new(),
images,
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> { async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
if let Some(x) = self.output.get() { if let Some(x) = self.output.get() {
return Ok(x); return Ok(x);
@@ -39,10 +113,9 @@ impl<'a> FlacExtractor<'a> {
} }
let reader = SyncReadBridge::new_current(self.item.read().await?); let reader = SyncReadBridge::new_current(self.item.read().await?);
let (raw_tags, raw_images) = tokio::task::spawn_blocking(move || { let raw_tags = tokio::task::spawn_blocking(move || {
let reader = FlacReader::new(BufReader::new(reader)); let reader = FlacReader::new(BufReader::new(reader));
let mut tags: Vec<(String, String)> = Vec::new(); let mut tags: Vec<(String, String)> = Vec::new();
let mut images: Vec<(Mime, Vec<u8>)> = Vec::new();
for block in reader { for block in reader {
match block.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))? { match block.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))? {
FlacBlock::VorbisComment(comment) => { FlacBlock::VorbisComment(comment) => {
@@ -50,14 +123,11 @@ impl<'a> FlacExtractor<'a> {
tags.push((k.to_string().to_lowercase(), v.into())); tags.push((k.to_string().to_lowercase(), v.into()));
} }
} }
FlacBlock::Picture(picture) => {
images.push((picture.mime, picture.img_data));
}
FlacBlock::AudioFrame(_) => break, FlacBlock::AudioFrame(_) => break,
_ => {} _ => {}
} }
} }
Ok::<_, std::io::Error>((tags, images)) Ok::<_, std::io::Error>(tags)
}) })
.await .await
.map_err(std::io::Error::other)??; .map_err(std::io::Error::other)??;
@@ -71,24 +141,11 @@ impl<'a> FlacExtractor<'a> {
.push(PileValue::String(v.into())); .push(PileValue::String(v.into()));
} }
} }
let mut output: HashMap<Label, PileValue<'a>> = output let output: HashMap<Label, PileValue<'a>> = output
.into_iter() .into_iter()
.map(|(k, v)| (k, PileValue::Array(v))) .map(|(k, v)| (k, PileValue::Array(v)))
.collect(); .collect();
if !raw_images.is_empty()
&& let Some(label) = Label::new("images".to_owned())
{
let images = raw_images
.into_iter()
.map(|(mime, data)| PileValue::Blob {
mime,
bytes: Arc::new(data),
})
.collect();
output.insert(label, PileValue::Array(images));
}
let _ = self.output.set(output); let _ = self.output.set(output);
#[expect(clippy::unwrap_used)] #[expect(clippy::unwrap_used)]
return Ok(self.output.get().unwrap()); return Ok(self.output.get().unwrap());
@@ -96,15 +153,25 @@ impl<'a> FlacExtractor<'a> {
} }
#[async_trait::async_trait] #[async_trait::async_trait]
impl Extractor for FlacExtractor<'_> { impl ObjectExtractor for FlacExtractor<'_> {
async fn field<'a>( async fn field<'a>(
&'a self, &'a self,
name: &Label, name: &Label,
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> { ) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
if name.as_str() == "images"
&& let Some(ref images) = self.images
{
return Ok(Some(images));
}
Ok(self.get_inner().await?.get(name)) Ok(self.get_inner().await?.get(name))
} }
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> { async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect()) let mut fields = self.get_inner().await?.keys().cloned().collect::<Vec<_>>();
if self.images.is_some() {
#[expect(clippy::unwrap_used)]
fields.push(Label::new("images").unwrap());
}
Ok(fields)
} }
} }

View File

@@ -1,7 +1,7 @@
use pile_config::Label; use pile_config::Label;
use std::{collections::HashMap, path::Component, sync::OnceLock}; use std::{collections::HashMap, path::Component, sync::OnceLock};
use crate::{Item, PileValue, extract::Extractor}; use crate::{Item, PileValue, extract::ObjectExtractor};
pub struct FsExtractor<'a> { pub struct FsExtractor<'a> {
item: &'a Item, item: &'a Item,
@@ -62,7 +62,7 @@ impl<'a> FsExtractor<'a> {
} }
#[async_trait::async_trait] #[async_trait::async_trait]
impl Extractor for FsExtractor<'_> { impl ObjectExtractor for FsExtractor<'_> {
async fn field<'a>( async fn field<'a>(
&'a self, &'a self,
name: &Label, name: &Label,

View File

@@ -2,7 +2,7 @@ use id3::Tag;
use pile_config::Label; use pile_config::Label;
use std::{borrow::Cow, collections::HashMap, io::BufReader, sync::OnceLock}; use std::{borrow::Cow, collections::HashMap, io::BufReader, sync::OnceLock};
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor}; use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor};
pub struct Id3Extractor<'a> { pub struct Id3Extractor<'a> {
item: &'a Item, item: &'a Item,
@@ -114,7 +114,7 @@ fn frame_id_to_field(id: &str) -> Cow<'static, str> {
} }
#[async_trait::async_trait] #[async_trait::async_trait]
impl Extractor for Id3Extractor<'_> { impl ObjectExtractor for Id3Extractor<'_> {
async fn field<'a>( async fn field<'a>(
&'a self, &'a self,
name: &Label, name: &Label,

View File

@@ -1,14 +1,14 @@
use pile_config::Label; use pile_config::Label;
use std::collections::HashMap; use std::collections::HashMap;
use crate::{PileValue, extract::Extractor}; use crate::{PileValue, extract::ObjectExtractor};
pub struct MapExtractor<'a> { pub struct MapExtractor<'a> {
pub(crate) inner: HashMap<Label, PileValue<'a>>, pub(crate) inner: HashMap<Label, PileValue<'a>>,
} }
#[async_trait::async_trait] #[async_trait::async_trait]
impl Extractor for MapExtractor<'_> { impl ObjectExtractor for MapExtractor<'_> {
async fn field<'a>( async fn field<'a>(
&'a self, &'a self,
name: &Label, name: &Label,

View File

@@ -35,7 +35,7 @@ use crate::Item;
/// Metadata is exposed as an immutable map of {label: value}, /// Metadata is exposed as an immutable map of {label: value},
/// much like a json object. /// much like a json object.
#[async_trait::async_trait] #[async_trait::async_trait]
pub trait Extractor: Send + Sync { pub trait ObjectExtractor: Send + Sync {
/// Get the field at `name` from `item`. /// Get the field at `name` from `item`.
/// - returns `None` if `name` is not a valid field /// - returns `None` if `name` is not a valid field
/// - returns `Some(Null)` if `name` is not available /// - returns `Some(Null)` if `name` is not available
@@ -50,14 +50,31 @@ pub trait Extractor: Send + Sync {
async fn fields(&self) -> Result<Vec<Label>, std::io::Error>; async fn fields(&self) -> Result<Vec<Label>, std::io::Error>;
} }
/// An attachment that extracts metadata from an [Item].
///
/// Metadata is exposed as an immutable list of values.
#[async_trait::async_trait]
pub trait ListExtractor: Send + Sync {
/// Get the item at index `idx`.
/// Indices start at zero, and must be consecutive.
/// - returns `None` if `idx` is out of range
/// - returns `Some(Null)` if `None` is at `idx`
async fn get<'a>(
&'a self,
idx: usize,
) -> Result<Option<&'a crate::PileValue<'a>>, std::io::Error>;
async fn len(&self) -> Result<usize, std::io::Error>;
async fn is_empty(&self) -> Result<bool, std::io::Error> {
Ok(self.len().await? == 0)
}
}
pub struct MetaExtractor<'a> { pub struct MetaExtractor<'a> {
inner: MapExtractor<'a>, inner: MapExtractor<'a>,
} }
//
// MARK: file
//
impl<'a> MetaExtractor<'a> { impl<'a> MetaExtractor<'a> {
#[expect(clippy::unwrap_used)] #[expect(clippy::unwrap_used)]
pub fn new(item: &'a Item) -> Self { pub fn new(item: &'a Item) -> Self {
@@ -65,35 +82,35 @@ impl<'a> MetaExtractor<'a> {
inner: HashMap::from([ inner: HashMap::from([
( (
Label::new("flac").unwrap(), Label::new("flac").unwrap(),
crate::PileValue::Extractor(Arc::new(FlacExtractor::new(item))), crate::PileValue::ObjectExtractor(Arc::new(FlacExtractor::new(item))),
), ),
( (
Label::new("id3").unwrap(), Label::new("id3").unwrap(),
crate::PileValue::Extractor(Arc::new(Id3Extractor::new(item))), crate::PileValue::ObjectExtractor(Arc::new(Id3Extractor::new(item))),
), ),
( (
Label::new("fs").unwrap(), Label::new("fs").unwrap(),
crate::PileValue::Extractor(Arc::new(FsExtractor::new(item))), crate::PileValue::ObjectExtractor(Arc::new(FsExtractor::new(item))),
), ),
( (
Label::new("epub").unwrap(), Label::new("epub").unwrap(),
crate::PileValue::Extractor(Arc::new(EpubExtractor::new(item))), crate::PileValue::ObjectExtractor(Arc::new(EpubExtractor::new(item))),
), ),
( (
Label::new("exif").unwrap(), Label::new("exif").unwrap(),
crate::PileValue::Extractor(Arc::new(ExifExtractor::new(item))), crate::PileValue::ObjectExtractor(Arc::new(ExifExtractor::new(item))),
), ),
( (
Label::new("pdf").unwrap(), Label::new("pdf").unwrap(),
crate::PileValue::Extractor(Arc::new(PdfExtractor::new(item))), crate::PileValue::ObjectExtractor(Arc::new(PdfExtractor::new(item))),
), ),
( (
Label::new("toml").unwrap(), Label::new("toml").unwrap(),
crate::PileValue::Extractor(Arc::new(TomlExtractor::new(item))), crate::PileValue::ObjectExtractor(Arc::new(TomlExtractor::new(item))),
), ),
( (
Label::new("sidecar").unwrap(), Label::new("sidecar").unwrap(),
crate::PileValue::Extractor(Arc::new(SidecarExtractor::new(item))), crate::PileValue::ObjectExtractor(Arc::new(SidecarExtractor::new(item))),
), ),
]), ]),
}; };
@@ -103,7 +120,7 @@ impl<'a> MetaExtractor<'a> {
} }
#[async_trait::async_trait] #[async_trait::async_trait]
impl Extractor for MetaExtractor<'_> { impl ObjectExtractor for MetaExtractor<'_> {
async fn field<'a>( async fn field<'a>(
&'a self, &'a self,
name: &pile_config::Label, name: &pile_config::Label,

View File

@@ -6,6 +6,11 @@ mod pdf_cover;
#[cfg(feature = "pdfium")] #[cfg(feature = "pdfium")]
pub use pdf_cover::*; pub use pdf_cover::*;
#[cfg(feature = "pdfium")]
mod pdf_pages;
#[cfg(feature = "pdfium")]
pub use pdf_pages::*;
mod pdf_meta; mod pdf_meta;
pub use pdf_meta::*; pub use pdf_meta::*;
@@ -14,7 +19,7 @@ pub use pdf_text::*;
use crate::{ use crate::{
Item, PileValue, Item, PileValue,
extract::{Extractor, MapExtractor}, extract::{MapExtractor, ObjectExtractor},
}; };
pub struct PdfExtractor<'a> { pub struct PdfExtractor<'a> {
@@ -27,16 +32,21 @@ impl<'a> PdfExtractor<'a> {
let mut inner_map = HashMap::new(); let mut inner_map = HashMap::new();
inner_map.insert( inner_map.insert(
Label::new("text").unwrap(), Label::new("text").unwrap(),
PileValue::Extractor(Arc::new(PdfTextExtractor::new(item))), PileValue::ObjectExtractor(Arc::new(PdfTextExtractor::new(item))),
); );
inner_map.insert( inner_map.insert(
Label::new("meta").unwrap(), Label::new("meta").unwrap(),
PileValue::Extractor(Arc::new(PdfMetaExtractor::new(item))), PileValue::ObjectExtractor(Arc::new(PdfMetaExtractor::new(item))),
); );
#[cfg(feature = "pdfium")] #[cfg(feature = "pdfium")]
inner_map.insert( inner_map.insert(
Label::new("cover").unwrap(), Label::new("cover").unwrap(),
PileValue::Extractor(Arc::new(PdfCoverExtractor::new(item))), PileValue::ObjectExtractor(Arc::new(PdfCoverExtractor::new(item))),
);
#[cfg(feature = "pdfium")]
inner_map.insert(
Label::new("pages").unwrap(),
PileValue::ListExtractor(Arc::new(PdfPagesExtractor::new(item))),
); );
Self { Self {
@@ -46,7 +56,7 @@ impl<'a> PdfExtractor<'a> {
} }
#[async_trait::async_trait] #[async_trait::async_trait]
impl Extractor for PdfExtractor<'_> { impl ObjectExtractor for PdfExtractor<'_> {
async fn field<'a>( async fn field<'a>(
&'a self, &'a self,
name: &pile_config::Label, name: &pile_config::Label,
@@ -54,7 +64,7 @@ impl Extractor for PdfExtractor<'_> {
#[expect(clippy::unwrap_used)] #[expect(clippy::unwrap_used)]
if name.as_str() == "text" { if name.as_str() == "text" {
match self.inner.inner.get(name).unwrap() { match self.inner.inner.get(name).unwrap() {
PileValue::Extractor(x) => return x.field(name).await, PileValue::ObjectExtractor(x) => return x.field(name).await,
_ => unreachable!(), _ => unreachable!(),
}; };
} }
@@ -63,7 +73,7 @@ impl Extractor for PdfExtractor<'_> {
#[expect(clippy::unwrap_used)] #[expect(clippy::unwrap_used)]
if name.as_str() == "cover" { if name.as_str() == "cover" {
match self.inner.inner.get(name).unwrap() { match self.inner.inner.get(name).unwrap() {
PileValue::Extractor(x) => return x.field(name).await, PileValue::ObjectExtractor(x) => return x.field(name).await,
_ => unreachable!(), _ => unreachable!(),
}; };
} }
@@ -78,6 +88,8 @@ impl Extractor for PdfExtractor<'_> {
Label::new("meta").unwrap(), Label::new("meta").unwrap(),
#[cfg(feature = "pdfium")] #[cfg(feature = "pdfium")]
Label::new("cover").unwrap(), Label::new("cover").unwrap(),
#[cfg(feature = "pdfium")]
Label::new("pages").unwrap(),
]) ])
} }
} }

View File

@@ -8,7 +8,7 @@ use std::{
}; };
use tracing::trace; use tracing::trace;
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor}; use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor};
pub struct PdfCoverExtractor<'a> { pub struct PdfCoverExtractor<'a> {
item: &'a Item, item: &'a Item,
@@ -84,7 +84,7 @@ impl<'a> PdfCoverExtractor<'a> {
} }
#[async_trait::async_trait] #[async_trait::async_trait]
impl Extractor for PdfCoverExtractor<'_> { impl ObjectExtractor for PdfCoverExtractor<'_> {
async fn field<'a>( async fn field<'a>(
&'a self, &'a self,
name: &Label, name: &Label,

View File

@@ -4,7 +4,8 @@ use pile_config::Label;
use std::{collections::HashMap, io::BufReader, sync::OnceLock}; use std::{collections::HashMap, io::BufReader, sync::OnceLock};
use tracing::trace; use tracing::trace;
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor}; use crate::extract::ObjectExtractor;
use crate::{Item, PileValue, SyncReadBridge};
pub struct PdfMetaExtractor<'a> { pub struct PdfMetaExtractor<'a> {
item: &'a Item, item: &'a Item,
@@ -40,6 +41,8 @@ impl<'a> PdfMetaExtractor<'a> {
} }
}; };
let page_count = file.num_pages();
let mut meta: Vec<(&'static str, Option<String>)> = Vec::new(); let mut meta: Vec<(&'static str, Option<String>)> = Vec::new();
if let Some(info) = &file.trailer.info_dict { if let Some(info) = &file.trailer.info_dict {
@@ -64,12 +67,12 @@ impl<'a> PdfMetaExtractor<'a> {
meta.push(("mod_date", info.mod_date.as_ref().map(format_date))); meta.push(("mod_date", info.mod_date.as_ref().map(format_date)));
} }
Ok::<_, std::io::Error>(meta) Ok::<_, std::io::Error>((page_count, meta))
}) })
.await .await
.map_err(std::io::Error::other)?; .map_err(std::io::Error::other)?;
let raw_meta = match raw_meta { let (page_count, raw_meta) = match raw_meta {
Ok(x) => x, Ok(x) => x,
Err(error) => { Err(error) => {
trace!(message = "Could not process pdf", ?error, key = ?self.item.key()); trace!(message = "Could not process pdf", ?error, key = ?self.item.key());
@@ -79,6 +82,12 @@ impl<'a> PdfMetaExtractor<'a> {
let mut output: HashMap<Label, PileValue<'a>> = HashMap::new(); let mut output: HashMap<Label, PileValue<'a>> = HashMap::new();
#[expect(clippy::unwrap_used)]
output.insert(
Label::new("pages").unwrap(),
PileValue::U64(page_count as u64),
);
#[expect(clippy::unwrap_used)] #[expect(clippy::unwrap_used)]
for (key, val) in raw_meta { for (key, val) in raw_meta {
let label = Label::new(key).unwrap(); let label = Label::new(key).unwrap();
@@ -106,7 +115,7 @@ fn format_date(d: &Date) -> String {
} }
#[async_trait::async_trait] #[async_trait::async_trait]
impl Extractor for PdfMetaExtractor<'_> { impl ObjectExtractor for PdfMetaExtractor<'_> {
async fn field<'a>( async fn field<'a>(
&'a self, &'a self,
name: &Label, name: &Label,

View File

@@ -0,0 +1,119 @@
use image::ImageFormat;
use pdfium_render::prelude::*;
use std::{
io::{BufReader, Cursor},
sync::{Arc, OnceLock},
};
use tracing::trace;
use crate::{Item, PileValue, SyncReadBridge, extract::ListExtractor};
pub struct PdfPagesExtractor<'a> {
item: &'a Item,
bytes: OnceLock<Arc<Vec<u8>>>,
pages: OnceLock<Vec<OnceLock<PileValue<'a>>>>,
}
impl<'a> PdfPagesExtractor<'a> {
pub fn new(item: &'a Item) -> Self {
Self {
item,
bytes: OnceLock::new(),
pages: OnceLock::new(),
}
}
async fn get_bytes(&self) -> Result<&Arc<Vec<u8>>, std::io::Error> {
if let Some(x) = self.bytes.get() {
return Ok(x);
}
let reader = SyncReadBridge::new_current(self.item.read().await?);
let bytes = tokio::task::spawn_blocking(move || {
let mut b = Vec::new();
std::io::Read::read_to_end(&mut BufReader::new(reader), &mut b)?;
Ok::<_, std::io::Error>(b)
})
.await
.map_err(std::io::Error::other)??;
let _ = self.bytes.set(Arc::new(bytes));
#[expect(clippy::unwrap_used)]
return Ok(self.bytes.get().unwrap());
}
async fn init_pages(&self) -> Result<&Vec<OnceLock<PileValue<'a>>>, std::io::Error> {
if let Some(x) = self.pages.get() {
return Ok(x);
}
let bytes = Arc::clone(self.get_bytes().await?);
let count = tokio::task::spawn_blocking(move || {
let pdfium = Pdfium::default();
let doc = pdfium
.load_pdf_from_byte_slice(&bytes, None)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
Ok::<_, std::io::Error>(doc.pages().len() as usize)
})
.await
.map_err(std::io::Error::other)?;
let slots = match count {
Ok(n) => (0..n).map(|_| OnceLock::new()).collect(),
Err(error) => {
trace!(message = "Could not read pdf page count", ?error, key = ?self.item.key());
Vec::new()
}
};
return Ok(self.pages.get_or_init(|| slots));
}
}
#[async_trait::async_trait]
impl ListExtractor for PdfPagesExtractor<'_> {
async fn get<'a>(&'a self, idx: usize) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
let pages = self.init_pages().await?;
let Some(slot) = pages.get(idx) else {
return Ok(None);
};
if let Some(v) = slot.get() {
return Ok(Some(v));
}
let bytes = Arc::clone(self.get_bytes().await?);
let png = tokio::task::spawn_blocking(move || {
let pdfium = Pdfium::default();
let doc = pdfium
.load_pdf_from_byte_slice(&bytes, None)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let render_config = PdfRenderConfig::new().set_target_width(1024);
let page = doc
.pages()
.get(idx as u16)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let image = page
.render_with_config(&render_config)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?
.as_image();
let mut png_bytes = Vec::new();
image
.write_to(&mut Cursor::new(&mut png_bytes), ImageFormat::Png)
.map_err(|e| std::io::Error::other(e.to_string()))?;
Ok::<_, std::io::Error>(png_bytes)
})
.await
.map_err(std::io::Error::other)?;
let value = match png {
Ok(bytes) => PileValue::Blob {
mime: mime::IMAGE_PNG,
bytes: Arc::new(bytes),
},
Err(error) => {
trace!(message = "Could not render pdf page", ?error, idx, key = ?self.item.key());
PileValue::Null
}
};
return Ok(Some(slot.get_or_init(|| value)));
}
async fn len(&self) -> Result<usize, std::io::Error> {
Ok(self.init_pages().await?.len())
}
}

View File

@@ -4,7 +4,8 @@ use pile_config::Label;
use std::{collections::HashMap, io::BufReader, sync::OnceLock}; use std::{collections::HashMap, io::BufReader, sync::OnceLock};
use tracing::trace; use tracing::trace;
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor}; use crate::extract::ObjectExtractor;
use crate::{Item, PileValue, SyncReadBridge};
pub struct PdfTextExtractor<'a> { pub struct PdfTextExtractor<'a> {
item: &'a Item, item: &'a Item,
@@ -94,7 +95,7 @@ impl<'a> PdfTextExtractor<'a> {
} }
#[async_trait::async_trait] #[async_trait::async_trait]
impl Extractor for PdfTextExtractor<'_> { impl ObjectExtractor for PdfTextExtractor<'_> {
async fn field<'a>( async fn field<'a>(
&'a self, &'a self,
name: &Label, name: &Label,

View File

@@ -3,7 +3,7 @@ use std::sync::OnceLock;
use crate::{ use crate::{
Item, PileValue, Item, PileValue,
extract::{Extractor, TomlExtractor}, extract::{ObjectExtractor, TomlExtractor},
}; };
pub struct SidecarExtractor<'a> { pub struct SidecarExtractor<'a> {
@@ -21,7 +21,7 @@ impl<'a> SidecarExtractor<'a> {
} }
#[async_trait::async_trait] #[async_trait::async_trait]
impl Extractor for SidecarExtractor<'_> { impl ObjectExtractor for SidecarExtractor<'_> {
async fn field<'a>( async fn field<'a>(
&'a self, &'a self,
name: &Label, name: &Label,

View File

@@ -1,7 +1,7 @@
use pile_config::Label; use pile_config::Label;
use std::{collections::HashMap, sync::OnceLock}; use std::{collections::HashMap, sync::OnceLock};
use crate::{AsyncReader, Item, PileValue, extract::Extractor}; use crate::{AsyncReader, Item, PileValue, extract::ObjectExtractor};
fn toml_to_pile(value: toml::Value) -> PileValue<'static> { fn toml_to_pile(value: toml::Value) -> PileValue<'static> {
match value { match value {
@@ -52,7 +52,7 @@ impl<'a> TomlExtractor<'a> {
} }
#[async_trait::async_trait] #[async_trait::async_trait]
impl Extractor for TomlExtractor<'_> { impl ObjectExtractor for TomlExtractor<'_> {
async fn field<'a>( async fn field<'a>(
&'a self, &'a self,
name: &Label, name: &Label,

View File

@@ -76,7 +76,7 @@ impl DbFtsIndex {
doc.add_text(self.schema.get_field("_meta_source")?, item.source_name()); doc.add_text(self.schema.get_field("_meta_source")?, item.source_name());
doc.add_text(self.schema.get_field("_meta_key")?, key); doc.add_text(self.schema.get_field("_meta_key")?, key);
let extractor = PileValue::Extractor(Arc::new(MetaExtractor::new(item))); let extractor = PileValue::ObjectExtractor(Arc::new(MetaExtractor::new(item)));
let mut empty = true; let mut empty = true;
for name in self.fts_cfg().fields.keys() { for name in self.fts_cfg().fields.keys() {
@@ -145,6 +145,8 @@ impl DbFtsIndex {
loop { loop {
val = match val { val = match val {
PileValue::String(x) => return Ok(Some(x.to_string())), PileValue::String(x) => return Ok(Some(x.to_string())),
PileValue::U64(x) => return Ok(Some(x.to_string())),
PileValue::I64(x) => return Ok(Some(x.to_string())),
#[expect(clippy::unwrap_used)] #[expect(clippy::unwrap_used)]
PileValue::Array(ref mut x) => { PileValue::Array(ref mut x) => {
@@ -177,7 +179,7 @@ impl DbFtsIndex {
continue 'outer; continue 'outer;
} }
PileValue::Extractor(_) => { PileValue::ObjectExtractor(_) => {
trace!( trace!(
message = "Skipping field, is object", message = "Skipping field, is object",
field = field_name.to_string(), field = field_name.to_string(),
@@ -186,6 +188,15 @@ impl DbFtsIndex {
continue 'outer; continue 'outer;
} }
PileValue::ListExtractor(_) => {
trace!(
message = "Skipping field, is ListExtractor",
field = field_name.to_string(),
?path,
);
continue 'outer;
}
PileValue::Blob { .. } => { PileValue::Blob { .. } => {
trace!( trace!(
message = "Skipping field, is blob", message = "Skipping field, is blob",
@@ -300,8 +311,11 @@ pub fn apply<'a>(post: &FieldSpecPost, val: &PileValue<'a>) -> Option<PileValue<
FieldSpecPost::SetCase { case: Case::Lower } => match val { FieldSpecPost::SetCase { case: Case::Lower } => match val {
PileValue::Null => return None, PileValue::Null => return None,
PileValue::U64(_) => return None,
PileValue::I64(_) => return None,
PileValue::Blob { .. } => return None, PileValue::Blob { .. } => return None,
PileValue::Extractor(_) => return None, PileValue::ObjectExtractor(_) => return None,
PileValue::ListExtractor(_) => return None,
PileValue::String(x) => PileValue::String(x.to_lowercase().into()), PileValue::String(x) => PileValue::String(x.to_lowercase().into()),
PileValue::Array(x) => { PileValue::Array(x) => {
@@ -311,8 +325,11 @@ pub fn apply<'a>(post: &FieldSpecPost, val: &PileValue<'a>) -> Option<PileValue<
FieldSpecPost::SetCase { case: Case::Upper } => match val { FieldSpecPost::SetCase { case: Case::Upper } => match val {
PileValue::Null => return None, PileValue::Null => return None,
PileValue::U64(_) => return None,
PileValue::I64(_) => return None,
PileValue::Blob { .. } => return None, PileValue::Blob { .. } => return None,
PileValue::Extractor(_) => return None, PileValue::ObjectExtractor(_) => return None,
PileValue::ListExtractor(_) => return None,
PileValue::String(x) => PileValue::String(x.to_uppercase().into()), PileValue::String(x) => PileValue::String(x.to_uppercase().into()),
PileValue::Array(x) => { PileValue::Array(x) => {
@@ -322,8 +339,11 @@ pub fn apply<'a>(post: &FieldSpecPost, val: &PileValue<'a>) -> Option<PileValue<
FieldSpecPost::TrimSuffix { trim_suffix } => match val { FieldSpecPost::TrimSuffix { trim_suffix } => match val {
PileValue::Null => return None, PileValue::Null => return None,
PileValue::U64(_) => return None,
PileValue::I64(_) => return None,
PileValue::Blob { .. } => return None, PileValue::Blob { .. } => return None,
PileValue::Extractor(_) => return None, PileValue::ObjectExtractor(_) => return None,
PileValue::ListExtractor(_) => return None,
PileValue::String(x) => { PileValue::String(x) => {
PileValue::String(x.strip_suffix(trim_suffix).unwrap_or(x).into()) PileValue::String(x.strip_suffix(trim_suffix).unwrap_or(x).into())
@@ -336,8 +356,11 @@ pub fn apply<'a>(post: &FieldSpecPost, val: &PileValue<'a>) -> Option<PileValue<
FieldSpecPost::TrimPrefix { trim_prefix } => match val { FieldSpecPost::TrimPrefix { trim_prefix } => match val {
PileValue::Null => return None, PileValue::Null => return None,
PileValue::U64(_) => return None,
PileValue::I64(_) => return None,
PileValue::Blob { .. } => return None, PileValue::Blob { .. } => return None,
PileValue::Extractor(_) => return None, PileValue::ObjectExtractor(_) => return None,
PileValue::ListExtractor(_) => return None,
PileValue::String(x) => { PileValue::String(x) => {
PileValue::String(x.strip_prefix(trim_prefix).unwrap_or(x).into()) PileValue::String(x.strip_prefix(trim_prefix).unwrap_or(x).into())
@@ -350,10 +373,14 @@ pub fn apply<'a>(post: &FieldSpecPost, val: &PileValue<'a>) -> Option<PileValue<
FieldSpecPost::Join { join } => match val { FieldSpecPost::Join { join } => match val {
PileValue::Null => return None, PileValue::Null => return None,
PileValue::U64(_) => return None,
PileValue::I64(_) => return None,
PileValue::Blob { .. } => return None, PileValue::Blob { .. } => return None,
PileValue::Extractor(_) => return None, PileValue::ObjectExtractor(_) => return None,
PileValue::ListExtractor(_) => return None,
PileValue::String(x) => PileValue::String(x.clone()), PileValue::String(x) => PileValue::String(x.clone()),
PileValue::Array(x) => PileValue::String( PileValue::Array(x) => PileValue::String(
x.iter() x.iter()
.map(|x| apply(post, x)) .map(|x| apply(post, x))

View File

@@ -62,7 +62,7 @@ pub async fn get_field(
}; };
let extractor = MetaExtractor::new(&item); let extractor = MetaExtractor::new(&item);
let root: PileValue<'_> = PileValue::Extractor(Arc::new(extractor)); let root: PileValue<'_> = PileValue::ObjectExtractor(Arc::new(extractor));
let value = match root.query(&path).await { let value = match root.query(&path).await {
Ok(Some(v)) => v, Ok(Some(v)) => v,

View File

@@ -4,11 +4,13 @@ use serde_json::{Map, Value};
use smartstring::{LazyCompact, SmartString}; use smartstring::{LazyCompact, SmartString};
use std::sync::Arc; use std::sync::Arc;
use crate::extract::Extractor; use crate::extract::{ListExtractor, ObjectExtractor};
/// An immutable, lazily-computed value similar to [serde_json::Value]. /// An immutable, lazily-computed value similar to [serde_json::Value].
pub enum PileValue<'a> { pub enum PileValue<'a> {
Null, Null,
U64(u64),
I64(i64),
/// A string /// A string
String(SmartString<LazyCompact>), String(SmartString<LazyCompact>),
@@ -23,16 +25,22 @@ pub enum PileValue<'a> {
}, },
/// A lazily-computed map of {label: value} /// A lazily-computed map of {label: value}
Extractor(Arc<dyn Extractor + 'a>), ObjectExtractor(Arc<dyn ObjectExtractor + 'a>),
/// A lazily-computed array
ListExtractor(Arc<dyn ListExtractor + 'a>),
} }
impl Clone for PileValue<'_> { impl Clone for PileValue<'_> {
fn clone(&self) -> Self { fn clone(&self) -> Self {
match self { match self {
Self::Null => Self::Null, Self::Null => Self::Null,
Self::U64(x) => Self::U64(*x),
Self::I64(x) => Self::I64(*x),
Self::String(x) => Self::String(x.clone()), Self::String(x) => Self::String(x.clone()),
Self::Array(x) => Self::Array(x.clone()), Self::Array(x) => Self::Array(x.clone()),
Self::Extractor(x) => Self::Extractor(x.clone()), Self::ObjectExtractor(x) => Self::ObjectExtractor(x.clone()),
Self::ListExtractor(x) => Self::ListExtractor(x.clone()),
Self::Blob { mime, bytes } => Self::Blob { Self::Blob { mime, bytes } => Self::Blob {
mime: mime.clone(), mime: mime.clone(),
bytes: bytes.clone(), bytes: bytes.clone(),
@@ -52,10 +60,13 @@ impl<'a> PileValue<'a> {
out = match &out { out = match &out {
None => return Ok(None), None => return Ok(None),
Some(Self::Null) => None, Some(Self::Null) => None,
Some(Self::U64(_)) => None,
Some(Self::I64(_)) => None,
Some(Self::Array(_)) => None, Some(Self::Array(_)) => None,
Some(Self::String(_)) => None, Some(Self::String(_)) => None,
Some(Self::Blob { .. }) => None, Some(Self::Blob { .. }) => None,
Some(Self::Extractor(e)) => e.field(field).await?, Some(Self::ListExtractor(_)) => None,
Some(Self::ObjectExtractor(e)) => e.field(field).await?,
} }
} }
@@ -63,6 +74,8 @@ impl<'a> PileValue<'a> {
out = match &out { out = match &out {
None => return Ok(None), None => return Ok(None),
Some(Self::Null) => None, Some(Self::Null) => None,
Some(Self::U64(_)) => None,
Some(Self::I64(_)) => None,
Some(Self::Blob { .. }) => None, Some(Self::Blob { .. }) => None,
Some(Self::Array(v)) => { Some(Self::Array(v)) => {
let idx = if *idx >= 0 { let idx = if *idx >= 0 {
@@ -74,7 +87,19 @@ impl<'a> PileValue<'a> {
idx.and_then(|idx| v.get(idx)) idx.and_then(|idx| v.get(idx))
} }
Some(Self::String(_)) => None, Some(Self::String(_)) => None,
Some(Self::Extractor(_)) => None, Some(Self::ObjectExtractor(_)) => None,
Some(Self::ListExtractor(e)) => {
let idx = if *idx >= 0 {
usize::try_from(*idx).ok()
} else {
usize::try_from(e.len().await? as i64 - idx).ok()
};
match idx {
Some(idx) => e.get(idx).await?,
None => None,
}
}
} }
} }
} }
@@ -93,6 +118,8 @@ impl<'a> PileValue<'a> {
pub async fn to_json(&self) -> Result<Value, std::io::Error> { pub async fn to_json(&self) -> Result<Value, std::io::Error> {
Ok(match self { Ok(match self {
Self::Null => Value::Null, Self::Null => Value::Null,
Self::U64(x) => Value::Number((*x).into()),
Self::I64(x) => Value::Number((*x).into()),
// TODO: replace with something meaningful // TODO: replace with something meaningful
Self::Blob { mime, bytes } => { Self::Blob { mime, bytes } => {
@@ -108,7 +135,7 @@ impl<'a> PileValue<'a> {
Value::Array(arr) Value::Array(arr)
} }
Self::Extractor(e) => { Self::ObjectExtractor(e) => {
let keys = e.fields().await?; let keys = e.fields().await?;
let mut map = Map::new(); let mut map = Map::new();
for k in &keys { for k in &keys {
@@ -120,6 +147,20 @@ impl<'a> PileValue<'a> {
} }
Value::Object(map) Value::Object(map)
} }
Self::ListExtractor(e) => {
let len = e.len().await?;
let mut list = Vec::with_capacity(len);
for i in 0..len {
#[expect(clippy::expect_used)]
let v = e.get(i)
.await?
.expect("value must be present according to length");
list.push(Box::pin(v.to_json()).await?);
}
Value::Array(list)
}
}) })
} }
} }

View File

@@ -73,7 +73,7 @@ impl CliCmd for AnnotateCommand {
}; };
let meta = MetaExtractor::new(&item); let meta = MetaExtractor::new(&item);
let extractor = PileValue::Extractor(Arc::new(meta)); let extractor = PileValue::ObjectExtractor(Arc::new(meta));
let Some(value) = let Some(value) =
index.get_field(&extractor, &field).await.with_context(|| { index.get_field(&extractor, &field).await.with_context(|| {

View File

@@ -54,7 +54,7 @@ impl CliCmd for ProbeCommand {
anyhow::anyhow!("{:?} not found in source {:?}", self.key, self.source) anyhow::anyhow!("{:?} not found in source {:?}", self.key, self.source)
})?; })?;
let value = PileValue::Extractor(Arc::new(MetaExtractor::new(&item))); let value = PileValue::ObjectExtractor(Arc::new(MetaExtractor::new(&item)));
value value
.to_json() .to_json()
.await .await