Extractor rewrite

This commit is contained in:
2026-03-11 10:12:36 -07:00
parent b789255ea9
commit 078801be40
51 changed files with 676 additions and 693 deletions

View File

@@ -0,0 +1,95 @@
use epub::doc::EpubDoc;
use pile_config::Label;
use std::{
collections::HashMap,
sync::{Arc, OnceLock},
};
use tracing::trace;
use crate::{
extract::traits::ObjectExtractor,
value::{Item, PileValue, SyncReadBridge},
};
pub struct EpubMetaExtractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl EpubMetaExtractor {
pub fn new(item: &Item) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let key = self.item.key();
let ext = key.as_str().rsplit('.').next();
if !matches!(ext, Some("epub")) {
return Ok(self.output.get_or_init(HashMap::new));
}
let reader = SyncReadBridge::new_current(self.item.read().await?);
let raw_meta = tokio::task::spawn_blocking(move || {
let doc = EpubDoc::from_reader(reader)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let fields: &[&'static str] = &[
"title",
"creator",
"description",
"language",
"publisher",
"date",
"subject",
"identifier",
];
let meta: Vec<(&'static str, Option<String>)> =
fields.iter().map(|&key| (key, doc.mdata(key))).collect();
Ok::<_, std::io::Error>(meta)
})
.await
.map_err(std::io::Error::other)?;
let raw_meta = match raw_meta {
Ok(x) => x,
Err(error) => {
trace!(message = "Could not process epub", ?error, key = ?self.item.key());
return Ok(self.output.get_or_init(HashMap::new));
}
};
let mut output: HashMap<Label, PileValue> = HashMap::new();
#[expect(clippy::unwrap_used)]
for (key, val) in raw_meta {
let label = Label::new(key).unwrap();
let value = match val {
Some(s) => PileValue::String(Arc::new(s.into())),
None => PileValue::Null,
};
output.insert(label, value);
}
return Ok(self.output.get_or_init(|| output));
}
}
#[async_trait::async_trait]
impl ObjectExtractor for EpubMetaExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.get_inner().await?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect())
}
}

View File

@@ -0,0 +1,105 @@
use epub::doc::EpubDoc;
use pile_config::Label;
use std::{
collections::HashMap,
sync::{Arc, OnceLock},
};
use tracing::debug;
use crate::{
extract::traits::ObjectExtractor,
value::{Item, PileValue, SyncReadBridge},
};
pub struct EpubTextExtractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl EpubTextExtractor {
pub fn new(item: &Item) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let key = self.item.key();
let ext = key.as_str().rsplit('.').next();
if !matches!(ext, Some("epub")) {
return Ok(self.output.get_or_init(HashMap::new));
}
let reader = SyncReadBridge::new_current(self.item.read().await?);
let raw_text = tokio::task::spawn_blocking(move || {
let mut doc = EpubDoc::from_reader(reader)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let mut text_parts: Vec<String> = Vec::new();
loop {
if let Ok(content) = doc.get_current_str() {
text_parts.push(strip_html(&content));
}
if doc.go_next().is_err() {
break;
}
}
Ok::<_, std::io::Error>(text_parts.join(" "))
})
.await
.map_err(std::io::Error::other)?;
let raw_text = match raw_text {
Ok(x) => x,
Err(error) => {
debug!(message = "Could not process epub", ?error, key = ?self.item.key());
return Ok(self.output.get_or_init(HashMap::new));
}
};
#[expect(clippy::unwrap_used)]
let output = HashMap::from([(
Label::new("text").unwrap(),
PileValue::String(Arc::new(raw_text.into())),
)]);
let _ = self.output.set(output);
#[expect(clippy::unwrap_used)]
return Ok(self.output.get().unwrap());
}
}
/// Strip HTML/XHTML tags from a string, leaving only text nodes.
fn strip_html(html: &str) -> String {
let mut result = String::with_capacity(html.len());
let mut in_tag = false;
for c in html.chars() {
match c {
'<' => in_tag = true,
'>' => in_tag = false,
_ if !in_tag => result.push(c),
_ => {}
}
}
result
}
#[async_trait::async_trait]
impl ObjectExtractor for EpubTextExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.get_inner().await?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect())
}
}

View File

@@ -0,0 +1,46 @@
use pile_config::Label;
use std::sync::Arc;
mod epub_meta;
pub use epub_meta::*;
mod epub_text;
pub use epub_text::*;
use crate::{
extract::traits::ObjectExtractor,
value::{Item, PileValue},
};
pub struct EpubExtractor {
text: Arc<EpubTextExtractor>,
meta: Arc<EpubMetaExtractor>,
}
impl EpubExtractor {
pub fn new(item: &Item) -> Self {
Self {
text: Arc::new(EpubTextExtractor::new(item)),
meta: Arc::new(EpubMetaExtractor::new(item)),
}
}
}
#[async_trait::async_trait]
impl ObjectExtractor for EpubExtractor {
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error> {
match name.as_str() {
"text" => self.text.field(name).await,
"meta" => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))),
_ => Ok(None),
}
}
#[expect(clippy::unwrap_used)]
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(vec![
Label::new("text").unwrap(),
Label::new("meta").unwrap(),
])
}
}

View File

@@ -0,0 +1,96 @@
use pile_config::Label;
use std::{
collections::HashMap,
io::BufReader,
sync::{Arc, OnceLock},
};
use tracing::trace;
use crate::{
extract::traits::ObjectExtractor,
value::{Item, PileValue, SyncReadBridge},
};
pub struct ExifExtractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl ExifExtractor {
pub fn new(item: &Item) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let reader = SyncReadBridge::new_current(self.item.read().await?);
let raw_fields = tokio::task::spawn_blocking(move || {
let mut br = BufReader::new(reader);
let exif = exif::Reader::new()
.read_from_container(&mut br)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let fields: Vec<(String, String)> = exif
.fields()
.map(|f| {
(
f.tag.to_string(),
f.display_value().with_unit(&exif).to_string(),
)
})
.collect();
Ok::<_, std::io::Error>(fields)
})
.await
.map_err(std::io::Error::other)?;
let raw_fields = match raw_fields {
Ok(x) => x,
Err(error) => {
trace!(message = "Could not process exif", ?error, key = ?self.item.key());
return Ok(self.output.get_or_init(HashMap::new));
}
};
let mut output: HashMap<Label, PileValue> = HashMap::new();
for (tag_name, value) in raw_fields {
let Some(label) = tag_to_label(&tag_name) else {
continue;
};
// First occurrence wins (PRIMARY IFD comes before THUMBNAIL)
output
.entry(label)
.or_insert_with(|| PileValue::String(Arc::new(value.into())));
}
return Ok(self.output.get_or_init(|| output));
}
}
fn tag_to_label(tag: &str) -> Option<Label> {
let sanitized: String = tag
.chars()
.map(|c| if c == ' ' { '_' } else { c })
.filter(|c| Label::VALID_CHARS.contains(*c))
.collect();
Label::new(sanitized)
}
#[async_trait::async_trait]
impl ObjectExtractor for ExifExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.get_inner().await?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect())
}
}

View File

@@ -0,0 +1,162 @@
use mime::Mime;
use pile_config::Label;
use pile_flac::{FlacBlock, FlacReader};
use std::{
collections::HashMap,
io::BufReader,
sync::{Arc, OnceLock},
};
use crate::{
extract::traits::{ListExtractor, ObjectExtractor},
value::{Item, PileValue, SyncReadBridge},
};
pub struct FlacImagesExtractor {
item: Item,
}
impl FlacImagesExtractor {
pub fn new(item: &Item) -> Self {
Self { item: item.clone() }
}
async fn get_images(&self) -> Result<Vec<PileValue>, std::io::Error> {
let reader = SyncReadBridge::new_current(self.item.read().await?);
let raw_images = tokio::task::spawn_blocking(move || {
let reader = FlacReader::new(BufReader::new(reader));
let mut images: Vec<(Mime, Vec<u8>)> = Vec::new();
for block in reader {
match block.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))? {
FlacBlock::Picture(picture) => {
images.push((picture.mime, picture.img_data));
}
FlacBlock::AudioFrame(_) => break,
_ => {}
}
}
Ok::<_, std::io::Error>(images)
})
.await
.map_err(std::io::Error::other)??;
Ok(raw_images
.into_iter()
.map(|(mime, data)| PileValue::Blob {
mime,
bytes: Arc::new(data),
})
.collect())
}
}
#[async_trait::async_trait]
impl ListExtractor for FlacImagesExtractor {
async fn get<'a>(&'a self, idx: usize) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.get_images().await?.into_iter().nth(idx))
}
async fn len(&self) -> Result<usize, std::io::Error> {
Ok(self.get_images().await?.len())
}
}
pub struct FlacExtractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
images: Option<PileValue>,
}
impl FlacExtractor {
pub fn new(item: &Item) -> Self {
let is_flac = match item {
Item::File { path, .. } => path.to_str().unwrap_or_default().ends_with(".flac"),
Item::S3 { key, .. } => key.ends_with(".flac"),
};
let images =
is_flac.then(|| PileValue::ListExtractor(Arc::new(FlacImagesExtractor::new(item))));
Self {
item: item.clone(),
output: OnceLock::new(),
images,
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let key = match &self.item {
Item::File { path, .. } => path.to_str().unwrap_or_default().to_owned(),
Item::S3 { key, .. } => key.to_string(),
};
if !key.ends_with(".flac") {
let _ = self.output.set(HashMap::new());
#[expect(clippy::unwrap_used)]
return Ok(self.output.get().unwrap());
}
let reader = SyncReadBridge::new_current(self.item.read().await?);
let raw_tags = tokio::task::spawn_blocking(move || {
let reader = FlacReader::new(BufReader::new(reader));
let mut tags: Vec<(String, String)> = Vec::new();
for block in reader {
match block.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))? {
FlacBlock::VorbisComment(comment) => {
for (k, v) in comment.comment.comments {
tags.push((k.to_string().to_lowercase(), v.into()));
}
}
FlacBlock::AudioFrame(_) => break,
_ => {}
}
}
Ok::<_, std::io::Error>(tags)
})
.await
.map_err(std::io::Error::other)??;
let mut output: HashMap<Label, Vec<PileValue>> = HashMap::new();
for (k, v) in raw_tags {
if let Some(label) = Label::new(k) {
output
.entry(label)
.or_default()
.push(PileValue::String(Arc::new(v.into())));
}
}
let output: HashMap<Label, PileValue> = output
.into_iter()
.map(|(k, v)| (k, PileValue::Array(Arc::new(v))))
.collect();
let _ = self.output.set(output);
#[expect(clippy::unwrap_used)]
return Ok(self.output.get().unwrap());
}
}
#[async_trait::async_trait]
impl ObjectExtractor for FlacExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
if name.as_str() == "images"
&& let Some(ref images) = self.images
{
return Ok(Some(images.clone()));
}
Ok(self.get_inner().await?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
let mut fields = self.get_inner().await?.keys().cloned().collect::<Vec<_>>();
if self.images.is_some() {
#[expect(clippy::unwrap_used)]
fields.push(Label::new("images").unwrap());
}
Ok(fields)
}
}

View File

@@ -0,0 +1,80 @@
use pile_config::Label;
use std::{
collections::HashMap,
path::Component,
sync::{Arc, OnceLock},
};
use crate::{
extract::traits::ObjectExtractor,
value::{Item, PileValue},
};
pub struct FsExtractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl FsExtractor {
pub fn new(item: &Item) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
}
}
fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let Item::File { path, .. } = &self.item else {
return Ok(self.output.get_or_init(HashMap::new));
};
#[expect(clippy::unwrap_used)]
let output = HashMap::from([
(
Label::new("extension").unwrap(),
path.extension()
.and_then(|x| x.to_str())
.map(|x| PileValue::String(Arc::new(x.into())))
.unwrap_or(PileValue::Null),
),
(
Label::new("path").unwrap(),
path.to_str()
.map(|x| PileValue::String(Arc::new(x.into())))
.unwrap_or(PileValue::Null),
),
(
Label::new("segments").unwrap(),
path.components()
.map(|x| match x {
Component::CurDir => Some(".".to_owned()),
Component::Normal(x) => x.to_str().map(|x| x.to_owned()),
Component::ParentDir => Some("..".to_owned()),
Component::RootDir => Some("/".to_owned()),
Component::Prefix(x) => x.as_os_str().to_str().map(|x| x.to_owned()),
})
.map(|x| x.map(|x| PileValue::String(Arc::new(x.into()))))
.collect::<Option<Vec<_>>>()
.map(|v| PileValue::Array(Arc::new(v)))
.unwrap_or(PileValue::Null),
),
]);
return Ok(self.output.get_or_init(|| output));
}
}
#[async_trait::async_trait]
impl ObjectExtractor for FsExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.get_inner()?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner()?.keys().cloned().collect())
}
}

View File

@@ -0,0 +1,135 @@
use id3::Tag;
use pile_config::Label;
use std::{
borrow::Cow,
collections::HashMap,
io::BufReader,
sync::{Arc, OnceLock},
};
use crate::{
extract::traits::ObjectExtractor,
value::{Item, PileValue, SyncReadBridge},
};
pub struct Id3Extractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl Id3Extractor {
pub fn new(item: &Item) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let key = self.item.key();
let ext = key.as_str().rsplit('.').next();
if !matches!(ext, Some("mp3") | Some("aiff") | Some("aif") | Some("wav")) {
return Ok(self.output.get_or_init(HashMap::new));
}
let reader = SyncReadBridge::new_current(self.item.read().await?);
let tag = match tokio::task::spawn_blocking(move || Tag::read_from2(BufReader::new(reader)))
.await
{
Ok(Ok(tag)) => tag,
Ok(Err(id3::Error {
kind: id3::ErrorKind::NoTag,
..
})) => {
return Ok(self.output.get_or_init(HashMap::new));
}
Ok(Err(id3::Error {
kind: id3::ErrorKind::Io(e),
..
})) => return Err(e),
Ok(Err(e)) => return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, e)),
Err(e) => return Err(e.into()),
};
let mut output: HashMap<Label, Vec<PileValue>> = HashMap::new();
for frame in tag.frames() {
if let Some(texts) = frame.content().text_values() {
let name = frame_id_to_field(frame.id());
if let Some(key) = Label::new(name) {
for text in texts {
output
.entry(key.clone())
.or_default()
.push(PileValue::String(Arc::new(text.into())));
}
}
}
}
let output = output
.into_iter()
.map(|(k, v)| (k, PileValue::Array(Arc::new(v))))
.collect();
return Ok(self.output.get_or_init(|| output));
}
}
/// Map an ID3 frame ID to the equivalent Vorbis Comment field name.
/// Falls back to the lowercased frame ID if no mapping exists.
fn frame_id_to_field(id: &str) -> Cow<'static, str> {
match id {
// spell:off
"TIT2" => Cow::Borrowed("title"),
"TIT1" => Cow::Borrowed("grouping"),
"TIT3" => Cow::Borrowed("subtitle"),
"TPE1" => Cow::Borrowed("artist"),
"TPE2" => Cow::Borrowed("albumartist"),
"TPE3" => Cow::Borrowed("conductor"),
"TOPE" => Cow::Borrowed("originalartist"),
"TALB" => Cow::Borrowed("album"),
"TOAL" => Cow::Borrowed("originalalbum"),
"TRCK" => Cow::Borrowed("tracknumber"),
"TPOS" => Cow::Borrowed("discnumber"),
"TSST" => Cow::Borrowed("discsubtitle"),
"TDRC" | "TYER" => Cow::Borrowed("date"),
"TDOR" | "TORY" => Cow::Borrowed("originaldate"),
"TCON" => Cow::Borrowed("genre"),
"TCOM" => Cow::Borrowed("composer"),
"TEXT" => Cow::Borrowed("lyricist"),
"TPUB" => Cow::Borrowed("label"),
"TSRC" => Cow::Borrowed("isrc"),
"TBPM" => Cow::Borrowed("bpm"),
"TLAN" => Cow::Borrowed("language"),
"TMED" => Cow::Borrowed("media"),
"TMOO" => Cow::Borrowed("mood"),
"TCOP" => Cow::Borrowed("copyright"),
"TENC" => Cow::Borrowed("encodedby"),
"TSSE" => Cow::Borrowed("encodersettings"),
"TSOA" => Cow::Borrowed("albumsort"),
"TSOP" => Cow::Borrowed("artistsort"),
"TSOT" => Cow::Borrowed("titlesort"),
"MVNM" => Cow::Borrowed("movement"),
"MVIN" => Cow::Borrowed("movementnumber"),
_ => Cow::Owned(id.to_lowercase()),
// spell:on
}
}
#[async_trait::async_trait]
impl ObjectExtractor for Id3Extractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.get_inner().await?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect())
}
}

View File

@@ -0,0 +1,99 @@
mod flac;
use std::{collections::HashMap, sync::Arc};
pub use flac::*;
mod id3;
pub use id3::*;
mod fs;
pub use fs::*;
mod epub;
pub use epub::*;
mod exif;
pub use exif::*;
mod pdf;
pub use pdf::*;
mod toml;
use pile_config::Label;
pub use toml::*;
mod sidecar;
pub use sidecar::*;
use crate::{
extract::{misc::MapExtractor, traits::ObjectExtractor},
value::{Item, PileValue},
};
pub struct ItemExtractor {
inner: MapExtractor,
}
impl ItemExtractor {
#[expect(clippy::unwrap_used)]
pub fn new(item: &Item) -> Self {
let inner = MapExtractor {
inner: HashMap::from([
(
Label::new("flac").unwrap(),
PileValue::ObjectExtractor(Arc::new(FlacExtractor::new(item))),
),
(
Label::new("id3").unwrap(),
PileValue::ObjectExtractor(Arc::new(Id3Extractor::new(item))),
),
(
Label::new("fs").unwrap(),
PileValue::ObjectExtractor(Arc::new(FsExtractor::new(item))),
),
(
Label::new("epub").unwrap(),
PileValue::ObjectExtractor(Arc::new(EpubExtractor::new(item))),
),
(
Label::new("exif").unwrap(),
PileValue::ObjectExtractor(Arc::new(ExifExtractor::new(item))),
),
(
Label::new("pdf").unwrap(),
PileValue::ObjectExtractor(Arc::new(PdfExtractor::new(item))),
),
(
Label::new("toml").unwrap(),
PileValue::ObjectExtractor(Arc::new(TomlExtractor::new(item))),
),
(
Label::new("sidecar").unwrap(),
PileValue::ObjectExtractor(Arc::new(SidecarExtractor::new(item))),
),
]),
};
Self { inner }
}
}
#[async_trait::async_trait]
impl ObjectExtractor for ItemExtractor {
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error> {
self.inner.field(name).await
}
#[expect(clippy::unwrap_used)]
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
return Ok(vec![
Label::new("flac").unwrap(),
Label::new("id3").unwrap(),
Label::new("fs").unwrap(),
Label::new("epub").unwrap(),
Label::new("exif").unwrap(),
Label::new("pdf").unwrap(),
Label::new("sidecar").unwrap(),
]);
}
}

View File

@@ -0,0 +1,61 @@
use pile_config::Label;
use std::sync::Arc;
#[cfg(feature = "pdfium")]
mod pdf_pages;
#[cfg(feature = "pdfium")]
pub use pdf_pages::*;
mod pdf_meta;
pub use pdf_meta::*;
mod pdf_text;
pub use pdf_text::*;
use crate::{
extract::traits::ObjectExtractor,
value::{Item, PileValue},
};
pub struct PdfExtractor {
text: Arc<PdfTextExtractor>,
meta: Arc<PdfMetaExtractor>,
#[cfg(feature = "pdfium")]
pages: Arc<PdfPagesExtractor>,
}
impl PdfExtractor {
pub fn new(item: &Item) -> Self {
Self {
text: Arc::new(PdfTextExtractor::new(item)),
meta: Arc::new(PdfMetaExtractor::new(item)),
#[cfg(feature = "pdfium")]
pages: Arc::new(PdfPagesExtractor::new(item)),
}
}
}
#[async_trait::async_trait]
impl ObjectExtractor for PdfExtractor {
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error> {
match name.as_str() {
"text" => self.text.field(name).await,
"meta" => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))),
#[cfg(feature = "pdfium")]
"pages" => Ok(Some(PileValue::ListExtractor(self.pages.clone()))),
_ => Ok(None),
}
}
#[expect(clippy::unwrap_used)]
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(vec![
Label::new("text").unwrap(),
Label::new("meta").unwrap(),
#[cfg(feature = "pdfium")]
Label::new("cover").unwrap(),
#[cfg(feature = "pdfium")]
Label::new("pages").unwrap(),
])
}
}

View File

@@ -0,0 +1,132 @@
use pdf::file::FileOptions;
use pdf::primitive::{Date, TimeRel};
use pile_config::Label;
use std::{
collections::HashMap,
io::BufReader,
sync::{Arc, OnceLock},
};
use tracing::trace;
use crate::{
extract::traits::ObjectExtractor,
value::{Item, PileValue, SyncReadBridge},
};
pub struct PdfMetaExtractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl PdfMetaExtractor {
pub fn new(item: &Item) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let reader = SyncReadBridge::new_current(self.item.read().await?);
let raw_meta = tokio::task::spawn_blocking(move || {
let mut bytes = Vec::new();
std::io::Read::read_to_end(&mut BufReader::new(reader), &mut bytes)?;
let file = match FileOptions::cached().load(bytes) {
Ok(x) => x,
Err(pdf::PdfError::Io { source }) => return Err(source),
Err(error) => {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidData,
error.to_string(),
));
}
};
let page_count = file.num_pages();
let mut meta: Vec<(&'static str, Option<String>)> = Vec::new();
if let Some(info) = &file.trailer.info_dict {
use pdf::primitive::PdfString;
let fields: &[(&'static str, Option<&PdfString>)] = &[
("title", info.title.as_ref()),
("author", info.author.as_ref()),
("subject", info.subject.as_ref()),
("keywords", info.keywords.as_ref()),
("creator", info.creator.as_ref()),
("producer", info.producer.as_ref()),
];
for (key, val) in fields {
meta.push((key, val.map(|s| s.to_string_lossy())));
}
meta.push((
"creation_date",
info.creation_date.as_ref().map(format_date),
));
meta.push(("mod_date", info.mod_date.as_ref().map(format_date)));
}
Ok::<_, std::io::Error>((page_count, meta))
})
.await
.map_err(std::io::Error::other)?;
let (page_count, raw_meta) = match raw_meta {
Ok(x) => x,
Err(error) => {
trace!(message = "Could not process pdf", ?error, key = ?self.item.key());
return Ok(self.output.get_or_init(HashMap::new));
}
};
let mut output: HashMap<Label, PileValue> = HashMap::new();
#[expect(clippy::unwrap_used)]
output.insert(
Label::new("pages").unwrap(),
PileValue::U64(page_count as u64),
);
#[expect(clippy::unwrap_used)]
for (key, val) in raw_meta {
let label = Label::new(key).unwrap();
let value = match val {
Some(s) => PileValue::String(Arc::new(s.into())),
None => PileValue::Null,
};
output.insert(label, value);
}
return Ok(self.output.get_or_init(|| output));
}
}
fn format_date(d: &Date) -> String {
let tz = match d.rel {
TimeRel::Universal => "Z".to_owned(),
TimeRel::Later => format!("+{:02}:{:02}", d.tz_hour, d.tz_minute),
TimeRel::Earlier => format!("-{:02}:{:02}", d.tz_hour, d.tz_minute),
};
format!(
"{:04}-{:02}-{:02}T{:02}:{:02}:{:02}{}",
d.year, d.month, d.day, d.hour, d.minute, d.second, tz
)
}
#[async_trait::async_trait]
impl ObjectExtractor for PdfMetaExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.get_inner().await?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect())
}
}

View File

@@ -0,0 +1,107 @@
use image::ImageFormat;
use pdfium_render::prelude::*;
use std::{
io::{BufReader, Cursor},
sync::Arc,
};
use tracing::trace;
use crate::{
extract::traits::ListExtractor,
value::{Item, PileValue, SyncReadBridge},
};
pub struct PdfPagesExtractor {
item: Item,
}
impl PdfPagesExtractor {
pub fn new(item: &Item) -> Self {
Self { item: item.clone() }
}
async fn get_bytes(&self) -> Result<Vec<u8>, std::io::Error> {
let reader = SyncReadBridge::new_current(self.item.read().await?);
tokio::task::spawn_blocking(move || {
let mut b = Vec::new();
std::io::Read::read_to_end(&mut BufReader::new(reader), &mut b)?;
Ok::<_, std::io::Error>(b)
})
.await
.map_err(std::io::Error::other)?
}
}
#[async_trait::async_trait]
impl ListExtractor for PdfPagesExtractor {
async fn get(&self, idx: usize) -> Result<Option<PileValue>, std::io::Error> {
let bytes = self.get_bytes().await?;
let png = tokio::task::spawn_blocking(move || {
let pdfium = Pdfium::default();
let doc = pdfium
.load_pdf_from_byte_slice(&bytes, None)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
if idx >= doc.pages().len() as usize {
return Ok::<_, std::io::Error>(None);
}
let render_config = PdfRenderConfig::new().set_target_width(1024);
let page = doc
.pages()
.get(idx as u16)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let image = page
.render_with_config(&render_config)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?
.as_image();
let mut png_bytes = Vec::new();
image
.write_to(&mut Cursor::new(&mut png_bytes), ImageFormat::Png)
.map_err(|e| std::io::Error::other(e.to_string()))?;
Ok(Some(png_bytes))
})
.await
.map_err(std::io::Error::other)?;
let value = match png {
Ok(None) => return Ok(None),
Ok(Some(bytes)) => PileValue::Blob {
mime: mime::IMAGE_PNG,
bytes: Arc::new(bytes),
},
Err(error) => {
trace!(message = "Could not render pdf page", ?error, idx, key = ?self.item.key());
PileValue::Null
}
};
Ok(Some(value))
}
async fn len(&self) -> Result<usize, std::io::Error> {
let bytes = self.get_bytes().await?;
let count = tokio::task::spawn_blocking(move || {
let pdfium = Pdfium::default();
let doc = pdfium
.load_pdf_from_byte_slice(&bytes, None)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
Ok::<_, std::io::Error>(doc.pages().len() as usize)
})
.await
.map_err(std::io::Error::other)?;
match count {
Ok(n) => Ok(n),
Err(error) => {
trace!(message = "Could not read pdf page count", ?error, key = ?self.item.key());
Ok(0)
}
}
}
// Override, extracting all pages is very slow,
// and we can't display binary in json anyway
async fn to_json(&self) -> Result<serde_json::Value, std::io::Error> {
Ok(serde_json::Value::String(format!(
"<PdfPages ({} pages)>",
self.len().await?
)))
}
}

View File

@@ -0,0 +1,112 @@
use pdf::content::{Op, TextDrawAdjusted};
use pdf::file::FileOptions;
use pile_config::Label;
use std::{
collections::HashMap,
io::BufReader,
sync::{Arc, OnceLock},
};
use tracing::trace;
use crate::{
extract::traits::ObjectExtractor,
value::{Item, PileValue, SyncReadBridge},
};
pub struct PdfTextExtractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl PdfTextExtractor {
pub fn new(item: &Item) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let reader = SyncReadBridge::new_current(self.item.read().await?);
let raw_text = tokio::task::spawn_blocking(move || {
let mut bytes = Vec::new();
std::io::Read::read_to_end(&mut BufReader::new(reader), &mut bytes)?;
let file = match FileOptions::cached().load(bytes) {
Ok(x) => x,
Err(pdf::PdfError::Io { source }) => return Err(source),
Err(error) => {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidData,
error.to_string(),
));
}
};
let mut text_parts: Vec<String> = Vec::new();
for page in file.pages() {
let page = page.map_err(|e| {
std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string())
})?;
if let Some(content) = &page.contents {
let ops = content.operations(&file.resolver()).map_err(|e| {
std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string())
})?;
for op in ops {
match op {
Op::TextDraw { text } => {
text_parts.push(text.to_string_lossy());
}
Op::TextDrawAdjusted { array } => {
for item in array {
if let TextDrawAdjusted::Text(text) = item {
text_parts.push(text.to_string_lossy());
}
}
}
_ => {}
}
}
}
}
Ok::<_, std::io::Error>(text_parts.join(" "))
})
.await
.map_err(std::io::Error::other)?;
let raw_text = match raw_text {
Ok(x) => x,
Err(error) => {
trace!(message = "Could not process pdf", ?error, key = ?self.item.key());
return Ok(self.output.get_or_init(HashMap::new));
}
};
#[expect(clippy::unwrap_used)]
let output = HashMap::from([(
Label::new("text").unwrap(),
PileValue::String(Arc::new(raw_text.into())),
)]);
return Ok(self.output.get_or_init(|| output));
}
}
#[async_trait::async_trait]
impl ObjectExtractor for PdfTextExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.get_inner().await?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect())
}
}

View File

@@ -0,0 +1,45 @@
use pile_config::Label;
use std::sync::OnceLock;
use super::TomlExtractor;
use crate::{
extract::traits::ObjectExtractor,
value::{Item, PileValue},
};
pub struct SidecarExtractor {
item: Item,
output: OnceLock<Option<TomlExtractor>>,
}
impl SidecarExtractor {
pub fn new(item: &Item) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
}
}
}
#[async_trait::async_trait]
impl ObjectExtractor for SidecarExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
match self
.output
.get_or_init(|| self.item.sidecar().map(TomlExtractor::new))
{
Some(x) => Ok(x.field(name).await?),
None => Ok(Some(PileValue::Null)),
}
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
match self
.output
.get_or_init(|| self.item.sidecar().map(TomlExtractor::new))
{
Some(x) => Ok(x.fields().await?),
None => Ok(Vec::new()),
}
}
}

View File

@@ -0,0 +1,78 @@
use pile_config::Label;
use std::{
collections::HashMap,
sync::{Arc, OnceLock},
};
use crate::{
extract::traits::ObjectExtractor,
value::{AsyncReader, Item, PileValue},
};
fn toml_to_pile(value: toml::Value) -> PileValue {
match value {
toml::Value::String(s) => PileValue::String(Arc::new(s.into())),
toml::Value::Integer(i) => PileValue::String(Arc::new(i.to_string().into())),
toml::Value::Float(f) => PileValue::String(Arc::new(f.to_string().into())),
toml::Value::Boolean(b) => PileValue::String(Arc::new(b.to_string().into())),
toml::Value::Datetime(d) => PileValue::String(Arc::new(d.to_string().into())),
toml::Value::Array(a) => {
PileValue::Array(Arc::new(a.into_iter().map(toml_to_pile).collect()))
}
toml::Value::Table(_) => PileValue::Null,
}
}
pub struct TomlExtractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl TomlExtractor {
pub fn new(item: &Item) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let mut reader = match self.item.read().await {
Ok(r) => r,
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
return Ok(self.output.get_or_init(HashMap::new));
}
Err(e) => return Err(e),
};
let bytes = reader.read_to_end().await?;
let toml: toml::Value = match toml::from_slice(&bytes) {
Ok(x) => x,
Err(_) => return Ok(self.output.get_or_init(HashMap::new)),
};
let output: HashMap<Label, PileValue> = match toml {
toml::Value::Table(t) => t
.into_iter()
.filter_map(|(k, v)| Label::new(&k).map(|label| (label, toml_to_pile(v))))
.collect(),
_ => HashMap::new(),
};
return Ok(self.output.get_or_init(|| output));
}
}
#[async_trait::async_trait]
impl ObjectExtractor for TomlExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.get_inner().await?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect())
}
}

View File

@@ -0,0 +1,24 @@
use std::sync::Arc;
use crate::{extract::traits::ListExtractor, value::PileValue};
pub struct ArrayExtractor {
inner: Arc<Vec<PileValue>>,
}
impl ArrayExtractor {
pub fn new(inner: Arc<Vec<PileValue>>) -> Self {
Self { inner }
}
}
#[async_trait::async_trait]
impl ListExtractor for ArrayExtractor {
async fn get(&self, idx: usize) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.inner.get(idx).cloned())
}
async fn len(&self) -> Result<usize, std::io::Error> {
Ok(self.inner.len())
}
}

View File

@@ -0,0 +1,20 @@
use pile_config::Label;
use std::collections::HashMap;
use crate::{extract::traits::ObjectExtractor, value::PileValue};
#[derive(Default)]
pub struct MapExtractor {
pub inner: HashMap<Label, PileValue>,
}
#[async_trait::async_trait]
impl ObjectExtractor for MapExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.inner.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.inner.keys().cloned().collect())
}
}

View File

@@ -0,0 +1,8 @@
mod list;
pub use list::*;
mod vec;
pub use vec::*;
mod map;
pub use map::*;

View File

@@ -0,0 +1,17 @@
use crate::{extract::traits::ListExtractor, value::PileValue};
#[derive(Default)]
pub struct VecExtractor {
pub inner: Vec<PileValue>,
}
#[async_trait::async_trait]
impl ListExtractor for VecExtractor {
async fn get(&self, idx: usize) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.inner.get(idx).cloned())
}
async fn len(&self) -> Result<usize, std::io::Error> {
Ok(self.inner.len())
}
}

View File

@@ -0,0 +1,4 @@
pub mod item;
pub mod misc;
pub mod string;
pub mod traits;

View File

@@ -0,0 +1,51 @@
use pile_config::Label;
use smartstring::{LazyCompact, SmartString};
use std::sync::Arc;
use crate::{extract::traits::ObjectExtractor, value::PileValue};
pub struct StringExtractor {
item: Arc<SmartString<LazyCompact>>,
}
impl StringExtractor {
pub fn new(item: &Arc<SmartString<LazyCompact>>) -> Self {
Self { item: item.clone() }
}
}
#[async_trait::async_trait]
impl ObjectExtractor for StringExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(match name.as_str() {
"trim" => Some(PileValue::String(Arc::new(
self.item.as_str().trim().into(),
))),
"upper" => Some(PileValue::String(Arc::new(
self.item.as_str().to_lowercase().into(),
))),
"lower" => Some(PileValue::String(Arc::new(
self.item.as_str().to_uppercase().into(),
))),
"nonempty" => Some(match self.item.is_empty() {
true => PileValue::Null,
false => PileValue::String(self.item.clone()),
}),
_ => None,
})
}
#[expect(clippy::unwrap_used)]
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
return Ok(vec![
Label::new("trim").unwrap(),
Label::new("upper").unwrap(),
Label::new("lower").unwrap(),
Label::new("nonempty").unwrap(),
]);
}
}

View File

@@ -0,0 +1,68 @@
/// An attachment that extracts metadata from an [Item].
///
/// Metadata is exposed as an immutable map of {label: value},
/// much like a json object.
#[async_trait::async_trait]
pub trait ObjectExtractor: Send + Sync {
/// Get the field at `name` from `item`.
/// - returns `None` if `name` is not a valid field
/// - returns `Some(Null)` if `name` is not available
async fn field(
&self,
name: &pile_config::Label,
) -> Result<Option<crate::value::PileValue>, std::io::Error>;
/// Return all fields in this extractor.
/// `Self::field` must return [Some] for all these keys
/// and [None] for all others.
async fn fields(&self) -> Result<Vec<pile_config::Label>, std::io::Error>;
/// Convert this to a JSON value.
async fn to_json(&self) -> Result<serde_json::Value, std::io::Error> {
let keys = self.fields().await?;
let mut map = serde_json::Map::new();
for k in &keys {
let v = match self.field(k).await? {
Some(x) => x,
None => continue,
};
map.insert(k.to_string(), Box::pin(v.to_json()).await?);
}
Ok(serde_json::Value::Object(map))
}
}
/// An attachment that extracts metadata from an [Item].
///
/// Metadata is exposed as an immutable list of values.
#[async_trait::async_trait]
pub trait ListExtractor: Send + Sync {
/// Get the item at index `idx`.
/// Indices start at zero, and must be consecutive.
/// - returns `None` if `idx` is out of range
/// - returns `Some(Null)` if `None` is at `idx`
async fn get(&self, idx: usize) -> Result<Option<crate::value::PileValue>, std::io::Error>;
async fn len(&self) -> Result<usize, std::io::Error>;
async fn is_empty(&self) -> Result<bool, std::io::Error> {
Ok(self.len().await? == 0)
}
/// Convert this list to a JSON value.
async fn to_json(&self) -> Result<serde_json::Value, std::io::Error> {
let len = self.len().await?;
let mut list = Vec::with_capacity(len);
for i in 0..len {
#[expect(clippy::expect_used)]
let v = self
.get(i)
.await?
.expect("value must be present according to length");
list.push(Box::pin(v.to_json()).await?);
}
Ok(serde_json::Value::Array(list))
}
}