Extractor refactor, S3 support
Some checks failed
CI / Typos (push) Successful in 1m5s
CI / Clippy (push) Failing after 1m50s
CI / Build and test (push) Successful in 3m1s

This commit is contained in:
2026-03-06 17:49:12 -08:00
parent 77b3125af4
commit aecc84233b
31 changed files with 2676 additions and 675 deletions

View File

@@ -1,40 +1,44 @@
use pdf::file::FileOptions;
use pdf::primitive::{Date, TimeRel};
use pdf::primitive::{Date, PdfString, TimeRel};
use pile_config::Label;
use std::{collections::HashMap, sync::OnceLock};
use tracing::debug;
use crate::{FileItem, PileValue, extract::Extractor};
use crate::{Item, PileValue, extract::Extractor};
pub struct PdfMetaExtractor<'a> {
item: &'a FileItem,
output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>,
item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a>>>,
}
impl<'a> PdfMetaExtractor<'a> {
pub fn new(item: &'a FileItem) -> Self {
pub fn new(item: &'a Item) -> Self {
Self {
item,
output: OnceLock::new(),
}
}
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> {
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
if self.item.path.extension().map(|x| x.to_str()).flatten() != Some("pdf") {
return Ok(self.output.get_or_init(|| HashMap::new()));
}
let bytes = self.item.read().await?.read_to_end().await?;
let file = FileOptions::cached()
.open(&self.item.path)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let file = match FileOptions::cached().load(bytes) {
Ok(x) => x,
Err(pdf::PdfError::Io { source }) => return Err(source),
Err(error) => {
debug!(message = "Could not process pdf", ?error, key = ?self.item.key());
return Ok(self.output.get_or_init(HashMap::new));
}
};
let mut output: HashMap<Label, PileValue<'a, FileItem>> = HashMap::new();
let mut output: HashMap<Label, PileValue<'a>> = HashMap::new();
if let Some(info) = &file.trailer.info_dict {
let fields: &[(&str, Option<&_>)] = &[
let fields: &[(&str, Option<&PdfString>)] = &[
("title", info.title.as_ref()),
("author", info.author.as_ref()),
("subject", info.subject.as_ref()),
@@ -88,15 +92,16 @@ fn format_date(d: &Date) -> String {
)
}
impl Extractor<FileItem> for PdfMetaExtractor<'_> {
fn field<'a>(
#[async_trait::async_trait]
impl Extractor for PdfMetaExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> {
Ok(self.get_inner()?.get(name))
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner().await?.get(name))
}
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner()?.keys().cloned().collect())
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect())
}
}