Add pdf extractors

This commit is contained in:
2026-03-06 16:26:09 -08:00
parent d51b8b51bf
commit 32c611186f
9 changed files with 669 additions and 29 deletions

View File

@@ -0,0 +1,98 @@
use pdf::file::FileOptions;
use pdf::primitive::{Date, TimeRel};
use pile_config::Label;
use std::{collections::HashMap, sync::OnceLock};
use crate::{FileItem, PileValue, extract::Extractor};
pub struct PdfMetaExtractor<'a> {
item: &'a FileItem,
output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>,
}
impl<'a> PdfMetaExtractor<'a> {
pub fn new(item: &'a FileItem) -> Self {
Self {
item,
output: OnceLock::new(),
}
}
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let file = FileOptions::cached()
.open(&self.item.path)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let mut output: HashMap<Label, PileValue<'a, FileItem>> = HashMap::new();
if let Some(info) = &file.trailer.info_dict {
let fields: &[(&str, Option<&_>)] = &[
("title", info.title.as_ref()),
("author", info.author.as_ref()),
("subject", info.subject.as_ref()),
("keywords", info.keywords.as_ref()),
("creator", info.creator.as_ref()),
("producer", info.producer.as_ref()),
];
#[expect(clippy::unwrap_used)]
for (key, val) in fields {
let label = Label::new(*key).unwrap();
let value = match val {
Some(s) => PileValue::String(s.to_string_lossy().into()),
None => PileValue::Null,
};
output.insert(label, value);
}
#[expect(clippy::unwrap_used)]
{
output.insert(
Label::new("creation_date").unwrap(),
info.creation_date
.as_ref()
.map(|d| PileValue::String(format_date(d).into()))
.unwrap_or(PileValue::Null),
);
output.insert(
Label::new("mod_date").unwrap(),
info.mod_date
.as_ref()
.map(|d| PileValue::String(format_date(d).into()))
.unwrap_or(PileValue::Null),
);
}
}
return Ok(self.output.get_or_init(|| output));
}
}
fn format_date(d: &Date) -> String {
let tz = match d.rel {
TimeRel::Universal => "Z".to_owned(),
TimeRel::Later => format!("+{:02}:{:02}", d.tz_hour, d.tz_minute),
TimeRel::Earlier => format!("-{:02}:{:02}", d.tz_hour, d.tz_minute),
};
format!(
"{:04}-{:02}-{:02}T{:02}:{:02}:{:02}{}",
d.year, d.month, d.day, d.hour, d.minute, d.second, tz
)
}
impl Extractor<FileItem> for PdfMetaExtractor<'_> {
fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> {
Ok(self.get_inner()?.get(name))
}
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner()?.keys().cloned().collect())
}
}