use pdf::file::FileOptions; use pdf::primitive::{Date, TimeRel}; use pile_config::Label; use std::{ collections::HashMap, io::BufReader, sync::{Arc, OnceLock}, }; use tracing::trace; use crate::{ extract::traits::ObjectExtractor, value::{Item, PileValue, SyncReadBridge}, }; pub struct PdfMetaExtractor { item: Item, output: OnceLock>, } impl PdfMetaExtractor { pub fn new(item: &Item) -> Self { Self { item: item.clone(), output: OnceLock::new(), } } async fn get_inner(&self) -> Result<&HashMap, std::io::Error> { if let Some(x) = self.output.get() { return Ok(x); } let reader = SyncReadBridge::new_current(self.item.read().await?); let raw_meta = tokio::task::spawn_blocking(move || { let mut bytes = Vec::new(); std::io::Read::read_to_end(&mut BufReader::new(reader), &mut bytes)?; let file = match FileOptions::cached().load(bytes) { Ok(x) => x, Err(pdf::PdfError::Io { source }) => return Err(source), Err(error) => { return Err(std::io::Error::new( std::io::ErrorKind::InvalidData, error.to_string(), )); } }; let page_count = file.num_pages(); let mut meta: Vec<(&'static str, Option)> = Vec::new(); if let Some(info) = &file.trailer.info_dict { use pdf::primitive::PdfString; let fields: &[(&'static str, Option<&PdfString>)] = &[ ("title", info.title.as_ref()), ("author", info.author.as_ref()), ("subject", info.subject.as_ref()), ("keywords", info.keywords.as_ref()), ("creator", info.creator.as_ref()), ("producer", info.producer.as_ref()), ]; for (key, val) in fields { meta.push((key, val.map(|s| s.to_string_lossy()))); } meta.push(( "creation_date", info.creation_date.as_ref().map(format_date), )); meta.push(("mod_date", info.mod_date.as_ref().map(format_date))); } Ok::<_, std::io::Error>((page_count, meta)) }) .await .map_err(std::io::Error::other)?; let (page_count, raw_meta) = match raw_meta { Ok(x) => x, Err(error) => { trace!(message = "Could not process pdf", ?error, key = ?self.item.key()); return Ok(self.output.get_or_init(HashMap::new)); } }; let mut output: HashMap = HashMap::new(); #[expect(clippy::unwrap_used)] output.insert( Label::new("pages").unwrap(), PileValue::U64(page_count as u64), ); #[expect(clippy::unwrap_used)] for (key, val) in raw_meta { let label = Label::new(key).unwrap(); let value = match val { Some(s) => PileValue::String(Arc::new(s.into())), None => PileValue::Null, }; output.insert(label, value); } return Ok(self.output.get_or_init(|| output)); } } fn format_date(d: &Date) -> String { let tz = match d.rel { TimeRel::Universal => "Z".to_owned(), TimeRel::Later => format!("+{:02}:{:02}", d.tz_hour, d.tz_minute), TimeRel::Earlier => format!("-{:02}:{:02}", d.tz_hour, d.tz_minute), }; format!( "{:04}-{:02}-{:02}T{:02}:{:02}:{:02}{}", d.year, d.month, d.day, d.hour, d.minute, d.second, tz ) } #[async_trait::async_trait] impl ObjectExtractor for PdfMetaExtractor { async fn field( &self, name: &Label, args: Option<&str>, ) -> Result, std::io::Error> { if args.is_some() { return Ok(None); } Ok(self.get_inner().await?.get(name).cloned()) } async fn fields(&self) -> Result, std::io::Error> { Ok(self.get_inner().await?.keys().cloned().collect()) } }