use pdf::file::FileOptions; use pdf::primitive::{Date, TimeRel}; use pile_config::Label; use std::{collections::HashMap, io::BufReader, sync::OnceLock}; use tracing::trace; use crate::extract::ObjectExtractor; use crate::{Item, PileValue, SyncReadBridge}; pub struct PdfMetaExtractor<'a> { item: &'a Item, output: OnceLock>>, } impl<'a> PdfMetaExtractor<'a> { pub fn new(item: &'a Item) -> Self { Self { item, output: OnceLock::new(), } } async fn get_inner(&self) -> Result<&HashMap>, std::io::Error> { if let Some(x) = self.output.get() { return Ok(x); } let reader = SyncReadBridge::new_current(self.item.read().await?); let raw_meta = tokio::task::spawn_blocking(move || { let mut bytes = Vec::new(); std::io::Read::read_to_end(&mut BufReader::new(reader), &mut bytes)?; let file = match FileOptions::cached().load(bytes) { Ok(x) => x, Err(pdf::PdfError::Io { source }) => return Err(source), Err(error) => { return Err(std::io::Error::new( std::io::ErrorKind::InvalidData, error.to_string(), )); } }; let page_count = file.num_pages(); let mut meta: Vec<(&'static str, Option)> = Vec::new(); if let Some(info) = &file.trailer.info_dict { use pdf::primitive::PdfString; let fields: &[(&'static str, Option<&PdfString>)] = &[ ("title", info.title.as_ref()), ("author", info.author.as_ref()), ("subject", info.subject.as_ref()), ("keywords", info.keywords.as_ref()), ("creator", info.creator.as_ref()), ("producer", info.producer.as_ref()), ]; for (key, val) in fields { meta.push((key, val.map(|s| s.to_string_lossy()))); } meta.push(( "creation_date", info.creation_date.as_ref().map(format_date), )); meta.push(("mod_date", info.mod_date.as_ref().map(format_date))); } Ok::<_, std::io::Error>((page_count, meta)) }) .await .map_err(std::io::Error::other)?; let (page_count, raw_meta) = match raw_meta { Ok(x) => x, Err(error) => { trace!(message = "Could not process pdf", ?error, key = ?self.item.key()); return Ok(self.output.get_or_init(HashMap::new)); } }; let mut output: HashMap> = HashMap::new(); #[expect(clippy::unwrap_used)] output.insert( Label::new("pages").unwrap(), PileValue::U64(page_count as u64), ); #[expect(clippy::unwrap_used)] for (key, val) in raw_meta { let label = Label::new(key).unwrap(); let value = match val { Some(s) => PileValue::String(s.into()), None => PileValue::Null, }; output.insert(label, value); } return Ok(self.output.get_or_init(|| output)); } } fn format_date(d: &Date) -> String { let tz = match d.rel { TimeRel::Universal => "Z".to_owned(), TimeRel::Later => format!("+{:02}:{:02}", d.tz_hour, d.tz_minute), TimeRel::Earlier => format!("-{:02}:{:02}", d.tz_hour, d.tz_minute), }; format!( "{:04}-{:02}-{:02}T{:02}:{:02}:{:02}{}", d.year, d.month, d.day, d.hour, d.minute, d.second, tz ) } #[async_trait::async_trait] impl ObjectExtractor for PdfMetaExtractor<'_> { async fn field<'a>( &'a self, name: &Label, ) -> Result>, std::io::Error> { Ok(self.get_inner().await?.get(name)) } async fn fields(&self) -> Result, std::io::Error> { Ok(self.get_inner().await?.keys().cloned().collect()) } }