use pdf::content::{Op, TextDrawAdjusted}; use pdf::file::FileOptions; use pile_config::Label; use std::{ collections::HashMap, io::BufReader, sync::{Arc, OnceLock}, }; use tracing::trace; use crate::{ extract::traits::ObjectExtractor, value::{Item, PileValue, SyncReadBridge}, }; pub struct PdfTextExtractor { item: Item, output: OnceLock>, } impl PdfTextExtractor { pub fn new(item: &Item) -> Self { Self { item: item.clone(), output: OnceLock::new(), } } async fn get_inner(&self) -> Result<&HashMap, std::io::Error> { if let Some(x) = self.output.get() { return Ok(x); } let reader = SyncReadBridge::new_current(self.item.read().await?); let raw_text = tokio::task::spawn_blocking(move || { let mut bytes = Vec::new(); std::io::Read::read_to_end(&mut BufReader::new(reader), &mut bytes)?; let file = match FileOptions::cached().load(bytes) { Ok(x) => x, Err(pdf::PdfError::Io { source }) => return Err(source), Err(error) => { return Err(std::io::Error::new( std::io::ErrorKind::InvalidData, error.to_string(), )); } }; let mut text_parts: Vec = Vec::new(); for page in file.pages() { let page = page.map_err(|e| { std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()) })?; if let Some(content) = &page.contents { let ops = content.operations(&file.resolver()).map_err(|e| { std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()) })?; for op in ops { match op { Op::TextDraw { text } => { text_parts.push(text.to_string_lossy()); } Op::TextDrawAdjusted { array } => { for item in array { if let TextDrawAdjusted::Text(text) = item { text_parts.push(text.to_string_lossy()); } } } _ => {} } } } } Ok::<_, std::io::Error>(text_parts.join(" ")) }) .await .map_err(std::io::Error::other)?; let raw_text = match raw_text { Ok(x) => x, Err(error) => { trace!(message = "Could not process pdf", ?error, key = ?self.item.key()); return Ok(self.output.get_or_init(HashMap::new)); } }; #[expect(clippy::unwrap_used)] let output = HashMap::from([( Label::new("text").unwrap(), PileValue::String(Arc::new(raw_text.into())), )]); return Ok(self.output.get_or_init(|| output)); } } #[async_trait::async_trait] impl ObjectExtractor for PdfTextExtractor { async fn field(&self, name: &Label) -> Result, std::io::Error> { Ok(self.get_inner().await?.get(name).cloned()) } async fn fields(&self) -> Result, std::io::Error> { Ok(self.get_inner().await?.keys().cloned().collect()) } }