use pdf::content::{Op, TextDrawAdjusted}; use pdf::file::FileOptions; use pile_config::Label; use std::{collections::HashMap, io::BufReader, sync::OnceLock}; use tracing::debug; use crate::{Item, PileValue, SyncReadBridge, extract::Extractor}; pub struct PdfTextExtractor<'a> { item: &'a Item, output: OnceLock>>, } impl<'a> PdfTextExtractor<'a> { pub fn new(item: &'a Item) -> Self { Self { item, output: OnceLock::new(), } } async fn get_inner(&self) -> Result<&HashMap>, std::io::Error> { if let Some(x) = self.output.get() { return Ok(x); } let reader = SyncReadBridge::new_current(self.item.read().await?); let raw_text = tokio::task::spawn_blocking(move || { let mut bytes = Vec::new(); std::io::Read::read_to_end(&mut BufReader::new(reader), &mut bytes)?; let file = match FileOptions::cached().load(bytes) { Ok(x) => x, Err(pdf::PdfError::Io { source }) => return Err(source), Err(error) => { return Err(std::io::Error::new( std::io::ErrorKind::InvalidData, error.to_string(), )); } }; let mut text_parts: Vec = Vec::new(); for page in file.pages() { let page = page.map_err(|e| { std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()) })?; if let Some(content) = &page.contents { let ops = content.operations(&file.resolver()).map_err(|e| { std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()) })?; for op in ops { match op { Op::TextDraw { text } => { text_parts.push(text.to_string_lossy()); } Op::TextDrawAdjusted { array } => { for item in array { if let TextDrawAdjusted::Text(text) = item { text_parts.push(text.to_string_lossy()); } } } _ => {} } } } } Ok::<_, std::io::Error>(text_parts.join(" ")) }) .await .map_err(std::io::Error::other)?; let raw_text = match raw_text { Ok(x) => x, Err(error) => { debug!(message = "Could not process pdf", ?error, key = ?self.item.key()); return Ok(self.output.get_or_init(HashMap::new)); } }; #[expect(clippy::unwrap_used)] let output = HashMap::from([( Label::new("text").unwrap(), PileValue::String(raw_text.into()), )]); let _ = self.output.set(output); #[expect(clippy::unwrap_used)] return Ok(self.output.get().unwrap()); } } #[async_trait::async_trait] impl Extractor for PdfTextExtractor<'_> { async fn field<'a>( &'a self, name: &Label, ) -> Result>, std::io::Error> { Ok(self.get_inner().await?.get(name)) } async fn fields(&self) -> Result, std::io::Error> { Ok(self.get_inner().await?.keys().cloned().collect()) } }