Files
pile/crates/pile-dataset/src/extract/pdf/pdf_text.rs
rm-dr d41ed7a8cd
Some checks failed
CI / Typos (push) Successful in 23s
CI / Clippy (push) Failing after 1m47s
CI / Build and test (push) Failing after 2m10s
Read files incrementally
2026-03-10 09:19:17 -07:00

111 lines
2.7 KiB
Rust

use pdf::content::{Op, TextDrawAdjusted};
use pdf::file::FileOptions;
use pile_config::Label;
use std::{collections::HashMap, io::BufReader, sync::OnceLock};
use tracing::debug;
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor};
pub struct PdfTextExtractor<'a> {
item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a>>>,
}
impl<'a> PdfTextExtractor<'a> {
pub fn new(item: &'a Item) -> Self {
Self {
item,
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let reader = SyncReadBridge::new_current(self.item.read().await?);
let raw_text = tokio::task::spawn_blocking(move || {
let mut bytes = Vec::new();
std::io::Read::read_to_end(&mut BufReader::new(reader), &mut bytes)?;
let file = match FileOptions::cached().load(bytes) {
Ok(x) => x,
Err(pdf::PdfError::Io { source }) => return Err(source),
Err(error) => {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidData,
error.to_string(),
));
}
};
let mut text_parts: Vec<String> = Vec::new();
for page in file.pages() {
let page = page.map_err(|e| {
std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string())
})?;
if let Some(content) = &page.contents {
let ops = content.operations(&file.resolver()).map_err(|e| {
std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string())
})?;
for op in ops {
match op {
Op::TextDraw { text } => {
text_parts.push(text.to_string_lossy());
}
Op::TextDrawAdjusted { array } => {
for item in array {
if let TextDrawAdjusted::Text(text) = item {
text_parts.push(text.to_string_lossy());
}
}
}
_ => {}
}
}
}
}
Ok::<_, std::io::Error>(text_parts.join(" "))
})
.await
.map_err(std::io::Error::other)?;
let raw_text = match raw_text {
Ok(x) => x,
Err(error) => {
debug!(message = "Could not process pdf", ?error, key = ?self.item.key());
return Ok(self.output.get_or_init(HashMap::new));
}
};
#[expect(clippy::unwrap_used)]
let output = HashMap::from([(
Label::new("text").unwrap(),
PileValue::String(raw_text.into()),
)]);
let _ = self.output.set(output);
#[expect(clippy::unwrap_used)]
return Ok(self.output.get().unwrap());
}
}
#[async_trait::async_trait]
impl Extractor for PdfTextExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner().await?.get(name))
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect())
}
}