Add pdf extractors

This commit is contained in:
2026-03-06 16:26:09 -08:00
parent d51b8b51bf
commit 32c611186f
9 changed files with 669 additions and 29 deletions

View File

@@ -0,0 +1,79 @@
use pdf::content::{Op, TextDrawAdjusted};
use pdf::file::FileOptions;
use pile_config::Label;
use std::{collections::HashMap, sync::OnceLock};
use crate::{FileItem, PileValue, extract::Extractor};
pub struct PdfTextExtractor<'a> {
item: &'a FileItem,
output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>,
}
impl<'a> PdfTextExtractor<'a> {
pub fn new(item: &'a FileItem) -> Self {
Self {
item,
output: OnceLock::new(),
}
}
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let file = FileOptions::cached()
.open(&self.item.path)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let mut text_parts: Vec<String> = Vec::new();
for page in file.pages() {
let page = page
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
if let Some(content) = &page.contents {
let ops = content.operations(&file.resolver()).map_err(|e| {
std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string())
})?;
for op in ops {
match op {
Op::TextDraw { text } => {
text_parts.push(text.to_string_lossy());
}
Op::TextDrawAdjusted { array } => {
for item in array {
if let TextDrawAdjusted::Text(text) = item {
text_parts.push(text.to_string_lossy());
}
}
}
_ => {}
}
}
}
}
let text = text_parts.join(" ");
#[expect(clippy::unwrap_used)]
let output = HashMap::from([(Label::new("text").unwrap(), PileValue::String(text.into()))]);
return Ok(self.output.get_or_init(|| output));
}
}
impl Extractor<FileItem> for PdfTextExtractor<'_> {
fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> {
Ok(self.get_inner()?.get(name))
}
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner()?.keys().cloned().collect())
}
}