Add pdf extractors

2026-03-06 16:26:09 -08:00
parent d51b8b51bf
commit 32c611186f
9 changed files with 669 additions and 29 deletions
--- a/crates/pile-dataset/src/extract/pdf/pdf_text.rs
+++ b/crates/pile-dataset/src/extract/pdf/pdf_text.rs
@@ -0,0 +1,79 @@
+use pdf::content::{Op, TextDrawAdjusted};
+use pdf::file::FileOptions;
+use pile_config::Label;
+use std::{collections::HashMap, sync::OnceLock};
+
+use crate::{FileItem, PileValue, extract::Extractor};
+
+pub struct PdfTextExtractor<'a> {
+	item: &'a FileItem,
+	output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>,
+}
+
+impl<'a> PdfTextExtractor<'a> {
+	pub fn new(item: &'a FileItem) -> Self {
+		Self {
+			item,
+			output: OnceLock::new(),
+		}
+	}
+
+	fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> {
+		if let Some(x) = self.output.get() {
+			return Ok(x);
+		}
+
+		let file = FileOptions::cached()
+			.open(&self.item.path)
+			.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
+
+		let mut text_parts: Vec<String> = Vec::new();
+
+		for page in file.pages() {
+			let page = page
+				.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
+
+			if let Some(content) = &page.contents {
+				let ops = content.operations(&file.resolver()).map_err(|e| {
+					std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string())
+				})?;
+
+				for op in ops {
+					match op {
+						Op::TextDraw { text } => {
+							text_parts.push(text.to_string_lossy());
+						}
+						Op::TextDrawAdjusted { array } => {
+							for item in array {
+								if let TextDrawAdjusted::Text(text) = item {
+									text_parts.push(text.to_string_lossy());
+								}
+							}
+						}
+						_ => {}
+					}
+				}
+			}
+		}
+
+		let text = text_parts.join(" ");
+
+		#[expect(clippy::unwrap_used)]
+		let output = HashMap::from([(Label::new("text").unwrap(), PileValue::String(text.into()))]);
+
+		return Ok(self.output.get_or_init(|| output));
+	}
+}
+
+impl Extractor<FileItem> for PdfTextExtractor<'_> {
+	fn field<'a>(
+		&'a self,
+		name: &Label,
+	) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> {
+		Ok(self.get_inner()?.get(name))
+	}
+
+	fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
+		Ok(self.get_inner()?.keys().cloned().collect())
+	}
+}