113 lines
2.7 KiB
Rust
113 lines
2.7 KiB
Rust
use pdf::content::{Op, TextDrawAdjusted};
|
|
use pdf::file::FileOptions;
|
|
use pile_config::Label;
|
|
use std::{
|
|
collections::HashMap,
|
|
io::BufReader,
|
|
sync::{Arc, OnceLock},
|
|
};
|
|
use tracing::trace;
|
|
|
|
use crate::{
|
|
extract::traits::ObjectExtractor,
|
|
value::{Item, PileValue, SyncReadBridge},
|
|
};
|
|
|
|
pub struct PdfTextExtractor {
|
|
item: Item,
|
|
output: OnceLock<HashMap<Label, PileValue>>,
|
|
}
|
|
|
|
impl PdfTextExtractor {
|
|
pub fn new(item: &Item) -> Self {
|
|
Self {
|
|
item: item.clone(),
|
|
output: OnceLock::new(),
|
|
}
|
|
}
|
|
|
|
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
|
|
if let Some(x) = self.output.get() {
|
|
return Ok(x);
|
|
}
|
|
|
|
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
|
let raw_text = tokio::task::spawn_blocking(move || {
|
|
let mut bytes = Vec::new();
|
|
std::io::Read::read_to_end(&mut BufReader::new(reader), &mut bytes)?;
|
|
|
|
let file = match FileOptions::cached().load(bytes) {
|
|
Ok(x) => x,
|
|
Err(pdf::PdfError::Io { source }) => return Err(source),
|
|
Err(error) => {
|
|
return Err(std::io::Error::new(
|
|
std::io::ErrorKind::InvalidData,
|
|
error.to_string(),
|
|
));
|
|
}
|
|
};
|
|
|
|
let mut text_parts: Vec<String> = Vec::new();
|
|
|
|
for page in file.pages() {
|
|
let page = page.map_err(|e| {
|
|
std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string())
|
|
})?;
|
|
|
|
if let Some(content) = &page.contents {
|
|
let ops = content.operations(&file.resolver()).map_err(|e| {
|
|
std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string())
|
|
})?;
|
|
|
|
for op in ops {
|
|
match op {
|
|
Op::TextDraw { text } => {
|
|
text_parts.push(text.to_string_lossy());
|
|
}
|
|
Op::TextDrawAdjusted { array } => {
|
|
for item in array {
|
|
if let TextDrawAdjusted::Text(text) = item {
|
|
text_parts.push(text.to_string_lossy());
|
|
}
|
|
}
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok::<_, std::io::Error>(text_parts.join(" "))
|
|
})
|
|
.await
|
|
.map_err(std::io::Error::other)?;
|
|
|
|
let raw_text = match raw_text {
|
|
Ok(x) => x,
|
|
Err(error) => {
|
|
trace!(message = "Could not process pdf", ?error, key = ?self.item.key());
|
|
return Ok(self.output.get_or_init(HashMap::new));
|
|
}
|
|
};
|
|
|
|
#[expect(clippy::unwrap_used)]
|
|
let output = HashMap::from([(
|
|
Label::new("text").unwrap(),
|
|
PileValue::String(Arc::new(raw_text.into())),
|
|
)]);
|
|
|
|
return Ok(self.output.get_or_init(|| output));
|
|
}
|
|
}
|
|
|
|
#[async_trait::async_trait]
|
|
impl ObjectExtractor for PdfTextExtractor {
|
|
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
|
Ok(self.get_inner().await?.get(name).cloned())
|
|
}
|
|
|
|
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
|
Ok(self.get_inner().await?.keys().cloned().collect())
|
|
}
|
|
}
|