Read files incrementally
This commit is contained in:
@@ -1,10 +1,10 @@
|
||||
use pdf::content::{Op, TextDrawAdjusted};
|
||||
use pdf::file::FileOptions;
|
||||
use pile_config::Label;
|
||||
use std::{collections::HashMap, sync::OnceLock};
|
||||
use std::{collections::HashMap, io::BufReader, sync::OnceLock};
|
||||
use tracing::debug;
|
||||
|
||||
use crate::{Item, PileValue, extract::Extractor};
|
||||
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor};
|
||||
|
||||
pub struct PdfTextExtractor<'a> {
|
||||
item: &'a Item,
|
||||
@@ -24,50 +24,70 @@ impl<'a> PdfTextExtractor<'a> {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let bytes = self.item.read().await?.read_to_end().await?;
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
let raw_text = tokio::task::spawn_blocking(move || {
|
||||
let mut bytes = Vec::new();
|
||||
std::io::Read::read_to_end(&mut BufReader::new(reader), &mut bytes)?;
|
||||
|
||||
let file = match FileOptions::cached().load(bytes) {
|
||||
let file = match FileOptions::cached().load(bytes) {
|
||||
Ok(x) => x,
|
||||
Err(pdf::PdfError::Io { source }) => return Err(source),
|
||||
Err(error) => {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
error.to_string(),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
let mut text_parts: Vec<String> = Vec::new();
|
||||
|
||||
for page in file.pages() {
|
||||
let page = page.map_err(|e| {
|
||||
std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string())
|
||||
})?;
|
||||
|
||||
if let Some(content) = &page.contents {
|
||||
let ops = content.operations(&file.resolver()).map_err(|e| {
|
||||
std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string())
|
||||
})?;
|
||||
|
||||
for op in ops {
|
||||
match op {
|
||||
Op::TextDraw { text } => {
|
||||
text_parts.push(text.to_string_lossy());
|
||||
}
|
||||
Op::TextDrawAdjusted { array } => {
|
||||
for item in array {
|
||||
if let TextDrawAdjusted::Text(text) = item {
|
||||
text_parts.push(text.to_string_lossy());
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok::<_, std::io::Error>(text_parts.join(" "))
|
||||
})
|
||||
.await
|
||||
.map_err(std::io::Error::other)?;
|
||||
|
||||
let raw_text = match raw_text {
|
||||
Ok(x) => x,
|
||||
Err(pdf::PdfError::Io { source }) => return Err(source),
|
||||
Err(error) => {
|
||||
debug!(message = "Could not process pdf", ?error, key = ?self.item.key());
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
};
|
||||
|
||||
let mut text_parts: Vec<String> = Vec::new();
|
||||
|
||||
for page in file.pages() {
|
||||
let page = page
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||
|
||||
if let Some(content) = &page.contents {
|
||||
let ops = content.operations(&file.resolver()).map_err(|e| {
|
||||
std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string())
|
||||
})?;
|
||||
|
||||
for op in ops {
|
||||
match op {
|
||||
Op::TextDraw { text } => {
|
||||
text_parts.push(text.to_string_lossy());
|
||||
}
|
||||
Op::TextDrawAdjusted { array } => {
|
||||
for item in array {
|
||||
if let TextDrawAdjusted::Text(text) = item {
|
||||
text_parts.push(text.to_string_lossy());
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let text = text_parts.join(" ");
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
let output = HashMap::from([(Label::new("text").unwrap(), PileValue::String(text.into()))]);
|
||||
let output = HashMap::from([(
|
||||
Label::new("text").unwrap(),
|
||||
PileValue::String(raw_text.into()),
|
||||
)]);
|
||||
|
||||
let _ = self.output.set(output);
|
||||
#[expect(clippy::unwrap_used)]
|
||||
|
||||
Reference in New Issue
Block a user