Extractor refactor, S3 support
This commit is contained in:
@@ -2,34 +2,38 @@ use pdf::content::{Op, TextDrawAdjusted};
|
||||
use pdf::file::FileOptions;
|
||||
use pile_config::Label;
|
||||
use std::{collections::HashMap, sync::OnceLock};
|
||||
use tracing::debug;
|
||||
|
||||
use crate::{FileItem, PileValue, extract::Extractor};
|
||||
use crate::{Item, PileValue, extract::Extractor};
|
||||
|
||||
pub struct PdfTextExtractor<'a> {
|
||||
item: &'a FileItem,
|
||||
output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>,
|
||||
item: &'a Item,
|
||||
output: OnceLock<HashMap<Label, PileValue<'a>>>,
|
||||
}
|
||||
|
||||
impl<'a> PdfTextExtractor<'a> {
|
||||
pub fn new(item: &'a FileItem) -> Self {
|
||||
pub fn new(item: &'a Item) -> Self {
|
||||
Self {
|
||||
item,
|
||||
output: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> {
|
||||
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
|
||||
if let Some(x) = self.output.get() {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
if self.item.path.extension().map(|x| x.to_str()).flatten() != Some("pdf") {
|
||||
return Ok(self.output.get_or_init(|| HashMap::new()));
|
||||
}
|
||||
let bytes = self.item.read().await?.read_to_end().await?;
|
||||
|
||||
let file = FileOptions::cached()
|
||||
.open(&self.item.path)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||
let file = match FileOptions::cached().load(bytes) {
|
||||
Ok(x) => x,
|
||||
Err(pdf::PdfError::Io { source }) => return Err(source),
|
||||
Err(error) => {
|
||||
debug!(message = "Could not process pdf", ?error, key = ?self.item.key());
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
};
|
||||
|
||||
let mut text_parts: Vec<String> = Vec::new();
|
||||
|
||||
@@ -65,19 +69,22 @@ impl<'a> PdfTextExtractor<'a> {
|
||||
#[expect(clippy::unwrap_used)]
|
||||
let output = HashMap::from([(Label::new("text").unwrap(), PileValue::String(text.into()))]);
|
||||
|
||||
return Ok(self.output.get_or_init(|| output));
|
||||
let _ = self.output.set(output);
|
||||
#[expect(clippy::unwrap_used)]
|
||||
return Ok(self.output.get().unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
impl Extractor<FileItem> for PdfTextExtractor<'_> {
|
||||
fn field<'a>(
|
||||
#[async_trait::async_trait]
|
||||
impl Extractor for PdfTextExtractor<'_> {
|
||||
async fn field<'a>(
|
||||
&'a self,
|
||||
name: &Label,
|
||||
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> {
|
||||
Ok(self.get_inner()?.get(name))
|
||||
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.get(name))
|
||||
}
|
||||
|
||||
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner()?.keys().cloned().collect())
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.keys().cloned().collect())
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user