Extractor rewrite
This commit is contained in:
61
crates/pile-value/src/extract/item/pdf/mod.rs
Normal file
61
crates/pile-value/src/extract/item/pdf/mod.rs
Normal file
@@ -0,0 +1,61 @@
|
||||
use pile_config::Label;
|
||||
use std::sync::Arc;
|
||||
|
||||
#[cfg(feature = "pdfium")]
|
||||
mod pdf_pages;
|
||||
#[cfg(feature = "pdfium")]
|
||||
pub use pdf_pages::*;
|
||||
|
||||
mod pdf_meta;
|
||||
pub use pdf_meta::*;
|
||||
|
||||
mod pdf_text;
|
||||
pub use pdf_text::*;
|
||||
|
||||
use crate::{
|
||||
extract::traits::ObjectExtractor,
|
||||
value::{Item, PileValue},
|
||||
};
|
||||
|
||||
pub struct PdfExtractor {
|
||||
text: Arc<PdfTextExtractor>,
|
||||
meta: Arc<PdfMetaExtractor>,
|
||||
#[cfg(feature = "pdfium")]
|
||||
pages: Arc<PdfPagesExtractor>,
|
||||
}
|
||||
|
||||
impl PdfExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
Self {
|
||||
text: Arc::new(PdfTextExtractor::new(item)),
|
||||
meta: Arc::new(PdfMetaExtractor::new(item)),
|
||||
#[cfg(feature = "pdfium")]
|
||||
pages: Arc::new(PdfPagesExtractor::new(item)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for PdfExtractor {
|
||||
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||
match name.as_str() {
|
||||
"text" => self.text.field(name).await,
|
||||
"meta" => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))),
|
||||
#[cfg(feature = "pdfium")]
|
||||
"pages" => Ok(Some(PileValue::ListExtractor(self.pages.clone()))),
|
||||
_ => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(vec![
|
||||
Label::new("text").unwrap(),
|
||||
Label::new("meta").unwrap(),
|
||||
#[cfg(feature = "pdfium")]
|
||||
Label::new("cover").unwrap(),
|
||||
#[cfg(feature = "pdfium")]
|
||||
Label::new("pages").unwrap(),
|
||||
])
|
||||
}
|
||||
}
|
||||
132
crates/pile-value/src/extract/item/pdf/pdf_meta.rs
Normal file
132
crates/pile-value/src/extract/item/pdf/pdf_meta.rs
Normal file
@@ -0,0 +1,132 @@
|
||||
use pdf::file::FileOptions;
|
||||
use pdf::primitive::{Date, TimeRel};
|
||||
use pile_config::Label;
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
io::BufReader,
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
extract::traits::ObjectExtractor,
|
||||
value::{Item, PileValue, SyncReadBridge},
|
||||
};
|
||||
|
||||
pub struct PdfMetaExtractor {
|
||||
item: Item,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl PdfMetaExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
|
||||
if let Some(x) = self.output.get() {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
let raw_meta = tokio::task::spawn_blocking(move || {
|
||||
let mut bytes = Vec::new();
|
||||
std::io::Read::read_to_end(&mut BufReader::new(reader), &mut bytes)?;
|
||||
|
||||
let file = match FileOptions::cached().load(bytes) {
|
||||
Ok(x) => x,
|
||||
Err(pdf::PdfError::Io { source }) => return Err(source),
|
||||
Err(error) => {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
error.to_string(),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
let page_count = file.num_pages();
|
||||
|
||||
let mut meta: Vec<(&'static str, Option<String>)> = Vec::new();
|
||||
|
||||
if let Some(info) = &file.trailer.info_dict {
|
||||
use pdf::primitive::PdfString;
|
||||
let fields: &[(&'static str, Option<&PdfString>)] = &[
|
||||
("title", info.title.as_ref()),
|
||||
("author", info.author.as_ref()),
|
||||
("subject", info.subject.as_ref()),
|
||||
("keywords", info.keywords.as_ref()),
|
||||
("creator", info.creator.as_ref()),
|
||||
("producer", info.producer.as_ref()),
|
||||
];
|
||||
|
||||
for (key, val) in fields {
|
||||
meta.push((key, val.map(|s| s.to_string_lossy())));
|
||||
}
|
||||
|
||||
meta.push((
|
||||
"creation_date",
|
||||
info.creation_date.as_ref().map(format_date),
|
||||
));
|
||||
meta.push(("mod_date", info.mod_date.as_ref().map(format_date)));
|
||||
}
|
||||
|
||||
Ok::<_, std::io::Error>((page_count, meta))
|
||||
})
|
||||
.await
|
||||
.map_err(std::io::Error::other)?;
|
||||
|
||||
let (page_count, raw_meta) = match raw_meta {
|
||||
Ok(x) => x,
|
||||
Err(error) => {
|
||||
trace!(message = "Could not process pdf", ?error, key = ?self.item.key());
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
};
|
||||
|
||||
let mut output: HashMap<Label, PileValue> = HashMap::new();
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
output.insert(
|
||||
Label::new("pages").unwrap(),
|
||||
PileValue::U64(page_count as u64),
|
||||
);
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
for (key, val) in raw_meta {
|
||||
let label = Label::new(key).unwrap();
|
||||
let value = match val {
|
||||
Some(s) => PileValue::String(Arc::new(s.into())),
|
||||
None => PileValue::Null,
|
||||
};
|
||||
output.insert(label, value);
|
||||
}
|
||||
|
||||
return Ok(self.output.get_or_init(|| output));
|
||||
}
|
||||
}
|
||||
|
||||
fn format_date(d: &Date) -> String {
|
||||
let tz = match d.rel {
|
||||
TimeRel::Universal => "Z".to_owned(),
|
||||
TimeRel::Later => format!("+{:02}:{:02}", d.tz_hour, d.tz_minute),
|
||||
TimeRel::Earlier => format!("-{:02}:{:02}", d.tz_hour, d.tz_minute),
|
||||
};
|
||||
format!(
|
||||
"{:04}-{:02}-{:02}T{:02}:{:02}:{:02}{}",
|
||||
d.year, d.month, d.day, d.hour, d.minute, d.second, tz
|
||||
)
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for PdfMetaExtractor {
|
||||
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.get(name).cloned())
|
||||
}
|
||||
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.keys().cloned().collect())
|
||||
}
|
||||
}
|
||||
107
crates/pile-value/src/extract/item/pdf/pdf_pages.rs
Normal file
107
crates/pile-value/src/extract/item/pdf/pdf_pages.rs
Normal file
@@ -0,0 +1,107 @@
|
||||
use image::ImageFormat;
|
||||
use pdfium_render::prelude::*;
|
||||
use std::{
|
||||
io::{BufReader, Cursor},
|
||||
sync::Arc,
|
||||
};
|
||||
use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
extract::traits::ListExtractor,
|
||||
value::{Item, PileValue, SyncReadBridge},
|
||||
};
|
||||
|
||||
pub struct PdfPagesExtractor {
|
||||
item: Item,
|
||||
}
|
||||
|
||||
impl PdfPagesExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
Self { item: item.clone() }
|
||||
}
|
||||
|
||||
async fn get_bytes(&self) -> Result<Vec<u8>, std::io::Error> {
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let mut b = Vec::new();
|
||||
std::io::Read::read_to_end(&mut BufReader::new(reader), &mut b)?;
|
||||
Ok::<_, std::io::Error>(b)
|
||||
})
|
||||
.await
|
||||
.map_err(std::io::Error::other)?
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ListExtractor for PdfPagesExtractor {
|
||||
async fn get(&self, idx: usize) -> Result<Option<PileValue>, std::io::Error> {
|
||||
let bytes = self.get_bytes().await?;
|
||||
let png = tokio::task::spawn_blocking(move || {
|
||||
let pdfium = Pdfium::default();
|
||||
let doc = pdfium
|
||||
.load_pdf_from_byte_slice(&bytes, None)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||
if idx >= doc.pages().len() as usize {
|
||||
return Ok::<_, std::io::Error>(None);
|
||||
}
|
||||
let render_config = PdfRenderConfig::new().set_target_width(1024);
|
||||
let page = doc
|
||||
.pages()
|
||||
.get(idx as u16)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||
let image = page
|
||||
.render_with_config(&render_config)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?
|
||||
.as_image();
|
||||
let mut png_bytes = Vec::new();
|
||||
image
|
||||
.write_to(&mut Cursor::new(&mut png_bytes), ImageFormat::Png)
|
||||
.map_err(|e| std::io::Error::other(e.to_string()))?;
|
||||
Ok(Some(png_bytes))
|
||||
})
|
||||
.await
|
||||
.map_err(std::io::Error::other)?;
|
||||
|
||||
let value = match png {
|
||||
Ok(None) => return Ok(None),
|
||||
Ok(Some(bytes)) => PileValue::Blob {
|
||||
mime: mime::IMAGE_PNG,
|
||||
bytes: Arc::new(bytes),
|
||||
},
|
||||
Err(error) => {
|
||||
trace!(message = "Could not render pdf page", ?error, idx, key = ?self.item.key());
|
||||
PileValue::Null
|
||||
}
|
||||
};
|
||||
Ok(Some(value))
|
||||
}
|
||||
|
||||
async fn len(&self) -> Result<usize, std::io::Error> {
|
||||
let bytes = self.get_bytes().await?;
|
||||
let count = tokio::task::spawn_blocking(move || {
|
||||
let pdfium = Pdfium::default();
|
||||
let doc = pdfium
|
||||
.load_pdf_from_byte_slice(&bytes, None)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||
Ok::<_, std::io::Error>(doc.pages().len() as usize)
|
||||
})
|
||||
.await
|
||||
.map_err(std::io::Error::other)?;
|
||||
match count {
|
||||
Ok(n) => Ok(n),
|
||||
Err(error) => {
|
||||
trace!(message = "Could not read pdf page count", ?error, key = ?self.item.key());
|
||||
Ok(0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Override, extracting all pages is very slow,
|
||||
// and we can't display binary in json anyway
|
||||
async fn to_json(&self) -> Result<serde_json::Value, std::io::Error> {
|
||||
Ok(serde_json::Value::String(format!(
|
||||
"<PdfPages ({} pages)>",
|
||||
self.len().await?
|
||||
)))
|
||||
}
|
||||
}
|
||||
112
crates/pile-value/src/extract/item/pdf/pdf_text.rs
Normal file
112
crates/pile-value/src/extract/item/pdf/pdf_text.rs
Normal file
@@ -0,0 +1,112 @@
|
||||
use pdf::content::{Op, TextDrawAdjusted};
|
||||
use pdf::file::FileOptions;
|
||||
use pile_config::Label;
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
io::BufReader,
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
extract::traits::ObjectExtractor,
|
||||
value::{Item, PileValue, SyncReadBridge},
|
||||
};
|
||||
|
||||
pub struct PdfTextExtractor {
|
||||
item: Item,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl PdfTextExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
|
||||
if let Some(x) = self.output.get() {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
let raw_text = tokio::task::spawn_blocking(move || {
|
||||
let mut bytes = Vec::new();
|
||||
std::io::Read::read_to_end(&mut BufReader::new(reader), &mut bytes)?;
|
||||
|
||||
let file = match FileOptions::cached().load(bytes) {
|
||||
Ok(x) => x,
|
||||
Err(pdf::PdfError::Io { source }) => return Err(source),
|
||||
Err(error) => {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
error.to_string(),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
let mut text_parts: Vec<String> = Vec::new();
|
||||
|
||||
for page in file.pages() {
|
||||
let page = page.map_err(|e| {
|
||||
std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string())
|
||||
})?;
|
||||
|
||||
if let Some(content) = &page.contents {
|
||||
let ops = content.operations(&file.resolver()).map_err(|e| {
|
||||
std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string())
|
||||
})?;
|
||||
|
||||
for op in ops {
|
||||
match op {
|
||||
Op::TextDraw { text } => {
|
||||
text_parts.push(text.to_string_lossy());
|
||||
}
|
||||
Op::TextDrawAdjusted { array } => {
|
||||
for item in array {
|
||||
if let TextDrawAdjusted::Text(text) = item {
|
||||
text_parts.push(text.to_string_lossy());
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok::<_, std::io::Error>(text_parts.join(" "))
|
||||
})
|
||||
.await
|
||||
.map_err(std::io::Error::other)?;
|
||||
|
||||
let raw_text = match raw_text {
|
||||
Ok(x) => x,
|
||||
Err(error) => {
|
||||
trace!(message = "Could not process pdf", ?error, key = ?self.item.key());
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
};
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
let output = HashMap::from([(
|
||||
Label::new("text").unwrap(),
|
||||
PileValue::String(Arc::new(raw_text.into())),
|
||||
)]);
|
||||
|
||||
return Ok(self.output.get_or_init(|| output));
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for PdfTextExtractor {
|
||||
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.get(name).cloned())
|
||||
}
|
||||
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.keys().cloned().collect())
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user