Extractor rewrite

This commit is contained in:
2026-03-11 10:12:36 -07:00
parent b789255ea9
commit 078801be40
51 changed files with 676 additions and 693 deletions

View File

@@ -0,0 +1,107 @@
use image::ImageFormat;
use pdfium_render::prelude::*;
use std::{
io::{BufReader, Cursor},
sync::Arc,
};
use tracing::trace;
use crate::{
extract::traits::ListExtractor,
value::{Item, PileValue, SyncReadBridge},
};
pub struct PdfPagesExtractor {
item: Item,
}
impl PdfPagesExtractor {
pub fn new(item: &Item) -> Self {
Self { item: item.clone() }
}
async fn get_bytes(&self) -> Result<Vec<u8>, std::io::Error> {
let reader = SyncReadBridge::new_current(self.item.read().await?);
tokio::task::spawn_blocking(move || {
let mut b = Vec::new();
std::io::Read::read_to_end(&mut BufReader::new(reader), &mut b)?;
Ok::<_, std::io::Error>(b)
})
.await
.map_err(std::io::Error::other)?
}
}
#[async_trait::async_trait]
impl ListExtractor for PdfPagesExtractor {
async fn get(&self, idx: usize) -> Result<Option<PileValue>, std::io::Error> {
let bytes = self.get_bytes().await?;
let png = tokio::task::spawn_blocking(move || {
let pdfium = Pdfium::default();
let doc = pdfium
.load_pdf_from_byte_slice(&bytes, None)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
if idx >= doc.pages().len() as usize {
return Ok::<_, std::io::Error>(None);
}
let render_config = PdfRenderConfig::new().set_target_width(1024);
let page = doc
.pages()
.get(idx as u16)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let image = page
.render_with_config(&render_config)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?
.as_image();
let mut png_bytes = Vec::new();
image
.write_to(&mut Cursor::new(&mut png_bytes), ImageFormat::Png)
.map_err(|e| std::io::Error::other(e.to_string()))?;
Ok(Some(png_bytes))
})
.await
.map_err(std::io::Error::other)?;
let value = match png {
Ok(None) => return Ok(None),
Ok(Some(bytes)) => PileValue::Blob {
mime: mime::IMAGE_PNG,
bytes: Arc::new(bytes),
},
Err(error) => {
trace!(message = "Could not render pdf page", ?error, idx, key = ?self.item.key());
PileValue::Null
}
};
Ok(Some(value))
}
async fn len(&self) -> Result<usize, std::io::Error> {
let bytes = self.get_bytes().await?;
let count = tokio::task::spawn_blocking(move || {
let pdfium = Pdfium::default();
let doc = pdfium
.load_pdf_from_byte_slice(&bytes, None)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
Ok::<_, std::io::Error>(doc.pages().len() as usize)
})
.await
.map_err(std::io::Error::other)?;
match count {
Ok(n) => Ok(n),
Err(error) => {
trace!(message = "Could not read pdf page count", ?error, key = ?self.item.key());
Ok(0)
}
}
}
// Override, extracting all pages is very slow,
// and we can't display binary in json anyway
async fn to_json(&self) -> Result<serde_json::Value, std::io::Error> {
Ok(serde_json::Value::String(format!(
"<PdfPages ({} pages)>",
self.len().await?
)))
}
}