126 lines
3.4 KiB
Rust
126 lines
3.4 KiB
Rust
use image::ImageFormat;
|
|
use pdfium_render::prelude::*;
|
|
use pile_io::SyncReadBridge;
|
|
use std::{
|
|
io::{BufReader, Cursor},
|
|
sync::Arc,
|
|
};
|
|
use tracing::trace;
|
|
|
|
use crate::{
|
|
extract::traits::{ExtractState, ListExtractor},
|
|
value::{Item, PileValue},
|
|
};
|
|
|
|
pub struct PdfPagesExtractor {
|
|
item: Item,
|
|
}
|
|
|
|
impl PdfPagesExtractor {
|
|
pub fn new(item: &Item) -> Self {
|
|
Self { item: item.clone() }
|
|
}
|
|
|
|
async fn get_bytes(&self) -> Result<Vec<u8>, std::io::Error> {
|
|
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
|
tokio::task::spawn_blocking(move || {
|
|
let mut b = Vec::new();
|
|
std::io::Read::read_to_end(&mut BufReader::new(reader), &mut b)?;
|
|
Ok::<_, std::io::Error>(b)
|
|
})
|
|
.await
|
|
.map_err(std::io::Error::other)?
|
|
}
|
|
}
|
|
|
|
#[async_trait::async_trait]
|
|
impl ListExtractor for PdfPagesExtractor {
|
|
async fn get(
|
|
&self,
|
|
state: &ExtractState,
|
|
idx: usize,
|
|
) -> Result<Option<PileValue>, std::io::Error> {
|
|
trace!(
|
|
key = self.item.key().as_str(),
|
|
"Getting index {idx} from PdfPagesExtractor",
|
|
);
|
|
|
|
if !state.ignore_mime && self.item.mime().essence_str() != "application/pdf" {
|
|
return Ok(None);
|
|
}
|
|
|
|
let bytes = self.get_bytes().await?;
|
|
let png = tokio::task::spawn_blocking(move || {
|
|
let pdfium = Pdfium::default();
|
|
let doc = pdfium
|
|
.load_pdf_from_byte_slice(&bytes, None)
|
|
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
|
if idx >= doc.pages().len() as usize {
|
|
return Ok::<_, std::io::Error>(None);
|
|
}
|
|
let render_config = PdfRenderConfig::new().set_target_width(1024);
|
|
let page = doc
|
|
.pages()
|
|
.get(idx as u16)
|
|
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
|
let image = page
|
|
.render_with_config(&render_config)
|
|
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?
|
|
.as_image();
|
|
let mut png_bytes = Vec::new();
|
|
image
|
|
.write_to(&mut Cursor::new(&mut png_bytes), ImageFormat::Png)
|
|
.map_err(|e| std::io::Error::other(e.to_string()))?;
|
|
Ok(Some(png_bytes))
|
|
})
|
|
.await
|
|
.map_err(std::io::Error::other)?;
|
|
|
|
let value = match png {
|
|
Ok(None) => return Ok(None),
|
|
Ok(Some(bytes)) => PileValue::Blob {
|
|
mime: mime::IMAGE_PNG,
|
|
bytes: Arc::new(bytes),
|
|
},
|
|
Err(error) => {
|
|
trace!(message = "Could not render pdf page", ?error, idx, key = ?self.item.key());
|
|
PileValue::Null
|
|
}
|
|
};
|
|
Ok(Some(value))
|
|
}
|
|
|
|
async fn len(&self, state: &ExtractState) -> Result<usize, std::io::Error> {
|
|
if !state.ignore_mime && self.item.mime().essence_str() != "application/pdf" {
|
|
return Ok(0);
|
|
}
|
|
|
|
let bytes = self.get_bytes().await?;
|
|
let count = tokio::task::spawn_blocking(move || {
|
|
let pdfium = Pdfium::default();
|
|
let doc = pdfium
|
|
.load_pdf_from_byte_slice(&bytes, None)
|
|
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
|
Ok::<_, std::io::Error>(doc.pages().len() as usize)
|
|
})
|
|
.await
|
|
.map_err(std::io::Error::other)?;
|
|
match count {
|
|
Ok(n) => Ok(n),
|
|
Err(error) => {
|
|
trace!(message = "Could not read pdf page count", ?error, key = ?self.item.key());
|
|
Ok(0)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Override, extracting all pages is very slow,
|
|
// and we can't display binary in json anyway
|
|
async fn to_json(&self, state: &ExtractState) -> Result<serde_json::Value, std::io::Error> {
|
|
Ok(serde_json::Value::String(format!(
|
|
"<PdfPages ({} pages)>",
|
|
self.len(state).await?
|
|
)))
|
|
}
|
|
}
|