use image::ImageFormat; use pdfium_render::prelude::*; use pile_io::SyncReadBridge; use std::{ io::{BufReader, Cursor}, sync::Arc, }; use tracing::trace; use crate::{ extract::traits::{ExtractState, ListExtractor}, value::{Item, PileValue}, }; pub struct PdfPagesExtractor { item: Item, } impl PdfPagesExtractor { pub fn new(item: &Item) -> Self { Self { item: item.clone() } } async fn get_bytes(&self) -> Result, std::io::Error> { let reader = SyncReadBridge::new_current(self.item.read().await?); tokio::task::spawn_blocking(move || { let mut b = Vec::new(); std::io::Read::read_to_end(&mut BufReader::new(reader), &mut b)?; Ok::<_, std::io::Error>(b) }) .await .map_err(std::io::Error::other)? } } #[async_trait::async_trait] impl ListExtractor for PdfPagesExtractor { async fn get( &self, state: &ExtractState, idx: usize, ) -> Result, std::io::Error> { trace!( key = self.item.key().as_str(), "Getting index {idx} from PdfPagesExtractor", ); if !state.ignore_mime && self.item.mime().essence_str() != "application/pdf" { return Ok(None); } let bytes = self.get_bytes().await?; let png = tokio::task::spawn_blocking(move || { let pdfium = Pdfium::default(); let doc = pdfium .load_pdf_from_byte_slice(&bytes, None) .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?; if idx >= doc.pages().len() as usize { return Ok::<_, std::io::Error>(None); } let render_config = PdfRenderConfig::new().set_target_width(1024); let page = doc .pages() .get(idx as u16) .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?; let image = page .render_with_config(&render_config) .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))? .as_image(); let mut png_bytes = Vec::new(); image .write_to(&mut Cursor::new(&mut png_bytes), ImageFormat::Png) .map_err(|e| std::io::Error::other(e.to_string()))?; Ok(Some(png_bytes)) }) .await .map_err(std::io::Error::other)?; let value = match png { Ok(None) => return Ok(None), Ok(Some(bytes)) => PileValue::Blob { mime: mime::IMAGE_PNG, bytes: Arc::new(bytes), }, Err(error) => { trace!(message = "Could not render pdf page", ?error, idx, key = ?self.item.key()); PileValue::Null } }; Ok(Some(value)) } async fn len(&self, state: &ExtractState) -> Result { if !state.ignore_mime && self.item.mime().essence_str() != "application/pdf" { return Ok(0); } let bytes = self.get_bytes().await?; let count = tokio::task::spawn_blocking(move || { let pdfium = Pdfium::default(); let doc = pdfium .load_pdf_from_byte_slice(&bytes, None) .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?; Ok::<_, std::io::Error>(doc.pages().len() as usize) }) .await .map_err(std::io::Error::other)?; match count { Ok(n) => Ok(n), Err(error) => { trace!(message = "Could not read pdf page count", ?error, key = ?self.item.key()); Ok(0) } } } // Override, extracting all pages is very slow, // and we can't display binary in json anyway async fn to_json(&self, state: &ExtractState) -> Result { Ok(serde_json::Value::String(format!( "", self.len(state).await? ))) } }