Add pdf cover extraction
This commit is contained in:
@@ -1,6 +1,11 @@
|
||||
use pile_config::Label;
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
#[cfg(feature = "pdfium")]
|
||||
mod pdf_cover;
|
||||
#[cfg(feature = "pdfium")]
|
||||
pub use pdf_cover::*;
|
||||
|
||||
mod pdf_meta;
|
||||
pub use pdf_meta::*;
|
||||
|
||||
@@ -19,20 +24,24 @@ pub struct PdfExtractor<'a> {
|
||||
impl<'a> PdfExtractor<'a> {
|
||||
#[expect(clippy::unwrap_used)]
|
||||
pub fn new(item: &'a Item) -> Self {
|
||||
let inner = MapExtractor {
|
||||
inner: HashMap::from([
|
||||
(
|
||||
Label::new("text").unwrap(),
|
||||
PileValue::Extractor(Arc::new(PdfTextExtractor::new(item))),
|
||||
),
|
||||
(
|
||||
Label::new("meta").unwrap(),
|
||||
PileValue::Extractor(Arc::new(PdfMetaExtractor::new(item))),
|
||||
),
|
||||
]),
|
||||
};
|
||||
let mut inner_map = HashMap::new();
|
||||
inner_map.insert(
|
||||
Label::new("text").unwrap(),
|
||||
PileValue::Extractor(Arc::new(PdfTextExtractor::new(item))),
|
||||
);
|
||||
inner_map.insert(
|
||||
Label::new("meta").unwrap(),
|
||||
PileValue::Extractor(Arc::new(PdfMetaExtractor::new(item))),
|
||||
);
|
||||
#[cfg(feature = "pdfium")]
|
||||
inner_map.insert(
|
||||
Label::new("cover").unwrap(),
|
||||
PileValue::Extractor(Arc::new(PdfCoverExtractor::new(item))),
|
||||
);
|
||||
|
||||
Self { inner }
|
||||
Self {
|
||||
inner: MapExtractor { inner: inner_map },
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50,6 +59,15 @@ impl Extractor for PdfExtractor<'_> {
|
||||
};
|
||||
}
|
||||
|
||||
#[cfg(feature = "pdfium")]
|
||||
#[expect(clippy::unwrap_used)]
|
||||
if name.as_str() == "cover" {
|
||||
match self.inner.inner.get(name).unwrap() {
|
||||
PileValue::Extractor(x) => return x.field(name).await,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
}
|
||||
|
||||
self.inner.field(name).await
|
||||
}
|
||||
|
||||
@@ -58,6 +76,8 @@ impl Extractor for PdfExtractor<'_> {
|
||||
Ok(vec![
|
||||
Label::new("text").unwrap(),
|
||||
Label::new("meta").unwrap(),
|
||||
#[cfg(feature = "pdfium")]
|
||||
Label::new("cover").unwrap(),
|
||||
])
|
||||
}
|
||||
}
|
||||
|
||||
98
crates/pile-dataset/src/extract/pdf/pdf_cover.rs
Normal file
98
crates/pile-dataset/src/extract/pdf/pdf_cover.rs
Normal file
@@ -0,0 +1,98 @@
|
||||
use image::ImageFormat;
|
||||
use pdfium_render::prelude::*;
|
||||
use pile_config::Label;
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
io::{BufReader, Cursor},
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
use tracing::trace;
|
||||
|
||||
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor};
|
||||
|
||||
pub struct PdfCoverExtractor<'a> {
|
||||
item: &'a Item,
|
||||
output: OnceLock<HashMap<Label, PileValue<'a>>>,
|
||||
}
|
||||
|
||||
impl<'a> PdfCoverExtractor<'a> {
|
||||
pub fn new(item: &'a Item) -> Self {
|
||||
Self {
|
||||
item,
|
||||
output: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
|
||||
if let Some(x) = self.output.get() {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
let cover = tokio::task::spawn_blocking(move || {
|
||||
let mut bytes = Vec::new();
|
||||
std::io::Read::read_to_end(&mut BufReader::new(reader), &mut bytes)?;
|
||||
|
||||
let pdfium = Pdfium::default();
|
||||
|
||||
let document = pdfium
|
||||
.load_pdf_from_byte_slice(&bytes, None)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||
|
||||
let render_config = PdfRenderConfig::new().set_target_width(1024);
|
||||
|
||||
let page = document
|
||||
.pages()
|
||||
.get(0)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||
|
||||
let image = page
|
||||
.render_with_config(&render_config)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?
|
||||
.as_image();
|
||||
|
||||
let mut png_bytes = Vec::new();
|
||||
image
|
||||
.write_to(&mut Cursor::new(&mut png_bytes), ImageFormat::Png)
|
||||
.map_err(|e| std::io::Error::other(e.to_string()))?;
|
||||
|
||||
Ok::<_, std::io::Error>(png_bytes)
|
||||
})
|
||||
.await
|
||||
.map_err(std::io::Error::other)?;
|
||||
|
||||
let output = match cover {
|
||||
Ok(data) => {
|
||||
#[expect(clippy::unwrap_used)]
|
||||
let label = Label::new("cover").unwrap();
|
||||
HashMap::from([(
|
||||
label,
|
||||
PileValue::Blob {
|
||||
mime: mime::IMAGE_PNG,
|
||||
bytes: Arc::new(data),
|
||||
},
|
||||
)])
|
||||
}
|
||||
Err(error) => {
|
||||
trace!(message = "Could not render pdf cover", ?error, key = ?self.item.key());
|
||||
HashMap::new()
|
||||
}
|
||||
};
|
||||
|
||||
return Ok(self.output.get_or_init(|| output));
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Extractor for PdfCoverExtractor<'_> {
|
||||
async fn field<'a>(
|
||||
&'a self,
|
||||
name: &Label,
|
||||
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.get(name))
|
||||
}
|
||||
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.keys().cloned().collect())
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user