Add pdf cover extraction

This commit is contained in:
2026-03-10 18:15:12 -07:00
parent c2b80f8dbc
commit a576ac49cd
8 changed files with 384 additions and 14 deletions

View File

@@ -25,6 +25,8 @@ blake3 = { workspace = true }
epub = { workspace = true }
kamadak-exif = { workspace = true }
pdf = { workspace = true }
pdfium-render = { workspace = true, optional = true }
image = { workspace = true, optional = true }
id3 = { workspace = true }
tokio = { workspace = true }
tokio-stream = { workspace = true }
@@ -40,4 +42,5 @@ utoipa-swagger-ui = { workspace = true, optional = true }
[features]
default = []
pdfium = ["dep:pdfium-render", "dep:image"]
axum = ["dep:axum", "dep:utoipa", "dep:utoipa-swagger-ui"]

View File

@@ -0,0 +1,54 @@
use std::env;
use std::path::PathBuf;
const PDFIUM_URL: &str = "https://github.com/bblanchon/pdfium-binaries/releases/download/chromium%2F7725/pdfium-linux-x64.tgz";
fn main() {
println!("cargo:rerun-if-changed=build.rs");
if env::var("CARGO_FEATURE_PDFIUM").is_err() {
return;
}
let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
// OUT_DIR is target/<profile>/build/<pkg>-<hash>/out
// Go up 3 levels to reach target/<profile>/
let profile_dir = out_dir
.ancestors()
.nth(3)
.expect("unexpected OUT_DIR structure")
.to_path_buf();
let lib_path = profile_dir.join("libpdfium.so");
if !lib_path.exists() {
let tgz_path = out_dir.join("pdfium.tgz");
eprintln!("cargo:warning=Downloading PDFium from {PDFIUM_URL}");
let status = std::process::Command::new("curl")
.args(["-L", "--fail", "-o", tgz_path.to_str().unwrap(), PDFIUM_URL])
.status()
.expect("failed to run curl");
assert!(status.success(), "curl failed to download PDFium");
let status = std::process::Command::new("tar")
.args([
"-xzf",
tgz_path.to_str().unwrap(),
"-C",
out_dir.to_str().unwrap(),
])
.status()
.expect("failed to run tar");
assert!(status.success(), "tar failed to extract PDFium");
std::fs::copy(out_dir.join("lib").join("libpdfium.so"), &lib_path)
.expect("failed to copy libpdfium.so");
}
println!("cargo:rustc-link-search=native={}", profile_dir.display());
println!("cargo:rustc-link-lib=dylib=pdfium");
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
}

View File

@@ -1,6 +1,11 @@
use pile_config::Label;
use std::{collections::HashMap, sync::Arc};
#[cfg(feature = "pdfium")]
mod pdf_cover;
#[cfg(feature = "pdfium")]
pub use pdf_cover::*;
mod pdf_meta;
pub use pdf_meta::*;
@@ -19,20 +24,24 @@ pub struct PdfExtractor<'a> {
impl<'a> PdfExtractor<'a> {
#[expect(clippy::unwrap_used)]
pub fn new(item: &'a Item) -> Self {
let inner = MapExtractor {
inner: HashMap::from([
(
Label::new("text").unwrap(),
PileValue::Extractor(Arc::new(PdfTextExtractor::new(item))),
),
(
Label::new("meta").unwrap(),
PileValue::Extractor(Arc::new(PdfMetaExtractor::new(item))),
),
]),
};
let mut inner_map = HashMap::new();
inner_map.insert(
Label::new("text").unwrap(),
PileValue::Extractor(Arc::new(PdfTextExtractor::new(item))),
);
inner_map.insert(
Label::new("meta").unwrap(),
PileValue::Extractor(Arc::new(PdfMetaExtractor::new(item))),
);
#[cfg(feature = "pdfium")]
inner_map.insert(
Label::new("cover").unwrap(),
PileValue::Extractor(Arc::new(PdfCoverExtractor::new(item))),
);
Self { inner }
Self {
inner: MapExtractor { inner: inner_map },
}
}
}
@@ -50,6 +59,15 @@ impl Extractor for PdfExtractor<'_> {
};
}
#[cfg(feature = "pdfium")]
#[expect(clippy::unwrap_used)]
if name.as_str() == "cover" {
match self.inner.inner.get(name).unwrap() {
PileValue::Extractor(x) => return x.field(name).await,
_ => unreachable!(),
};
}
self.inner.field(name).await
}
@@ -58,6 +76,8 @@ impl Extractor for PdfExtractor<'_> {
Ok(vec![
Label::new("text").unwrap(),
Label::new("meta").unwrap(),
#[cfg(feature = "pdfium")]
Label::new("cover").unwrap(),
])
}
}

View File

@@ -0,0 +1,98 @@
use image::ImageFormat;
use pdfium_render::prelude::*;
use pile_config::Label;
use std::{
collections::HashMap,
io::{BufReader, Cursor},
sync::{Arc, OnceLock},
};
use tracing::trace;
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor};
pub struct PdfCoverExtractor<'a> {
item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a>>>,
}
impl<'a> PdfCoverExtractor<'a> {
pub fn new(item: &'a Item) -> Self {
Self {
item,
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let reader = SyncReadBridge::new_current(self.item.read().await?);
let cover = tokio::task::spawn_blocking(move || {
let mut bytes = Vec::new();
std::io::Read::read_to_end(&mut BufReader::new(reader), &mut bytes)?;
let pdfium = Pdfium::default();
let document = pdfium
.load_pdf_from_byte_slice(&bytes, None)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let render_config = PdfRenderConfig::new().set_target_width(1024);
let page = document
.pages()
.get(0)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let image = page
.render_with_config(&render_config)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?
.as_image();
let mut png_bytes = Vec::new();
image
.write_to(&mut Cursor::new(&mut png_bytes), ImageFormat::Png)
.map_err(|e| std::io::Error::other(e.to_string()))?;
Ok::<_, std::io::Error>(png_bytes)
})
.await
.map_err(std::io::Error::other)?;
let output = match cover {
Ok(data) => {
#[expect(clippy::unwrap_used)]
let label = Label::new("cover").unwrap();
HashMap::from([(
label,
PileValue::Blob {
mime: mime::IMAGE_PNG,
bytes: Arc::new(data),
},
)])
}
Err(error) => {
trace!(message = "Could not render pdf cover", ?error, key = ?self.item.key());
HashMap::new()
}
};
return Ok(self.output.get_or_init(|| output));
}
}
#[async_trait::async_trait]
impl Extractor for PdfCoverExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner().await?.get(name))
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect())
}
}