Add pdf cover extraction

This commit is contained in:
2026-03-10 18:15:12 -07:00
parent 899b47b169
commit 80d248787c
8 changed files with 387 additions and 14 deletions

2
.cargo/config.toml Normal file
View File

@@ -0,0 +1,2 @@
[target.x86_64-unknown-linux-gnu]
rustflags = ["-C", "link-arg=-Wl,-rpath,$ORIGIN"]

191
Cargo.lock generated
View File

@@ -667,12 +667,24 @@ version = "3.19.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510"
[[package]]
name = "bytemuck"
version = "1.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec"
[[package]]
name = "byteorder"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
[[package]]
name = "byteorder-lite"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495"
[[package]]
name = "bytes"
version = "1.11.0"
@@ -813,6 +825,26 @@ dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "console_error_panic_hook"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a06aeb73f470f66dcdbf7223caeebb85984942f22f1adb2a088cf9668146bbbc"
dependencies = [
"cfg-if",
"wasm-bindgen",
]
[[package]]
name = "console_log"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be8aed40e4edbf4d3b4431ab260b63fdc40f5780a4766824329ea0f1eefe3c0f"
dependencies = [
"log",
"web-sys",
]
[[package]]
name = "const-oid"
version = "0.9.6"
@@ -1241,6 +1273,15 @@ dependencies = [
"syn 2.0.117",
]
[[package]]
name = "fdeflate"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e6853b52649d4ac5c0bd02320cddc5ba956bdb407c4b75a2c6b75bf51500f8c"
dependencies = [
"simd-adler32",
]
[[package]]
name = "ff"
version = "0.12.1"
@@ -1854,6 +1895,21 @@ dependencies = [
"icu_properties",
]
[[package]]
name = "image"
version = "0.25.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85ab80394333c02fe689eaf900ab500fbd0c2213da414687ebf995a65d5a6104"
dependencies = [
"bytemuck",
"byteorder-lite",
"moxcms",
"num-traits",
"png",
"zune-core",
"zune-jpeg",
]
[[package]]
name = "indexmap"
version = "2.13.0"
@@ -2019,6 +2075,16 @@ dependencies = [
"rle-decode-fast",
]
[[package]]
name = "libloading"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "754ca22de805bb5744484a5b151a9e1a8e837d5dc232c2d7d8c2e3492edc8b60"
dependencies = [
"cfg-if",
"windows-link",
]
[[package]]
name = "libm"
version = "0.2.16"
@@ -2091,6 +2157,12 @@ version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3"
[[package]]
name = "maybe-owned"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4facc753ae494aeb6e3c22f839b158aebd4f9270f55cd3c79906c45476c47ab4"
[[package]]
name = "md-5"
version = "0.10.6"
@@ -2174,6 +2246,16 @@ dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "moxcms"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bb85c154ba489f01b25c0d36ae69a87e4a1c73a72631fc6c0eb6dde34a73e44b"
dependencies = [
"num-traits",
"pxfm",
]
[[package]]
name = "multer"
version = "3.1.0"
@@ -2365,6 +2447,32 @@ dependencies = [
"syn 2.0.117",
]
[[package]]
name = "pdfium-render"
version = "0.8.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6553f6604a52b3203db7b4e9d51eb4dd193cf455af9e56d40cab6575b547b679"
dependencies = [
"bitflags",
"bytemuck",
"bytes",
"chrono",
"console_error_panic_hook",
"console_log",
"image",
"itertools 0.14.0",
"js-sys",
"libloading",
"log",
"maybe-owned",
"once_cell",
"utf16string",
"vecmath",
"wasm-bindgen",
"wasm-bindgen-futures",
"web-sys",
]
[[package]]
name = "percent-encoding"
version = "2.3.2"
@@ -2414,11 +2522,13 @@ dependencies = [
"chrono",
"epub",
"id3",
"image",
"itertools 0.14.0",
"kamadak-exif",
"mime",
"mime_guess",
"pdf",
"pdfium-render",
"pile-config",
"pile-flac",
"pile-toolbox",
@@ -2470,6 +2580,12 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "piston-float"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ad78bf43dcf80e8f950c92b84f938a0fc7590b7f6866fbcbeca781609c115590"
[[package]]
name = "pkcs8"
version = "0.9.0"
@@ -2486,6 +2602,19 @@ version = "0.3.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
[[package]]
name = "png"
version = "0.18.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60769b8b31b2a9f263dae2776c37b1b28ae246943cf719eb6946a1db05128a61"
dependencies = [
"bitflags",
"crc32fast",
"fdeflate",
"flate2",
"miniz_oxide",
]
[[package]]
name = "portable-atomic"
version = "1.13.0"
@@ -2535,6 +2664,12 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "pxfm"
version = "0.1.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5a041e753da8b807c9255f28de81879c78c876392ff2469cde94799b2896b9d"
[[package]]
name = "quote"
version = "1.0.44"
@@ -3821,6 +3956,15 @@ dependencies = [
"serde",
]
[[package]]
name = "utf16string"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b62a1e85e12d5d712bf47a85f426b73d303e2d00a90de5f3004df3596e9d216"
dependencies = [
"byteorder",
]
[[package]]
name = "utf8-ranges"
version = "1.0.5"
@@ -3908,6 +4052,15 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
[[package]]
name = "vecmath"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "956ae1e0d85bca567dee1dcf87fb1ca2e792792f66f87dced8381f99cd91156a"
dependencies = [
"piston-float",
]
[[package]]
name = "version_check"
version = "0.9.5"
@@ -3997,6 +4150,19 @@ dependencies = [
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-futures"
version = "0.4.56"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "836d9622d604feee9e5de25ac10e3ea5f2d65b41eac0d9ce72eb5deae707ce7c"
dependencies = [
"cfg-if",
"js-sys",
"once_cell",
"wasm-bindgen",
"web-sys",
]
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.106"
@@ -4063,6 +4229,16 @@ dependencies = [
"semver",
]
[[package]]
name = "web-sys"
version = "0.3.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b32828d774c412041098d182a8b38b16ea816958e07cf40eec2bc080ae137ac"
dependencies = [
"js-sys",
"wasm-bindgen",
]
[[package]]
name = "web-time"
version = "1.1.0"
@@ -4632,3 +4808,18 @@ dependencies = [
"cc",
"pkg-config",
]
[[package]]
name = "zune-core"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cb8a0807f7c01457d0379ba880ba6322660448ddebc890ce29bb64da71fb40f9"
[[package]]
name = "zune-jpeg"
version = "0.5.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "410e9ecef634c709e3831c2cfdb8d9c32164fae1c67496d5b68fff728eec37fe"
dependencies = [
"zune-core",
]

View File

@@ -112,6 +112,8 @@ pdf = "0.10.0"
id3 = "1.16.4"
epub = "1.2.2"
kamadak-exif = "0.6.1"
pdfium-render = "0.8"
image = { version = "0.25", default-features = false, features = ["png"] }
# Misc helpers
thiserror = "2.0.18"

View File

@@ -25,6 +25,8 @@ blake3 = { workspace = true }
epub = { workspace = true }
kamadak-exif = { workspace = true }
pdf = { workspace = true }
pdfium-render = { workspace = true, optional = true }
image = { workspace = true, optional = true }
id3 = { workspace = true }
tokio = { workspace = true }
tokio-stream = { workspace = true }
@@ -40,4 +42,5 @@ utoipa-swagger-ui = { workspace = true, optional = true }
[features]
default = []
pdfium = ["dep:pdfium-render", "dep:image"]
axum = ["dep:axum", "dep:utoipa", "dep:utoipa-swagger-ui"]

View File

@@ -0,0 +1,57 @@
use std::env;
use std::path::PathBuf;
const PDFIUM_URL: &str = "https://github.com/bblanchon/pdfium-binaries/releases/download/chromium%2F7725/pdfium-linux-x64.tgz";
#[expect(clippy::expect_used)]
#[expect(clippy::unwrap_used)]
#[expect(clippy::print_stderr)]
fn main() {
println!("cargo:rerun-if-changed=build.rs");
if env::var("CARGO_FEATURE_PDFIUM").is_err() {
return;
}
let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
// OUT_DIR is target/<profile>/build/<pkg>-<hash>/out
// Go up 3 levels to reach target/<profile>/
let profile_dir = out_dir
.ancestors()
.nth(3)
.expect("unexpected OUT_DIR structure")
.to_path_buf();
let lib_path = profile_dir.join("libpdfium.so");
if !lib_path.exists() {
let tgz_path = out_dir.join("pdfium.tgz");
eprintln!("cargo:warning=Downloading PDFium from {PDFIUM_URL}");
let status = std::process::Command::new("curl")
.args(["-L", "--fail", "-o", tgz_path.to_str().unwrap(), PDFIUM_URL])
.status()
.expect("failed to run curl");
assert!(status.success(), "curl failed to download PDFium");
let status = std::process::Command::new("tar")
.args([
"-xzf",
tgz_path.to_str().unwrap(),
"-C",
out_dir.to_str().unwrap(),
])
.status()
.expect("failed to run tar");
assert!(status.success(), "tar failed to extract PDFium");
std::fs::copy(out_dir.join("lib").join("libpdfium.so"), &lib_path)
.expect("failed to copy libpdfium.so");
}
println!("cargo:rustc-link-search=native={}", profile_dir.display());
println!("cargo:rustc-link-lib=dylib=pdfium");
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
}

View File

@@ -1,6 +1,11 @@
use pile_config::Label;
use std::{collections::HashMap, sync::Arc};
#[cfg(feature = "pdfium")]
mod pdf_cover;
#[cfg(feature = "pdfium")]
pub use pdf_cover::*;
mod pdf_meta;
pub use pdf_meta::*;
@@ -19,20 +24,24 @@ pub struct PdfExtractor<'a> {
impl<'a> PdfExtractor<'a> {
#[expect(clippy::unwrap_used)]
pub fn new(item: &'a Item) -> Self {
let inner = MapExtractor {
inner: HashMap::from([
(
let mut inner_map = HashMap::new();
inner_map.insert(
Label::new("text").unwrap(),
PileValue::Extractor(Arc::new(PdfTextExtractor::new(item))),
),
(
);
inner_map.insert(
Label::new("meta").unwrap(),
PileValue::Extractor(Arc::new(PdfMetaExtractor::new(item))),
),
]),
};
);
#[cfg(feature = "pdfium")]
inner_map.insert(
Label::new("cover").unwrap(),
PileValue::Extractor(Arc::new(PdfCoverExtractor::new(item))),
);
Self { inner }
Self {
inner: MapExtractor { inner: inner_map },
}
}
}
@@ -50,6 +59,15 @@ impl Extractor for PdfExtractor<'_> {
};
}
#[cfg(feature = "pdfium")]
#[expect(clippy::unwrap_used)]
if name.as_str() == "cover" {
match self.inner.inner.get(name).unwrap() {
PileValue::Extractor(x) => return x.field(name).await,
_ => unreachable!(),
};
}
self.inner.field(name).await
}
@@ -58,6 +76,8 @@ impl Extractor for PdfExtractor<'_> {
Ok(vec![
Label::new("text").unwrap(),
Label::new("meta").unwrap(),
#[cfg(feature = "pdfium")]
Label::new("cover").unwrap(),
])
}
}

View File

@@ -0,0 +1,98 @@
use image::ImageFormat;
use pdfium_render::prelude::*;
use pile_config::Label;
use std::{
collections::HashMap,
io::{BufReader, Cursor},
sync::{Arc, OnceLock},
};
use tracing::trace;
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor};
pub struct PdfCoverExtractor<'a> {
item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a>>>,
}
impl<'a> PdfCoverExtractor<'a> {
pub fn new(item: &'a Item) -> Self {
Self {
item,
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let reader = SyncReadBridge::new_current(self.item.read().await?);
let cover = tokio::task::spawn_blocking(move || {
let mut bytes = Vec::new();
std::io::Read::read_to_end(&mut BufReader::new(reader), &mut bytes)?;
let pdfium = Pdfium::default();
let document = pdfium
.load_pdf_from_byte_slice(&bytes, None)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let render_config = PdfRenderConfig::new().set_target_width(1024);
let page = document
.pages()
.get(0)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let image = page
.render_with_config(&render_config)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?
.as_image();
let mut png_bytes = Vec::new();
image
.write_to(&mut Cursor::new(&mut png_bytes), ImageFormat::Png)
.map_err(|e| std::io::Error::other(e.to_string()))?;
Ok::<_, std::io::Error>(png_bytes)
})
.await
.map_err(std::io::Error::other)?;
let output = match cover {
Ok(data) => {
#[expect(clippy::unwrap_used)]
let label = Label::new("cover").unwrap();
HashMap::from([(
label,
PileValue::Blob {
mime: mime::IMAGE_PNG,
bytes: Arc::new(data),
},
)])
}
Err(error) => {
trace!(message = "Could not render pdf cover", ?error, key = ?self.item.key());
HashMap::new()
}
};
return Ok(self.output.get_or_init(|| output));
}
}
#[async_trait::async_trait]
impl Extractor for PdfCoverExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner().await?.get(name))
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect())
}
}

View File

@@ -9,7 +9,7 @@ workspace = true
[dependencies]
pile-toolbox = { workspace = true }
pile-dataset = { workspace = true, features = ["axum"] }
pile-dataset = { workspace = true, features = ["axum", "pdfium"] }
pile-config = { workspace = true }
tracing = { workspace = true }