From a576ac49cd9d8dfadfc5e1d46860b6e5b089d39e Mon Sep 17 00:00:00 2001 From: rm-dr <96270320+rm-dr@users.noreply.github.com> Date: Tue, 10 Mar 2026 18:15:12 -0700 Subject: [PATCH] Add pdf cover extraction --- .cargo/config.toml | 2 + Cargo.lock | 191 ++++++++++++++++++ Cargo.toml | 2 + crates/pile-dataset/Cargo.toml | 3 + crates/pile-dataset/build.rs | 54 +++++ crates/pile-dataset/src/extract/pdf/mod.rs | 46 +++-- .../pile-dataset/src/extract/pdf/pdf_cover.rs | 98 +++++++++ crates/pile/Cargo.toml | 2 +- 8 files changed, 384 insertions(+), 14 deletions(-) create mode 100644 .cargo/config.toml create mode 100644 crates/pile-dataset/build.rs create mode 100644 crates/pile-dataset/src/extract/pdf/pdf_cover.rs diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 0000000..42aa400 --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,2 @@ +[target.x86_64-unknown-linux-gnu] +rustflags = ["-C", "link-arg=-Wl,-rpath,$ORIGIN"] diff --git a/Cargo.lock b/Cargo.lock index ed884a9..bb0ea99 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -667,12 +667,24 @@ version = "3.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510" +[[package]] +name = "bytemuck" +version = "1.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" + [[package]] name = "byteorder" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" +[[package]] +name = "byteorder-lite" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495" + [[package]] name = "bytes" version = "1.11.0" @@ -813,6 +825,26 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "console_error_panic_hook" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06aeb73f470f66dcdbf7223caeebb85984942f22f1adb2a088cf9668146bbbc" +dependencies = [ + "cfg-if", + "wasm-bindgen", +] + +[[package]] +name = "console_log" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be8aed40e4edbf4d3b4431ab260b63fdc40f5780a4766824329ea0f1eefe3c0f" +dependencies = [ + "log", + "web-sys", +] + [[package]] name = "const-oid" version = "0.9.6" @@ -1241,6 +1273,15 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "fdeflate" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e6853b52649d4ac5c0bd02320cddc5ba956bdb407c4b75a2c6b75bf51500f8c" +dependencies = [ + "simd-adler32", +] + [[package]] name = "ff" version = "0.12.1" @@ -1854,6 +1895,21 @@ dependencies = [ "icu_properties", ] +[[package]] +name = "image" +version = "0.25.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85ab80394333c02fe689eaf900ab500fbd0c2213da414687ebf995a65d5a6104" +dependencies = [ + "bytemuck", + "byteorder-lite", + "moxcms", + "num-traits", + "png", + "zune-core", + "zune-jpeg", +] + [[package]] name = "indexmap" version = "2.13.0" @@ -2019,6 +2075,16 @@ dependencies = [ "rle-decode-fast", ] +[[package]] +name = "libloading" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "754ca22de805bb5744484a5b151a9e1a8e837d5dc232c2d7d8c2e3492edc8b60" +dependencies = [ + "cfg-if", + "windows-link", +] + [[package]] name = "libm" version = "0.2.16" @@ -2091,6 +2157,12 @@ version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" +[[package]] +name = "maybe-owned" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4facc753ae494aeb6e3c22f839b158aebd4f9270f55cd3c79906c45476c47ab4" + [[package]] name = "md-5" version = "0.10.6" @@ -2174,6 +2246,16 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "moxcms" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb85c154ba489f01b25c0d36ae69a87e4a1c73a72631fc6c0eb6dde34a73e44b" +dependencies = [ + "num-traits", + "pxfm", +] + [[package]] name = "multer" version = "3.1.0" @@ -2365,6 +2447,32 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "pdfium-render" +version = "0.8.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6553f6604a52b3203db7b4e9d51eb4dd193cf455af9e56d40cab6575b547b679" +dependencies = [ + "bitflags", + "bytemuck", + "bytes", + "chrono", + "console_error_panic_hook", + "console_log", + "image", + "itertools 0.14.0", + "js-sys", + "libloading", + "log", + "maybe-owned", + "once_cell", + "utf16string", + "vecmath", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + [[package]] name = "percent-encoding" version = "2.3.2" @@ -2414,11 +2522,13 @@ dependencies = [ "chrono", "epub", "id3", + "image", "itertools 0.14.0", "kamadak-exif", "mime", "mime_guess", "pdf", + "pdfium-render", "pile-config", "pile-flac", "pile-toolbox", @@ -2470,6 +2580,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "piston-float" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad78bf43dcf80e8f950c92b84f938a0fc7590b7f6866fbcbeca781609c115590" + [[package]] name = "pkcs8" version = "0.9.0" @@ -2486,6 +2602,19 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +[[package]] +name = "png" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60769b8b31b2a9f263dae2776c37b1b28ae246943cf719eb6946a1db05128a61" +dependencies = [ + "bitflags", + "crc32fast", + "fdeflate", + "flate2", + "miniz_oxide", +] + [[package]] name = "portable-atomic" version = "1.13.0" @@ -2535,6 +2664,12 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "pxfm" +version = "0.1.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5a041e753da8b807c9255f28de81879c78c876392ff2469cde94799b2896b9d" + [[package]] name = "quote" version = "1.0.44" @@ -3821,6 +3956,15 @@ dependencies = [ "serde", ] +[[package]] +name = "utf16string" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b62a1e85e12d5d712bf47a85f426b73d303e2d00a90de5f3004df3596e9d216" +dependencies = [ + "byteorder", +] + [[package]] name = "utf8-ranges" version = "1.0.5" @@ -3908,6 +4052,15 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" +[[package]] +name = "vecmath" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "956ae1e0d85bca567dee1dcf87fb1ca2e792792f66f87dced8381f99cd91156a" +dependencies = [ + "piston-float", +] + [[package]] name = "version_check" version = "0.9.5" @@ -3997,6 +4150,19 @@ dependencies = [ "wasm-bindgen-shared", ] +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "836d9622d604feee9e5de25ac10e3ea5f2d65b41eac0d9ce72eb5deae707ce7c" +dependencies = [ + "cfg-if", + "js-sys", + "once_cell", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "wasm-bindgen-macro" version = "0.2.106" @@ -4063,6 +4229,16 @@ dependencies = [ "semver", ] +[[package]] +name = "web-sys" +version = "0.3.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b32828d774c412041098d182a8b38b16ea816958e07cf40eec2bc080ae137ac" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "web-time" version = "1.1.0" @@ -4632,3 +4808,18 @@ dependencies = [ "cc", "pkg-config", ] + +[[package]] +name = "zune-core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb8a0807f7c01457d0379ba880ba6322660448ddebc890ce29bb64da71fb40f9" + +[[package]] +name = "zune-jpeg" +version = "0.5.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "410e9ecef634c709e3831c2cfdb8d9c32164fae1c67496d5b68fff728eec37fe" +dependencies = [ + "zune-core", +] diff --git a/Cargo.toml b/Cargo.toml index 1b81f37..2889a85 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -112,6 +112,8 @@ pdf = "0.10.0" id3 = "1.16.4" epub = "1.2.2" kamadak-exif = "0.6.1" +pdfium-render = "0.8" +image = { version = "0.25", default-features = false, features = ["png"] } # Misc helpers thiserror = "2.0.18" diff --git a/crates/pile-dataset/Cargo.toml b/crates/pile-dataset/Cargo.toml index 2888064..113f6fa 100644 --- a/crates/pile-dataset/Cargo.toml +++ b/crates/pile-dataset/Cargo.toml @@ -25,6 +25,8 @@ blake3 = { workspace = true } epub = { workspace = true } kamadak-exif = { workspace = true } pdf = { workspace = true } +pdfium-render = { workspace = true, optional = true } +image = { workspace = true, optional = true } id3 = { workspace = true } tokio = { workspace = true } tokio-stream = { workspace = true } @@ -40,4 +42,5 @@ utoipa-swagger-ui = { workspace = true, optional = true } [features] default = [] +pdfium = ["dep:pdfium-render", "dep:image"] axum = ["dep:axum", "dep:utoipa", "dep:utoipa-swagger-ui"] diff --git a/crates/pile-dataset/build.rs b/crates/pile-dataset/build.rs new file mode 100644 index 0000000..1677e05 --- /dev/null +++ b/crates/pile-dataset/build.rs @@ -0,0 +1,54 @@ +use std::env; +use std::path::PathBuf; + +const PDFIUM_URL: &str = "https://github.com/bblanchon/pdfium-binaries/releases/download/chromium%2F7725/pdfium-linux-x64.tgz"; + +fn main() { + println!("cargo:rerun-if-changed=build.rs"); + + if env::var("CARGO_FEATURE_PDFIUM").is_err() { + return; + } + + let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap()); + + // OUT_DIR is target//build/-/out + // Go up 3 levels to reach target// + let profile_dir = out_dir + .ancestors() + .nth(3) + .expect("unexpected OUT_DIR structure") + .to_path_buf(); + + let lib_path = profile_dir.join("libpdfium.so"); + + if !lib_path.exists() { + let tgz_path = out_dir.join("pdfium.tgz"); + + eprintln!("cargo:warning=Downloading PDFium from {PDFIUM_URL}"); + + let status = std::process::Command::new("curl") + .args(["-L", "--fail", "-o", tgz_path.to_str().unwrap(), PDFIUM_URL]) + .status() + .expect("failed to run curl"); + assert!(status.success(), "curl failed to download PDFium"); + + let status = std::process::Command::new("tar") + .args([ + "-xzf", + tgz_path.to_str().unwrap(), + "-C", + out_dir.to_str().unwrap(), + ]) + .status() + .expect("failed to run tar"); + assert!(status.success(), "tar failed to extract PDFium"); + + std::fs::copy(out_dir.join("lib").join("libpdfium.so"), &lib_path) + .expect("failed to copy libpdfium.so"); + } + + println!("cargo:rustc-link-search=native={}", profile_dir.display()); + println!("cargo:rustc-link-lib=dylib=pdfium"); + println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN"); +} diff --git a/crates/pile-dataset/src/extract/pdf/mod.rs b/crates/pile-dataset/src/extract/pdf/mod.rs index d8c4d7e..ca374c1 100644 --- a/crates/pile-dataset/src/extract/pdf/mod.rs +++ b/crates/pile-dataset/src/extract/pdf/mod.rs @@ -1,6 +1,11 @@ use pile_config::Label; use std::{collections::HashMap, sync::Arc}; +#[cfg(feature = "pdfium")] +mod pdf_cover; +#[cfg(feature = "pdfium")] +pub use pdf_cover::*; + mod pdf_meta; pub use pdf_meta::*; @@ -19,20 +24,24 @@ pub struct PdfExtractor<'a> { impl<'a> PdfExtractor<'a> { #[expect(clippy::unwrap_used)] pub fn new(item: &'a Item) -> Self { - let inner = MapExtractor { - inner: HashMap::from([ - ( - Label::new("text").unwrap(), - PileValue::Extractor(Arc::new(PdfTextExtractor::new(item))), - ), - ( - Label::new("meta").unwrap(), - PileValue::Extractor(Arc::new(PdfMetaExtractor::new(item))), - ), - ]), - }; + let mut inner_map = HashMap::new(); + inner_map.insert( + Label::new("text").unwrap(), + PileValue::Extractor(Arc::new(PdfTextExtractor::new(item))), + ); + inner_map.insert( + Label::new("meta").unwrap(), + PileValue::Extractor(Arc::new(PdfMetaExtractor::new(item))), + ); + #[cfg(feature = "pdfium")] + inner_map.insert( + Label::new("cover").unwrap(), + PileValue::Extractor(Arc::new(PdfCoverExtractor::new(item))), + ); - Self { inner } + Self { + inner: MapExtractor { inner: inner_map }, + } } } @@ -50,6 +59,15 @@ impl Extractor for PdfExtractor<'_> { }; } + #[cfg(feature = "pdfium")] + #[expect(clippy::unwrap_used)] + if name.as_str() == "cover" { + match self.inner.inner.get(name).unwrap() { + PileValue::Extractor(x) => return x.field(name).await, + _ => unreachable!(), + }; + } + self.inner.field(name).await } @@ -58,6 +76,8 @@ impl Extractor for PdfExtractor<'_> { Ok(vec![ Label::new("text").unwrap(), Label::new("meta").unwrap(), + #[cfg(feature = "pdfium")] + Label::new("cover").unwrap(), ]) } } diff --git a/crates/pile-dataset/src/extract/pdf/pdf_cover.rs b/crates/pile-dataset/src/extract/pdf/pdf_cover.rs new file mode 100644 index 0000000..4bf900a --- /dev/null +++ b/crates/pile-dataset/src/extract/pdf/pdf_cover.rs @@ -0,0 +1,98 @@ +use image::ImageFormat; +use pdfium_render::prelude::*; +use pile_config::Label; +use std::{ + collections::HashMap, + io::{BufReader, Cursor}, + sync::{Arc, OnceLock}, +}; +use tracing::trace; + +use crate::{Item, PileValue, SyncReadBridge, extract::Extractor}; + +pub struct PdfCoverExtractor<'a> { + item: &'a Item, + output: OnceLock>>, +} + +impl<'a> PdfCoverExtractor<'a> { + pub fn new(item: &'a Item) -> Self { + Self { + item, + output: OnceLock::new(), + } + } + + async fn get_inner(&self) -> Result<&HashMap>, std::io::Error> { + if let Some(x) = self.output.get() { + return Ok(x); + } + + let reader = SyncReadBridge::new_current(self.item.read().await?); + let cover = tokio::task::spawn_blocking(move || { + let mut bytes = Vec::new(); + std::io::Read::read_to_end(&mut BufReader::new(reader), &mut bytes)?; + + let pdfium = Pdfium::default(); + + let document = pdfium + .load_pdf_from_byte_slice(&bytes, None) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?; + + let render_config = PdfRenderConfig::new().set_target_width(1024); + + let page = document + .pages() + .get(0) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?; + + let image = page + .render_with_config(&render_config) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))? + .as_image(); + + let mut png_bytes = Vec::new(); + image + .write_to(&mut Cursor::new(&mut png_bytes), ImageFormat::Png) + .map_err(|e| std::io::Error::other(e.to_string()))?; + + Ok::<_, std::io::Error>(png_bytes) + }) + .await + .map_err(std::io::Error::other)?; + + let output = match cover { + Ok(data) => { + #[expect(clippy::unwrap_used)] + let label = Label::new("cover").unwrap(); + HashMap::from([( + label, + PileValue::Blob { + mime: mime::IMAGE_PNG, + bytes: Arc::new(data), + }, + )]) + } + Err(error) => { + trace!(message = "Could not render pdf cover", ?error, key = ?self.item.key()); + HashMap::new() + } + }; + + return Ok(self.output.get_or_init(|| output)); + } +} + +#[async_trait::async_trait] +impl Extractor for PdfCoverExtractor<'_> { + async fn field<'a>( + &'a self, + name: &Label, + ) -> Result>, std::io::Error> { + Ok(self.get_inner().await?.get(name)) + } + + async fn fields(&self) -> Result, std::io::Error> { + Ok(self.get_inner().await?.keys().cloned().collect()) + } +} diff --git a/crates/pile/Cargo.toml b/crates/pile/Cargo.toml index 93687fd..3e70249 100644 --- a/crates/pile/Cargo.toml +++ b/crates/pile/Cargo.toml @@ -9,7 +9,7 @@ workspace = true [dependencies] pile-toolbox = { workspace = true } -pile-dataset = { workspace = true, features = ["axum"] } +pile-dataset = { workspace = true, features = ["axum", "pdfium"] } pile-config = { workspace = true } tracing = { workspace = true }