From 32c611186f6dcf552c7b73ca3f86480f97461fbe Mon Sep 17 00:00:00 2001 From: rm-dr <96270320+rm-dr@users.noreply.github.com> Date: Fri, 6 Mar 2026 16:26:09 -0800 Subject: [PATCH] Add pdf extractors --- Cargo.lock | 443 ++++++++++++++++-- Cargo.toml | 1 + crates/pile-dataset/Cargo.toml | 1 + crates/pile-dataset/src/extract/flac.rs | 5 + crates/pile-dataset/src/extract/map.rs | 2 +- crates/pile-dataset/src/extract/mod.rs | 8 + crates/pile-dataset/src/extract/pdf/mod.rs | 61 +++ .../pile-dataset/src/extract/pdf/pdf_meta.rs | 98 ++++ .../pile-dataset/src/extract/pdf/pdf_text.rs | 79 ++++ 9 files changed, 669 insertions(+), 29 deletions(-) create mode 100644 crates/pile-dataset/src/extract/pdf/mod.rs create mode 100644 crates/pile-dataset/src/extract/pdf/pdf_meta.rs create mode 100644 crates/pile-dataset/src/extract/pdf/pdf_text.rs diff --git a/Cargo.lock b/Cargo.lock index e64ccd2..2306592 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,23 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "adler32" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234" + +[[package]] +name = "aes" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" +dependencies = [ + "cfg-if", + "cipher", + "cpufeatures", +] + [[package]] name = "aho-corasick" version = "1.1.4" @@ -111,7 +128,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -155,6 +172,15 @@ dependencies = [ "cpufeatures", ] +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + [[package]] name = "block-buffer" version = "0.11.0" @@ -164,6 +190,15 @@ dependencies = [ "hybrid-array", ] +[[package]] +name = "block-padding" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8894febbff9f758034a5b8e12d87918f56dfc64a8e1fe757d65e29041538d93" +dependencies = [ + "generic-array", +] + [[package]] name = "bon" version = "3.9.0" @@ -186,7 +221,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn", + "syn 2.0.117", ] [[package]] @@ -207,6 +242,15 @@ version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3" +[[package]] +name = "cbc" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26b52a9543ae338f279b96b0b9fed9c8093744685043739079ce85cd58f289a6" +dependencies = [ + "cipher", +] + [[package]] name = "cc" version = "1.2.56" @@ -244,6 +288,16 @@ dependencies = [ "windows-link", ] +[[package]] +name = "cipher" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" +dependencies = [ + "crypto-common 0.1.7", + "inout", +] + [[package]] name = "clap" version = "4.5.60" @@ -275,7 +329,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -321,6 +375,15 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "core2" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505" +dependencies = [ + "memchr", +] + [[package]] name = "cpufeatures" version = "0.2.17" @@ -379,6 +442,16 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + [[package]] name = "crypto-common" version = "0.2.0" @@ -408,7 +481,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn", + "syn 2.0.117", ] [[package]] @@ -419,7 +492,42 @@ checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" dependencies = [ "darling_core", "quote", - "syn", + "syn 2.0.117", +] + +[[package]] +name = "dary_heap" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06d2e3287df1c007e74221c49ca10a95d557349e54b3a75dc2fb14712c751f04" + +[[package]] +name = "datasize" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e65c07d59e45d77a8bda53458c24a828893a99ac6cdd9c84111e09176ab739a2" +dependencies = [ + "datasize_derive", +] + +[[package]] +name = "datasize_derive" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613e4ee15899913285b7612004bbd490abd605be7b11d35afada5902fb6b91d5" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "deflate" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c86f7e25f518f4b81808a2cf1c50996a61f5c2eb394b2393bd87f2a4780a432f" +dependencies = [ + "adler32", ] [[package]] @@ -432,15 +540,25 @@ dependencies = [ "serde_core", ] +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer 0.10.4", + "crypto-common 0.1.7", +] + [[package]] name = "digest" version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8bf3682cdec91817be507e4aa104314898b95b84d74f3d43882210101a545b6" dependencies = [ - "block-buffer", + "block-buffer 0.11.0", "const-oid", - "crypto-common", + "crypto-common 0.2.0", ] [[package]] @@ -489,6 +607,26 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +[[package]] +name = "fax" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f05de7d48f37cd6730705cbca900770cab77a89f413d23e100ad7fad7795a0ab" +dependencies = [ + "fax_derive", +] + +[[package]] +name = "fax_derive" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0aca10fb742cb43f9e7bb8467c91aa9bcb8e3ffbc6a6f7389bb93ffc920577d" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "find-msvc-tools" version = "0.1.9" @@ -507,6 +645,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "fs4" version = "0.13.1" @@ -531,7 +675,7 @@ checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -553,6 +697,16 @@ dependencies = [ "slab", ] +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getrandom" version = "0.2.17" @@ -589,6 +743,18 @@ dependencies = [ "wasip3", ] +[[package]] +name = "globalcache" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f07c02868ebe3ffab0c273801b815f19a9fc05743b5b9da971449dd40604fe30" +dependencies = [ + "async-trait", + "rustc-hash", + "tuple", + "web-time", +] + [[package]] name = "hashbrown" version = "0.15.5" @@ -597,7 +763,7 @@ checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ "allocator-api2", "equivalent", - "foldhash", + "foldhash 0.1.5", ] [[package]] @@ -605,6 +771,11 @@ name = "hashbrown" version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.2.0", +] [[package]] name = "heck" @@ -699,12 +870,40 @@ dependencies = [ "web-time", ] +[[package]] +name = "inout" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01" +dependencies = [ + "block-padding", + "generic-array", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" +[[package]] +name = "istring" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "875cc6fb9aecbc1a9bd736f2d18b12e0756b4c80c5e35e28262154abcb077a39" +dependencies = [ + "datasize", +] + +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.14.0" @@ -730,6 +929,12 @@ dependencies = [ "libc", ] +[[package]] +name = "jpeg-decoder" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00810f1d8b74be64b13dbf3db89ac67740615d6c891f0e7b6179326533011a07" + [[package]] name = "js-sys" version = "0.3.83" @@ -764,6 +969,30 @@ version = "0.2.182" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112" +[[package]] +name = "libflate" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3248b8d211bd23a104a42d81b4fa8bb8ac4a3b75e7a43d85d2c9ccb6179cd74" +dependencies = [ + "adler32", + "core2", + "crc32fast", + "dary_heap", + "libflate_lz77", +] + +[[package]] +name = "libflate_lz77" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a599cb10a9cd92b1300debcef28da8f70b935ec937f44fcd1b70a7c986a11c5c" +dependencies = [ + "core2", + "hashbrown 0.16.1", + "rle-decode-fast", +] + [[package]] name = "libm" version = "0.2.16" @@ -815,6 +1044,12 @@ dependencies = [ "regex-automata", ] +[[package]] +name = "md5" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" + [[package]] name = "measure_time" version = "0.9.0" @@ -959,6 +1194,45 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +[[package]] +name = "pdf" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de084796ae1744b0ae6b2b1e9368ed87bd43043ca410363edb72155ea68245db" +dependencies = [ + "aes", + "bitflags", + "cbc", + "datasize", + "deflate", + "fax", + "globalcache", + "indexmap", + "istring", + "itertools 0.13.0", + "jpeg-decoder", + "libflate", + "log", + "md5", + "once_cell", + "pdf_derive", + "sha2 0.10.9", + "snafu", + "stringprep", + "weezl", +] + +[[package]] +name = "pdf_derive" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d66198aa8cbc8fb36b61c32b50fc2ba11cf70cf177c7a36b15f9444a2fce0267" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "pile" version = "0.0.1" @@ -996,7 +1270,8 @@ version = "0.0.1" dependencies = [ "blake3", "chrono", - "itertools", + "itertools 0.14.0", + "pdf", "pile-config", "pile-flac", "pile-toolbox", @@ -1016,10 +1291,10 @@ name = "pile-flac" version = "0.0.1" dependencies = [ "base64", - "itertools", + "itertools 0.14.0", "mime", "paste", - "sha2", + "sha2 0.11.0-rc.5", "smartstring", "strum", "thiserror", @@ -1073,7 +1348,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn", + "syn 2.0.117", ] [[package]] @@ -1198,6 +1473,12 @@ version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" +[[package]] +name = "rle-decode-fast" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422" + [[package]] name = "rust-stemmers" version = "1.2.0" @@ -1281,7 +1562,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -1306,6 +1587,17 @@ dependencies = [ "serde_core", ] +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest 0.10.7", +] + [[package]] name = "sha2" version = "0.11.0-rc.5" @@ -1314,7 +1606,7 @@ checksum = "7c5f3b1e2dc8aad28310d8410bd4d7e180eca65fca176c52ab00d364475d0024" dependencies = [ "cfg-if", "cpufeatures", - "digest", + "digest 0.11.0", ] [[package]] @@ -1384,6 +1676,27 @@ dependencies = [ "version_check", ] +[[package]] +name = "snafu" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e84b3f4eacbf3a1ce05eac6763b4d629d60cbc94d632e4092c54ade71f1e1a2" +dependencies = [ + "snafu-derive", +] + +[[package]] +name = "snafu-derive" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1c97747dbf44bb1ca44a561ece23508e99cb592e862f22222dcf42f51d1e451" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "socket2" version = "0.6.2" @@ -1406,6 +1719,17 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" +[[package]] +name = "stringprep" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b4df3d392d81bd458a8a621b8bffbd2302a12ffe288a9d931670948749463b1" +dependencies = [ + "unicode-bidi", + "unicode-normalization", + "unicode-properties", +] + [[package]] name = "strsim" version = "0.11.1" @@ -1430,7 +1754,18 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn", + "syn 2.0.117", +] + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", ] [[package]] @@ -1465,7 +1800,7 @@ dependencies = [ "fs4", "htmlescape", "hyperloglogplus", - "itertools", + "itertools 0.14.0", "levenshtein_automata", "log", "lru", @@ -1513,7 +1848,7 @@ checksum = "8b628488ae936c83e92b5c4056833054ca56f76c0e616aee8339e24ac89119cd" dependencies = [ "downcast-rs", "fastdivide", - "itertools", + "itertools 0.14.0", "serde", "tantivy-bitpacker", "tantivy-common", @@ -1563,7 +1898,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8292095d1a8a2c2b36380ec455f910ab52dde516af36321af332c93f20ab7d5" dependencies = [ "futures-util", - "itertools", + "itertools 0.14.0", "tantivy-bitpacker", "tantivy-common", "tantivy-fst", @@ -1620,7 +1955,7 @@ checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -1663,6 +1998,21 @@ dependencies = [ "time-core", ] +[[package]] +name = "tinyvec" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + [[package]] name = "tokio" version = "1.49.0" @@ -1688,7 +2038,7 @@ checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -1762,7 +2112,7 @@ checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -1829,18 +2179,49 @@ dependencies = [ "tracing-serde", ] +[[package]] +name = "tuple" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9bb9f6bd73479481158ba8ee3edf17aca93354623d13f02e96a2014fdbc1c37e" +dependencies = [ + "num-traits", + "serde", +] + [[package]] name = "typenum" version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" +[[package]] +name = "unicode-bidi" +version = "0.3.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5" + [[package]] name = "unicode-ident" version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" +[[package]] +name = "unicode-normalization" +version = "0.1.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-properties" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d" + [[package]] name = "unicode-segmentation" version = "1.12.0" @@ -1988,7 +2369,7 @@ dependencies = [ "bumpalo", "proc-macro2", "quote", - "syn", + "syn 2.0.117", "wasm-bindgen-shared", ] @@ -2045,6 +2426,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "weezl" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28ac98ddc8b9274cb41bb4d9d4d5c425b6020c50c46f25559911905610b4a88" + [[package]] name = "winapi" version = "0.3.9" @@ -2097,7 +2484,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -2108,7 +2495,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -2336,7 +2723,7 @@ dependencies = [ "heck", "indexmap", "prettyplease", - "syn", + "syn 2.0.117", "wasm-metadata", "wit-bindgen-core", "wit-component", @@ -2352,7 +2739,7 @@ dependencies = [ "prettyplease", "proc-macro2", "quote", - "syn", + "syn 2.0.117", "wit-bindgen-core", "wit-bindgen-rust", ] @@ -2411,7 +2798,7 @@ checksum = "0c15e1b46eff7c6c91195752e0eeed8ef040e391cdece7c25376957d5f15df22" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index f137f73..34b1856 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -90,6 +90,7 @@ toml = "1.0.3" toml_edit = "0.25.4" sha2 = "0.11.0-rc.5" blake3 = "1.8.3" +pdf = "0.10.0" # Misc helpers thiserror = "2.0.18" diff --git a/crates/pile-dataset/Cargo.toml b/crates/pile-dataset/Cargo.toml index b65142a..c87f45d 100644 --- a/crates/pile-dataset/Cargo.toml +++ b/crates/pile-dataset/Cargo.toml @@ -24,3 +24,4 @@ rayon = { workspace = true } smartstring = { workspace = true } blake3 = { workspace = true } toml_edit = { workspace = true } +pdf = { workspace = true } diff --git a/crates/pile-dataset/src/extract/flac.rs b/crates/pile-dataset/src/extract/flac.rs index 65c7ee4..223ea48 100644 --- a/crates/pile-dataset/src/extract/flac.rs +++ b/crates/pile-dataset/src/extract/flac.rs @@ -22,6 +22,11 @@ impl<'a> FlacExtractor<'a> { return Ok(x); } + // If this isn't a flac file, ignore it. + if self.item.path.extension().map(|x| x.to_str()).flatten() != Some("flac") { + return Ok(self.output.get_or_init(|| HashMap::new())); + } + let file = File::open(&self.item.path)?; let reader = FlacReader::new(BufReader::new(file)); diff --git a/crates/pile-dataset/src/extract/map.rs b/crates/pile-dataset/src/extract/map.rs index e774c52..6f68e65 100644 --- a/crates/pile-dataset/src/extract/map.rs +++ b/crates/pile-dataset/src/extract/map.rs @@ -4,7 +4,7 @@ use std::collections::HashMap; use crate::{Item, PileValue, extract::Extractor}; pub struct MapExtractor<'a, I: Item> { - pub(super) inner: HashMap>, + pub(crate) inner: HashMap>, } impl Extractor for MapExtractor<'_, I> { diff --git a/crates/pile-dataset/src/extract/mod.rs b/crates/pile-dataset/src/extract/mod.rs index ee3aa9d..fa3ac7c 100644 --- a/crates/pile-dataset/src/extract/mod.rs +++ b/crates/pile-dataset/src/extract/mod.rs @@ -7,6 +7,9 @@ pub use flac::*; mod fs; pub use fs::*; +mod pdf; +pub use pdf::*; + mod sidecar; pub use sidecar::*; @@ -49,6 +52,10 @@ impl<'a> MetaExtractor<'a, crate::FileItem> { Label::new("fs").unwrap(), crate::PileValue::Extractor(Rc::new(FsExtractor::new(item))), ), + ( + Label::new("pdf").unwrap(), + crate::PileValue::Extractor(Rc::new(PdfExtractor::new(item))), + ), ( Label::new("sidecar").unwrap(), crate::PileValue::Extractor(Rc::new(SidecarExtractor::new(item))), @@ -73,6 +80,7 @@ impl Extractor for MetaExtractor<'_, crate::FileItem> { return Ok(vec![ Label::new("flac").unwrap(), Label::new("fs").unwrap(), + Label::new("pdf").unwrap(), Label::new("sidecar").unwrap(), ]); } diff --git a/crates/pile-dataset/src/extract/pdf/mod.rs b/crates/pile-dataset/src/extract/pdf/mod.rs new file mode 100644 index 0000000..21f6d0c --- /dev/null +++ b/crates/pile-dataset/src/extract/pdf/mod.rs @@ -0,0 +1,61 @@ +use pile_config::Label; +use std::{collections::HashMap, rc::Rc}; + +mod pdf_meta; +pub use pdf_meta::*; + +mod pdf_text; +pub use pdf_text::*; + +use crate::{ + FileItem, PileValue, + extract::{Extractor, MapExtractor}, +}; + +pub struct PdfExtractor<'a> { + inner: MapExtractor<'a, FileItem>, +} + +impl<'a> PdfExtractor<'a> { + #[expect(clippy::unwrap_used)] + pub fn new(item: &'a FileItem) -> Self { + let inner = MapExtractor { + inner: HashMap::from([ + ( + Label::new("text").unwrap(), + PileValue::Extractor(Rc::new(PdfTextExtractor::new(item))), + ), + ( + Label::new("meta").unwrap(), + PileValue::Extractor(Rc::new(PdfMetaExtractor::new(item))), + ), + ]), + }; + + Self { inner } + } +} + +impl Extractor for PdfExtractor<'_> { + fn field<'a>( + &'a self, + name: &pile_config::Label, + ) -> Result>, std::io::Error> { + if name.as_str() == "text" { + match self.inner.inner.get(name).unwrap() { + PileValue::Extractor(x) => return x.field(name), + _ => unreachable!(), + }; + } + + self.inner.field(name) + } + + #[expect(clippy::unwrap_used)] + fn fields(&self) -> Result, std::io::Error> { + Ok(vec![ + Label::new("text").unwrap(), + Label::new("meta").unwrap(), + ]) + } +} diff --git a/crates/pile-dataset/src/extract/pdf/pdf_meta.rs b/crates/pile-dataset/src/extract/pdf/pdf_meta.rs new file mode 100644 index 0000000..6a6e4aa --- /dev/null +++ b/crates/pile-dataset/src/extract/pdf/pdf_meta.rs @@ -0,0 +1,98 @@ +use pdf::file::FileOptions; +use pdf::primitive::{Date, TimeRel}; +use pile_config::Label; +use std::{collections::HashMap, sync::OnceLock}; + +use crate::{FileItem, PileValue, extract::Extractor}; + +pub struct PdfMetaExtractor<'a> { + item: &'a FileItem, + output: OnceLock>>, +} + +impl<'a> PdfMetaExtractor<'a> { + pub fn new(item: &'a FileItem) -> Self { + Self { + item, + output: OnceLock::new(), + } + } + + fn get_inner(&self) -> Result<&HashMap>, std::io::Error> { + if let Some(x) = self.output.get() { + return Ok(x); + } + + let file = FileOptions::cached() + .open(&self.item.path) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?; + + let mut output: HashMap> = HashMap::new(); + + if let Some(info) = &file.trailer.info_dict { + let fields: &[(&str, Option<&_>)] = &[ + ("title", info.title.as_ref()), + ("author", info.author.as_ref()), + ("subject", info.subject.as_ref()), + ("keywords", info.keywords.as_ref()), + ("creator", info.creator.as_ref()), + ("producer", info.producer.as_ref()), + ]; + + #[expect(clippy::unwrap_used)] + for (key, val) in fields { + let label = Label::new(*key).unwrap(); + let value = match val { + Some(s) => PileValue::String(s.to_string_lossy().into()), + None => PileValue::Null, + }; + output.insert(label, value); + } + + #[expect(clippy::unwrap_used)] + { + output.insert( + Label::new("creation_date").unwrap(), + info.creation_date + .as_ref() + .map(|d| PileValue::String(format_date(d).into())) + .unwrap_or(PileValue::Null), + ); + output.insert( + Label::new("mod_date").unwrap(), + info.mod_date + .as_ref() + .map(|d| PileValue::String(format_date(d).into())) + .unwrap_or(PileValue::Null), + ); + } + } + + return Ok(self.output.get_or_init(|| output)); + } +} + +fn format_date(d: &Date) -> String { + let tz = match d.rel { + TimeRel::Universal => "Z".to_owned(), + TimeRel::Later => format!("+{:02}:{:02}", d.tz_hour, d.tz_minute), + TimeRel::Earlier => format!("-{:02}:{:02}", d.tz_hour, d.tz_minute), + }; + format!( + "{:04}-{:02}-{:02}T{:02}:{:02}:{:02}{}", + d.year, d.month, d.day, d.hour, d.minute, d.second, tz + ) +} + +impl Extractor for PdfMetaExtractor<'_> { + fn field<'a>( + &'a self, + name: &Label, + ) -> Result>, std::io::Error> { + Ok(self.get_inner()?.get(name)) + } + + fn fields(&self) -> Result, std::io::Error> { + Ok(self.get_inner()?.keys().cloned().collect()) + } +} diff --git a/crates/pile-dataset/src/extract/pdf/pdf_text.rs b/crates/pile-dataset/src/extract/pdf/pdf_text.rs new file mode 100644 index 0000000..31a0a4d --- /dev/null +++ b/crates/pile-dataset/src/extract/pdf/pdf_text.rs @@ -0,0 +1,79 @@ +use pdf::content::{Op, TextDrawAdjusted}; +use pdf::file::FileOptions; +use pile_config::Label; +use std::{collections::HashMap, sync::OnceLock}; + +use crate::{FileItem, PileValue, extract::Extractor}; + +pub struct PdfTextExtractor<'a> { + item: &'a FileItem, + output: OnceLock>>, +} + +impl<'a> PdfTextExtractor<'a> { + pub fn new(item: &'a FileItem) -> Self { + Self { + item, + output: OnceLock::new(), + } + } + + fn get_inner(&self) -> Result<&HashMap>, std::io::Error> { + if let Some(x) = self.output.get() { + return Ok(x); + } + + let file = FileOptions::cached() + .open(&self.item.path) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?; + + let mut text_parts: Vec = Vec::new(); + + for page in file.pages() { + let page = page + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?; + + if let Some(content) = &page.contents { + let ops = content.operations(&file.resolver()).map_err(|e| { + std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()) + })?; + + for op in ops { + match op { + Op::TextDraw { text } => { + text_parts.push(text.to_string_lossy()); + } + Op::TextDrawAdjusted { array } => { + for item in array { + if let TextDrawAdjusted::Text(text) = item { + text_parts.push(text.to_string_lossy()); + } + } + } + _ => {} + } + } + } + } + + let text = text_parts.join(" "); + + #[expect(clippy::unwrap_used)] + let output = HashMap::from([(Label::new("text").unwrap(), PileValue::String(text.into()))]); + + return Ok(self.output.get_or_init(|| output)); + } +} + +impl Extractor for PdfTextExtractor<'_> { + fn field<'a>( + &'a self, + name: &Label, + ) -> Result>, std::io::Error> { + Ok(self.get_inner()?.get(name)) + } + + fn fields(&self) -> Result, std::io::Error> { + Ok(self.get_inner()?.keys().cloned().collect()) + } +}