Add pdf extractors

This commit is contained in:
2026-03-06 16:26:09 -08:00
parent d51b8b51bf
commit 32c611186f
9 changed files with 669 additions and 29 deletions

443
Cargo.lock generated
View File

@@ -2,6 +2,23 @@
# It is not intended for manual editing.
version = 4
[[package]]
name = "adler32"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234"
[[package]]
name = "aes"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
dependencies = [
"cfg-if",
"cipher",
"cpufeatures",
]
[[package]]
name = "aho-corasick"
version = "1.1.4"
@@ -111,7 +128,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb"
dependencies = [
"proc-macro2",
"quote",
"syn",
"syn 2.0.117",
]
[[package]]
@@ -155,6 +172,15 @@ dependencies = [
"cpufeatures",
]
[[package]]
name = "block-buffer"
version = "0.10.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
dependencies = [
"generic-array",
]
[[package]]
name = "block-buffer"
version = "0.11.0"
@@ -164,6 +190,15 @@ dependencies = [
"hybrid-array",
]
[[package]]
name = "block-padding"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8894febbff9f758034a5b8e12d87918f56dfc64a8e1fe757d65e29041538d93"
dependencies = [
"generic-array",
]
[[package]]
name = "bon"
version = "3.9.0"
@@ -186,7 +221,7 @@ dependencies = [
"proc-macro2",
"quote",
"rustversion",
"syn",
"syn 2.0.117",
]
[[package]]
@@ -207,6 +242,15 @@ version = "1.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3"
[[package]]
name = "cbc"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26b52a9543ae338f279b96b0b9fed9c8093744685043739079ce85cd58f289a6"
dependencies = [
"cipher",
]
[[package]]
name = "cc"
version = "1.2.56"
@@ -244,6 +288,16 @@ dependencies = [
"windows-link",
]
[[package]]
name = "cipher"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad"
dependencies = [
"crypto-common 0.1.7",
"inout",
]
[[package]]
name = "clap"
version = "4.5.60"
@@ -275,7 +329,7 @@ dependencies = [
"heck",
"proc-macro2",
"quote",
"syn",
"syn 2.0.117",
]
[[package]]
@@ -321,6 +375,15 @@ version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
[[package]]
name = "core2"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505"
dependencies = [
"memchr",
]
[[package]]
name = "cpufeatures"
version = "0.2.17"
@@ -379,6 +442,16 @@ version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
[[package]]
name = "crypto-common"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a"
dependencies = [
"generic-array",
"typenum",
]
[[package]]
name = "crypto-common"
version = "0.2.0"
@@ -408,7 +481,7 @@ dependencies = [
"proc-macro2",
"quote",
"strsim",
"syn",
"syn 2.0.117",
]
[[package]]
@@ -419,7 +492,42 @@ checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d"
dependencies = [
"darling_core",
"quote",
"syn",
"syn 2.0.117",
]
[[package]]
name = "dary_heap"
version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06d2e3287df1c007e74221c49ca10a95d557349e54b3a75dc2fb14712c751f04"
[[package]]
name = "datasize"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e65c07d59e45d77a8bda53458c24a828893a99ac6cdd9c84111e09176ab739a2"
dependencies = [
"datasize_derive",
]
[[package]]
name = "datasize_derive"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "613e4ee15899913285b7612004bbd490abd605be7b11d35afada5902fb6b91d5"
dependencies = [
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "deflate"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c86f7e25f518f4b81808a2cf1c50996a61f5c2eb394b2393bd87f2a4780a432f"
dependencies = [
"adler32",
]
[[package]]
@@ -432,15 +540,25 @@ dependencies = [
"serde_core",
]
[[package]]
name = "digest"
version = "0.10.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
dependencies = [
"block-buffer 0.10.4",
"crypto-common 0.1.7",
]
[[package]]
name = "digest"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8bf3682cdec91817be507e4aa104314898b95b84d74f3d43882210101a545b6"
dependencies = [
"block-buffer",
"block-buffer 0.11.0",
"const-oid",
"crypto-common",
"crypto-common 0.2.0",
]
[[package]]
@@ -489,6 +607,26 @@ version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
[[package]]
name = "fax"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f05de7d48f37cd6730705cbca900770cab77a89f413d23e100ad7fad7795a0ab"
dependencies = [
"fax_derive",
]
[[package]]
name = "fax_derive"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a0aca10fb742cb43f9e7bb8467c91aa9bcb8e3ffbc6a6f7389bb93ffc920577d"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.117",
]
[[package]]
name = "find-msvc-tools"
version = "0.1.9"
@@ -507,6 +645,12 @@ version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
[[package]]
name = "foldhash"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
[[package]]
name = "fs4"
version = "0.13.1"
@@ -531,7 +675,7 @@ checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b"
dependencies = [
"proc-macro2",
"quote",
"syn",
"syn 2.0.117",
]
[[package]]
@@ -553,6 +697,16 @@ dependencies = [
"slab",
]
[[package]]
name = "generic-array"
version = "0.14.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
dependencies = [
"typenum",
"version_check",
]
[[package]]
name = "getrandom"
version = "0.2.17"
@@ -589,6 +743,18 @@ dependencies = [
"wasip3",
]
[[package]]
name = "globalcache"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f07c02868ebe3ffab0c273801b815f19a9fc05743b5b9da971449dd40604fe30"
dependencies = [
"async-trait",
"rustc-hash",
"tuple",
"web-time",
]
[[package]]
name = "hashbrown"
version = "0.15.5"
@@ -597,7 +763,7 @@ checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
dependencies = [
"allocator-api2",
"equivalent",
"foldhash",
"foldhash 0.1.5",
]
[[package]]
@@ -605,6 +771,11 @@ name = "hashbrown"
version = "0.16.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
dependencies = [
"allocator-api2",
"equivalent",
"foldhash 0.2.0",
]
[[package]]
name = "heck"
@@ -699,12 +870,40 @@ dependencies = [
"web-time",
]
[[package]]
name = "inout"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01"
dependencies = [
"block-padding",
"generic-array",
]
[[package]]
name = "is_terminal_polyfill"
version = "1.70.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
[[package]]
name = "istring"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "875cc6fb9aecbc1a9bd736f2d18b12e0756b4c80c5e35e28262154abcb077a39"
dependencies = [
"datasize",
]
[[package]]
name = "itertools"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
dependencies = [
"either",
]
[[package]]
name = "itertools"
version = "0.14.0"
@@ -730,6 +929,12 @@ dependencies = [
"libc",
]
[[package]]
name = "jpeg-decoder"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "00810f1d8b74be64b13dbf3db89ac67740615d6c891f0e7b6179326533011a07"
[[package]]
name = "js-sys"
version = "0.3.83"
@@ -764,6 +969,30 @@ version = "0.2.182"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112"
[[package]]
name = "libflate"
version = "2.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3248b8d211bd23a104a42d81b4fa8bb8ac4a3b75e7a43d85d2c9ccb6179cd74"
dependencies = [
"adler32",
"core2",
"crc32fast",
"dary_heap",
"libflate_lz77",
]
[[package]]
name = "libflate_lz77"
version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a599cb10a9cd92b1300debcef28da8f70b935ec937f44fcd1b70a7c986a11c5c"
dependencies = [
"core2",
"hashbrown 0.16.1",
"rle-decode-fast",
]
[[package]]
name = "libm"
version = "0.2.16"
@@ -815,6 +1044,12 @@ dependencies = [
"regex-automata",
]
[[package]]
name = "md5"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
[[package]]
name = "measure_time"
version = "0.9.0"
@@ -959,6 +1194,45 @@ version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
[[package]]
name = "pdf"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "de084796ae1744b0ae6b2b1e9368ed87bd43043ca410363edb72155ea68245db"
dependencies = [
"aes",
"bitflags",
"cbc",
"datasize",
"deflate",
"fax",
"globalcache",
"indexmap",
"istring",
"itertools 0.13.0",
"jpeg-decoder",
"libflate",
"log",
"md5",
"once_cell",
"pdf_derive",
"sha2 0.10.9",
"snafu",
"stringprep",
"weezl",
]
[[package]]
name = "pdf_derive"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d66198aa8cbc8fb36b61c32b50fc2ba11cf70cf177c7a36b15f9444a2fce0267"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.117",
]
[[package]]
name = "pile"
version = "0.0.1"
@@ -996,7 +1270,8 @@ version = "0.0.1"
dependencies = [
"blake3",
"chrono",
"itertools",
"itertools 0.14.0",
"pdf",
"pile-config",
"pile-flac",
"pile-toolbox",
@@ -1016,10 +1291,10 @@ name = "pile-flac"
version = "0.0.1"
dependencies = [
"base64",
"itertools",
"itertools 0.14.0",
"mime",
"paste",
"sha2",
"sha2 0.11.0-rc.5",
"smartstring",
"strum",
"thiserror",
@@ -1073,7 +1348,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b"
dependencies = [
"proc-macro2",
"syn",
"syn 2.0.117",
]
[[package]]
@@ -1198,6 +1473,12 @@ version = "0.8.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
[[package]]
name = "rle-decode-fast"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422"
[[package]]
name = "rust-stemmers"
version = "1.2.0"
@@ -1281,7 +1562,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
dependencies = [
"proc-macro2",
"quote",
"syn",
"syn 2.0.117",
]
[[package]]
@@ -1306,6 +1587,17 @@ dependencies = [
"serde_core",
]
[[package]]
name = "sha2"
version = "0.10.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
dependencies = [
"cfg-if",
"cpufeatures",
"digest 0.10.7",
]
[[package]]
name = "sha2"
version = "0.11.0-rc.5"
@@ -1314,7 +1606,7 @@ checksum = "7c5f3b1e2dc8aad28310d8410bd4d7e180eca65fca176c52ab00d364475d0024"
dependencies = [
"cfg-if",
"cpufeatures",
"digest",
"digest 0.11.0",
]
[[package]]
@@ -1384,6 +1676,27 @@ dependencies = [
"version_check",
]
[[package]]
name = "snafu"
version = "0.8.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e84b3f4eacbf3a1ce05eac6763b4d629d60cbc94d632e4092c54ade71f1e1a2"
dependencies = [
"snafu-derive",
]
[[package]]
name = "snafu-derive"
version = "0.8.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1c97747dbf44bb1ca44a561ece23508e99cb592e862f22222dcf42f51d1e451"
dependencies = [
"heck",
"proc-macro2",
"quote",
"syn 2.0.117",
]
[[package]]
name = "socket2"
version = "0.6.2"
@@ -1406,6 +1719,17 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "stringprep"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b4df3d392d81bd458a8a621b8bffbd2302a12ffe288a9d931670948749463b1"
dependencies = [
"unicode-bidi",
"unicode-normalization",
"unicode-properties",
]
[[package]]
name = "strsim"
version = "0.11.1"
@@ -1430,7 +1754,18 @@ dependencies = [
"heck",
"proc-macro2",
"quote",
"syn",
"syn 2.0.117",
]
[[package]]
name = "syn"
version = "1.0.109"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
@@ -1465,7 +1800,7 @@ dependencies = [
"fs4",
"htmlescape",
"hyperloglogplus",
"itertools",
"itertools 0.14.0",
"levenshtein_automata",
"log",
"lru",
@@ -1513,7 +1848,7 @@ checksum = "8b628488ae936c83e92b5c4056833054ca56f76c0e616aee8339e24ac89119cd"
dependencies = [
"downcast-rs",
"fastdivide",
"itertools",
"itertools 0.14.0",
"serde",
"tantivy-bitpacker",
"tantivy-common",
@@ -1563,7 +1898,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8292095d1a8a2c2b36380ec455f910ab52dde516af36321af332c93f20ab7d5"
dependencies = [
"futures-util",
"itertools",
"itertools 0.14.0",
"tantivy-bitpacker",
"tantivy-common",
"tantivy-fst",
@@ -1620,7 +1955,7 @@ checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5"
dependencies = [
"proc-macro2",
"quote",
"syn",
"syn 2.0.117",
]
[[package]]
@@ -1663,6 +1998,21 @@ dependencies = [
"time-core",
]
[[package]]
name = "tinyvec"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa"
dependencies = [
"tinyvec_macros",
]
[[package]]
name = "tinyvec_macros"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "tokio"
version = "1.49.0"
@@ -1688,7 +2038,7 @@ checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5"
dependencies = [
"proc-macro2",
"quote",
"syn",
"syn 2.0.117",
]
[[package]]
@@ -1762,7 +2112,7 @@ checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da"
dependencies = [
"proc-macro2",
"quote",
"syn",
"syn 2.0.117",
]
[[package]]
@@ -1829,18 +2179,49 @@ dependencies = [
"tracing-serde",
]
[[package]]
name = "tuple"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9bb9f6bd73479481158ba8ee3edf17aca93354623d13f02e96a2014fdbc1c37e"
dependencies = [
"num-traits",
"serde",
]
[[package]]
name = "typenum"
version = "1.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
[[package]]
name = "unicode-bidi"
version = "0.3.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5"
[[package]]
name = "unicode-ident"
version = "1.0.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
[[package]]
name = "unicode-normalization"
version = "0.1.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8"
dependencies = [
"tinyvec",
]
[[package]]
name = "unicode-properties"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d"
[[package]]
name = "unicode-segmentation"
version = "1.12.0"
@@ -1988,7 +2369,7 @@ dependencies = [
"bumpalo",
"proc-macro2",
"quote",
"syn",
"syn 2.0.117",
"wasm-bindgen-shared",
]
@@ -2045,6 +2426,12 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "weezl"
version = "0.1.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a28ac98ddc8b9274cb41bb4d9d4d5c425b6020c50c46f25559911905610b4a88"
[[package]]
name = "winapi"
version = "0.3.9"
@@ -2097,7 +2484,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf"
dependencies = [
"proc-macro2",
"quote",
"syn",
"syn 2.0.117",
]
[[package]]
@@ -2108,7 +2495,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358"
dependencies = [
"proc-macro2",
"quote",
"syn",
"syn 2.0.117",
]
[[package]]
@@ -2336,7 +2723,7 @@ dependencies = [
"heck",
"indexmap",
"prettyplease",
"syn",
"syn 2.0.117",
"wasm-metadata",
"wit-bindgen-core",
"wit-component",
@@ -2352,7 +2739,7 @@ dependencies = [
"prettyplease",
"proc-macro2",
"quote",
"syn",
"syn 2.0.117",
"wit-bindgen-core",
"wit-bindgen-rust",
]
@@ -2411,7 +2798,7 @@ checksum = "0c15e1b46eff7c6c91195752e0eeed8ef040e391cdece7c25376957d5f15df22"
dependencies = [
"proc-macro2",
"quote",
"syn",
"syn 2.0.117",
]
[[package]]

View File

@@ -90,6 +90,7 @@ toml = "1.0.3"
toml_edit = "0.25.4"
sha2 = "0.11.0-rc.5"
blake3 = "1.8.3"
pdf = "0.10.0"
# Misc helpers
thiserror = "2.0.18"

View File

@@ -24,3 +24,4 @@ rayon = { workspace = true }
smartstring = { workspace = true }
blake3 = { workspace = true }
toml_edit = { workspace = true }
pdf = { workspace = true }

View File

@@ -22,6 +22,11 @@ impl<'a> FlacExtractor<'a> {
return Ok(x);
}
// If this isn't a flac file, ignore it.
if self.item.path.extension().map(|x| x.to_str()).flatten() != Some("flac") {
return Ok(self.output.get_or_init(|| HashMap::new()));
}
let file = File::open(&self.item.path)?;
let reader = FlacReader::new(BufReader::new(file));

View File

@@ -4,7 +4,7 @@ use std::collections::HashMap;
use crate::{Item, PileValue, extract::Extractor};
pub struct MapExtractor<'a, I: Item> {
pub(super) inner: HashMap<Label, PileValue<'a, I>>,
pub(crate) inner: HashMap<Label, PileValue<'a, I>>,
}
impl<I: Item> Extractor<I> for MapExtractor<'_, I> {

View File

@@ -7,6 +7,9 @@ pub use flac::*;
mod fs;
pub use fs::*;
mod pdf;
pub use pdf::*;
mod sidecar;
pub use sidecar::*;
@@ -49,6 +52,10 @@ impl<'a> MetaExtractor<'a, crate::FileItem> {
Label::new("fs").unwrap(),
crate::PileValue::Extractor(Rc::new(FsExtractor::new(item))),
),
(
Label::new("pdf").unwrap(),
crate::PileValue::Extractor(Rc::new(PdfExtractor::new(item))),
),
(
Label::new("sidecar").unwrap(),
crate::PileValue::Extractor(Rc::new(SidecarExtractor::new(item))),
@@ -73,6 +80,7 @@ impl Extractor<crate::FileItem> for MetaExtractor<'_, crate::FileItem> {
return Ok(vec![
Label::new("flac").unwrap(),
Label::new("fs").unwrap(),
Label::new("pdf").unwrap(),
Label::new("sidecar").unwrap(),
]);
}

View File

@@ -0,0 +1,61 @@
use pile_config::Label;
use std::{collections::HashMap, rc::Rc};
mod pdf_meta;
pub use pdf_meta::*;
mod pdf_text;
pub use pdf_text::*;
use crate::{
FileItem, PileValue,
extract::{Extractor, MapExtractor},
};
pub struct PdfExtractor<'a> {
inner: MapExtractor<'a, FileItem>,
}
impl<'a> PdfExtractor<'a> {
#[expect(clippy::unwrap_used)]
pub fn new(item: &'a FileItem) -> Self {
let inner = MapExtractor {
inner: HashMap::from([
(
Label::new("text").unwrap(),
PileValue::Extractor(Rc::new(PdfTextExtractor::new(item))),
),
(
Label::new("meta").unwrap(),
PileValue::Extractor(Rc::new(PdfMetaExtractor::new(item))),
),
]),
};
Self { inner }
}
}
impl Extractor<FileItem> for PdfExtractor<'_> {
fn field<'a>(
&'a self,
name: &pile_config::Label,
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> {
if name.as_str() == "text" {
match self.inner.inner.get(name).unwrap() {
PileValue::Extractor(x) => return x.field(name),
_ => unreachable!(),
};
}
self.inner.field(name)
}
#[expect(clippy::unwrap_used)]
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(vec![
Label::new("text").unwrap(),
Label::new("meta").unwrap(),
])
}
}

View File

@@ -0,0 +1,98 @@
use pdf::file::FileOptions;
use pdf::primitive::{Date, TimeRel};
use pile_config::Label;
use std::{collections::HashMap, sync::OnceLock};
use crate::{FileItem, PileValue, extract::Extractor};
pub struct PdfMetaExtractor<'a> {
item: &'a FileItem,
output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>,
}
impl<'a> PdfMetaExtractor<'a> {
pub fn new(item: &'a FileItem) -> Self {
Self {
item,
output: OnceLock::new(),
}
}
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let file = FileOptions::cached()
.open(&self.item.path)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let mut output: HashMap<Label, PileValue<'a, FileItem>> = HashMap::new();
if let Some(info) = &file.trailer.info_dict {
let fields: &[(&str, Option<&_>)] = &[
("title", info.title.as_ref()),
("author", info.author.as_ref()),
("subject", info.subject.as_ref()),
("keywords", info.keywords.as_ref()),
("creator", info.creator.as_ref()),
("producer", info.producer.as_ref()),
];
#[expect(clippy::unwrap_used)]
for (key, val) in fields {
let label = Label::new(*key).unwrap();
let value = match val {
Some(s) => PileValue::String(s.to_string_lossy().into()),
None => PileValue::Null,
};
output.insert(label, value);
}
#[expect(clippy::unwrap_used)]
{
output.insert(
Label::new("creation_date").unwrap(),
info.creation_date
.as_ref()
.map(|d| PileValue::String(format_date(d).into()))
.unwrap_or(PileValue::Null),
);
output.insert(
Label::new("mod_date").unwrap(),
info.mod_date
.as_ref()
.map(|d| PileValue::String(format_date(d).into()))
.unwrap_or(PileValue::Null),
);
}
}
return Ok(self.output.get_or_init(|| output));
}
}
fn format_date(d: &Date) -> String {
let tz = match d.rel {
TimeRel::Universal => "Z".to_owned(),
TimeRel::Later => format!("+{:02}:{:02}", d.tz_hour, d.tz_minute),
TimeRel::Earlier => format!("-{:02}:{:02}", d.tz_hour, d.tz_minute),
};
format!(
"{:04}-{:02}-{:02}T{:02}:{:02}:{:02}{}",
d.year, d.month, d.day, d.hour, d.minute, d.second, tz
)
}
impl Extractor<FileItem> for PdfMetaExtractor<'_> {
fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> {
Ok(self.get_inner()?.get(name))
}
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner()?.keys().cloned().collect())
}
}

View File

@@ -0,0 +1,79 @@
use pdf::content::{Op, TextDrawAdjusted};
use pdf::file::FileOptions;
use pile_config::Label;
use std::{collections::HashMap, sync::OnceLock};
use crate::{FileItem, PileValue, extract::Extractor};
pub struct PdfTextExtractor<'a> {
item: &'a FileItem,
output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>,
}
impl<'a> PdfTextExtractor<'a> {
pub fn new(item: &'a FileItem) -> Self {
Self {
item,
output: OnceLock::new(),
}
}
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let file = FileOptions::cached()
.open(&self.item.path)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let mut text_parts: Vec<String> = Vec::new();
for page in file.pages() {
let page = page
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
if let Some(content) = &page.contents {
let ops = content.operations(&file.resolver()).map_err(|e| {
std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string())
})?;
for op in ops {
match op {
Op::TextDraw { text } => {
text_parts.push(text.to_string_lossy());
}
Op::TextDrawAdjusted { array } => {
for item in array {
if let TextDrawAdjusted::Text(text) = item {
text_parts.push(text.to_string_lossy());
}
}
}
_ => {}
}
}
}
}
let text = text_parts.join(" ");
#[expect(clippy::unwrap_used)]
let output = HashMap::from([(Label::new("text").unwrap(), PileValue::String(text.into()))]);
return Ok(self.output.get_or_init(|| output));
}
}
impl Extractor<FileItem> for PdfTextExtractor<'_> {
fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> {
Ok(self.get_inner()?.get(name))
}
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner()?.keys().cloned().collect())
}
}