diff --git a/.gitignore b/.gitignore index ac2fab7..9eba5bf 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ /target -*.ignore \ No newline at end of file +*.ignore + +/data +/tokenizer.json \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..4761000 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,2889 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "const-random", + "getrandom 0.3.4", + "once_cell", + "version_check", + "zerocopy", +] + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "alloc-no-stdlib" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" + +[[package]] +name = "alloc-stdlib" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" +dependencies = [ + "alloc-no-stdlib", +] + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anstream" +version = "0.6.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" + +[[package]] +name = "anstyle-parse" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2" +dependencies = [ + "windows-sys 0.60.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.60.2", +] + +[[package]] +name = "anyhow" +version = "1.0.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + +[[package]] +name = "arrow-array" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8548ca7c070d8db9ce7aa43f37393e4bfcf3f2d3681df278490772fd1673d08d" +dependencies = [ + "ahash", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "hashbrown", + "num", +] + +[[package]] +name = "arrow-buffer" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e003216336f70446457e280807a73899dd822feaf02087d31febca1363e2fccc" +dependencies = [ + "bytes", + "half", + "num", +] + +[[package]] +name = "arrow-cast" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "919418a0681298d3a77d1a315f625916cb5678ad0d74b9c60108eb15fd083023" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "atoi", + "base64", + "chrono", + "half", + "lexical-core", + "num", + "ryu", +] + +[[package]] +name = "arrow-data" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5c64fff1d142f833d78897a772f2e5b55b36cb3e6320376f0961ab0db7bd6d0" +dependencies = [ + "arrow-buffer", + "arrow-schema", + "half", + "num", +] + +[[package]] +name = "arrow-ipc" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d3594dcddccc7f20fd069bc8e9828ce37220372680ff638c5e00dea427d88f5" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "flatbuffers", +] + +[[package]] +name = "arrow-schema" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3aa9e59c611ebc291c28582077ef25c97f1975383f1479b12f3b9ffee2ffabe" + +[[package]] +name = "arrow-select" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c41dbbd1e97bfcaee4fcb30e29105fb2c75e4d82ae4de70b792a5d3f66b2e7a" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "num", +] + +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", +] + +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + +[[package]] +name = "bitflags" +version = "2.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394" + +[[package]] +name = "brotli" +version = "8.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + +[[package]] +name = "bumpalo" +version = "3.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" + +[[package]] +name = "castaway" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dec551ab6e7578819132c713a93c022a05d60159dc86e7a7050223577484c55a" +dependencies = [ + "rustversion", +] + +[[package]] +name = "cc" +version = "1.2.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac9fe6cdbb24b6ade63616c0a0688e45bb56732262c158df3c0c4bea4ca47cb7" +dependencies = [ + "find-msvc-tools", + "jobserver", + "libc", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9" + +[[package]] +name = "chrono" +version = "0.4.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2" +dependencies = [ + "iana-time-zone", + "num-traits", + "windows-link 0.2.1", +] + +[[package]] +name = "clap" +version = "4.5.49" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4512b90fa68d3a9932cea5184017c5d200f5921df706d45e853537dea51508f" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.49" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0025e98baa12e766c67ba13ff4695a887a1eba19569aad00a472546795bd6730" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.49" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" + +[[package]] +name = "colorchoice" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" + +[[package]] +name = "compact_str" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb1325a1cece981e8a296ab8f0f9b63ae357bd0784a9faaf548cc7b480707a" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "rustversion", + "ryu", + "static_assertions", +] + +[[package]] +name = "console" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b430743a6eb14e9764d4260d4c0d8123087d504eeb9c48f2b2a5e810dd369df4" +dependencies = [ + "encode_unicode", + "libc", + "once_cell", + "unicode-width 0.2.2", + "windows-sys 0.61.2", +] + +[[package]] +name = "const-random" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom 0.2.16", + "once_cell", + "tiny-keccak", +] + +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + +[[package]] +name = "dary_heap" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06d2e3287df1c007e74221c49ca10a95d557349e54b3a75dc2fb14712c751f04" + +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "encode_unicode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" + +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "fancy-regex" +version = "0.16.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "998b056554fbe42e03ae0e152895cd1a7e1002aec800fdc6635d20270260c46f" +dependencies = [ + "bit-set", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + +[[package]] +name = "find-msvc-tools" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127" + +[[package]] +name = "flatbuffers" +version = "25.9.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09b6620799e7340ebd9968d2e0708eb82cf1971e9a16821e2091b6d6e475eed5" +dependencies = [ + "bitflags", + "rustc_version", +] + +[[package]] +name = "flate2" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc5a4e564e38c699f2880d3fda590bedc2e69f3f84cd48b457bd892ce61d0aa9" +dependencies = [ + "crc32fast", + "libz-rs-sys", + "miniz_oxide", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + +[[package]] +name = "form_urlencoded" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "futures-channel" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" +dependencies = [ + "futures-core", +] + +[[package]] +name = "futures-core" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" + +[[package]] +name = "futures-io" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" + +[[package]] +name = "futures-macro" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" + +[[package]] +name = "futures-task" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" + +[[package]] +name = "futures-util" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" +dependencies = [ + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "getrandom" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", +] + +[[package]] +name = "h2" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3c0b69cfcb4e1b9f1bf2f53f95f766e4661169728ec61cd3fe5a0166f2d1386" +dependencies = [ + "atomic-waker", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "http", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if", + "crunchy", + "num-traits", + "zerocopy", +] + +[[package]] +name = "hashbrown" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "http" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http", +] + +[[package]] +name = "http-body-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" +dependencies = [ + "bytes", + "futures-core", + "http", + "http-body", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" + +[[package]] +name = "hyper" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb3aa54a13a0dfe7fbe3a59e0c76093041720fdc77b110cc0fc260fafb4dc51e" +dependencies = [ + "atomic-waker", + "bytes", + "futures-channel", + "futures-core", + "h2", + "http", + "http-body", + "httparse", + "itoa", + "pin-project-lite", + "pin-utils", + "smallvec", + "tokio", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.27.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" +dependencies = [ + "http", + "hyper", + "hyper-util", + "rustls", + "rustls-pki-types", + "tokio", + "tokio-rustls", + "tower-service", +] + +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", +] + +[[package]] +name = "hyper-util" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c6995591a8f1380fcb4ba966a252a4b29188d51d2b89e3a252f5305be65aea8" +dependencies = [ + "base64", + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "http", + "http-body", + "hyper", + "ipnet", + "libc", + "percent-encoding", + "pin-project-lite", + "socket2", + "system-configuration", + "tokio", + "tower-service", + "tracing", + "windows-registry", +] + +[[package]] +name = "iana-time-zone" +version = "0.1.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33e57f83510bb73707521ebaffa789ec8caf86f9657cad665b092b581d40e9fb" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "icu_collections" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47" +dependencies = [ + "displaydoc", + "potential_utf", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3" + +[[package]] +name = "icu_properties" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "potential_utf", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632" + +[[package]] +name = "icu_provider" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af" +dependencies = [ + "displaydoc", + "icu_locale_core", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "idna" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + +[[package]] +name = "indexmap" +version = "2.11.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b0f83760fb341a774ed326568e19f5a863af4a952def8c39f9ab92fd95b88e5" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "indicatif" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70a646d946d06bedbbc4cac4c218acf4bbf2d87757a784857025f4d447e4e1cd" +dependencies = [ + "console", + "portable-atomic", + "unicode-segmentation", + "unicode-width 0.2.2", + "unit-prefix", + "vt100", + "web-time", +] + +[[package]] +name = "integer-encoding" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + +[[package]] +name = "ipnet" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" + +[[package]] +name = "iri-string" +version = "0.7.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbc5ebe9c3a1a7a5127f920a418f7585e9e758e911d0466ed004f393b0e380b2" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + +[[package]] +name = "itoa" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.4", + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.81" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec48937a97411dcb524a265206ccd4c90bb711fca92b2792c407f268825b9305" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "lexical-core" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-parse-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" +dependencies = [ + "lexical-parse-integer", + "lexical-util", +] + +[[package]] +name = "lexical-parse-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" +dependencies = [ + "lexical-util", +] + +[[package]] +name = "lexical-util" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" + +[[package]] +name = "lexical-write-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" +dependencies = [ + "lexical-util", + "lexical-write-integer", +] + +[[package]] +name = "lexical-write-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" +dependencies = [ + "lexical-util", +] + +[[package]] +name = "libc" +version = "0.2.177" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" + +[[package]] +name = "libm" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" + +[[package]] +name = "libz-rs-sys" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "840db8cf39d9ec4dd794376f38acc40d0fc65eec2a8f484f7fd375b84602becd" +dependencies = [ + "zlib-rs", +] + +[[package]] +name = "linux-raw-sys" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" + +[[package]] +name = "litemap" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" + +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" + +[[package]] +name = "lz4_flex" +version = "0.11.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08ab2867e3eeeca90e844d1940eab391c9dc5228783db2ed999acbc0a9ed375a" +dependencies = [ + "twox-hash", +] + +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + +[[package]] +name = "memchr" +version = "2.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" + +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", + "simd-adler32", +] + +[[package]] +name = "mio" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c" +dependencies = [ + "libc", + "wasi", + "windows-sys 0.59.0", +] + +[[package]] +name = "nanochat-rs" +version = "0.1.0" +dependencies = [ + "ahash", + "anstyle", + "anyhow", + "clap", + "compact_str", + "dary_heap", + "fancy-regex", + "futures-util", + "indicatif", + "parking_lot", + "parquet", + "rayon", + "reqwest", + "serde", + "serde_json", + "thiserror", + "tokio", + "tracing", + "tracing-indicatif", + "tracing-subscriber", + "url", +] + +[[package]] +name = "native-tls" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87de3442987e9dbec73158d5c715e7ad9072fda936bb03d19d7fa10e00520f0e" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + +[[package]] +name = "nu-ansi-term" +version = "0.50.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "num" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" + +[[package]] +name = "openssl" +version = "0.10.74" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24ad14dd45412269e1a30f52ad8f0664f0f4f4a89ee8fe28c3b3527021ebb654" +dependencies = [ + "bitflags", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "openssl-probe" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" + +[[package]] +name = "openssl-sys" +version = "0.9.110" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a9f0075ba3c21b09f8e8b2026584b1d18d49388648f2fbbf3c97ea8deced8e2" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "ordered-float" +version = "2.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" +dependencies = [ + "num-traits", +] + +[[package]] +name = "parking_lot" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-link 0.2.1", +] + +[[package]] +name = "parquet" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0dbd48ad52d7dccf8ea1b90a3ddbfaea4f69878dd7683e51c507d4bc52b5b27" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ipc", + "arrow-schema", + "arrow-select", + "base64", + "brotli", + "bytes", + "chrono", + "flate2", + "half", + "hashbrown", + "lz4_flex", + "num", + "num-bigint", + "paste", + "seq-macro", + "simdutf8", + "snap", + "thrift", + "twox-hash", + "zstd", +] + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + +[[package]] +name = "pin-project-lite" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + +[[package]] +name = "portable-atomic" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" + +[[package]] +name = "potential_utf" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84df19adbe5b5a0782edcab45899906947ab039ccf4573713735ee7de1e6b08a" +dependencies = [ + "zerovec", +] + +[[package]] +name = "proc-macro2" +version = "1.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "rayon" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex-automata" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" + +[[package]] +name = "reqwest" +version = "0.12.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d0946410b9f7b082a427e4ef5c8ff541a88b357bc6c637c40db3a68ac70a36f" +dependencies = [ + "base64", + "bytes", + "encoding_rs", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-rustls", + "hyper-tls", + "hyper-util", + "js-sys", + "log", + "mime", + "native-tls", + "percent-encoding", + "pin-project-lite", + "rustls-pki-types", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tokio-native-tls", + "tokio-util", + "tower", + "tower-http", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams", + "web-sys", +] + +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.16", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + +[[package]] +name = "rustix" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.61.2", +] + +[[package]] +name = "rustls" +version = "0.23.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd3c25631629d034ce7cd9940adc9d45762d46de2b0f57193c4443b92c6d4d40" +dependencies = [ + "once_cell", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-pki-types" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "229a4a4c221013e7e1f1a043678c5cc39fe5171437c88fb47151a21e6f5b5c79" +dependencies = [ + "zeroize", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10b3f4191e8a80e6b43eebabfac91e5dcecebb27a71f04e820c47ec41d314bf" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "ryu" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" + +[[package]] +name = "schannel" +version = "0.1.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "security-framework" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" +dependencies = [ + "bitflags", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc1f0cbffaac4852523ce30d8bd3c5cdc873501d96ff467ca09b6767bb8cd5c0" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "semver" +version = "1.0.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" + +[[package]] +name = "seq-macro" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.145" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", + "serde_core", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "signal-hook-registry" +version = "1.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2a4719bff48cee6b39d12c020eeb490953ad2443b7055bd0b21fca26bd8c28b" +dependencies = [ + "libc", +] + +[[package]] +name = "simd-adler32" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" + +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + +[[package]] +name = "slab" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "snap" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" + +[[package]] +name = "socket2" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17129e116933cf371d018bb80ae557e889637989d8638274fb25622827b03881" +dependencies = [ + "libc", + "windows-sys 0.60.2", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + +[[package]] +name = "syn" +version = "2.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "sync_wrapper" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" +dependencies = [ + "futures-core", +] + +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "system-configuration" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" +dependencies = [ + "bitflags", + "core-foundation", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "tempfile" +version = "3.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" +dependencies = [ + "fastrand", + "getrandom 0.3.4", + "once_cell", + "rustix", + "windows-sys 0.61.2", +] + +[[package]] +name = "thiserror" +version = "2.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thread_local" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "thrift" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" +dependencies = [ + "byteorder", + "integer-encoding", + "ordered-float", +] + +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + +[[package]] +name = "tinystr" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" +dependencies = [ + "displaydoc", + "zerovec", +] + +[[package]] +name = "tokio" +version = "1.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff360e02eab121e0bc37a2d3b4d4dc622e6eda3a8e5253d5435ecf5bd4c68408" +dependencies = [ + "bytes", + "libc", + "mio", + "parking_lot", + "pin-project-lite", + "signal-hook-registry", + "socket2", + "tokio-macros", + "windows-sys 0.61.2", +] + +[[package]] +name = "tokio-macros" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + +[[package]] +name = "tokio-rustls" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" +dependencies = [ + "rustls", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14307c986784f72ef81c89db7d9e28d6ac26d16213b109ea501696195e6e3ce5" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tower" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper", + "tokio", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-http" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adc82fd73de2a9722ac5da747f12383d2bfdb93591ee6c58486e0097890f05f2" +dependencies = [ + "bitflags", + "bytes", + "futures-util", + "http", + "http-body", + "iri-string", + "pin-project-lite", + "tower", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + +[[package]] +name = "tracing" +version = "0.1.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d12581f227e93f094d3af2ae690a574abb8a2b9b7a96e7cfe9647b2b617678" +dependencies = [ + "once_cell", + "valuable", +] + +[[package]] +name = "tracing-indicatif" +version = "0.3.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04d4e11e0e27acef25a47f27e9435355fecdc488867fa2bc90e75b0700d2823d" +dependencies = [ + "indicatif", + "tracing", + "tracing-core", + "tracing-subscriber", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-serde" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1" +dependencies = [ + "serde", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2054a14f5307d601f88daf0553e1cbf472acc4f2c51afab632431cdcd72124d5" +dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "regex-automata", + "serde", + "serde_json", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", + "tracing-serde", +] + +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + +[[package]] +name = "twox-hash" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" + +[[package]] +name = "unicode-ident" +version = "1.0.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" + +[[package]] +name = "unicode-segmentation" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" + +[[package]] +name = "unicode-width" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" + +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + +[[package]] +name = "unit-prefix" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "323402cff2dd658f39ca17c789b502021b3f18707c91cdf22e3838e1b4023817" + +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "url" +version = "2.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08bc136a29a3d1758e07a9cca267be308aeebf5cfd5a10f3f67ab2097683ef5b" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", + "serde", +] + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "vt100" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84cd863bf0db7e392ba3bd04994be3473491b31e66340672af5d11943c6274de" +dependencies = [ + "itoa", + "log", + "unicode-width 0.1.14", + "vte", +] + +[[package]] +name = "vte" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5022b5fbf9407086c180e9557be968742d839e68346af7792b8592489732197" +dependencies = [ + "arrayvec", + "utf8parse", + "vte_generate_state_changes", +] + +[[package]] +name = "vte_generate_state_changes" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e369bee1b05d510a7b4ed645f5faa90619e05437111783ea5848f28d97d3c2e" +dependencies = [ + "proc-macro2", + "quote", +] + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "wasip2" +version = "1.0.1+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1da10c01ae9f1ae40cbfac0bac3b1e724b320abfcf52229f80b547c0d250e2d" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "671c9a5a66f49d8a47345ab942e2cb93c7d1d0339065d4f8139c486121b43b19" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e038d41e478cc73bae0ff9b36c60cff1c98b8f38f8d7e8061e79ee63608ac5c" +dependencies = [ + "cfg-if", + "js-sys", + "once_cell", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ca60477e4c59f5f2986c50191cd972e3a50d8a95603bc9434501cf156a9a119" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bad67dc8b2a1a6e5448428adec4c3e84c43e561d8c9ee8a9e5aabeb193ec41d1" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "wasm-streams" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + +[[package]] +name = "web-sys" +version = "0.3.81" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9367c417a924a74cae129e6a2ae3b47fabb1f8995595ab474029da749a8be120" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "windows-core" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link 0.2.1", + "windows-result 0.4.1", + "windows-strings 0.5.1", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-link" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-registry" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b8a9ed28765efc97bbc954883f4e6796c33a06546ebafacbabee9696967499e" +dependencies = [ + "windows-link 0.1.3", + "windows-result 0.3.4", + "windows-strings 0.4.2", +] + +[[package]] +name = "windows-result" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56f42bd332cc6c8eac5af113fc0c1fd6a8fd2aa08a0119358686e5160d0586c6" +dependencies = [ + "windows-link 0.1.3", +] + +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link 0.2.1", +] + +[[package]] +name = "windows-strings" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56e6c93f3a0c3b36176cb1327a4958a0353d5d166c2a35cb268ace15e91d3b57" +dependencies = [ + "windows-link 0.1.3", +] + +[[package]] +name = "windows-strings" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" +dependencies = [ + "windows-link 0.2.1", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.5", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link 0.2.1", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" +dependencies = [ + "windows-link 0.2.1", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_i686_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" + +[[package]] +name = "wit-bindgen" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" + +[[package]] +name = "writeable" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" + +[[package]] +name = "yoke" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zerocopy" +version = "0.8.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0894878a5fa3edfd6da3f88c4805f4c8558e2b996227a3d864f47fe11e38282c" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zerofrom" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zeroize" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" + +[[package]] +name = "zerotrie" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7aa2bd55086f1ab526693ecbe444205da57e25f4489879da80635a46d90e73b" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zlib-rs" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f06ae92f42f5e5c42443fd094f245eb656abf56dd7cce9b8b263236565e00f2" + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..062a381 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,79 @@ +[package] +name = "nanochat-rs" +version = "0.1.0" +edition = "2024" + +[lints.rust] +unused_import_braces = "deny" +unit_bindings = "deny" +single_use_lifetimes = "deny" +non_ascii_idents = "deny" +macro_use_extern_crate = "deny" +elided_lifetimes_in_paths = "deny" +absolute_paths_not_starting_with_crate = "deny" +explicit_outlives_requirements = "warn" +unused_crate_dependencies = "warn" +redundant_lifetimes = "warn" +missing_docs = "allow" + +[lints.clippy] +todo = "deny" +uninlined_format_args = "allow" +result_large_err = "allow" +too_many_arguments = "allow" +upper_case_acronyms = "deny" +needless_return = "allow" +new_without_default = "allow" +tabs_in_doc_comments = "allow" +dbg_macro = "deny" +allow_attributes = "deny" +create_dir = "deny" +filetype_is_file = "deny" +integer_division = "allow" +lossy_float_literal = "deny" +map_err_ignore = "deny" +mutex_atomic = "deny" +needless_raw_strings = "deny" +str_to_string = "deny" +string_add = "deny" +string_to_string = "deny" +use_debug = "allow" +verbose_file_reads = "deny" +large_types_passed_by_value = "deny" +wildcard_dependencies = "deny" +negative_feature_names = "deny" +redundant_feature_names = "deny" +multiple_crate_versions = "allow" +missing_safety_doc = "warn" +identity_op = "allow" +print_stderr = "deny" +print_stdout = "deny" +comparison_chain = "allow" +unimplemented = "deny" +unwrap_used = "warn" +expect_used = "warn" +type_complexity = "allow" + + +[dependencies] +ahash = "0.8.12" +anstyle = "1.0.13" +anyhow = "1.0.100" +clap = { version = "4.5.49", features = ["derive"] } +compact_str = "0.9.0" +dary_heap = "0.3.8" +fancy-regex = "0.16.2" +futures-util = "0.3.31" +indicatif = { version = "0.18.0", features = ["improved_unicode"] } +parking_lot = "0.12.5" +parquet = "56.2.0" +rayon = "1.11.0" +reqwest = { version = "0.12.24", features = ["json", "stream"] } +serde = { version = "1.0.228", features = ["derive"] } +serde_json = "1.0.145" +thiserror = "2.0.17" +tokio = { version = "1.48.0", features = ["full"] } +tracing = "0.1.41" +tracing-indicatif = "0.3.13" +tracing-subscriber = { version = "0.3.20", features = ["env-filter", "json"] } +url = "2.5.7" diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000..218e203 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1 @@ +hard_tabs = true diff --git a/src/cli.rs b/src/cli.rs new file mode 100644 index 0000000..cfab229 --- /dev/null +++ b/src/cli.rs @@ -0,0 +1,64 @@ +use anstyle::{AnsiColor, Color, Style}; +use indicatif::ProgressStyle; + +pub fn clap_styles() -> clap::builder::Styles { + clap::builder::Styles::styled() + .usage( + Style::new() + .bold() + .fg_color(Some(Color::Ansi(AnsiColor::Blue))), + ) + .header( + Style::new() + .bold() + .fg_color(Some(Color::Ansi(AnsiColor::Blue))), + ) + .literal( + Style::new() + .bold() + .fg_color(Some(Color::Ansi(AnsiColor::BrightBlack))), + ) + .invalid( + Style::new() + .bold() + .fg_color(Some(Color::Ansi(AnsiColor::Red))), + ) + .error( + Style::new() + .bold() + .fg_color(Some(Color::Ansi(AnsiColor::Red))), + ) + .valid( + Style::new() + .bold() + .underline() + .fg_color(Some(Color::Ansi(AnsiColor::Green))), + ) + .placeholder(Style::new().fg_color(Some(Color::Ansi(AnsiColor::White)))) +} + +#[expect(clippy::unwrap_used)] +pub fn progress_big() -> ProgressStyle { + return ProgressStyle::default_bar() + .template( + " {spinner:.green} [{elapsed_precise}] [{bar:40.green/dim}] {pos:>7}/{len:7} {msg:.dim}", + ) + .unwrap() + .progress_chars("=>-") + .tick_strings(&[ + "⠉⠉", "⠈⠙", "⠀⠹", "⠀⢸", "⠀⣰", "⢀⣠", "⣀⣀", "⣄⡀", "⣆⠀", "⡇⠀", "⠏⠀", "⠋⠁", "⣏⣹", + ]); +} + +#[expect(clippy::unwrap_used)] +pub fn progress_bytes() -> ProgressStyle { + return ProgressStyle::default_bar() + .template( + " {bar:16.red/white.dim} {elapsed_precise:.dim} {bytes:>10}/{total_bytes:>10} {msg:.dim} ({eta})", + ) + .unwrap() + .progress_chars("---") + .tick_strings(&[ + "⠉⠉", "⠈⠙", "⠀⠹", "⠀⢸", "⠀⣰", "⢀⣠", "⣀⣀", "⣄⡀", "⣆⠀", "⡇⠀", "⠏⠀", "⠋⠁", "⣏⣹", + ]); +} diff --git a/src/command/download.rs b/src/command/download.rs new file mode 100644 index 0000000..9674cb4 --- /dev/null +++ b/src/command/download.rs @@ -0,0 +1,280 @@ +use anyhow::Result; +use clap::Args; +use futures_util::StreamExt; +use indicatif::MultiProgress; +use indicatif::ProgressBar; +use rayon::prelude::*; +use std::fs; +use std::path::{Path, PathBuf}; +use std::time::Duration; +use tokio::runtime::Runtime; +use tracing::{debug, error, info}; +use url::Url; + +use crate::cli::{progress_big, progress_bytes}; + +const BASE_URL: &str = + "https://huggingface.co/datasets/karpathy/fineweb-edu-100b-shuffle/resolve/main"; + +const MAX_SHARD: usize = 1822; + +#[derive(Debug, Args, Clone)] + +pub struct DownloadArgs { + /// Training data dir + #[clap(default_value = "data")] + data_dir: PathBuf, + + /// Number of shards to download (-1 for all) + #[arg(short = 'n', long, default_value = "-1")] + num_files: isize, + + /// Number of parallel downloads + #[arg(short = 't', long, default_value = "8")] + threads: usize, +} + +impl DownloadArgs { + pub fn run(self, mp: Option) -> Result<()> { + info!("Downloading files from {BASE_URL}"); + fs::create_dir_all(&self.data_dir)?; + + let num_shards_to_download = if self.num_files == -1 { + MAX_SHARD + 1 + } else { + self.num_files.min((MAX_SHARD + 1) as isize) as usize + }; + + let ids_to_download: Vec = (0..num_shards_to_download).collect(); + + info!("Downloading {} shards...", ids_to_download.len(),); + info!("Target directory: {}", self.data_dir.display()); + + let main_pb = mp.as_ref().map(|mp| { + let pb = mp.add(ProgressBar::new(ids_to_download.len() as u64)); + pb.set_style(progress_big()); + pb.set_message("Downloading training data"); + pb.enable_steady_tick(Duration::from_millis(100)); + pb + }); + + let rt = Runtime::new()?; + + let (tx, rx) = std::sync::mpsc::channel(); + + let pool = rayon::ThreadPoolBuilder::new() + .num_threads(self.threads) + .build()?; + + pool.install(|| { + ids_to_download + .into_par_iter() + .for_each_with(tx, |tx, index| { + let target = self.data_dir.clone(); + let main_pb = main_pb.clone(); + let mp_clone = mp.clone(); + let rt_handle = rt.handle().clone(); // Clone the runtime handle for each thread + + let result = rt_handle.block_on(async { + download_single_file(index, &target, main_pb, mp_clone).await + }); + + // Send the result back to the main thread for aggregation + #[expect(clippy::unwrap_used)] + tx.send(result).unwrap(); + }); + }); + + // Wait for all downloads to finish and collect results + let mut successful_downloads = 0; + for _ in 0..num_shards_to_download { + if let Ok(Ok(_)) = rx.recv() { + // Receive the Result<(), String> wrapped in a Result from MPSC + successful_downloads += 1; + } + } + + if let Some(pb) = main_pb.as_ref() { + pb.finish_and_clear(); + info!("Downloads complete ({successful_downloads} successful)"); + } + + return Ok(()); + } +} + +async fn download_single_file( + index: usize, + target: &Path, + progress_bar: Option, + mp: Option, +) -> Result<(), String> { + let filename = format!("shard_{:05}.parquet", index); + let filepath = target.join(&filename); + + if filepath.exists() { + info!("Skipping {} (already exists)", filepath.display()); + if let Some(pb) = progress_bar.as_ref() { + pb.inc(1); + } + return Ok(()); + } + + #[expect(clippy::unwrap_used)] + let url = Url::parse(&format!("{BASE_URL}/{filename}")).unwrap(); + + info!("Downloading {} from {}", filename, url); + + let max_attempts = 5; + 'attempt_loop: for attempt in 1..=max_attempts { + let temp_filepath = filepath.with_extension("parquet.tmp"); + let client = reqwest::Client::new(); + + match client.get(url.clone()).send().await { + Ok(response) => { + if !response.status().is_success() { + error!( + "Attempt {}/{}: Server responded with status {} for {}", + attempt, + max_attempts, + response.status(), + url + ); + tokio::time::sleep(Duration::from_secs(2u64.pow(attempt))).await; + continue; + } + + let total_size = response.content_length().unwrap_or(0); + debug!("Total size for {}: {}", filename, total_size); + + // Create file progress bar + let file_pb = if total_size > 0 + && let Some(mp) = mp.as_ref() + { + Some({ + let pb = mp.add(ProgressBar::new(total_size)); + pb.set_style(progress_bytes()); + pb.set_message(format!("Downloading {}", filename)); + pb + }) + } else { + None + }; + + let mut file = match tokio::fs::File::create(&temp_filepath).await { + Ok(file) => file, + Err(e) => { + error!( + "Attempt {}/{}: Failed to create temporary file {}: {}", + attempt, + max_attempts, + temp_filepath.display(), + e + ); + tokio::time::sleep(Duration::from_secs(2u64.pow(attempt))).await; + continue; + } + }; + + let mut stream = response.bytes_stream(); + let mut downloaded: u64 = 0; + + while let Some(chunk_result) = StreamExt::next(&mut stream).await { + match chunk_result { + Ok(chunk) => { + match tokio::io::AsyncWriteExt::write_all(&mut file, &chunk).await { + Ok(_) => { + downloaded += chunk.len() as u64; + if let Some(pb) = &file_pb { + pb.set_position(downloaded); + } + } + Err(e) => { + error!( + "Attempt {}/{}: Failed to write to temporary file {}: {}", + attempt, + max_attempts, + temp_filepath.display(), + e + ); + + // Clean up + let _ = tokio::fs::remove_file(&temp_filepath).await; + if let Some(pb) = &file_pb { + pb.finish_and_clear(); + } + tokio::time::sleep(Duration::from_secs(2u64.pow(attempt))) + .await; + continue 'attempt_loop; + } + } + } + + Err(e) => { + error!( + "Attempt {}/{}: Error reading chunk for {}: {}", + attempt, max_attempts, filename, e + ); + + let _ = tokio::fs::remove_file(&temp_filepath).await; + if let Some(pb) = &file_pb { + pb.finish_and_clear(); + } + tokio::time::sleep(Duration::from_secs(2u64.pow(attempt))).await; + continue 'attempt_loop; + } + } + } + + if let Some(pb) = &file_pb { + pb.finish_and_clear(); + } + + // Atomically rename the temporary file + match tokio::fs::rename(&temp_filepath, &filepath).await { + Ok(_) => { + info!("Successfully downloaded {}", filename); + if let Some(pb) = progress_bar.as_ref() { + pb.inc(1); + } + return Ok(()); + } + Err(e) => { + error!( + "Attempt {}/{}: Failed to rename temporary file {} to {}: {}", + attempt, + max_attempts, + temp_filepath.display(), + filepath.display(), + e + ); + let _ = tokio::fs::remove_file(&temp_filepath).await; // Clean up + } + } + } + Err(e) => { + error!( + "Attempt {}/{}: Failed to download {}: {}", + attempt, max_attempts, filename, e + ); + + // Clean up any partial files + let _ = tokio::fs::remove_file(&temp_filepath).await; + } + } + + if attempt < max_attempts { + let wait_time = 2u64.pow(attempt); + info!("Waiting {} seconds before retry...", wait_time); + tokio::time::sleep(Duration::from_secs(wait_time)).await; + } else { + error!( + "Failed to download {} after {} attempts", + filename, max_attempts + ); + return Err(format!("Failed to download {}", filename)); + } + } + + Err(format!("Failed to download {}", filename)) +} diff --git a/src/command/mod.rs b/src/command/mod.rs new file mode 100644 index 0000000..290a636 --- /dev/null +++ b/src/command/mod.rs @@ -0,0 +1,26 @@ +mod download; +mod train_tokenizer; + +#[derive(Debug, clap::Subcommand)] +pub enum SubCommand { + /// Download training set + Download { + #[command(flatten)] + args: download::DownloadArgs, + }, + + /// Train tokenizer + TrainTokenizer { + #[command(flatten)] + args: train_tokenizer::TrainTokenizerArgs, + }, +} + +impl SubCommand { + pub fn run(self, mp: Option) -> anyhow::Result<()> { + match self { + Self::Download { args } => args.run(mp), + Self::TrainTokenizer { args } => args.run(mp), + } + } +} diff --git a/src/command/train_tokenizer.rs b/src/command/train_tokenizer.rs new file mode 100644 index 0000000..eccbb71 --- /dev/null +++ b/src/command/train_tokenizer.rs @@ -0,0 +1,57 @@ +use anyhow::{Context, Result}; +use clap::Args; +use indicatif::MultiProgress; +use rayon::ThreadPoolBuilder; +use std::fs::File; +use std::path::PathBuf; +use tracing::info; + +use crate::data_reader::DataReader; +use crate::tokenizer::Tokenizer; + +#[derive(Debug, Args, Clone)] + +pub struct TrainTokenizerArgs { + /// Path to training data + #[clap(default_value = "data")] + data_dir: PathBuf, + + /// Where to save tokenizer + #[clap(default_value = "tokenizer.json")] + target: PathBuf, + + /// Only train on the first n texts + #[clap(long)] + first_n: Option, + + /// Number of threads to use for training + #[clap(long, default_value = "0")] + threads: usize, +} + +impl TrainTokenizerArgs { + pub fn run(self, mp: Option) -> Result<()> { + let iter = DataReader::new(5, &self.data_dir).context("while initializing data reader")?; + + #[expect(clippy::unwrap_used)] // Lazy error handling + let iter = iter.map(|x| x.unwrap()); + + let pool = ThreadPoolBuilder::new() + .num_threads(self.threads) + .build() + .context("while building thread pool")?; + + let tokenizer = pool + .install(|| match self.first_n { + Some(n) => Tokenizer::train(mp, iter.take(n), 1024), + None => Tokenizer::train(mp, iter, 1024), + }) + .context("while training tokenizer")?; + + info!("Saving to {}", self.target.display()); + let target = File::create(&self.target).context("while creating tokenizer file")?; + serde_json::to_writer(target, &tokenizer).context("while saving tokenizer")?; + + Ok(()) + } +} diff --git a/src/data_reader.rs b/src/data_reader.rs new file mode 100644 index 0000000..7e1188d --- /dev/null +++ b/src/data_reader.rs @@ -0,0 +1,187 @@ +use anyhow::Result; +use parking_lot::Mutex; +use parquet::errors::ParquetError; +use parquet::file::reader::{FileReader, SerializedFileReader}; +use parquet::record::RowAccessor; +use std::fs::File; +use std::path::Path; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::mpsc::Receiver; +use std::sync::{Arc, mpsc}; +use thiserror::Error; +use tracing::{debug, trace}; + +#[derive(Debug, Error)] +pub enum DataReaderError { + #[error("i/o error: {0}")] + IoError(#[from] std::io::Error), + + #[error("parquet error: {0}")] + ParquetError(#[from] ParquetError), +} + +/// A data reader that uses `n` threads to read +/// rows from a directory of parquet files. +/// +/// All parquet files have exactly one text column. +/// No promises about this struct's behavior if this is not the case. +pub struct DataReader { + rx: Receiver>, + total_rows: usize, + consumed_rows: AtomicUsize, +} + +impl DataReader { + /// Create a new [DataReader] that reads all `*.parquet` files in the given directory. + /// All other files are ignored. + pub fn new(threads: usize, data_dir: impl AsRef) -> Result { + let (files, total_rows) = { + let mut files = Vec::new(); + let mut total_rows = 0usize; + let entries = std::fs::read_dir(data_dir)?; + for entry in entries { + let entry = entry?; + let path = entry.path(); + if path.extension().and_then(|s| s.to_str()) == Some("parquet") { + // Count rows in this file + let file = File::open(&path)?; + let reader = SerializedFileReader::new(file) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; + let metadata = reader.metadata(); + + for row_group_index in 0..metadata.num_row_groups() { + let row_group_metadata = metadata.row_group(row_group_index); + total_rows += row_group_metadata.num_rows() as usize; + } + + files.push(path); + } + } + (Arc::new(Mutex::new(files)), total_rows) + }; + + #[expect(clippy::unwrap_used)] + let pool = rayon::ThreadPoolBuilder::new() + .num_threads(threads) + .thread_name(move |i| format!("DataReader-{i}-of-{threads}")) + .build() + .unwrap(); + + debug!( + message = "Starting DataReader", + threads, + n_files = files.lock().len() + ); + + let buffer = threads.min(10) * 200; + let (tx, rx) = mpsc::sync_channel::>(buffer); + pool.spawn_broadcast(move |_ctx| { + let tx = tx.clone(); + let files = files.clone(); + + 'outer: loop { + let file = match files.lock().pop() { + None => break 'outer, + Some(x) => x, + }; + + trace!("Reading rows from {}", file.display()); + let file = match File::open(&file) { + Ok(x) => x, + Err(err) => { + let _ = tx.send(Err(err.into())); + break 'outer; + } + }; + + let reader = match SerializedFileReader::new(file) { + Ok(x) => x, + Err(err) => { + let _ = tx.send(Err(err.into())); + break 'outer; + } + }; + + let metadata = reader.metadata(); + + for row_group_index in 0..metadata.num_row_groups() { + let row_group_reader = match reader.get_row_group(row_group_index) { + Ok(x) => x, + Err(err) => { + let _ = tx.send(Err(err.into())); + break 'outer; + } + }; + + let row_iter = match row_group_reader.get_row_iter(None) { + Ok(x) => x, + Err(err) => { + let _ = tx.send(Err(err.into())); + break 'outer; + } + }; + + for row_result in row_iter { + let row = match row_result { + Ok(x) => x, + Err(err) => { + let _ = tx.send(Err(err.into())); + break 'outer; + } + }; + + let text = match row.get_string(0) { + Ok(x) => x.clone(), + Err(err) => { + let _ = tx.send(Err(err.into())); + break 'outer; + } + }; + + if tx.send(Ok(text)).is_err() { + return; + } + } + } + } + }); + + Ok(Self { + rx, + total_rows, + consumed_rows: AtomicUsize::new(0), + }) + } + + /// Get the next available row. + /// Order is arbitrary. + /// Returns `None` when all rows have been read. + pub fn recv(&self) -> Option> { + self.rx.recv().ok() + } + + //pub fn try_recv(&self) -> Result, TryRecvError> { + // self.rx.try_recv() + //} +} + +impl Iterator for DataReader { + type Item = Result; + fn next(&mut self) -> Option { + match self.recv() { + Some(item) => { + self.consumed_rows.fetch_add(1, Ordering::Relaxed); + Some(item) + } + None => None, + } + } + + fn size_hint(&self) -> (usize, Option) { + let consumed = self.consumed_rows.load(Ordering::Relaxed); + let len = self.total_rows.saturating_sub(consumed); + return (len, Some(len)); + } +} + +impl ExactSizeIterator for DataReader {} diff --git a/src/logging.rs b/src/logging.rs new file mode 100644 index 0000000..705ebdb --- /dev/null +++ b/src/logging.rs @@ -0,0 +1,349 @@ +use anyhow::Result; +use clap::{Parser, ValueEnum}; +use indicatif::MultiProgress; +use serde::Deserialize; +use std::{fmt::Display, str::FromStr}; +use tracing_indicatif::IndicatifWriter; +use tracing_subscriber::{ + EnvFilter, Layer, fmt::MakeWriter, layer::SubscriberExt, util::SubscriberInitExt, +}; + +// +// MARK: loglevel +// + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Deserialize, ValueEnum)] +pub enum LogLevel { + Trace, + Debug, + Info, + Warn, + Error, +} + +impl Default for LogLevel { + fn default() -> Self { + Self::Info + } +} + +impl Display for LogLevel { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Trace => write!(f, "trace"), + Self::Debug => write!(f, "debug"), + Self::Info => write!(f, "info"), + Self::Warn => write!(f, "warn"), + Self::Error => write!(f, "error"), + } + } +} + +// +// MARK: logconfig +// + +/// Configures log levels for known sources +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Deserialize)] +pub struct LoggingConfig { + pub other: LogLevel, + pub silence: LogLevel, + + // Bins + pub nanochat: LogLevel, +} + +impl From for EnvFilter { + fn from(conf: LoggingConfig) -> Self { + // Should never fail + #[expect(clippy::unwrap_used)] + EnvFilter::from_str( + &[ + // + // Silence + // + format!("hyper_util={}", conf.silence), + format!("h2={}", conf.silence), + format!("rustls={}", conf.silence), + format!("tower={}", conf.silence), + format!("html5ever={}", conf.silence), + format!("selectors={}", conf.silence), + format!("wgpu_core={}", conf.silence), + format!("naga={}", conf.silence), + format!("cubecl={}", conf.silence), + // + // Bins + // + format!("nanochat_rs={}", conf.nanochat), + conf.other.to_string(), + ] + .join(","), + ) + .unwrap() + } +} + +// +// MARK: LogCliVQ +// + +/// Provides global -v and -q cli arguments. +/// Use with `#[arg(flatten)]`. +#[derive(Parser, Debug, Clone)] +pub struct LogCliVQ { + /// Increase verbosity (can be repeated) + #[arg(default_value = "0")] + #[arg(short, action = clap::ArgAction::Count,global = true)] + v: u8, + + /// Decrease verbosity (can be repeated) + #[arg(default_value = "0")] + #[arg(short, action = clap::ArgAction::Count, global = true)] + q: u8, +} + +impl LogCliVQ { + pub fn into_preset(self) -> LogFilterPreset { + let level_i: i16 = self.v as i16 - self.q as i16; + + let preset; + if level_i <= -2 { + preset = LogFilterPreset::Error + } else if level_i == -1 { + preset = LogFilterPreset::Warn + } else if level_i == 0 { + preset = LogFilterPreset::Info + } else if level_i == 1 { + preset = LogFilterPreset::Debug + } else if level_i >= 2 { + preset = LogFilterPreset::Trace + } else { + unreachable!() + } + + return preset; + } +} + +// +// MARK: logpreset +// + +/// Provides preset configurations of [LoggingConfig] +#[derive(Debug, Deserialize, Clone, Copy)] +pub enum LogFilterPreset { + /// Standard "error" log level + Error, + + /// Standard "warn" log level + Warn, + + /// Standard "info" log level. + /// This is the default. + Info, + + /// Standard "debug" log level + Debug, + + /// Standard "trace" log level + Trace, +} + +impl Default for LogFilterPreset { + fn default() -> Self { + return Self::Info; + } +} + +impl From for LoggingConfig { + fn from(val: LogFilterPreset) -> Self { + val.get_config() + } +} + +impl LogFilterPreset { + pub fn get_config(&self) -> LoggingConfig { + match self { + Self::Error => LoggingConfig { + other: LogLevel::Error, + silence: LogLevel::Error, + nanochat: LogLevel::Error, + }, + + Self::Warn => LoggingConfig { + other: LogLevel::Warn, + silence: LogLevel::Warn, + nanochat: LogLevel::Warn, + }, + + Self::Info => LoggingConfig { + other: LogLevel::Warn, + silence: LogLevel::Warn, + nanochat: LogLevel::Info, + }, + + Self::Debug => LoggingConfig { + other: LogLevel::Warn, + silence: LogLevel::Warn, + nanochat: LogLevel::Debug, + }, + + Self::Trace => LoggingConfig { + other: LogLevel::Trace, + silence: LogLevel::Warn, + nanochat: LogLevel::Trace, + }, + } + } +} + +// +// MARK: initializer +// + +/// Where to print logs +#[expect(clippy::allow_attributes)] +#[allow(dead_code)] +pub enum LoggingTarget { + /// Send logs to stdout + Stdout { format: LoggingFormat }, + + /// Send logs to stderr + Stderr { format: LoggingFormat }, + + /// Send logs to an IndicatifWriter. + /// + /// This is the same as Stderr { format: Ansi {color:true} }, + /// but uses an indicatifwriter with the given multiprogress. + Indicatif(MultiProgress), +} + +/// How to print logs +#[derive(Debug, Clone, Copy, Deserialize)] +pub enum LoggingFormat { + Ansi, + AnsiNoColor, + Json, +} + +impl Default for LoggingFormat { + fn default() -> Self { + Self::Ansi + } +} + +pub struct LoggingInitializer { + /// Log filter for printed logs + pub preset: LogFilterPreset, + + /// Where to print logs + pub target: LoggingTarget, +} + +impl LoggingInitializer { + pub fn initialize(self) -> Result<()> { + let mut stderr_ansi_layer = None; + let mut stderr_json_layer = None; + let mut stdout_ansi_layer = None; + let mut stdout_json_layer = None; + let mut indicatif_layer = None; + match self.target { + LoggingTarget::Stderr { + format: LoggingFormat::Ansi, + } => { + stderr_ansi_layer = Some( + tracing_subscriber::fmt::Layer::default() + .without_time() + .with_ansi(true) + .with_writer(std::io::stderr) + .with_filter::(self.preset.get_config().into()), + ) + } + + LoggingTarget::Stderr { + format: LoggingFormat::AnsiNoColor, + } => { + stderr_ansi_layer = Some( + tracing_subscriber::fmt::Layer::default() + .without_time() + .with_ansi(false) + .with_writer(std::io::stderr) + .with_filter::(self.preset.get_config().into()), + ) + } + + LoggingTarget::Stderr { + format: LoggingFormat::Json, + } => { + stderr_json_layer = Some( + tracing_subscriber::fmt::Layer::default() + .without_time() + .with_ansi(false) + .json() + .with_writer(std::io::stderr) + .with_filter::(self.preset.get_config().into()), + ) + } + + LoggingTarget::Stdout { + format: LoggingFormat::Ansi, + } => { + stdout_ansi_layer = Some( + tracing_subscriber::fmt::Layer::default() + .without_time() + .with_ansi(true) + .with_writer(std::io::stdout) + .with_filter::(self.preset.get_config().into()), + ) + } + + LoggingTarget::Stdout { + format: LoggingFormat::AnsiNoColor, + } => { + stdout_ansi_layer = Some( + tracing_subscriber::fmt::Layer::default() + .without_time() + .with_ansi(false) + .with_writer(std::io::stdout) + .with_filter::(self.preset.get_config().into()), + ) + } + + LoggingTarget::Stdout { + format: LoggingFormat::Json, + } => { + stdout_json_layer = Some( + tracing_subscriber::fmt::Layer::default() + .without_time() + .with_ansi(false) + .json() + .with_writer(std::io::stdout) + .with_filter::(self.preset.get_config().into()), + ) + } + + LoggingTarget::Indicatif(mp) => { + let writer: IndicatifWriter = + IndicatifWriter::new(mp); + + indicatif_layer = Some( + tracing_subscriber::fmt::Layer::default() + .without_time() + .with_ansi(true) + .with_writer(writer.make_writer()) + .with_filter::(self.preset.get_config().into()), + ) + } + } + + tracing_subscriber::registry() + .with(stdout_ansi_layer) + .with(stdout_json_layer) + .with(stderr_ansi_layer) + .with(stderr_json_layer) + .with(indicatif_layer) + .init(); + + Ok(()) + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..45922f0 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,63 @@ +use clap::Parser; +use indicatif::MultiProgress; +use tracing::error; + +use crate::{ + command::SubCommand, + logging::{LogCliVQ, LoggingFormat, LoggingInitializer, LoggingTarget}, +}; + +mod cli; +mod command; +mod data_reader; +mod logging; +mod tokenizer; + +#[derive(Parser, Debug)] +#[command(version, about, long_about = None, styles=crate::cli::clap_styles())] +struct Cli { + #[clap(flatten)] + vq: LogCliVQ, + + /// If true, never show progress bars. + #[arg(long)] + noprogress: bool, + + #[command(subcommand)] + command: SubCommand, +} + +fn main() { + let cli = Cli::parse(); + let mp = (!cli.noprogress).then(MultiProgress::new); + + { + let res = LoggingInitializer { + preset: cli.vq.into_preset(), + target: match mp.clone() { + None => LoggingTarget::Stderr { + format: LoggingFormat::Ansi, + }, + Some(mp) => LoggingTarget::Indicatif(mp.clone()), + }, + } + .initialize(); + + if let Err(e) = res { + #[expect(clippy::print_stderr)] + for e in e.chain() { + eprintln!("{e}"); + } + + std::process::exit(1); + } + } + + if let Err(e) = cli.command.run(mp) { + error!("Error: {e:?}"); + for e in e.chain() { + error!(e); + } + std::process::exit(1); + } +} diff --git a/src/tokenizer.rs b/src/tokenizer.rs new file mode 100644 index 0000000..67077e7 --- /dev/null +++ b/src/tokenizer.rs @@ -0,0 +1,538 @@ +use ahash::{AHashMap, AHashSet}; +use anyhow::{Result, bail}; +use compact_str::CompactString; +use dary_heap::DaryHeap; +use fancy_regex::Regex; +use indicatif::{MultiProgress, ProgressBar}; +use rayon::iter::{ + IndexedParallelIterator, IntoParallelRefIterator, ParallelBridge, ParallelIterator, +}; +use serde::{Deserialize, Serialize}; +use std::{cmp::Ordering, collections::HashMap, sync::atomic::AtomicUsize, time::Instant}; +use tracing::{debug, info}; + +use crate::cli::progress_big; + +// Notes: +// +// ## Why ahash? +// See docs. +// Ahash provides a uses a higher performance hash fn, +// but is not resistant to DOS and thus cannot be used +// with untrusted input. +// +// ## Changes from original impl +// - idiomatic Rust +// - significantly less memory usage + +/// A pair of adjacent tokens +type Pair = (u32, u32); + +// +// MARK: word +// + +/// A segment of text that is undergoing tokenization. +/// +/// `ids` contains the ids of the tokens this word consists of. +/// Initially, these are ascii u8s. More are added as we merge pairs. +/// +/// This implementation of BPE does not cross word boundaries. +/// - one token cannot represent more than one word, which may be a feature. +/// - this keeps multi-byte unicode together, since the split regex handles it correctly. +#[derive(Clone, Debug)] +struct Word { + ids: Vec, +} + +impl Word { + #[inline] + fn new(ids: Vec) -> Self { + Self { ids } + } + + /// Return an iterator over all adjacent pairs + /// of tokens in this word + #[inline] + fn pairs<'a>(&'a self) -> impl Iterator + 'a { + self.ids.windows(2).map(|w| (w[0], w[1])) + } + + /// Merge all non-overlapping occurrences of pair -> new_id. + /// Returns a small Vec of local pair-count deltas for THIS word only: + /// -1 for removed pairs, +1 for newly created pairs. + /// + /// NOTE: this version deliberately avoids a HashMap in the hot loop. + fn merge_pair(&mut self, pair: Pair, new_id: u32) -> Vec<(Pair, i32)> { + let (a, b) = pair; + let n = self.ids.len(); + if n < 2 { + return Vec::new(); + } + + let mut out: Vec = Vec::with_capacity(n); + let mut deltas: Vec<(Pair, i32)> = Vec::with_capacity(6); + + let mut i = 0; + while i < n { + if i + 1 < n && self.ids[i] == a && self.ids[i + 1] == b { + let left = out.last().copied(); + let right = if i + 2 < n { + Some(self.ids[i + 2]) + } else { + None + }; + + // remove old pairs + if let Some(x) = left { + deltas.push(((x, a), -1)); + deltas.push(((x, new_id), 1)); + } + deltas.push(((a, b), -1)); + if let Some(y) = right { + deltas.push(((b, y), -1)); + deltas.push(((new_id, y), 1)); + } + + // write merged token + out.push(new_id); + i += 2; // skip 'a' and 'b' + } else { + out.push(self.ids[i]); + i += 1; + } + } + + self.ids = out; + deltas + } +} + +// +// MARK: mergejob +// + +#[derive(Debug, Eq)] +struct MergeJob { + pair: Pair, + count: u64, + + /// set of word indices where this pair may occur and needs processing + pos: AHashSet, +} + +impl PartialEq for MergeJob { + fn eq(&self, other: &Self) -> bool { + self.count == other.count && self.pair == other.pair + } +} + +impl PartialOrd for MergeJob { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for MergeJob { + fn cmp(&self, other: &Self) -> Ordering { + self.count + .cmp(&other.count) + .then_with(|| other.pair.cmp(&self.pair)) + } +} + +// +// MARK: tokenizer +// + +/// A Byte Pair Encoding tokenizer that matches the GPT-4 style implementation +/// +/// BPE's goal is to build compound tokens for a text corpus. +/// It starts with a simple initial state---in this case, each ascii u8 +/// is a token. We then examine all adjacent pairs of tokens, replacing +/// common pairs with a new token id. This is repeated until our vocabulary +/// is as large as it needs to be. +pub struct Tokenizer { + /// Maps pairs of token IDs to their merged token ID + merges: HashMap, + + /// The regex pattern used for text splitting + #[expect(dead_code)] + split_regex: Regex, + + /// Source of split_regex + /// (debug info) + split_regex_string: String, + + /// The number of texts this tokenizer was trained on + /// (debug info) + n_train_texts: u64, + + /// The number of words this tokenizer was trained on + /// (debug info) + n_train_words: u64, +} + +impl Serialize for Tokenizer { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + + let mut state = serializer.serialize_struct("Tokenizer", 4)?; + + // Convert HashMap to Vec for serialization + // (only string keys are valid in json) + let merges_vec: Vec<(Pair, u32)> = self.merges.iter().map(|(&k, &v)| (k, v)).collect(); + state.serialize_field("merges", &merges_vec)?; + + state.serialize_field("split_regex_string", &self.split_regex_string)?; + state.serialize_field("n_train_texts", &self.n_train_texts)?; + state.serialize_field("n_train_words", &self.n_train_words)?; + state.end() + } +} + +// Custom deserializer that automatically compiles split_regex +impl<'de> Deserialize<'de> for Tokenizer { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + #[derive(Deserialize)] + struct TokenizerData { + merges: Vec<(Pair, u32)>, + split_regex_string: String, + n_train_texts: u64, + n_train_words: u64, + } + + let data = TokenizerData::deserialize(deserializer)?; + let split_regex = Regex::new(&data.split_regex_string).map_err(serde::de::Error::custom)?; + let merges: HashMap = data.merges.into_iter().collect(); + + Ok(Tokenizer { + merges, + split_regex, + split_regex_string: data.split_regex_string, + n_train_texts: data.n_train_texts, + n_train_words: data.n_train_words, + }) + } +} + +impl Tokenizer { + /// Default regex pattern for splitting text + const DEFAULT_REGEX: &str = r"'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"; + + /// Return the regex pattern used to split words + #[inline] + pub fn get_regex(&self) -> &str { + &self.split_regex_string + } + + /// Tokenize a string + pub fn tokenize(&self, text: &str) -> Vec { + let mut all_ids = Vec::new(); + + for m in self.split_regex.find_iter(text) { + #[expect(clippy::unwrap_used)] // Shouldn't ever fail + let word = m.unwrap().as_str(); + let mut word_tokens = word.bytes().map(|b| b as u32).collect::>(); + + // Apply merges + while word_tokens.len() >= 2 { + // Merge the pair with the largest token idx + // (pair_start_idx, replace_with) + let mut best_pair: Option<(usize, u32)> = None; + + for (i, pair) in word_tokens.windows(2).map(|x| (x[0], x[1])).enumerate() { + let new_id = match self.merges.get(&pair) { + None => continue, + Some(x) => *x, + }; + + #[expect(clippy::unwrap_used)] + if best_pair.is_none() || new_id < best_pair.unwrap().1 { + best_pair = Some((i, new_id)); + } + } + + match best_pair { + Some((idx, new_id)) => { + word_tokens[idx] = new_id; + word_tokens.remove(idx + 1); + } + + None => { + // No merges possible + break; + } + } + } + + all_ids.extend(word_tokens); + } + + all_ids + } + + // + // MARK: main training code + // + + // TODO: pool config + + /// Given an array of words an an array of counts, + /// - count the number of occurrences of each token pair + /// - map each pair to the indices of the words that contain it + /// + /// ## Notes: + /// - will panic if `words.len() != counts.len()` + /// - will not behave correctly if words are repeated + #[inline] + fn count_pairs( + words: &[Word], + counts: &[i32], + ) -> (AHashMap, AHashMap>) { + words + .par_iter() + .enumerate() + .map(|(i, w)| { + let mut pair_counts: AHashMap = AHashMap::new(); + let mut word_map: AHashMap> = AHashMap::new(); + + if w.ids.len() >= 2 && counts[i] != 0 { + for (a, b) in w.pairs() { + *pair_counts.entry((a, b)).or_default() += counts[i]; + word_map.entry((a, b)).or_default().insert(i); + } + } + + (pair_counts, word_map) + }) + .reduce( + || (AHashMap::new(), AHashMap::new()), + |(mut acc_pc, mut acc_wtu), (pc, wtu)| { + for (k, v) in pc { + *acc_pc.entry(k).or_default() += v; + } + for (k, s) in wtu { + acc_wtu.entry(k).or_default().extend(s); + } + (acc_pc, acc_wtu) + }, + ) + } + + /// Core incremental BPE training given unique words and their counts. + /// - `words`: one entry per unique chunk (Vec of token-ids/bytes). + /// - `counts`: same length as `words`, count per chunk. + /// + /// ## Notes: + /// - vocab size must be >= 256 + /// - will panic if `words.len() != counts.len()` + /// - will not behave correctly if words are repeated + fn train_core( + mp: Option, + mut words: Vec, + counts: Vec, + vocab_size: u32, + ) -> HashMap { + assert!(vocab_size >= 256, "vocab_size must be at least 256"); + + let num_merges = vocab_size - 256; + let mut merges = HashMap::with_capacity(num_merges as usize); + info!(message = "Training tokenizer", num_merges); + let now = Instant::now(); + + info!( + message = "Computing initial pair counts", + n_words = words.len() + ); + let (mut pair_counts, mut where_to_update) = Self::count_pairs(&words, &counts); + + let mut heap = { + debug!("Building heap with {} unique pairs", pair_counts.len()); + + let mut heap = DaryHeap::<_, 8>::with_capacity(pair_counts.len()); + for (pair, pos) in where_to_update.drain() { + let c = *pair_counts.get(&pair).unwrap_or(&0); + if c > 0 { + heap.push(MergeJob { + pair, + count: c as u64, + pos, + }); + } + } + + heap + }; + + let pb = mp.as_ref().map(|mp| { + let pb = mp.add(ProgressBar::new(num_merges as u64)); + pb.set_style(progress_big()); + pb.set_message("Computing merges"); + pb + }); + + let mut merges_done = 0u32; + + while merges_done < num_merges { + let Some(mut top) = heap.pop() else { + break; + }; + + // Lazy refresh + let current = *pair_counts.get(&top.pair).unwrap_or(&0); + if top.count != current as u64 { + top.count = current as u64; + if top.count > 0 { + heap.push(top); + } + continue; + } + if top.count == 0 { + break; + } + + // Record merge + let new_id = 256 + merges_done; + merges.insert(top.pair, new_id); + + // Merge this pair in all words where it occurs + let mut local_pos_updates: AHashMap> = AHashMap::new(); + for &word_idx in &top.pos { + // Apply merge to this word and collect pair-count deltas + let changes = words[word_idx].merge_pair(top.pair, new_id); + // Update global pair counts based on this word's count + for (pair, delta) in changes { + let delta_total = delta * counts[word_idx]; + if delta_total != 0 { + *pair_counts.entry(pair).or_default() += delta_total; + if delta > 0 { + local_pos_updates.entry(pair).or_default().insert(word_idx); + } + } + } + } + + // Add the updated pair counts back to the heap + for (pair, pos) in local_pos_updates { + let cnt = *pair_counts.get(&pair).unwrap_or(&0); + if cnt > 0 { + heap.push(MergeJob { + pair, + count: cnt as u64, + pos, + }); + } + } + + merges_done += 1; + + if let Some(pb) = pb.as_ref() { + pb.set_position(merges_done as u64); + } + } + + if let Some(pb) = pb { + pb.finish_and_clear(); + } + info!("Computed merges in {:.03?}", now.elapsed()); + + return merges; + } + + // + // MARK: train + // + + /// Train a new tokenizer from an iterator of texts. + pub fn train(mp: Option, iterator: I, vocab_size: u32) -> Result + where + I: Iterator + ExactSizeIterator, + I: ParallelBridge + Send, + { + if vocab_size < 256 { + bail!("vocab_size must be at least 256, but it is {vocab_size}"); + } + + let split_regex_string = Self::DEFAULT_REGEX.to_owned(); + #[expect(clippy::unwrap_used)] // Default regex must be valid + let split_regex = Regex::new(&split_regex_string).unwrap(); + + let now = Instant::now(); + let n_train_texts = iterator.len() as u64; + debug!("Counting words in {} texts", n_train_texts); + let pb = mp.as_ref().map(|mp| { + let pb = mp.add(ProgressBar::new(iterator.len() as u64)); + pb.set_style(progress_big()); + pb.set_message("Counting words"); + pb + }); + + // Process texts in parallel and collect chunk counts + let counter = AtomicUsize::new(0); + let counts: AHashMap = iterator + .par_bridge() + .map(|text| { + let mut local_counts: AHashMap = AHashMap::new(); + for mat in split_regex.find_iter(&text) { + #[expect(clippy::unwrap_used)] + let piece = mat.unwrap().as_str(); + + *local_counts.entry(CompactString::from(piece)).or_default() += 1; + } + + let count = counter.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + if let Some(ref pb) = pb { + if count % 1000 == 0 { + pb.inc(1000); + } + } + + local_counts + }) + .reduce(AHashMap::new, |mut acc, local_counts| { + for (k, v) in local_counts { + *acc.entry(k).or_default() += v; + } + + acc + }); + + if let Some(pb) = pb { + pb.finish_and_clear(); + } + info!( + "Counted {} unique words in {:.03?}", + counts.len(), + now.elapsed() + ); + + // Materialize words & counts + let mut words = Vec::with_capacity(counts.len()); + let mut cvec = Vec::with_capacity(counts.len()); + let mut n_train_words = 0u64; + for (chunk, c) in counts.into_iter() { + words.push(Word::new( + chunk.as_bytes().iter().map(|&b| b as u32).collect(), + )); + cvec.push(c); + n_train_words += c as u64; + } + + let merges = Self::train_core(mp, words, cvec, vocab_size); + + Ok(Self { + merges, + split_regex, + split_regex_string, + n_train_texts, + n_train_words, + }) + } +}