From 8ab4ea53ece63ad322db28f9f4dba2b70d7dd875 Mon Sep 17 00:00:00 2001 From: rm-dr <96270320+rm-dr@users.noreply.github.com> Date: Sat, 21 Feb 2026 13:53:30 -0800 Subject: [PATCH] Validate label names --- Cargo.lock | 107 ++++++++++++++++++ Cargo.toml | 21 +--- crates/pile-config/Cargo.toml | 1 + crates/pile-config/src/lib.rs | 40 ++----- crates/pile-config/src/misc.rs | 122 +++++++++++++++++++++ crates/pile-dataset/Cargo.toml | 2 +- crates/pile-dataset/src/index/index_fts.rs | 18 +-- crates/pile-dataset/src/lib.rs | 3 + crates/pile-dataset/src/misc.rs | 61 +++++++++++ crates/pile/src/command/index.rs | 4 +- crates/pile/src/command/lookup.rs | 2 +- 11 files changed, 318 insertions(+), 63 deletions(-) create mode 100644 crates/pile-config/src/misc.rs create mode 100644 crates/pile-dataset/src/misc.rs diff --git a/Cargo.lock b/Cargo.lock index cf9274b..f2d957b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,15 @@ version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + [[package]] name = "anstream" version = "0.6.21" @@ -211,6 +220,19 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "chrono" +version = "0.4.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fac4744fb15ae8337dc853fee7fb3f4e48c0fbaa23d0afe49c447b4fab126118" +dependencies = [ + "iana-time-zone", + "js-sys", + "num-traits", + "wasm-bindgen", + "windows-link", +] + [[package]] name = "clap" version = "4.5.53" @@ -276,6 +298,12 @@ version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0dabb6555f92fb9ee4140454eb5dcd14c7960e1225c6d1a6cc361f032947713e" +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + [[package]] name = "cpufeatures" version = "0.2.17" @@ -621,6 +649,30 @@ dependencies = [ "serde", ] +[[package]] +name = "iana-time-zone" +version = "0.1.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + [[package]] name = "id-arena" version = "2.3.0" @@ -1019,6 +1071,7 @@ dependencies = [ "itertools", "serde", "serde_json", + "smartstring", "toml", ] @@ -1026,6 +1079,7 @@ dependencies = [ name = "pile-dataset" version = "0.0.1" dependencies = [ + "chrono", "itertools", "jsonpath-rust", "pile-audio", @@ -2120,12 +2174,65 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows-core" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "windows-link" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" +dependencies = [ + "windows-link", +] + [[package]] name = "windows-sys" version = "0.59.0" diff --git a/Cargo.toml b/Cargo.toml index 3508d0f..fa4f37f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -70,18 +70,10 @@ pile-audio = { path = "crates/pile-audio" } pile-dataset = { path = "crates/pile-dataset" } # Clients -reqwest = { version = "0.12.15", features = [ - "multipart", - "json", - "rustls-tls", -] } -librqbit = "8.1.1" -librqbit-core = "5.0.0" tantivy = "0.25.0" # Async & Parallelism tokio = { version = "1.44.1", features = ["full"] } -tokio-stream = { version = "0.1.17" } # CLI & logging tracing = "0.1.41" @@ -91,17 +83,10 @@ tracing-indicatif = "0.3.13" anstyle = "1.0.10" clap = { version = "4.5.37", features = ["derive"] } -# Extra types -url = { version = "2.5.4", features = ["serde"] } - # Serialization & formats serde = { version = "1.0.219", features = ["derive"] } serde_json = "1.0.140" -blake3 = "1.8.2" -flate2 = "1.1.2" base64 = "0.22.1" -binrw = "0.15.0" -brotli = "8.0.2" toml = "0.9.8" jsonpath-rust = "1.0.4" sha2 = "0.11.0-rc.3" @@ -110,15 +95,11 @@ sha2 = "0.11.0-rc.3" thiserror = "2.0.12" anyhow = "1.0.97" itertools = "0.14.0" -tempfile = "3.21.0" signal-hook = "0.3.18" -parking_lot = "0.12.5" -lru = "0.16.1" -rayon = "1.11.0" rand = "0.9.2" -regex = "1.12.2" strum = { version = "0.27.2", features = ["derive"] } walkdir = "2.5.0" mime = "0.3.17" paste = "1.0.15" smartstring = "1.0.1" +chrono = "0.4.43" diff --git a/crates/pile-config/Cargo.toml b/crates/pile-config/Cargo.toml index 9b637d1..67b39ac 100644 --- a/crates/pile-config/Cargo.toml +++ b/crates/pile-config/Cargo.toml @@ -11,6 +11,7 @@ workspace = true serde = { workspace = true } itertools = { workspace = true } serde_json = { workspace = true } +smartstring = { workspace = true } [dev-dependencies] toml = { workspace = true } diff --git a/crates/pile-config/src/lib.rs b/crates/pile-config/src/lib.rs index cd671a5..67525c6 100644 --- a/crates/pile-config/src/lib.rs +++ b/crates/pile-config/src/lib.rs @@ -1,56 +1,36 @@ use serde::Deserialize; -use std::{collections::HashMap, fmt::Debug, path::PathBuf, slice}; - -pub static INIT_DB_TOML: &str = include_str!("./config.toml"); +use std::{collections::HashMap, fmt::Debug, path::PathBuf}; mod post; pub use post::*; +mod misc; +pub use misc::*; + +pub static INIT_DB_TOML: &str = include_str!("./config.toml"); + #[test] fn init_db_toml_valid() { toml::from_str::(INIT_DB_TOML).unwrap(); } -#[derive(Debug, Clone, Deserialize)] -#[serde(untagged)] -pub enum OneOrMany { - One(T), - Many(Vec), -} - -impl OneOrMany { - pub fn to_vec(self) -> Vec { - match self { - Self::One(x) => vec![x], - Self::Many(x) => x, - } - } - - pub fn as_slice(&self) -> &[T] { - match self { - Self::One(x) => slice::from_ref(&x), - Self::Many(x) => &x[..], - } - } -} - #[derive(Debug, Clone, Deserialize)] pub struct ConfigToml { pub dataset: DatasetConfig, - pub schema: HashMap, + pub schema: HashMap, pub fts: Option, } #[derive(Debug, Clone, Deserialize)] pub struct DatasetConfig { /// Must be unique - pub name: String, + pub name: Label, /// Root dir for indices pub working_dir: Option, /// Where to find this field - pub source: HashMap, + pub source: HashMap, /// How to post-process this field #[serde(default)] @@ -95,7 +75,7 @@ pub enum FieldType { #[derive(Debug, Clone, Deserialize, Default)] pub struct DatasetFts { #[serde(alias = "field")] - pub fields: HashMap, + pub fields: HashMap, } #[derive(Debug, Clone, Deserialize)] diff --git a/crates/pile-config/src/misc.rs b/crates/pile-config/src/misc.rs new file mode 100644 index 0000000..831c2f9 --- /dev/null +++ b/crates/pile-config/src/misc.rs @@ -0,0 +1,122 @@ +use core::slice; +use std::fmt::{Debug, Display}; +use std::ops::Deref; + +use serde::{Deserialize, Serialize}; +use smartstring::{LazyCompact, SmartString}; + +#[derive(Debug, Clone, Deserialize)] +#[serde(untagged)] +pub enum OneOrMany { + One(T), + Many(Vec), +} + +impl OneOrMany { + pub fn to_vec(self) -> Vec { + match self { + Self::One(x) => vec![x], + Self::Many(x) => x, + } + } + + pub fn as_slice(&self) -> &[T] { + match self { + Self::One(x) => slice::from_ref(&x), + Self::Many(x) => &x[..], + } + } +} + +// +// MARK: Label +// + +/// A sanitized [String], guaranteed to only contain +/// chars in `A-z`, `0-9`, and `-_`. +/// +/// Used for names of datasets, fields, etc. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +#[serde(try_from = "String", into = "String")] +pub struct Label(SmartString); + +impl Label { + pub const VALID_CHARS: &str = + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_"; + + pub fn new(str: impl Into) -> Option { + let str: String = str.into(); + for c in str.chars() { + if !Self::VALID_CHARS.contains(c) { + return None; + } + } + + return Some(Self(str.into())); + } + + pub fn as_str(&self) -> &str { + &self.0 + } + + pub fn into_string(self) -> String { + self.0.into() + } +} + +impl Display for Label { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +impl From