From 08003a3fbe02cd64e2c5fa96ef49458ff40437d1 Mon Sep 17 00:00:00 2001 From: rm-dr <96270320+rm-dr@users.noreply.github.com> Date: Wed, 3 Dec 2025 12:57:10 -0800 Subject: [PATCH] v0.0.3 --- Cargo.lock | 186 +++++++++++++- Cargo.toml | 10 +- crates/datapath-macro/src/lib.rs | 9 +- crates/datapath/Cargo.toml | 11 + crates/datapath/src/datapath.rs | 4 + crates/datapath/src/index/mod.rs | 395 ++++++++++++++++++++++++++++++ crates/datapath/src/index/rule.rs | 381 ++++++++++++++++++++++++++++ crates/datapath/src/lib.rs | 10 + 8 files changed, 1000 insertions(+), 6 deletions(-) create mode 100644 crates/datapath/src/index/mod.rs create mode 100644 crates/datapath/src/index/rule.rs diff --git a/Cargo.lock b/Cargo.lock index 1cdb44d..1b8a2c4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + [[package]] name = "bumpalo" version = "3.19.0" @@ -14,23 +23,68 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + [[package]] name = "datapath" -version = "0.0.2" +version = "0.0.3" dependencies = [ "datapath-macro", + "itertools", + "regex", + "tokio", + "tracing", + "trie-rs", "uuid", ] [[package]] name = "datapath-macro" -version = "0.0.2" +version = "0.0.3" dependencies = [ "proc-macro2", "quote", "syn", ] +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "fid-rs" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6956a1e60e2d1412b44b4169d44a03dae518f8583d3e10090c912c105e48447" +dependencies = [ + "rayon", +] + [[package]] name = "getrandom" version = "0.3.4" @@ -43,6 +97,15 @@ dependencies = [ "wasip2", ] +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "js-sys" version = "0.3.83" @@ -59,12 +122,33 @@ version = "0.2.178" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37c93d8daa9d8a012fd8ab92f088405fb202ea0b6ab73ee2482ae66af4f42091" +[[package]] +name = "louds-rs" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "936de6c22f08e7135a921f8ada907acd0d88880c4f42b5591f634b9f1dd8e07f" +dependencies = [ + "fid-rs", +] + +[[package]] +name = "memchr" +version = "2.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" + [[package]] name = "once_cell" version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +[[package]] +name = "pin-project-lite" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" + [[package]] name = "proc-macro2" version = "1.0.103" @@ -89,6 +173,55 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "rayon" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "regex" +version = "1.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" + [[package]] name = "rustversion" version = "1.0.22" @@ -106,6 +239,55 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tokio" +version = "1.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff360e02eab121e0bc37a2d3b4d4dc622e6eda3a8e5253d5435ecf5bd4c68408" +dependencies = [ + "pin-project-lite", +] + +[[package]] +name = "tracing" +version = "0.1.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d15d90a0b5c19378952d479dc858407149d7bb45a14de0142f6c534b16fc647" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a04e24fab5c89c6a36eb8558c9656f30d81de51dfa4d3b45f26b21d61fa0a6c" +dependencies = [ + "once_cell", +] + +[[package]] +name = "trie-rs" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f88f4b0a1ebd6c3d16be3e45eb0e8089372ccadd88849b7ca162ba64b5e6f6" +dependencies = [ + "louds-rs", +] + [[package]] name = "unicode-ident" version = "1.0.22" diff --git a/Cargo.toml b/Cargo.toml index a054c4e..122c791 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,7 +11,7 @@ readme = "README.md" authors = ["rm-dr"] # Don't forget to bump datapath-macro below! -version = "0.0.2" +version = "0.0.3" [workspace.lints.rust] unused_import_braces = "deny" @@ -70,12 +70,16 @@ cargo_common_metadata = "deny" # [workspace.dependencies] -datapath-macro = { path = "crates/datapath-macro", version = "0.0.2" } +datapath-macro = { path = "crates/datapath-macro", version = "0.0.3" } datapath = { path = "crates/datapath" } chrono = "0.4.42" +itertools = "0.14.0" proc-macro2 = "1.0.103" quote = "1.0.42" +regex = "1.12.2" syn = "2.0.111" - +tracing = "0.1" +trie-rs = "0.4.2" uuid = "1.19.0" +tokio = { version = "1.48.0", features = ["sync"] } diff --git a/crates/datapath-macro/src/lib.rs b/crates/datapath-macro/src/lib.rs index 76c28f7..8bdad65 100644 --- a/crates/datapath-macro/src/lib.rs +++ b/crates/datapath-macro/src/lib.rs @@ -549,7 +549,7 @@ fn generate_common_impls( } // Extract just the field names for struct construction - let field_names = typed_fields.iter().map(|(name, _)| name); + let field_names: Vec<_> = typed_fields.iter().map(|(name, _)| name).collect(); let datapath_impl = quote! { impl ::datapath::Datapath for #struct_name { @@ -600,6 +600,13 @@ fn generate_common_impls( file, }) } + + fn field(&self, name: &str) -> Option<::std::string::String> { + match name { + #(stringify!(#field_names) => Some(self.#field_names.to_string()),)* + _ => None, + } + } } }; diff --git a/crates/datapath/Cargo.toml b/crates/datapath/Cargo.toml index 1e67666..4e626a0 100644 --- a/crates/datapath/Cargo.toml +++ b/crates/datapath/Cargo.toml @@ -17,5 +17,16 @@ workspace = true [dependencies] datapath-macro = { workspace = true } +regex = { workspace = true, optional = true } +tracing = { workspace = true, optional = true } +trie-rs = { workspace = true, optional = true } +itertools = { workspace = true, optional = true } +tokio = { workspace = true, optional = true } + [dev-dependencies] uuid = { version = "1", features = ["v4"] } + +[features] +default = [] +index = ["dep:regex", "dep:trie-rs", "dep:tracing", "dep:itertools"] +tokio = ["dep:tokio"] diff --git a/crates/datapath/src/datapath.rs b/crates/datapath/src/datapath.rs index ff42aff..7bd9132 100644 --- a/crates/datapath/src/datapath.rs +++ b/crates/datapath/src/datapath.rs @@ -33,4 +33,8 @@ where /// Parse a string as this datapath with a (possibly empty-string) /// file, returning `None` if this string is invalid. fn parse(path: &str) -> Option>; + + /// Get the string value of the field with the given name, + /// if it exists. + fn field(&self, name: &str) -> Option; } diff --git a/crates/datapath/src/index/mod.rs b/crates/datapath/src/index/mod.rs new file mode 100644 index 0000000..cc42e55 --- /dev/null +++ b/crates/datapath/src/index/mod.rs @@ -0,0 +1,395 @@ +use itertools::Itertools; +use std::{collections::HashMap, fmt::Display, str::FromStr}; +use trie_rs::map::{Trie, TrieBuilder}; + +mod rule; + +/// A path segment in an [`AnyDatapath`] +#[derive(Debug, Clone, Hash, PartialEq, Eq)] +enum PathSegment { + /// A constant value, like `web` + Constant(String), + + /// A key=value partition, like `domain=gouletpens.com` + Value { key: String, value: String }, +} + +impl Display for PathSegment { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + PathSegment::Constant(x) => write!(f, "{x}"), + PathSegment::Value { key, value } => write!(f, "{key}={value}"), + } + } +} + +impl FromStr for PathSegment { + type Err = (); + fn from_str(s: &str) -> Result { + if s.contains("\n") { + return Err(()); + } + + if s.is_empty() { + return Err(()); + } + + return Ok(if s.contains("=") { + let mut s = s.split("="); + let key = s.next().ok_or(())?.to_owned(); + let value = s.join("="); + Self::Value { key, value } + } else { + Self::Constant(s.to_owned()) + }); + } +} + +// +// MARK: index +// + +/// An in-memory cache of s3 paths. +#[derive(Debug)] +pub struct DatapathIndex { + patterns: Trie>, + len: usize, +} + +impl DatapathIndex { + /// Convert a query string to a trie search key by normalizing values to `*`. + /// Stops at the first wildcard constant since it can't be used for prefix matching. + fn query_to_key(query: &str) -> String { + let trimmed = query.trim().trim_end_matches("**").trim_matches('/'); + let mut segments = Vec::new(); + for seg in trimmed.split('/') { + let segment = match PathSegment::from_str(&seg) { + Ok(x) => x, + Err(_) => continue, + }; + + // Stop at wildcard constants - can't use for trie prefix search + if matches!(segment, PathSegment::Constant(ref s) if s == "*") { + break; + } + + segments.push(segment); + } + + segments.iter_mut().for_each(|x| match x { + PathSegment::Constant(_) => {} + PathSegment::Value { value, .. } => *value = "*".into(), + }); + + segments.iter().join("/") + } + + pub fn new_empty() -> Self { + Self { + patterns: TrieBuilder::new().build(), + len: 0, + } + } + + pub fn new, I: Iterator>(paths: I) -> Self { + let mut len = 0; + let mut patterns = HashMap::new(); + + for s in paths { + let s: String = s.into(); + let mut segments = Vec::new(); + for seg in s.split('/') { + segments.push(match PathSegment::from_str(&seg) { + Ok(x) => x, + Err(_) => continue, + }); + } + + segments.iter_mut().for_each(|x| match x { + PathSegment::Constant(_) => {} + PathSegment::Value { value, .. } => *value = "*".into(), + }); + + let pattern = segments.iter().join("/"); + + patterns.entry(pattern).or_insert(Vec::new()).push(s); + len += 1; + } + + let mut builder = TrieBuilder::new(); + for (k, v) in patterns { + builder.push(k, v); + } + + Self { + len, + patterns: builder.build(), + } + } + + #[cfg(feature = "tokio")] + pub async fn async_new>(mut paths: tokio::sync::mpsc::Receiver) -> Self { + let mut len = 0; + let mut patterns = HashMap::new(); + + while let Some(s) = paths.recv().await { + let s: String = s.into(); + let mut segments = Vec::new(); + for seg in s.split('/') { + segments.push(match PathSegment::from_str(&seg) { + Ok(x) => x, + Err(_) => continue, + }); + } + + segments.iter_mut().for_each(|x| match x { + PathSegment::Constant(_) => {} + PathSegment::Value { value, .. } => *value = "*".into(), + }); + + let pattern = segments.iter().join("/"); + + patterns.entry(pattern).or_insert(Vec::new()).push(s); + len += 1; + } + + let mut builder = TrieBuilder::new(); + for (k, v) in patterns { + builder.push(k, v); + } + + Self { + len, + patterns: builder.build(), + } + } + + #[inline(always)] + pub fn len(&self) -> usize { + self.len + } + + #[inline(always)] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Given a datapath (that may contain wildcards) as a query, + /// return all known datapaths that match it. + /// + /// Returns an empty iterator if no paths match. + /// Returns `None` if the query was invalid. + pub fn query(&self, query: impl Into) -> Option + '_> { + let query: String = query.into(); + let regex = rule::Rule::new(query.clone()).regex()?; + let key = Self::query_to_key(&query); + + Some( + self.patterns + .predictive_search::(&key) + .flat_map(|(_, strings)| strings.iter()) + .filter(move |s| regex.is_match(s)) + .cloned(), + ) + } + + pub fn query_match(&self, query: impl Into) -> Option { + let query: String = query.into(); + let regex = rule::Rule::new(query.clone()).regex()?; + let key = Self::query_to_key(&query); + + for (_, strings) in self.patterns.predictive_search::(&key) { + for s in strings { + if regex.is_match(s) { + return Some(true); + } + } + } + + return Some(false); + } +} + +// MARK: index tests + +#[cfg(test)] +#[expect(clippy::unwrap_used)] +mod index_tests { + use super::*; + + #[test] + fn datapath_index_empty() { + let idx = DatapathIndex::new(std::iter::empty::()); + let query = "web/domain=example.com"; + assert_eq!(idx.query(query).unwrap().count(), 0); + assert!(idx.is_empty()); + assert_eq!(idx.len(), 0); + } + + #[test] + fn insert_and_lookup_exact_match() { + let paths = vec!["web/domain=example.com/ts=1234"]; + let idx = DatapathIndex::new(paths.into_iter()); + + // Exact match + let results: Vec<_> = idx + .query("web/domain=example.com/ts=1234") + .unwrap() + .collect(); + assert_eq!(results.len(), 1); + assert_eq!(results[0], "web/domain=example.com/ts=1234"); + + // No match + let results: Vec<_> = idx.query("web/domain=other.com/ts=1234").unwrap().collect(); + assert_eq!(results.len(), 0); + + assert_eq!(idx.len(), 1); + } + + #[test] + fn wildcard_constant_match() { + let paths = vec![ + "web/domain=example.com/ts=1234", + "api/domain=example.com/ts=1234", + ]; + let idx = DatapathIndex::new(paths.into_iter()); + + // Wildcard first segment + let results: Vec<_> = idx.query("*/domain=example.com/ts=1234").unwrap().collect(); + assert_eq!(results.len(), 2); + + assert_eq!(idx.len(), 2); + } + + #[test] + fn wildcard_value_match() { + let paths = vec![ + "web/domain=example.com/ts=1234", + "web/domain=other.com/ts=1234", + ]; + let idx = DatapathIndex::new(paths.into_iter()); + + // Wildcard domain + let results: Vec<_> = idx.query("web/domain=*/ts=1234").unwrap().collect(); + assert_eq!(results.len(), 2); + } + + #[test] + fn multiple_datapaths() { + let paths = vec![ + "web/domain=example.com/ts=1234", + "web/domain=other.com/ts=1234", + "api/domain=example.com/ts=5678", + ]; + let idx = DatapathIndex::new(paths.into_iter()); + + // Specific lookup + let results: Vec<_> = idx + .query("web/domain=example.com/ts=1234") + .unwrap() + .collect(); + assert_eq!(results.len(), 1); + assert_eq!(results[0], "web/domain=example.com/ts=1234"); + + // Wildcard time lookup + let results: Vec<_> = idx.query("web/domain=example.com/ts=*").unwrap().collect(); + assert_eq!(results.len(), 1); + assert_eq!(results[0], "web/domain=example.com/ts=1234"); + + // Double wildcard lookup + let results: Vec<_> = idx.query("web/domain=*/ts=*").unwrap().collect(); + assert_eq!(results.len(), 2); + + assert_eq!(idx.len(), 3); + } + + #[test] + fn nested_wildcards() { + let paths = vec![ + "web/domain=example.com/ts=1234/crawl/2.5", + "web/domain=other.com/ts=5678/crawl/2.5", + "web/domain=example.com/ts=9999/crawl/3.0", + ]; + let idx = DatapathIndex::new(paths.into_iter()); + + // Multiple wildcards in path + let results: Vec<_> = idx.query("web/domain=*/ts=*/crawl/*").unwrap().collect(); + assert_eq!(results.len(), 3); + + // Selective wildcards + let results: Vec<_> = idx + .query("web/domain=example.com/ts=*/crawl/*") + .unwrap() + .collect(); + assert_eq!(results.len(), 2); + } + + #[test] + fn partial_path_query() { + let paths = vec!["web/domain=example.com/ts=1234/crawl/2.5"]; + let idx = DatapathIndex::new(paths.into_iter()); + + // Query with fewer segments than the stored path + let results: Vec<_> = idx.query("web/domain=example.com").unwrap().collect(); + assert_eq!(results.len(), 0); + } + + #[test] + fn longer_path_query() { + let paths = vec!["web/domain=example.com"]; + let idx = DatapathIndex::new(paths.into_iter()); + + // Query with more segments than the stored path + let results: Vec<_> = idx + .query("web/domain=example.com/ts=1234/crawl/2.5") + .unwrap() + .collect(); + assert_eq!(results.len(), 0); + } + + #[test] + fn query_match() { + let paths = vec![ + "web/domain=example.com/ts=1234", + "web/domain=other.com/ts=5678", + ]; + let idx = DatapathIndex::new(paths.into_iter()); + + // Match exists + assert_eq!( + idx.query_match("web/domain=example.com/ts=1234").unwrap(), + true + ); + assert_eq!(idx.query_match("web/domain=*/ts=*").unwrap(), true); + + // No match + assert_eq!( + idx.query_match("api/domain=example.com/ts=1234").unwrap(), + false + ); + assert_eq!( + idx.query_match("web/domain=missing.com/ts=9999").unwrap(), + false + ); + } + + #[test] + fn suffix_wildcard() { + let paths = vec![ + "web/domain=example.com/ts=1234/file1.json", + "web/domain=example.com/ts=1234/file2.json", + "web/domain=example.com/ts=5678/file3.json", + ]; + let idx = DatapathIndex::new(paths.into_iter()); + + // Query with suffix wildcard + let results: Vec<_> = idx.query("web/domain=example.com/**").unwrap().collect(); + assert_eq!(results.len(), 3); + + let results: Vec<_> = idx + .query("web/domain=example.com/ts=1234/**") + .unwrap() + .collect(); + assert_eq!(results.len(), 2); + } +} diff --git a/crates/datapath/src/index/rule.rs b/crates/datapath/src/index/rule.rs new file mode 100644 index 0000000..f9e9fce --- /dev/null +++ b/crates/datapath/src/index/rule.rs @@ -0,0 +1,381 @@ +use regex::Regex; +use tracing::warn; + +// +// MARK: rule +// + +#[derive(Debug)] +enum RegexSegment { + /// A single segment + Single(String), + + /// An optional doublestar segment + DoubleStar, +} + +impl RegexSegment { + /// Returns the regex pattern of this part, + /// prefixed with a /. + fn to_regex_part(&self, prev: Option<&Self>, next: Option<&Self>) -> String { + match (prev, self, next) { + // Consecutive single segments need a trailing slash + (_, Self::Single(x), Some(Self::Single(_))) => format!("{x}[/]"), + + // Terminal single segments don't need a trailing slash + (_, Self::Single(x), None) => x.to_owned(), + + // Neighboring doublestar is always responsible for slashes + (_, Self::Single(x), Some(Self::DoubleStar)) => x.to_owned(), + + // No additional slashes + (None, Self::DoubleStar, None) => "((?:.*)?)".into(), + + // Leading slash + (Some(Self::Single(_)), Self::DoubleStar, None) => "((?:[/].*)?)".into(), + + // Trailing slash + (None, Self::DoubleStar, Some(Self::Single(_))) => "((?:.*[/])?)".into(), + + // Leading and trailing slash. + // Also, replace self with a [/] when empty. + (Some(Self::Single(_)), Self::DoubleStar, Some(Self::Single(_))) => { + "((?:[/].*[/])|[/])".into() + } + + // Doublestars cannot be neighbors + (_, Self::DoubleStar, Some(Self::DoubleStar)) + | (Some(Self::DoubleStar), Self::DoubleStar, _) => { + unreachable!("consecutive doublestars must be reduced") + } + } + } +} + +#[derive(Debug, Clone)] +pub struct Rule { + pub pattern: String, +} + +impl Rule { + pub fn new(pattern: impl Into) -> Self { + Self { + pattern: pattern.into(), + } + } + + /// Turn this rule into a regex pattern. + /// Returns `None` if this rule was invalid. + pub fn regex(&self) -> Option { + let pattern = &self.pattern; + + if pattern.ends_with("/") { + warn!("Pattern `{pattern}` has a trailing slash which will be ignored") + } + + if pattern.starts_with("/") { + warn!("Pattern `{pattern}` has a leading slash which will be ignored") + } + + // Split on slashes or stars + // This is a lot like .split("/"), but handles + // the edge case where ** is not delimited by slashes + // (`root**test` is equivalent to `root/**/test`) + let segments = { + #[expect(clippy::unwrap_used)] + let re = Regex::new("[*]{2,}|[/]").unwrap(); + let split = re.find_iter(pattern); + + let bounds = split + .into_iter() + .flat_map(|x| { + let r = x.range(); + let a = r.start; + let b = r.end; + [a, b] + }) + .chain([pattern.len()]) + .collect::>(); + + let mut parts = Vec::new(); + let mut last = 0; + for next in bounds { + let seg = &pattern[last..next]; + // Consecutive slashes are identical to a single slash + if seg != "/" && !seg.is_empty() { + parts.push(seg); + } + last = next; + } + + parts + }; + + let mut rebuilt_segments = Vec::new(); + let mut last_was_doublestar = false; + for segment in segments { + // This is a wildcard regex + // (**, ***, etc) + if segment.len() > 1 && segment.chars().all(|x| x == '*') { + match segment { + "**" => { + // Consecutive doublestars are meaningless + if !last_was_doublestar { + rebuilt_segments.push(RegexSegment::DoubleStar); + } + last_was_doublestar = true; + } + _ => return None, + } + continue; + } + last_was_doublestar = false; + + let parts = segment.split("*").collect::>(); + + let mut rebuilt = String::new(); + for (i, part) in parts.into_iter().enumerate() { + if i != 0 { + rebuilt.push_str("([^/]*)") + } + + rebuilt.push_str(®ex::escape(part)); + } + + rebuilt_segments.push(RegexSegment::Single(rebuilt)); + } + + let mut re_built = String::new(); + let mut prev = None; + for (i, seg) in rebuilt_segments.iter().enumerate() { + let next = rebuilt_segments.get(i + 1); + re_built.push_str(&seg.to_regex_part(prev, next)); + prev = Some(seg); + } + + let re_built = format!("^{re_built}$"); + // This regex should always be valid + #[expect(clippy::unwrap_used)] + Some(Regex::new(&re_built).unwrap()) + } +} + +// +// MARK: tests +// + +#[cfg(test)] +#[expect(clippy::unwrap_used)] +mod rule_tests { + use super::*; + + fn rule_regex(pattern: &str) -> Regex { + let rule = Rule::new(pattern); + return rule.regex().unwrap(); + } + + #[test] + fn simple() { + let regex = rule_regex("file.txt"); + + assert!(regex.is_match("file.txt")); + assert!(!regex.is_match("other.txt")); + assert!(!regex.is_match("path/file.txt")); + } + + #[test] + fn simple_dir() { + let regex = rule_regex("dir/file.txt"); + + assert!(regex.is_match("dir/file.txt")); + assert!(!regex.is_match("file.txt")); + assert!(!regex.is_match("other/file.txt")); + } + + #[test] + fn simple_star() { + let regex = rule_regex("*.txt"); + + assert!(regex.is_match("file.txt")); + assert!(regex.is_match("other.txt")); + assert!(!regex.is_match("file.jpg")); + assert!(!regex.is_match("nested/file.txt")); + } + + #[test] + fn simple_doublestar() { + let regex = rule_regex("**/*.txt"); + + assert!(regex.is_match("file.txt")); + assert!(regex.is_match("dir/file.txt")); + assert!(regex.is_match("dir/subdir/file.txt")); + assert!(!regex.is_match("file.jpg")); + assert!(!regex.is_match("dir/file.jpg")); + } + + #[test] + fn consecutive_doublestar() { + let regex = rule_regex("**/**/**/*.txt"); + + assert!(regex.is_match("file.txt")); + assert!(regex.is_match("dir/file.txt")); + assert!(regex.is_match("dir/subdir/file.txt")); + assert!(!regex.is_match("file.jpg")); + assert!(!regex.is_match("dir/file.jpg")); + } + + #[test] + fn dual_star() { + let regex = rule_regex("**/*a*"); + + assert!(regex.is_match("fileafile")); + assert!(regex.is_match("dir/fileafile")); + assert!(regex.is_match("filea")); + assert!(regex.is_match("dir/filea")); + assert!(regex.is_match("afile")); + assert!(regex.is_match("dir/afile")); + assert!(!regex.is_match("noletter")); + assert!(!regex.is_match("dir/noletter")); + } + + #[test] + fn single_end() { + let regex = rule_regex("**/*"); + + assert!(regex.is_match("file")); + assert!(regex.is_match("dir/file")); + assert!(regex.is_match("a/b/c/dir/file")); + } + + #[test] + fn doublestar_end() { + let regex = rule_regex("root/**"); + + assert!(regex.is_match("root/file")); + assert!(!regex.is_match("dir/file")); + } + + #[test] + fn doublestar_start() { + let regex = rule_regex("**/dir"); + + assert!(regex.is_match("dir")); + assert!(regex.is_match("a/b/dir")); + assert!(!regex.is_match("dir/file")); + } + + #[test] + fn doublestar_adjacent_before() { + let regex = rule_regex("root/**test"); + + assert!(regex.is_match("root/test")); + assert!(regex.is_match("root/a/test")); + assert!(regex.is_match("root/a/b/c/test")); + assert!(!regex.is_match("root/file")); + assert!(!regex.is_match("root/xxtest")); + } + + #[test] + fn doublestar_adjacent_after() { + let regex = rule_regex("root/test**"); + + assert!(regex.is_match("root/test")); + assert!(regex.is_match("root/test/a")); + assert!(regex.is_match("root/test/a/b/c")); + assert!(!regex.is_match("root/testxx")); + assert!(!regex.is_match("root/file")); + } + + #[test] + fn doublestar_adjacent_middle() { + let regex = rule_regex("root/test**file"); + + assert!(regex.is_match("root/test/file")); + assert!(regex.is_match("root/test/a/b/c/file")); + assert!(!regex.is_match("root/test")); + assert!(!regex.is_match("root/file")); + assert!(!regex.is_match("root/testfile")); + assert!(!regex.is_match("root/testxxfile")); + } + + #[test] + fn doublestar_nullable() { + let regex = rule_regex("root/**/file"); + + assert!(regex.is_match("root/test/file")); + assert!(regex.is_match("root/file")); + assert!(!regex.is_match("rootfile")); + } + + #[test] + fn doublestar_nullable_post() { + let regex = rule_regex("root/**"); + + assert!(regex.is_match("root")); + assert!(regex.is_match("root/file")); + assert!(!regex.is_match("rootfile")); + } + + #[test] + fn doublestar_nullable_pre() { + let regex = rule_regex("**/file"); + + assert!(regex.is_match("file")); + assert!(regex.is_match("root/file")); + assert!(!regex.is_match("rootfile")); + } + + #[test] + fn doublestar_bad_extension() { + let regex = rule_regex("**.flac"); + + assert!(regex.is_match("root/.flac")); + assert!(regex.is_match("root/a/.flac")); + assert!(!regex.is_match("root/test.flac")); + assert!(!regex.is_match("test.flac")); + assert!(!regex.is_match("root/test/a/b/c.flac")); + assert!(!regex.is_match("root/testflac")); + assert!(!regex.is_match("test.mp3")); + } + + #[test] + fn doublestar_good_extension() { + let regex = rule_regex("**/*.flac"); + + assert!(regex.is_match("root/.flac")); + assert!(regex.is_match("root/a/.flac")); + assert!(regex.is_match("root/test.flac")); + assert!(regex.is_match("test.flac")); + assert!(regex.is_match("root/test/a/b/c.flac")); + assert!(!regex.is_match("root/testflac")); + assert!(!regex.is_match("test.mp3")); + } + + #[test] + fn multi_slash_a() { + let regex = rule_regex("dir//file.txt"); + + assert!(regex.is_match("dir/file.txt")); + assert!(!regex.is_match("dirfile.txt")); + assert!(!regex.is_match("dir/other.txt")); + } + + #[test] + fn multi_slash_b() { + let regex = rule_regex("**///*.txt"); + + assert!(regex.is_match("dir/file.txt")); + assert!(regex.is_match("dir/subdir/file.txt")); + assert!(!regex.is_match("file.jpg")); + } + + #[test] + fn multi_slash_c() { + let regex = rule_regex("///dir//**//*.txt//"); + + assert!(regex.is_match("dir/subdir/file.txt")); + assert!(regex.is_match("dir/sub1/sub2/file.txt")); + assert!(!regex.is_match("other/sub/file.txt")); + assert!(!regex.is_match("dir/file.jpg")); + } +} diff --git a/crates/datapath/src/lib.rs b/crates/datapath/src/lib.rs index e0da51a..9e19902 100644 --- a/crates/datapath/src/lib.rs +++ b/crates/datapath/src/lib.rs @@ -7,6 +7,10 @@ #[cfg(test)] use uuid as _; +// silence linter, used by fns in index.rs +#[cfg(feature = "tokio")] +use tokio as _; + mod datapath; pub use datapath::*; @@ -19,4 +23,10 @@ pub use schema::*; mod wildcardable; pub use wildcardable::*; +#[cfg(feature = "index")] +mod index; + +#[cfg(feature = "index")] +pub use index::*; + pub use datapath_macro::datapath;