mirror of
https://github.com/rm-dr/datapath.git
synced 2025-12-07 20:04:13 -08:00
v0.0.3
This commit is contained in:
186
Cargo.lock
generated
186
Cargo.lock
generated
@@ -2,6 +2,15 @@
|
||||
# It is not intended for manual editing.
|
||||
version = 4
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bumpalo"
|
||||
version = "3.19.0"
|
||||
@@ -14,23 +23,68 @@ version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-deque"
|
||||
version = "0.8.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
|
||||
dependencies = [
|
||||
"crossbeam-epoch",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-epoch"
|
||||
version = "0.9.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
|
||||
dependencies = [
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
|
||||
|
||||
[[package]]
|
||||
name = "datapath"
|
||||
version = "0.0.2"
|
||||
version = "0.0.3"
|
||||
dependencies = [
|
||||
"datapath-macro",
|
||||
"itertools",
|
||||
"regex",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"trie-rs",
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "datapath-macro"
|
||||
version = "0.0.2"
|
||||
version = "0.0.3"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.15.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
|
||||
|
||||
[[package]]
|
||||
name = "fid-rs"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a6956a1e60e2d1412b44b4169d44a03dae518f8583d3e10090c912c105e48447"
|
||||
dependencies = [
|
||||
"rayon",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.3.4"
|
||||
@@ -43,6 +97,15 @@ dependencies = [
|
||||
"wasip2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itertools"
|
||||
version = "0.14.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
|
||||
dependencies = [
|
||||
"either",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "js-sys"
|
||||
version = "0.3.83"
|
||||
@@ -59,12 +122,33 @@ version = "0.2.178"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37c93d8daa9d8a012fd8ab92f088405fb202ea0b6ab73ee2482ae66af4f42091"
|
||||
|
||||
[[package]]
|
||||
name = "louds-rs"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "936de6c22f08e7135a921f8ada907acd0d88880c4f42b5591f634b9f1dd8e07f"
|
||||
dependencies = [
|
||||
"fid-rs",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.7.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.21.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
|
||||
|
||||
[[package]]
|
||||
name = "pin-project-lite"
|
||||
version = "0.2.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.103"
|
||||
@@ -89,6 +173,55 @@ version = "5.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "1.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f"
|
||||
dependencies = [
|
||||
"either",
|
||||
"rayon-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon-core"
|
||||
version = "1.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
|
||||
dependencies = [
|
||||
"crossbeam-deque",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.12.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-automata",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.4.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.8.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
|
||||
|
||||
[[package]]
|
||||
name = "rustversion"
|
||||
version = "1.0.22"
|
||||
@@ -106,6 +239,55 @@ dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.48.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ff360e02eab121e0bc37a2d3b4d4dc622e6eda3a8e5253d5435ecf5bd4c68408"
|
||||
dependencies = [
|
||||
"pin-project-lite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing"
|
||||
version = "0.1.43"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2d15d90a0b5c19378952d479dc858407149d7bb45a14de0142f6c534b16fc647"
|
||||
dependencies = [
|
||||
"pin-project-lite",
|
||||
"tracing-attributes",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-attributes"
|
||||
version = "0.1.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-core"
|
||||
version = "0.1.35"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a04e24fab5c89c6a36eb8558c9656f30d81de51dfa4d3b45f26b21d61fa0a6c"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "trie-rs"
|
||||
version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f6f88f4b0a1ebd6c3d16be3e45eb0e8089372ccadd88849b7ca162ba64b5e6f6"
|
||||
dependencies = [
|
||||
"louds-rs",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.22"
|
||||
|
||||
10
Cargo.toml
10
Cargo.toml
@@ -11,7 +11,7 @@ readme = "README.md"
|
||||
authors = ["rm-dr"]
|
||||
|
||||
# Don't forget to bump datapath-macro below!
|
||||
version = "0.0.2"
|
||||
version = "0.0.3"
|
||||
|
||||
[workspace.lints.rust]
|
||||
unused_import_braces = "deny"
|
||||
@@ -70,12 +70,16 @@ cargo_common_metadata = "deny"
|
||||
#
|
||||
|
||||
[workspace.dependencies]
|
||||
datapath-macro = { path = "crates/datapath-macro", version = "0.0.2" }
|
||||
datapath-macro = { path = "crates/datapath-macro", version = "0.0.3" }
|
||||
datapath = { path = "crates/datapath" }
|
||||
|
||||
chrono = "0.4.42"
|
||||
itertools = "0.14.0"
|
||||
proc-macro2 = "1.0.103"
|
||||
quote = "1.0.42"
|
||||
regex = "1.12.2"
|
||||
syn = "2.0.111"
|
||||
|
||||
tracing = "0.1"
|
||||
trie-rs = "0.4.2"
|
||||
uuid = "1.19.0"
|
||||
tokio = { version = "1.48.0", features = ["sync"] }
|
||||
|
||||
@@ -549,7 +549,7 @@ fn generate_common_impls(
|
||||
}
|
||||
|
||||
// Extract just the field names for struct construction
|
||||
let field_names = typed_fields.iter().map(|(name, _)| name);
|
||||
let field_names: Vec<_> = typed_fields.iter().map(|(name, _)| name).collect();
|
||||
|
||||
let datapath_impl = quote! {
|
||||
impl ::datapath::Datapath for #struct_name {
|
||||
@@ -600,6 +600,13 @@ fn generate_common_impls(
|
||||
file,
|
||||
})
|
||||
}
|
||||
|
||||
fn field(&self, name: &str) -> Option<::std::string::String> {
|
||||
match name {
|
||||
#(stringify!(#field_names) => Some(self.#field_names.to_string()),)*
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -17,5 +17,16 @@ workspace = true
|
||||
[dependencies]
|
||||
datapath-macro = { workspace = true }
|
||||
|
||||
regex = { workspace = true, optional = true }
|
||||
tracing = { workspace = true, optional = true }
|
||||
trie-rs = { workspace = true, optional = true }
|
||||
itertools = { workspace = true, optional = true }
|
||||
tokio = { workspace = true, optional = true }
|
||||
|
||||
[dev-dependencies]
|
||||
uuid = { version = "1", features = ["v4"] }
|
||||
|
||||
[features]
|
||||
default = []
|
||||
index = ["dep:regex", "dep:trie-rs", "dep:tracing", "dep:itertools"]
|
||||
tokio = ["dep:tokio"]
|
||||
|
||||
@@ -33,4 +33,8 @@ where
|
||||
/// Parse a string as this datapath with a (possibly empty-string)
|
||||
/// file, returning `None` if this string is invalid.
|
||||
fn parse(path: &str) -> Option<DatapathFile<Self>>;
|
||||
|
||||
/// Get the string value of the field with the given name,
|
||||
/// if it exists.
|
||||
fn field(&self, name: &str) -> Option<String>;
|
||||
}
|
||||
|
||||
395
crates/datapath/src/index/mod.rs
Normal file
395
crates/datapath/src/index/mod.rs
Normal file
@@ -0,0 +1,395 @@
|
||||
use itertools::Itertools;
|
||||
use std::{collections::HashMap, fmt::Display, str::FromStr};
|
||||
use trie_rs::map::{Trie, TrieBuilder};
|
||||
|
||||
mod rule;
|
||||
|
||||
/// A path segment in an [`AnyDatapath`]
|
||||
#[derive(Debug, Clone, Hash, PartialEq, Eq)]
|
||||
enum PathSegment {
|
||||
/// A constant value, like `web`
|
||||
Constant(String),
|
||||
|
||||
/// A key=value partition, like `domain=gouletpens.com`
|
||||
Value { key: String, value: String },
|
||||
}
|
||||
|
||||
impl Display for PathSegment {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
PathSegment::Constant(x) => write!(f, "{x}"),
|
||||
PathSegment::Value { key, value } => write!(f, "{key}={value}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for PathSegment {
|
||||
type Err = ();
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
if s.contains("\n") {
|
||||
return Err(());
|
||||
}
|
||||
|
||||
if s.is_empty() {
|
||||
return Err(());
|
||||
}
|
||||
|
||||
return Ok(if s.contains("=") {
|
||||
let mut s = s.split("=");
|
||||
let key = s.next().ok_or(())?.to_owned();
|
||||
let value = s.join("=");
|
||||
Self::Value { key, value }
|
||||
} else {
|
||||
Self::Constant(s.to_owned())
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// MARK: index
|
||||
//
|
||||
|
||||
/// An in-memory cache of s3 paths.
|
||||
#[derive(Debug)]
|
||||
pub struct DatapathIndex {
|
||||
patterns: Trie<u8, Vec<String>>,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
impl DatapathIndex {
|
||||
/// Convert a query string to a trie search key by normalizing values to `*`.
|
||||
/// Stops at the first wildcard constant since it can't be used for prefix matching.
|
||||
fn query_to_key(query: &str) -> String {
|
||||
let trimmed = query.trim().trim_end_matches("**").trim_matches('/');
|
||||
let mut segments = Vec::new();
|
||||
for seg in trimmed.split('/') {
|
||||
let segment = match PathSegment::from_str(&seg) {
|
||||
Ok(x) => x,
|
||||
Err(_) => continue,
|
||||
};
|
||||
|
||||
// Stop at wildcard constants - can't use for trie prefix search
|
||||
if matches!(segment, PathSegment::Constant(ref s) if s == "*") {
|
||||
break;
|
||||
}
|
||||
|
||||
segments.push(segment);
|
||||
}
|
||||
|
||||
segments.iter_mut().for_each(|x| match x {
|
||||
PathSegment::Constant(_) => {}
|
||||
PathSegment::Value { value, .. } => *value = "*".into(),
|
||||
});
|
||||
|
||||
segments.iter().join("/")
|
||||
}
|
||||
|
||||
pub fn new_empty() -> Self {
|
||||
Self {
|
||||
patterns: TrieBuilder::new().build(),
|
||||
len: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new<S: Into<String>, I: Iterator<Item = S>>(paths: I) -> Self {
|
||||
let mut len = 0;
|
||||
let mut patterns = HashMap::new();
|
||||
|
||||
for s in paths {
|
||||
let s: String = s.into();
|
||||
let mut segments = Vec::new();
|
||||
for seg in s.split('/') {
|
||||
segments.push(match PathSegment::from_str(&seg) {
|
||||
Ok(x) => x,
|
||||
Err(_) => continue,
|
||||
});
|
||||
}
|
||||
|
||||
segments.iter_mut().for_each(|x| match x {
|
||||
PathSegment::Constant(_) => {}
|
||||
PathSegment::Value { value, .. } => *value = "*".into(),
|
||||
});
|
||||
|
||||
let pattern = segments.iter().join("/");
|
||||
|
||||
patterns.entry(pattern).or_insert(Vec::new()).push(s);
|
||||
len += 1;
|
||||
}
|
||||
|
||||
let mut builder = TrieBuilder::new();
|
||||
for (k, v) in patterns {
|
||||
builder.push(k, v);
|
||||
}
|
||||
|
||||
Self {
|
||||
len,
|
||||
patterns: builder.build(),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "tokio")]
|
||||
pub async fn async_new<S: Into<String>>(mut paths: tokio::sync::mpsc::Receiver<S>) -> Self {
|
||||
let mut len = 0;
|
||||
let mut patterns = HashMap::new();
|
||||
|
||||
while let Some(s) = paths.recv().await {
|
||||
let s: String = s.into();
|
||||
let mut segments = Vec::new();
|
||||
for seg in s.split('/') {
|
||||
segments.push(match PathSegment::from_str(&seg) {
|
||||
Ok(x) => x,
|
||||
Err(_) => continue,
|
||||
});
|
||||
}
|
||||
|
||||
segments.iter_mut().for_each(|x| match x {
|
||||
PathSegment::Constant(_) => {}
|
||||
PathSegment::Value { value, .. } => *value = "*".into(),
|
||||
});
|
||||
|
||||
let pattern = segments.iter().join("/");
|
||||
|
||||
patterns.entry(pattern).or_insert(Vec::new()).push(s);
|
||||
len += 1;
|
||||
}
|
||||
|
||||
let mut builder = TrieBuilder::new();
|
||||
for (k, v) in patterns {
|
||||
builder.push(k, v);
|
||||
}
|
||||
|
||||
Self {
|
||||
len,
|
||||
patterns: builder.build(),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
|
||||
/// Given a datapath (that may contain wildcards) as a query,
|
||||
/// return all known datapaths that match it.
|
||||
///
|
||||
/// Returns an empty iterator if no paths match.
|
||||
/// Returns `None` if the query was invalid.
|
||||
pub fn query(&self, query: impl Into<String>) -> Option<impl Iterator<Item = String> + '_> {
|
||||
let query: String = query.into();
|
||||
let regex = rule::Rule::new(query.clone()).regex()?;
|
||||
let key = Self::query_to_key(&query);
|
||||
|
||||
Some(
|
||||
self.patterns
|
||||
.predictive_search::<String, _>(&key)
|
||||
.flat_map(|(_, strings)| strings.iter())
|
||||
.filter(move |s| regex.is_match(s))
|
||||
.cloned(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn query_match(&self, query: impl Into<String>) -> Option<bool> {
|
||||
let query: String = query.into();
|
||||
let regex = rule::Rule::new(query.clone()).regex()?;
|
||||
let key = Self::query_to_key(&query);
|
||||
|
||||
for (_, strings) in self.patterns.predictive_search::<String, _>(&key) {
|
||||
for s in strings {
|
||||
if regex.is_match(s) {
|
||||
return Some(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Some(false);
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: index tests
|
||||
|
||||
#[cfg(test)]
|
||||
#[expect(clippy::unwrap_used)]
|
||||
mod index_tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn datapath_index_empty() {
|
||||
let idx = DatapathIndex::new(std::iter::empty::<String>());
|
||||
let query = "web/domain=example.com";
|
||||
assert_eq!(idx.query(query).unwrap().count(), 0);
|
||||
assert!(idx.is_empty());
|
||||
assert_eq!(idx.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn insert_and_lookup_exact_match() {
|
||||
let paths = vec!["web/domain=example.com/ts=1234"];
|
||||
let idx = DatapathIndex::new(paths.into_iter());
|
||||
|
||||
// Exact match
|
||||
let results: Vec<_> = idx
|
||||
.query("web/domain=example.com/ts=1234")
|
||||
.unwrap()
|
||||
.collect();
|
||||
assert_eq!(results.len(), 1);
|
||||
assert_eq!(results[0], "web/domain=example.com/ts=1234");
|
||||
|
||||
// No match
|
||||
let results: Vec<_> = idx.query("web/domain=other.com/ts=1234").unwrap().collect();
|
||||
assert_eq!(results.len(), 0);
|
||||
|
||||
assert_eq!(idx.len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wildcard_constant_match() {
|
||||
let paths = vec![
|
||||
"web/domain=example.com/ts=1234",
|
||||
"api/domain=example.com/ts=1234",
|
||||
];
|
||||
let idx = DatapathIndex::new(paths.into_iter());
|
||||
|
||||
// Wildcard first segment
|
||||
let results: Vec<_> = idx.query("*/domain=example.com/ts=1234").unwrap().collect();
|
||||
assert_eq!(results.len(), 2);
|
||||
|
||||
assert_eq!(idx.len(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wildcard_value_match() {
|
||||
let paths = vec![
|
||||
"web/domain=example.com/ts=1234",
|
||||
"web/domain=other.com/ts=1234",
|
||||
];
|
||||
let idx = DatapathIndex::new(paths.into_iter());
|
||||
|
||||
// Wildcard domain
|
||||
let results: Vec<_> = idx.query("web/domain=*/ts=1234").unwrap().collect();
|
||||
assert_eq!(results.len(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_datapaths() {
|
||||
let paths = vec![
|
||||
"web/domain=example.com/ts=1234",
|
||||
"web/domain=other.com/ts=1234",
|
||||
"api/domain=example.com/ts=5678",
|
||||
];
|
||||
let idx = DatapathIndex::new(paths.into_iter());
|
||||
|
||||
// Specific lookup
|
||||
let results: Vec<_> = idx
|
||||
.query("web/domain=example.com/ts=1234")
|
||||
.unwrap()
|
||||
.collect();
|
||||
assert_eq!(results.len(), 1);
|
||||
assert_eq!(results[0], "web/domain=example.com/ts=1234");
|
||||
|
||||
// Wildcard time lookup
|
||||
let results: Vec<_> = idx.query("web/domain=example.com/ts=*").unwrap().collect();
|
||||
assert_eq!(results.len(), 1);
|
||||
assert_eq!(results[0], "web/domain=example.com/ts=1234");
|
||||
|
||||
// Double wildcard lookup
|
||||
let results: Vec<_> = idx.query("web/domain=*/ts=*").unwrap().collect();
|
||||
assert_eq!(results.len(), 2);
|
||||
|
||||
assert_eq!(idx.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn nested_wildcards() {
|
||||
let paths = vec![
|
||||
"web/domain=example.com/ts=1234/crawl/2.5",
|
||||
"web/domain=other.com/ts=5678/crawl/2.5",
|
||||
"web/domain=example.com/ts=9999/crawl/3.0",
|
||||
];
|
||||
let idx = DatapathIndex::new(paths.into_iter());
|
||||
|
||||
// Multiple wildcards in path
|
||||
let results: Vec<_> = idx.query("web/domain=*/ts=*/crawl/*").unwrap().collect();
|
||||
assert_eq!(results.len(), 3);
|
||||
|
||||
// Selective wildcards
|
||||
let results: Vec<_> = idx
|
||||
.query("web/domain=example.com/ts=*/crawl/*")
|
||||
.unwrap()
|
||||
.collect();
|
||||
assert_eq!(results.len(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn partial_path_query() {
|
||||
let paths = vec!["web/domain=example.com/ts=1234/crawl/2.5"];
|
||||
let idx = DatapathIndex::new(paths.into_iter());
|
||||
|
||||
// Query with fewer segments than the stored path
|
||||
let results: Vec<_> = idx.query("web/domain=example.com").unwrap().collect();
|
||||
assert_eq!(results.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn longer_path_query() {
|
||||
let paths = vec!["web/domain=example.com"];
|
||||
let idx = DatapathIndex::new(paths.into_iter());
|
||||
|
||||
// Query with more segments than the stored path
|
||||
let results: Vec<_> = idx
|
||||
.query("web/domain=example.com/ts=1234/crawl/2.5")
|
||||
.unwrap()
|
||||
.collect();
|
||||
assert_eq!(results.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn query_match() {
|
||||
let paths = vec![
|
||||
"web/domain=example.com/ts=1234",
|
||||
"web/domain=other.com/ts=5678",
|
||||
];
|
||||
let idx = DatapathIndex::new(paths.into_iter());
|
||||
|
||||
// Match exists
|
||||
assert_eq!(
|
||||
idx.query_match("web/domain=example.com/ts=1234").unwrap(),
|
||||
true
|
||||
);
|
||||
assert_eq!(idx.query_match("web/domain=*/ts=*").unwrap(), true);
|
||||
|
||||
// No match
|
||||
assert_eq!(
|
||||
idx.query_match("api/domain=example.com/ts=1234").unwrap(),
|
||||
false
|
||||
);
|
||||
assert_eq!(
|
||||
idx.query_match("web/domain=missing.com/ts=9999").unwrap(),
|
||||
false
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn suffix_wildcard() {
|
||||
let paths = vec![
|
||||
"web/domain=example.com/ts=1234/file1.json",
|
||||
"web/domain=example.com/ts=1234/file2.json",
|
||||
"web/domain=example.com/ts=5678/file3.json",
|
||||
];
|
||||
let idx = DatapathIndex::new(paths.into_iter());
|
||||
|
||||
// Query with suffix wildcard
|
||||
let results: Vec<_> = idx.query("web/domain=example.com/**").unwrap().collect();
|
||||
assert_eq!(results.len(), 3);
|
||||
|
||||
let results: Vec<_> = idx
|
||||
.query("web/domain=example.com/ts=1234/**")
|
||||
.unwrap()
|
||||
.collect();
|
||||
assert_eq!(results.len(), 2);
|
||||
}
|
||||
}
|
||||
381
crates/datapath/src/index/rule.rs
Normal file
381
crates/datapath/src/index/rule.rs
Normal file
@@ -0,0 +1,381 @@
|
||||
use regex::Regex;
|
||||
use tracing::warn;
|
||||
|
||||
//
|
||||
// MARK: rule
|
||||
//
|
||||
|
||||
#[derive(Debug)]
|
||||
enum RegexSegment {
|
||||
/// A single segment
|
||||
Single(String),
|
||||
|
||||
/// An optional doublestar segment
|
||||
DoubleStar,
|
||||
}
|
||||
|
||||
impl RegexSegment {
|
||||
/// Returns the regex pattern of this part,
|
||||
/// prefixed with a /.
|
||||
fn to_regex_part(&self, prev: Option<&Self>, next: Option<&Self>) -> String {
|
||||
match (prev, self, next) {
|
||||
// Consecutive single segments need a trailing slash
|
||||
(_, Self::Single(x), Some(Self::Single(_))) => format!("{x}[/]"),
|
||||
|
||||
// Terminal single segments don't need a trailing slash
|
||||
(_, Self::Single(x), None) => x.to_owned(),
|
||||
|
||||
// Neighboring doublestar is always responsible for slashes
|
||||
(_, Self::Single(x), Some(Self::DoubleStar)) => x.to_owned(),
|
||||
|
||||
// No additional slashes
|
||||
(None, Self::DoubleStar, None) => "((?:.*)?)".into(),
|
||||
|
||||
// Leading slash
|
||||
(Some(Self::Single(_)), Self::DoubleStar, None) => "((?:[/].*)?)".into(),
|
||||
|
||||
// Trailing slash
|
||||
(None, Self::DoubleStar, Some(Self::Single(_))) => "((?:.*[/])?)".into(),
|
||||
|
||||
// Leading and trailing slash.
|
||||
// Also, replace self with a [/] when empty.
|
||||
(Some(Self::Single(_)), Self::DoubleStar, Some(Self::Single(_))) => {
|
||||
"((?:[/].*[/])|[/])".into()
|
||||
}
|
||||
|
||||
// Doublestars cannot be neighbors
|
||||
(_, Self::DoubleStar, Some(Self::DoubleStar))
|
||||
| (Some(Self::DoubleStar), Self::DoubleStar, _) => {
|
||||
unreachable!("consecutive doublestars must be reduced")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Rule {
|
||||
pub pattern: String,
|
||||
}
|
||||
|
||||
impl Rule {
|
||||
pub fn new(pattern: impl Into<String>) -> Self {
|
||||
Self {
|
||||
pattern: pattern.into(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Turn this rule into a regex pattern.
|
||||
/// Returns `None` if this rule was invalid.
|
||||
pub fn regex(&self) -> Option<Regex> {
|
||||
let pattern = &self.pattern;
|
||||
|
||||
if pattern.ends_with("/") {
|
||||
warn!("Pattern `{pattern}` has a trailing slash which will be ignored")
|
||||
}
|
||||
|
||||
if pattern.starts_with("/") {
|
||||
warn!("Pattern `{pattern}` has a leading slash which will be ignored")
|
||||
}
|
||||
|
||||
// Split on slashes or stars
|
||||
// This is a lot like .split("/"), but handles
|
||||
// the edge case where ** is not delimited by slashes
|
||||
// (`root**test` is equivalent to `root/**/test`)
|
||||
let segments = {
|
||||
#[expect(clippy::unwrap_used)]
|
||||
let re = Regex::new("[*]{2,}|[/]").unwrap();
|
||||
let split = re.find_iter(pattern);
|
||||
|
||||
let bounds = split
|
||||
.into_iter()
|
||||
.flat_map(|x| {
|
||||
let r = x.range();
|
||||
let a = r.start;
|
||||
let b = r.end;
|
||||
[a, b]
|
||||
})
|
||||
.chain([pattern.len()])
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut parts = Vec::new();
|
||||
let mut last = 0;
|
||||
for next in bounds {
|
||||
let seg = &pattern[last..next];
|
||||
// Consecutive slashes are identical to a single slash
|
||||
if seg != "/" && !seg.is_empty() {
|
||||
parts.push(seg);
|
||||
}
|
||||
last = next;
|
||||
}
|
||||
|
||||
parts
|
||||
};
|
||||
|
||||
let mut rebuilt_segments = Vec::new();
|
||||
let mut last_was_doublestar = false;
|
||||
for segment in segments {
|
||||
// This is a wildcard regex
|
||||
// (**, ***, etc)
|
||||
if segment.len() > 1 && segment.chars().all(|x| x == '*') {
|
||||
match segment {
|
||||
"**" => {
|
||||
// Consecutive doublestars are meaningless
|
||||
if !last_was_doublestar {
|
||||
rebuilt_segments.push(RegexSegment::DoubleStar);
|
||||
}
|
||||
last_was_doublestar = true;
|
||||
}
|
||||
_ => return None,
|
||||
}
|
||||
continue;
|
||||
}
|
||||
last_was_doublestar = false;
|
||||
|
||||
let parts = segment.split("*").collect::<Vec<_>>();
|
||||
|
||||
let mut rebuilt = String::new();
|
||||
for (i, part) in parts.into_iter().enumerate() {
|
||||
if i != 0 {
|
||||
rebuilt.push_str("([^/]*)")
|
||||
}
|
||||
|
||||
rebuilt.push_str(®ex::escape(part));
|
||||
}
|
||||
|
||||
rebuilt_segments.push(RegexSegment::Single(rebuilt));
|
||||
}
|
||||
|
||||
let mut re_built = String::new();
|
||||
let mut prev = None;
|
||||
for (i, seg) in rebuilt_segments.iter().enumerate() {
|
||||
let next = rebuilt_segments.get(i + 1);
|
||||
re_built.push_str(&seg.to_regex_part(prev, next));
|
||||
prev = Some(seg);
|
||||
}
|
||||
|
||||
let re_built = format!("^{re_built}$");
|
||||
// This regex should always be valid
|
||||
#[expect(clippy::unwrap_used)]
|
||||
Some(Regex::new(&re_built).unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// MARK: tests
|
||||
//
|
||||
|
||||
#[cfg(test)]
|
||||
#[expect(clippy::unwrap_used)]
|
||||
mod rule_tests {
|
||||
use super::*;
|
||||
|
||||
fn rule_regex(pattern: &str) -> Regex {
|
||||
let rule = Rule::new(pattern);
|
||||
return rule.regex().unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn simple() {
|
||||
let regex = rule_regex("file.txt");
|
||||
|
||||
assert!(regex.is_match("file.txt"));
|
||||
assert!(!regex.is_match("other.txt"));
|
||||
assert!(!regex.is_match("path/file.txt"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn simple_dir() {
|
||||
let regex = rule_regex("dir/file.txt");
|
||||
|
||||
assert!(regex.is_match("dir/file.txt"));
|
||||
assert!(!regex.is_match("file.txt"));
|
||||
assert!(!regex.is_match("other/file.txt"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn simple_star() {
|
||||
let regex = rule_regex("*.txt");
|
||||
|
||||
assert!(regex.is_match("file.txt"));
|
||||
assert!(regex.is_match("other.txt"));
|
||||
assert!(!regex.is_match("file.jpg"));
|
||||
assert!(!regex.is_match("nested/file.txt"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn simple_doublestar() {
|
||||
let regex = rule_regex("**/*.txt");
|
||||
|
||||
assert!(regex.is_match("file.txt"));
|
||||
assert!(regex.is_match("dir/file.txt"));
|
||||
assert!(regex.is_match("dir/subdir/file.txt"));
|
||||
assert!(!regex.is_match("file.jpg"));
|
||||
assert!(!regex.is_match("dir/file.jpg"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn consecutive_doublestar() {
|
||||
let regex = rule_regex("**/**/**/*.txt");
|
||||
|
||||
assert!(regex.is_match("file.txt"));
|
||||
assert!(regex.is_match("dir/file.txt"));
|
||||
assert!(regex.is_match("dir/subdir/file.txt"));
|
||||
assert!(!regex.is_match("file.jpg"));
|
||||
assert!(!regex.is_match("dir/file.jpg"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dual_star() {
|
||||
let regex = rule_regex("**/*a*");
|
||||
|
||||
assert!(regex.is_match("fileafile"));
|
||||
assert!(regex.is_match("dir/fileafile"));
|
||||
assert!(regex.is_match("filea"));
|
||||
assert!(regex.is_match("dir/filea"));
|
||||
assert!(regex.is_match("afile"));
|
||||
assert!(regex.is_match("dir/afile"));
|
||||
assert!(!regex.is_match("noletter"));
|
||||
assert!(!regex.is_match("dir/noletter"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_end() {
|
||||
let regex = rule_regex("**/*");
|
||||
|
||||
assert!(regex.is_match("file"));
|
||||
assert!(regex.is_match("dir/file"));
|
||||
assert!(regex.is_match("a/b/c/dir/file"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn doublestar_end() {
|
||||
let regex = rule_regex("root/**");
|
||||
|
||||
assert!(regex.is_match("root/file"));
|
||||
assert!(!regex.is_match("dir/file"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn doublestar_start() {
|
||||
let regex = rule_regex("**/dir");
|
||||
|
||||
assert!(regex.is_match("dir"));
|
||||
assert!(regex.is_match("a/b/dir"));
|
||||
assert!(!regex.is_match("dir/file"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn doublestar_adjacent_before() {
|
||||
let regex = rule_regex("root/**test");
|
||||
|
||||
assert!(regex.is_match("root/test"));
|
||||
assert!(regex.is_match("root/a/test"));
|
||||
assert!(regex.is_match("root/a/b/c/test"));
|
||||
assert!(!regex.is_match("root/file"));
|
||||
assert!(!regex.is_match("root/xxtest"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn doublestar_adjacent_after() {
|
||||
let regex = rule_regex("root/test**");
|
||||
|
||||
assert!(regex.is_match("root/test"));
|
||||
assert!(regex.is_match("root/test/a"));
|
||||
assert!(regex.is_match("root/test/a/b/c"));
|
||||
assert!(!regex.is_match("root/testxx"));
|
||||
assert!(!regex.is_match("root/file"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn doublestar_adjacent_middle() {
|
||||
let regex = rule_regex("root/test**file");
|
||||
|
||||
assert!(regex.is_match("root/test/file"));
|
||||
assert!(regex.is_match("root/test/a/b/c/file"));
|
||||
assert!(!regex.is_match("root/test"));
|
||||
assert!(!regex.is_match("root/file"));
|
||||
assert!(!regex.is_match("root/testfile"));
|
||||
assert!(!regex.is_match("root/testxxfile"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn doublestar_nullable() {
|
||||
let regex = rule_regex("root/**/file");
|
||||
|
||||
assert!(regex.is_match("root/test/file"));
|
||||
assert!(regex.is_match("root/file"));
|
||||
assert!(!regex.is_match("rootfile"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn doublestar_nullable_post() {
|
||||
let regex = rule_regex("root/**");
|
||||
|
||||
assert!(regex.is_match("root"));
|
||||
assert!(regex.is_match("root/file"));
|
||||
assert!(!regex.is_match("rootfile"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn doublestar_nullable_pre() {
|
||||
let regex = rule_regex("**/file");
|
||||
|
||||
assert!(regex.is_match("file"));
|
||||
assert!(regex.is_match("root/file"));
|
||||
assert!(!regex.is_match("rootfile"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn doublestar_bad_extension() {
|
||||
let regex = rule_regex("**.flac");
|
||||
|
||||
assert!(regex.is_match("root/.flac"));
|
||||
assert!(regex.is_match("root/a/.flac"));
|
||||
assert!(!regex.is_match("root/test.flac"));
|
||||
assert!(!regex.is_match("test.flac"));
|
||||
assert!(!regex.is_match("root/test/a/b/c.flac"));
|
||||
assert!(!regex.is_match("root/testflac"));
|
||||
assert!(!regex.is_match("test.mp3"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn doublestar_good_extension() {
|
||||
let regex = rule_regex("**/*.flac");
|
||||
|
||||
assert!(regex.is_match("root/.flac"));
|
||||
assert!(regex.is_match("root/a/.flac"));
|
||||
assert!(regex.is_match("root/test.flac"));
|
||||
assert!(regex.is_match("test.flac"));
|
||||
assert!(regex.is_match("root/test/a/b/c.flac"));
|
||||
assert!(!regex.is_match("root/testflac"));
|
||||
assert!(!regex.is_match("test.mp3"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_slash_a() {
|
||||
let regex = rule_regex("dir//file.txt");
|
||||
|
||||
assert!(regex.is_match("dir/file.txt"));
|
||||
assert!(!regex.is_match("dirfile.txt"));
|
||||
assert!(!regex.is_match("dir/other.txt"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_slash_b() {
|
||||
let regex = rule_regex("**///*.txt");
|
||||
|
||||
assert!(regex.is_match("dir/file.txt"));
|
||||
assert!(regex.is_match("dir/subdir/file.txt"));
|
||||
assert!(!regex.is_match("file.jpg"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_slash_c() {
|
||||
let regex = rule_regex("///dir//**//*.txt//");
|
||||
|
||||
assert!(regex.is_match("dir/subdir/file.txt"));
|
||||
assert!(regex.is_match("dir/sub1/sub2/file.txt"));
|
||||
assert!(!regex.is_match("other/sub/file.txt"));
|
||||
assert!(!regex.is_match("dir/file.jpg"));
|
||||
}
|
||||
}
|
||||
@@ -7,6 +7,10 @@
|
||||
#[cfg(test)]
|
||||
use uuid as _;
|
||||
|
||||
// silence linter, used by fns in index.rs
|
||||
#[cfg(feature = "tokio")]
|
||||
use tokio as _;
|
||||
|
||||
mod datapath;
|
||||
pub use datapath::*;
|
||||
|
||||
@@ -19,4 +23,10 @@ pub use schema::*;
|
||||
mod wildcardable;
|
||||
pub use wildcardable::*;
|
||||
|
||||
#[cfg(feature = "index")]
|
||||
mod index;
|
||||
|
||||
#[cfg(feature = "index")]
|
||||
pub use index::*;
|
||||
|
||||
pub use datapath_macro::datapath;
|
||||
|
||||
Reference in New Issue
Block a user