1
0
mirror of https://github.com/rm-dr/datapath.git synced 2025-12-10 05:14:13 -08:00
This commit is contained in:
2025-12-03 12:57:10 -08:00
committed by Mark
parent f51162478b
commit 08003a3fbe
8 changed files with 1000 additions and 6 deletions

View File

@@ -549,7 +549,7 @@ fn generate_common_impls(
}
// Extract just the field names for struct construction
let field_names = typed_fields.iter().map(|(name, _)| name);
let field_names: Vec<_> = typed_fields.iter().map(|(name, _)| name).collect();
let datapath_impl = quote! {
impl ::datapath::Datapath for #struct_name {
@@ -600,6 +600,13 @@ fn generate_common_impls(
file,
})
}
fn field(&self, name: &str) -> Option<::std::string::String> {
match name {
#(stringify!(#field_names) => Some(self.#field_names.to_string()),)*
_ => None,
}
}
}
};

View File

@@ -17,5 +17,16 @@ workspace = true
[dependencies]
datapath-macro = { workspace = true }
regex = { workspace = true, optional = true }
tracing = { workspace = true, optional = true }
trie-rs = { workspace = true, optional = true }
itertools = { workspace = true, optional = true }
tokio = { workspace = true, optional = true }
[dev-dependencies]
uuid = { version = "1", features = ["v4"] }
[features]
default = []
index = ["dep:regex", "dep:trie-rs", "dep:tracing", "dep:itertools"]
tokio = ["dep:tokio"]

View File

@@ -33,4 +33,8 @@ where
/// Parse a string as this datapath with a (possibly empty-string)
/// file, returning `None` if this string is invalid.
fn parse(path: &str) -> Option<DatapathFile<Self>>;
/// Get the string value of the field with the given name,
/// if it exists.
fn field(&self, name: &str) -> Option<String>;
}

View File

@@ -0,0 +1,395 @@
use itertools::Itertools;
use std::{collections::HashMap, fmt::Display, str::FromStr};
use trie_rs::map::{Trie, TrieBuilder};
mod rule;
/// A path segment in an [`AnyDatapath`]
#[derive(Debug, Clone, Hash, PartialEq, Eq)]
enum PathSegment {
/// A constant value, like `web`
Constant(String),
/// A key=value partition, like `domain=gouletpens.com`
Value { key: String, value: String },
}
impl Display for PathSegment {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
PathSegment::Constant(x) => write!(f, "{x}"),
PathSegment::Value { key, value } => write!(f, "{key}={value}"),
}
}
}
impl FromStr for PathSegment {
type Err = ();
fn from_str(s: &str) -> Result<Self, Self::Err> {
if s.contains("\n") {
return Err(());
}
if s.is_empty() {
return Err(());
}
return Ok(if s.contains("=") {
let mut s = s.split("=");
let key = s.next().ok_or(())?.to_owned();
let value = s.join("=");
Self::Value { key, value }
} else {
Self::Constant(s.to_owned())
});
}
}
//
// MARK: index
//
/// An in-memory cache of s3 paths.
#[derive(Debug)]
pub struct DatapathIndex {
patterns: Trie<u8, Vec<String>>,
len: usize,
}
impl DatapathIndex {
/// Convert a query string to a trie search key by normalizing values to `*`.
/// Stops at the first wildcard constant since it can't be used for prefix matching.
fn query_to_key(query: &str) -> String {
let trimmed = query.trim().trim_end_matches("**").trim_matches('/');
let mut segments = Vec::new();
for seg in trimmed.split('/') {
let segment = match PathSegment::from_str(&seg) {
Ok(x) => x,
Err(_) => continue,
};
// Stop at wildcard constants - can't use for trie prefix search
if matches!(segment, PathSegment::Constant(ref s) if s == "*") {
break;
}
segments.push(segment);
}
segments.iter_mut().for_each(|x| match x {
PathSegment::Constant(_) => {}
PathSegment::Value { value, .. } => *value = "*".into(),
});
segments.iter().join("/")
}
pub fn new_empty() -> Self {
Self {
patterns: TrieBuilder::new().build(),
len: 0,
}
}
pub fn new<S: Into<String>, I: Iterator<Item = S>>(paths: I) -> Self {
let mut len = 0;
let mut patterns = HashMap::new();
for s in paths {
let s: String = s.into();
let mut segments = Vec::new();
for seg in s.split('/') {
segments.push(match PathSegment::from_str(&seg) {
Ok(x) => x,
Err(_) => continue,
});
}
segments.iter_mut().for_each(|x| match x {
PathSegment::Constant(_) => {}
PathSegment::Value { value, .. } => *value = "*".into(),
});
let pattern = segments.iter().join("/");
patterns.entry(pattern).or_insert(Vec::new()).push(s);
len += 1;
}
let mut builder = TrieBuilder::new();
for (k, v) in patterns {
builder.push(k, v);
}
Self {
len,
patterns: builder.build(),
}
}
#[cfg(feature = "tokio")]
pub async fn async_new<S: Into<String>>(mut paths: tokio::sync::mpsc::Receiver<S>) -> Self {
let mut len = 0;
let mut patterns = HashMap::new();
while let Some(s) = paths.recv().await {
let s: String = s.into();
let mut segments = Vec::new();
for seg in s.split('/') {
segments.push(match PathSegment::from_str(&seg) {
Ok(x) => x,
Err(_) => continue,
});
}
segments.iter_mut().for_each(|x| match x {
PathSegment::Constant(_) => {}
PathSegment::Value { value, .. } => *value = "*".into(),
});
let pattern = segments.iter().join("/");
patterns.entry(pattern).or_insert(Vec::new()).push(s);
len += 1;
}
let mut builder = TrieBuilder::new();
for (k, v) in patterns {
builder.push(k, v);
}
Self {
len,
patterns: builder.build(),
}
}
#[inline(always)]
pub fn len(&self) -> usize {
self.len
}
#[inline(always)]
pub fn is_empty(&self) -> bool {
self.len() == 0
}
/// Given a datapath (that may contain wildcards) as a query,
/// return all known datapaths that match it.
///
/// Returns an empty iterator if no paths match.
/// Returns `None` if the query was invalid.
pub fn query(&self, query: impl Into<String>) -> Option<impl Iterator<Item = String> + '_> {
let query: String = query.into();
let regex = rule::Rule::new(query.clone()).regex()?;
let key = Self::query_to_key(&query);
Some(
self.patterns
.predictive_search::<String, _>(&key)
.flat_map(|(_, strings)| strings.iter())
.filter(move |s| regex.is_match(s))
.cloned(),
)
}
pub fn query_match(&self, query: impl Into<String>) -> Option<bool> {
let query: String = query.into();
let regex = rule::Rule::new(query.clone()).regex()?;
let key = Self::query_to_key(&query);
for (_, strings) in self.patterns.predictive_search::<String, _>(&key) {
for s in strings {
if regex.is_match(s) {
return Some(true);
}
}
}
return Some(false);
}
}
// MARK: index tests
#[cfg(test)]
#[expect(clippy::unwrap_used)]
mod index_tests {
use super::*;
#[test]
fn datapath_index_empty() {
let idx = DatapathIndex::new(std::iter::empty::<String>());
let query = "web/domain=example.com";
assert_eq!(idx.query(query).unwrap().count(), 0);
assert!(idx.is_empty());
assert_eq!(idx.len(), 0);
}
#[test]
fn insert_and_lookup_exact_match() {
let paths = vec!["web/domain=example.com/ts=1234"];
let idx = DatapathIndex::new(paths.into_iter());
// Exact match
let results: Vec<_> = idx
.query("web/domain=example.com/ts=1234")
.unwrap()
.collect();
assert_eq!(results.len(), 1);
assert_eq!(results[0], "web/domain=example.com/ts=1234");
// No match
let results: Vec<_> = idx.query("web/domain=other.com/ts=1234").unwrap().collect();
assert_eq!(results.len(), 0);
assert_eq!(idx.len(), 1);
}
#[test]
fn wildcard_constant_match() {
let paths = vec![
"web/domain=example.com/ts=1234",
"api/domain=example.com/ts=1234",
];
let idx = DatapathIndex::new(paths.into_iter());
// Wildcard first segment
let results: Vec<_> = idx.query("*/domain=example.com/ts=1234").unwrap().collect();
assert_eq!(results.len(), 2);
assert_eq!(idx.len(), 2);
}
#[test]
fn wildcard_value_match() {
let paths = vec![
"web/domain=example.com/ts=1234",
"web/domain=other.com/ts=1234",
];
let idx = DatapathIndex::new(paths.into_iter());
// Wildcard domain
let results: Vec<_> = idx.query("web/domain=*/ts=1234").unwrap().collect();
assert_eq!(results.len(), 2);
}
#[test]
fn multiple_datapaths() {
let paths = vec![
"web/domain=example.com/ts=1234",
"web/domain=other.com/ts=1234",
"api/domain=example.com/ts=5678",
];
let idx = DatapathIndex::new(paths.into_iter());
// Specific lookup
let results: Vec<_> = idx
.query("web/domain=example.com/ts=1234")
.unwrap()
.collect();
assert_eq!(results.len(), 1);
assert_eq!(results[0], "web/domain=example.com/ts=1234");
// Wildcard time lookup
let results: Vec<_> = idx.query("web/domain=example.com/ts=*").unwrap().collect();
assert_eq!(results.len(), 1);
assert_eq!(results[0], "web/domain=example.com/ts=1234");
// Double wildcard lookup
let results: Vec<_> = idx.query("web/domain=*/ts=*").unwrap().collect();
assert_eq!(results.len(), 2);
assert_eq!(idx.len(), 3);
}
#[test]
fn nested_wildcards() {
let paths = vec![
"web/domain=example.com/ts=1234/crawl/2.5",
"web/domain=other.com/ts=5678/crawl/2.5",
"web/domain=example.com/ts=9999/crawl/3.0",
];
let idx = DatapathIndex::new(paths.into_iter());
// Multiple wildcards in path
let results: Vec<_> = idx.query("web/domain=*/ts=*/crawl/*").unwrap().collect();
assert_eq!(results.len(), 3);
// Selective wildcards
let results: Vec<_> = idx
.query("web/domain=example.com/ts=*/crawl/*")
.unwrap()
.collect();
assert_eq!(results.len(), 2);
}
#[test]
fn partial_path_query() {
let paths = vec!["web/domain=example.com/ts=1234/crawl/2.5"];
let idx = DatapathIndex::new(paths.into_iter());
// Query with fewer segments than the stored path
let results: Vec<_> = idx.query("web/domain=example.com").unwrap().collect();
assert_eq!(results.len(), 0);
}
#[test]
fn longer_path_query() {
let paths = vec!["web/domain=example.com"];
let idx = DatapathIndex::new(paths.into_iter());
// Query with more segments than the stored path
let results: Vec<_> = idx
.query("web/domain=example.com/ts=1234/crawl/2.5")
.unwrap()
.collect();
assert_eq!(results.len(), 0);
}
#[test]
fn query_match() {
let paths = vec![
"web/domain=example.com/ts=1234",
"web/domain=other.com/ts=5678",
];
let idx = DatapathIndex::new(paths.into_iter());
// Match exists
assert_eq!(
idx.query_match("web/domain=example.com/ts=1234").unwrap(),
true
);
assert_eq!(idx.query_match("web/domain=*/ts=*").unwrap(), true);
// No match
assert_eq!(
idx.query_match("api/domain=example.com/ts=1234").unwrap(),
false
);
assert_eq!(
idx.query_match("web/domain=missing.com/ts=9999").unwrap(),
false
);
}
#[test]
fn suffix_wildcard() {
let paths = vec![
"web/domain=example.com/ts=1234/file1.json",
"web/domain=example.com/ts=1234/file2.json",
"web/domain=example.com/ts=5678/file3.json",
];
let idx = DatapathIndex::new(paths.into_iter());
// Query with suffix wildcard
let results: Vec<_> = idx.query("web/domain=example.com/**").unwrap().collect();
assert_eq!(results.len(), 3);
let results: Vec<_> = idx
.query("web/domain=example.com/ts=1234/**")
.unwrap()
.collect();
assert_eq!(results.len(), 2);
}
}

View File

@@ -0,0 +1,381 @@
use regex::Regex;
use tracing::warn;
//
// MARK: rule
//
#[derive(Debug)]
enum RegexSegment {
/// A single segment
Single(String),
/// An optional doublestar segment
DoubleStar,
}
impl RegexSegment {
/// Returns the regex pattern of this part,
/// prefixed with a /.
fn to_regex_part(&self, prev: Option<&Self>, next: Option<&Self>) -> String {
match (prev, self, next) {
// Consecutive single segments need a trailing slash
(_, Self::Single(x), Some(Self::Single(_))) => format!("{x}[/]"),
// Terminal single segments don't need a trailing slash
(_, Self::Single(x), None) => x.to_owned(),
// Neighboring doublestar is always responsible for slashes
(_, Self::Single(x), Some(Self::DoubleStar)) => x.to_owned(),
// No additional slashes
(None, Self::DoubleStar, None) => "((?:.*)?)".into(),
// Leading slash
(Some(Self::Single(_)), Self::DoubleStar, None) => "((?:[/].*)?)".into(),
// Trailing slash
(None, Self::DoubleStar, Some(Self::Single(_))) => "((?:.*[/])?)".into(),
// Leading and trailing slash.
// Also, replace self with a [/] when empty.
(Some(Self::Single(_)), Self::DoubleStar, Some(Self::Single(_))) => {
"((?:[/].*[/])|[/])".into()
}
// Doublestars cannot be neighbors
(_, Self::DoubleStar, Some(Self::DoubleStar))
| (Some(Self::DoubleStar), Self::DoubleStar, _) => {
unreachable!("consecutive doublestars must be reduced")
}
}
}
}
#[derive(Debug, Clone)]
pub struct Rule {
pub pattern: String,
}
impl Rule {
pub fn new(pattern: impl Into<String>) -> Self {
Self {
pattern: pattern.into(),
}
}
/// Turn this rule into a regex pattern.
/// Returns `None` if this rule was invalid.
pub fn regex(&self) -> Option<Regex> {
let pattern = &self.pattern;
if pattern.ends_with("/") {
warn!("Pattern `{pattern}` has a trailing slash which will be ignored")
}
if pattern.starts_with("/") {
warn!("Pattern `{pattern}` has a leading slash which will be ignored")
}
// Split on slashes or stars
// This is a lot like .split("/"), but handles
// the edge case where ** is not delimited by slashes
// (`root**test` is equivalent to `root/**/test`)
let segments = {
#[expect(clippy::unwrap_used)]
let re = Regex::new("[*]{2,}|[/]").unwrap();
let split = re.find_iter(pattern);
let bounds = split
.into_iter()
.flat_map(|x| {
let r = x.range();
let a = r.start;
let b = r.end;
[a, b]
})
.chain([pattern.len()])
.collect::<Vec<_>>();
let mut parts = Vec::new();
let mut last = 0;
for next in bounds {
let seg = &pattern[last..next];
// Consecutive slashes are identical to a single slash
if seg != "/" && !seg.is_empty() {
parts.push(seg);
}
last = next;
}
parts
};
let mut rebuilt_segments = Vec::new();
let mut last_was_doublestar = false;
for segment in segments {
// This is a wildcard regex
// (**, ***, etc)
if segment.len() > 1 && segment.chars().all(|x| x == '*') {
match segment {
"**" => {
// Consecutive doublestars are meaningless
if !last_was_doublestar {
rebuilt_segments.push(RegexSegment::DoubleStar);
}
last_was_doublestar = true;
}
_ => return None,
}
continue;
}
last_was_doublestar = false;
let parts = segment.split("*").collect::<Vec<_>>();
let mut rebuilt = String::new();
for (i, part) in parts.into_iter().enumerate() {
if i != 0 {
rebuilt.push_str("([^/]*)")
}
rebuilt.push_str(&regex::escape(part));
}
rebuilt_segments.push(RegexSegment::Single(rebuilt));
}
let mut re_built = String::new();
let mut prev = None;
for (i, seg) in rebuilt_segments.iter().enumerate() {
let next = rebuilt_segments.get(i + 1);
re_built.push_str(&seg.to_regex_part(prev, next));
prev = Some(seg);
}
let re_built = format!("^{re_built}$");
// This regex should always be valid
#[expect(clippy::unwrap_used)]
Some(Regex::new(&re_built).unwrap())
}
}
//
// MARK: tests
//
#[cfg(test)]
#[expect(clippy::unwrap_used)]
mod rule_tests {
use super::*;
fn rule_regex(pattern: &str) -> Regex {
let rule = Rule::new(pattern);
return rule.regex().unwrap();
}
#[test]
fn simple() {
let regex = rule_regex("file.txt");
assert!(regex.is_match("file.txt"));
assert!(!regex.is_match("other.txt"));
assert!(!regex.is_match("path/file.txt"));
}
#[test]
fn simple_dir() {
let regex = rule_regex("dir/file.txt");
assert!(regex.is_match("dir/file.txt"));
assert!(!regex.is_match("file.txt"));
assert!(!regex.is_match("other/file.txt"));
}
#[test]
fn simple_star() {
let regex = rule_regex("*.txt");
assert!(regex.is_match("file.txt"));
assert!(regex.is_match("other.txt"));
assert!(!regex.is_match("file.jpg"));
assert!(!regex.is_match("nested/file.txt"));
}
#[test]
fn simple_doublestar() {
let regex = rule_regex("**/*.txt");
assert!(regex.is_match("file.txt"));
assert!(regex.is_match("dir/file.txt"));
assert!(regex.is_match("dir/subdir/file.txt"));
assert!(!regex.is_match("file.jpg"));
assert!(!regex.is_match("dir/file.jpg"));
}
#[test]
fn consecutive_doublestar() {
let regex = rule_regex("**/**/**/*.txt");
assert!(regex.is_match("file.txt"));
assert!(regex.is_match("dir/file.txt"));
assert!(regex.is_match("dir/subdir/file.txt"));
assert!(!regex.is_match("file.jpg"));
assert!(!regex.is_match("dir/file.jpg"));
}
#[test]
fn dual_star() {
let regex = rule_regex("**/*a*");
assert!(regex.is_match("fileafile"));
assert!(regex.is_match("dir/fileafile"));
assert!(regex.is_match("filea"));
assert!(regex.is_match("dir/filea"));
assert!(regex.is_match("afile"));
assert!(regex.is_match("dir/afile"));
assert!(!regex.is_match("noletter"));
assert!(!regex.is_match("dir/noletter"));
}
#[test]
fn single_end() {
let regex = rule_regex("**/*");
assert!(regex.is_match("file"));
assert!(regex.is_match("dir/file"));
assert!(regex.is_match("a/b/c/dir/file"));
}
#[test]
fn doublestar_end() {
let regex = rule_regex("root/**");
assert!(regex.is_match("root/file"));
assert!(!regex.is_match("dir/file"));
}
#[test]
fn doublestar_start() {
let regex = rule_regex("**/dir");
assert!(regex.is_match("dir"));
assert!(regex.is_match("a/b/dir"));
assert!(!regex.is_match("dir/file"));
}
#[test]
fn doublestar_adjacent_before() {
let regex = rule_regex("root/**test");
assert!(regex.is_match("root/test"));
assert!(regex.is_match("root/a/test"));
assert!(regex.is_match("root/a/b/c/test"));
assert!(!regex.is_match("root/file"));
assert!(!regex.is_match("root/xxtest"));
}
#[test]
fn doublestar_adjacent_after() {
let regex = rule_regex("root/test**");
assert!(regex.is_match("root/test"));
assert!(regex.is_match("root/test/a"));
assert!(regex.is_match("root/test/a/b/c"));
assert!(!regex.is_match("root/testxx"));
assert!(!regex.is_match("root/file"));
}
#[test]
fn doublestar_adjacent_middle() {
let regex = rule_regex("root/test**file");
assert!(regex.is_match("root/test/file"));
assert!(regex.is_match("root/test/a/b/c/file"));
assert!(!regex.is_match("root/test"));
assert!(!regex.is_match("root/file"));
assert!(!regex.is_match("root/testfile"));
assert!(!regex.is_match("root/testxxfile"));
}
#[test]
fn doublestar_nullable() {
let regex = rule_regex("root/**/file");
assert!(regex.is_match("root/test/file"));
assert!(regex.is_match("root/file"));
assert!(!regex.is_match("rootfile"));
}
#[test]
fn doublestar_nullable_post() {
let regex = rule_regex("root/**");
assert!(regex.is_match("root"));
assert!(regex.is_match("root/file"));
assert!(!regex.is_match("rootfile"));
}
#[test]
fn doublestar_nullable_pre() {
let regex = rule_regex("**/file");
assert!(regex.is_match("file"));
assert!(regex.is_match("root/file"));
assert!(!regex.is_match("rootfile"));
}
#[test]
fn doublestar_bad_extension() {
let regex = rule_regex("**.flac");
assert!(regex.is_match("root/.flac"));
assert!(regex.is_match("root/a/.flac"));
assert!(!regex.is_match("root/test.flac"));
assert!(!regex.is_match("test.flac"));
assert!(!regex.is_match("root/test/a/b/c.flac"));
assert!(!regex.is_match("root/testflac"));
assert!(!regex.is_match("test.mp3"));
}
#[test]
fn doublestar_good_extension() {
let regex = rule_regex("**/*.flac");
assert!(regex.is_match("root/.flac"));
assert!(regex.is_match("root/a/.flac"));
assert!(regex.is_match("root/test.flac"));
assert!(regex.is_match("test.flac"));
assert!(regex.is_match("root/test/a/b/c.flac"));
assert!(!regex.is_match("root/testflac"));
assert!(!regex.is_match("test.mp3"));
}
#[test]
fn multi_slash_a() {
let regex = rule_regex("dir//file.txt");
assert!(regex.is_match("dir/file.txt"));
assert!(!regex.is_match("dirfile.txt"));
assert!(!regex.is_match("dir/other.txt"));
}
#[test]
fn multi_slash_b() {
let regex = rule_regex("**///*.txt");
assert!(regex.is_match("dir/file.txt"));
assert!(regex.is_match("dir/subdir/file.txt"));
assert!(!regex.is_match("file.jpg"));
}
#[test]
fn multi_slash_c() {
let regex = rule_regex("///dir//**//*.txt//");
assert!(regex.is_match("dir/subdir/file.txt"));
assert!(regex.is_match("dir/sub1/sub2/file.txt"));
assert!(!regex.is_match("other/sub/file.txt"));
assert!(!regex.is_match("dir/file.jpg"));
}
}

View File

@@ -7,6 +7,10 @@
#[cfg(test)]
use uuid as _;
// silence linter, used by fns in index.rs
#[cfg(feature = "tokio")]
use tokio as _;
mod datapath;
pub use datapath::*;
@@ -19,4 +23,10 @@ pub use schema::*;
mod wildcardable;
pub use wildcardable::*;
#[cfg(feature = "index")]
mod index;
#[cfg(feature = "index")]
pub use index::*;
pub use datapath_macro::datapath;