v0.0.3

2026-03-10 19:47:17 -07:00 · 2025-12-03 12:57:10 -08:00
parent f51162478b
commit 08003a3fbe
8 changed files with 1000 additions and 6 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,6 +2,15 @@
 # It is not intended for manual editing.
 version = 4

+[[package]]
+name = "aho-corasick"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "bumpalo"
 version = "3.19.0"
@@ -14,23 +23,68 @@ version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"

+[[package]]
+name = "crossbeam-deque"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
+dependencies = [
+ "crossbeam-epoch",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-epoch"
+version = "0.9.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
+dependencies = [
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
+
 [[package]]
 name = "datapath"
-version = "0.0.2"
+version = "0.0.3"
 dependencies = [
 "datapath-macro",
+ "itertools",
+ "regex",
+ "tokio",
+ "tracing",
+ "trie-rs",
 "uuid",
 ]

 [[package]]
 name = "datapath-macro"
-version = "0.0.2"
+version = "0.0.3"
 dependencies = [
 "proc-macro2",
 "quote",
 "syn",
 ]

+[[package]]
+name = "either"
+version = "1.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
+
+[[package]]
+name = "fid-rs"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6956a1e60e2d1412b44b4169d44a03dae518f8583d3e10090c912c105e48447"
+dependencies = [
+ "rayon",
+]
+
 [[package]]
 name = "getrandom"
 version = "0.3.4"
@@ -43,6 +97,15 @@ dependencies = [
 "wasip2",
 ]

+[[package]]
+name = "itertools"
+version = "0.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
+dependencies = [
+ "either",
+]
+
 [[package]]
 name = "js-sys"
 version = "0.3.83"
@@ -59,12 +122,33 @@ version = "0.2.178"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "37c93d8daa9d8a012fd8ab92f088405fb202ea0b6ab73ee2482ae66af4f42091"

+[[package]]
+name = "louds-rs"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "936de6c22f08e7135a921f8ada907acd0d88880c4f42b5591f634b9f1dd8e07f"
+dependencies = [
+ "fid-rs",
+]
+
+[[package]]
+name = "memchr"
+version = "2.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
+
 [[package]]
 name = "once_cell"
 version = "1.21.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"

+[[package]]
+name = "pin-project-lite"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
+
 [[package]]
 name = "proc-macro2"
 version = "1.0.103"
@@ -89,6 +173,55 @@ version = "5.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"

+[[package]]
+name = "rayon"
+version = "1.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f"
+dependencies = [
+ "either",
+ "rayon-core",
+]
+
+[[package]]
+name = "rayon-core"
+version = "1.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
+dependencies = [
+ "crossbeam-deque",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "regex"
+version = "1.12.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.8.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
+
 [[package]]
 name = "rustversion"
 version = "1.0.22"
@@ -106,6 +239,55 @@ dependencies = [
 "unicode-ident",
 ]

+[[package]]
+name = "tokio"
+version = "1.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff360e02eab121e0bc37a2d3b4d4dc622e6eda3a8e5253d5435ecf5bd4c68408"
+dependencies = [
+ "pin-project-lite",
+]
+
+[[package]]
+name = "tracing"
+version = "0.1.43"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d15d90a0b5c19378952d479dc858407149d7bb45a14de0142f6c534b16fc647"
+dependencies = [
+ "pin-project-lite",
+ "tracing-attributes",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-attributes"
+version = "0.1.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a04e24fab5c89c6a36eb8558c9656f30d81de51dfa4d3b45f26b21d61fa0a6c"
+dependencies = [
+ "once_cell",
+]
+
+[[package]]
+name = "trie-rs"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6f88f4b0a1ebd6c3d16be3e45eb0e8089372ccadd88849b7ca162ba64b5e6f6"
+dependencies = [
+ "louds-rs",
+]
+
 [[package]]
 name = "unicode-ident"
 version = "1.0.22"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,7 +11,7 @@ readme = "README.md"
 authors = ["rm-dr"]

 # Don't forget to bump datapath-macro below!
-version = "0.0.2"
+version = "0.0.3"

 [workspace.lints.rust]
 unused_import_braces = "deny"
@@ -70,12 +70,16 @@ cargo_common_metadata = "deny"
 #

 [workspace.dependencies]
-datapath-macro = { path = "crates/datapath-macro", version = "0.0.2" }
+datapath-macro = { path = "crates/datapath-macro", version = "0.0.3" }
 datapath = { path = "crates/datapath" }

 chrono = "0.4.42"
+itertools = "0.14.0"
 proc-macro2 = "1.0.103"
 quote = "1.0.42"
+regex = "1.12.2"
 syn = "2.0.111"
-
+tracing = "0.1"
+trie-rs = "0.4.2"
 uuid = "1.19.0"
+tokio = { version = "1.48.0", features = ["sync"] }
--- a/crates/datapath-macro/src/lib.rs
+++ b/crates/datapath-macro/src/lib.rs
@@ -549,7 +549,7 @@ fn generate_common_impls(
 	}

 	// Extract just the field names for struct construction
-	let field_names = typed_fields.iter().map(|(name, _)| name);
+	let field_names: Vec<_> = typed_fields.iter().map(|(name, _)| name).collect();

 	let datapath_impl = quote! {
 		impl ::datapath::Datapath for #struct_name {
@@ -600,6 +600,13 @@ fn generate_common_impls(
 					file,
 				})
 			}
+
+			fn field(&self, name: &str) -> Option<::std::string::String> {
+				match name {
+					#(stringify!(#field_names) => Some(self.#field_names.to_string()),)*
+					_ => None,
+				}
+			}
 		}
 	};

--- a/crates/datapath/Cargo.toml
+++ b/crates/datapath/Cargo.toml
@@ -17,5 +17,16 @@ workspace = true
 [dependencies]
 datapath-macro = { workspace = true }

+regex = { workspace = true, optional = true }
+tracing = { workspace = true, optional = true }
+trie-rs = { workspace = true, optional = true }
+itertools = { workspace = true, optional = true }
+tokio = { workspace = true, optional = true }
+
 [dev-dependencies]
 uuid = { version = "1", features = ["v4"] }
+
+[features]
+default = []
+index = ["dep:regex", "dep:trie-rs", "dep:tracing", "dep:itertools"]
+tokio = ["dep:tokio"]
--- a/crates/datapath/src/datapath.rs
+++ b/crates/datapath/src/datapath.rs
@@ -33,4 +33,8 @@ where
 	/// Parse a string as this datapath with a (possibly empty-string)
 	/// file, returning `None` if this string is invalid.
 	fn parse(path: &str) -> Option<DatapathFile<Self>>;
+
+	/// Get the string value of the field with the given name,
+	/// if it exists.
+	fn field(&self, name: &str) -> Option<String>;
 }
--- a/crates/datapath/src/index/mod.rs
+++ b/crates/datapath/src/index/mod.rs
@@ -0,0 +1,395 @@
+use itertools::Itertools;
+use std::{collections::HashMap, fmt::Display, str::FromStr};
+use trie_rs::map::{Trie, TrieBuilder};
+
+mod rule;
+
+/// A path segment in an [`AnyDatapath`]
+#[derive(Debug, Clone, Hash, PartialEq, Eq)]
+enum PathSegment {
+	/// A constant value, like `web`
+	Constant(String),
+
+	/// A key=value partition, like `domain=gouletpens.com`
+	Value { key: String, value: String },
+}
+
+impl Display for PathSegment {
+	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+		match self {
+			PathSegment::Constant(x) => write!(f, "{x}"),
+			PathSegment::Value { key, value } => write!(f, "{key}={value}"),
+		}
+	}
+}
+
+impl FromStr for PathSegment {
+	type Err = ();
+	fn from_str(s: &str) -> Result<Self, Self::Err> {
+		if s.contains("\n") {
+			return Err(());
+		}
+
+		if s.is_empty() {
+			return Err(());
+		}
+
+		return Ok(if s.contains("=") {
+			let mut s = s.split("=");
+			let key = s.next().ok_or(())?.to_owned();
+			let value = s.join("=");
+			Self::Value { key, value }
+		} else {
+			Self::Constant(s.to_owned())
+		});
+	}
+}
+
+//
+// MARK: index
+//
+
+/// An in-memory cache of s3 paths.
+#[derive(Debug)]
+pub struct DatapathIndex {
+	patterns: Trie<u8, Vec<String>>,
+	len: usize,
+}
+
+impl DatapathIndex {
+	/// Convert a query string to a trie search key by normalizing values to `*`.
+	/// Stops at the first wildcard constant since it can't be used for prefix matching.
+	fn query_to_key(query: &str) -> String {
+		let trimmed = query.trim().trim_end_matches("**").trim_matches('/');
+		let mut segments = Vec::new();
+		for seg in trimmed.split('/') {
+			let segment = match PathSegment::from_str(&seg) {
+				Ok(x) => x,
+				Err(_) => continue,
+			};
+
+			// Stop at wildcard constants - can't use for trie prefix search
+			if matches!(segment, PathSegment::Constant(ref s) if s == "*") {
+				break;
+			}
+
+			segments.push(segment);
+		}
+
+		segments.iter_mut().for_each(|x| match x {
+			PathSegment::Constant(_) => {}
+			PathSegment::Value { value, .. } => *value = "*".into(),
+		});
+
+		segments.iter().join("/")
+	}
+
+	pub fn new_empty() -> Self {
+		Self {
+			patterns: TrieBuilder::new().build(),
+			len: 0,
+		}
+	}
+
+	pub fn new<S: Into<String>, I: Iterator<Item = S>>(paths: I) -> Self {
+		let mut len = 0;
+		let mut patterns = HashMap::new();
+
+		for s in paths {
+			let s: String = s.into();
+			let mut segments = Vec::new();
+			for seg in s.split('/') {
+				segments.push(match PathSegment::from_str(&seg) {
+					Ok(x) => x,
+					Err(_) => continue,
+				});
+			}
+
+			segments.iter_mut().for_each(|x| match x {
+				PathSegment::Constant(_) => {}
+				PathSegment::Value { value, .. } => *value = "*".into(),
+			});
+
+			let pattern = segments.iter().join("/");
+
+			patterns.entry(pattern).or_insert(Vec::new()).push(s);
+			len += 1;
+		}
+
+		let mut builder = TrieBuilder::new();
+		for (k, v) in patterns {
+			builder.push(k, v);
+		}
+
+		Self {
+			len,
+			patterns: builder.build(),
+		}
+	}
+
+	#[cfg(feature = "tokio")]
+	pub async fn async_new<S: Into<String>>(mut paths: tokio::sync::mpsc::Receiver<S>) -> Self {
+		let mut len = 0;
+		let mut patterns = HashMap::new();
+
+		while let Some(s) = paths.recv().await {
+			let s: String = s.into();
+			let mut segments = Vec::new();
+			for seg in s.split('/') {
+				segments.push(match PathSegment::from_str(&seg) {
+					Ok(x) => x,
+					Err(_) => continue,
+				});
+			}
+
+			segments.iter_mut().for_each(|x| match x {
+				PathSegment::Constant(_) => {}
+				PathSegment::Value { value, .. } => *value = "*".into(),
+			});
+
+			let pattern = segments.iter().join("/");
+
+			patterns.entry(pattern).or_insert(Vec::new()).push(s);
+			len += 1;
+		}
+
+		let mut builder = TrieBuilder::new();
+		for (k, v) in patterns {
+			builder.push(k, v);
+		}
+
+		Self {
+			len,
+			patterns: builder.build(),
+		}
+	}
+
+	#[inline(always)]
+	pub fn len(&self) -> usize {
+		self.len
+	}
+
+	#[inline(always)]
+	pub fn is_empty(&self) -> bool {
+		self.len() == 0
+	}
+
+	/// Given a datapath (that may contain wildcards) as a query,
+	/// return all known datapaths that match it.
+	///
+	/// Returns an empty iterator if no paths match.
+	/// Returns `None` if the query was invalid.
+	pub fn query(&self, query: impl Into<String>) -> Option<impl Iterator<Item = String> + '_> {
+		let query: String = query.into();
+		let regex = rule::Rule::new(query.clone()).regex()?;
+		let key = Self::query_to_key(&query);
+
+		Some(
+			self.patterns
+				.predictive_search::<String, _>(&key)
+				.flat_map(|(_, strings)| strings.iter())
+				.filter(move |s| regex.is_match(s))
+				.cloned(),
+		)
+	}
+
+	pub fn query_match(&self, query: impl Into<String>) -> Option<bool> {
+		let query: String = query.into();
+		let regex = rule::Rule::new(query.clone()).regex()?;
+		let key = Self::query_to_key(&query);
+
+		for (_, strings) in self.patterns.predictive_search::<String, _>(&key) {
+			for s in strings {
+				if regex.is_match(s) {
+					return Some(true);
+				}
+			}
+		}
+
+		return Some(false);
+	}
+}
+
+// MARK: index tests
+
+#[cfg(test)]
+#[expect(clippy::unwrap_used)]
+mod index_tests {
+	use super::*;
+
+	#[test]
+	fn datapath_index_empty() {
+		let idx = DatapathIndex::new(std::iter::empty::<String>());
+		let query = "web/domain=example.com";
+		assert_eq!(idx.query(query).unwrap().count(), 0);
+		assert!(idx.is_empty());
+		assert_eq!(idx.len(), 0);
+	}
+
+	#[test]
+	fn insert_and_lookup_exact_match() {
+		let paths = vec!["web/domain=example.com/ts=1234"];
+		let idx = DatapathIndex::new(paths.into_iter());
+
+		// Exact match
+		let results: Vec<_> = idx
+			.query("web/domain=example.com/ts=1234")
+			.unwrap()
+			.collect();
+		assert_eq!(results.len(), 1);
+		assert_eq!(results[0], "web/domain=example.com/ts=1234");
+
+		// No match
+		let results: Vec<_> = idx.query("web/domain=other.com/ts=1234").unwrap().collect();
+		assert_eq!(results.len(), 0);
+
+		assert_eq!(idx.len(), 1);
+	}
+
+	#[test]
+	fn wildcard_constant_match() {
+		let paths = vec![
+			"web/domain=example.com/ts=1234",
+			"api/domain=example.com/ts=1234",
+		];
+		let idx = DatapathIndex::new(paths.into_iter());
+
+		// Wildcard first segment
+		let results: Vec<_> = idx.query("*/domain=example.com/ts=1234").unwrap().collect();
+		assert_eq!(results.len(), 2);
+
+		assert_eq!(idx.len(), 2);
+	}
+
+	#[test]
+	fn wildcard_value_match() {
+		let paths = vec![
+			"web/domain=example.com/ts=1234",
+			"web/domain=other.com/ts=1234",
+		];
+		let idx = DatapathIndex::new(paths.into_iter());
+
+		// Wildcard domain
+		let results: Vec<_> = idx.query("web/domain=*/ts=1234").unwrap().collect();
+		assert_eq!(results.len(), 2);
+	}
+
+	#[test]
+	fn multiple_datapaths() {
+		let paths = vec![
+			"web/domain=example.com/ts=1234",
+			"web/domain=other.com/ts=1234",
+			"api/domain=example.com/ts=5678",
+		];
+		let idx = DatapathIndex::new(paths.into_iter());
+
+		// Specific lookup
+		let results: Vec<_> = idx
+			.query("web/domain=example.com/ts=1234")
+			.unwrap()
+			.collect();
+		assert_eq!(results.len(), 1);
+		assert_eq!(results[0], "web/domain=example.com/ts=1234");
+
+		// Wildcard time lookup
+		let results: Vec<_> = idx.query("web/domain=example.com/ts=*").unwrap().collect();
+		assert_eq!(results.len(), 1);
+		assert_eq!(results[0], "web/domain=example.com/ts=1234");
+
+		// Double wildcard lookup
+		let results: Vec<_> = idx.query("web/domain=*/ts=*").unwrap().collect();
+		assert_eq!(results.len(), 2);
+
+		assert_eq!(idx.len(), 3);
+	}
+
+	#[test]
+	fn nested_wildcards() {
+		let paths = vec![
+			"web/domain=example.com/ts=1234/crawl/2.5",
+			"web/domain=other.com/ts=5678/crawl/2.5",
+			"web/domain=example.com/ts=9999/crawl/3.0",
+		];
+		let idx = DatapathIndex::new(paths.into_iter());
+
+		// Multiple wildcards in path
+		let results: Vec<_> = idx.query("web/domain=*/ts=*/crawl/*").unwrap().collect();
+		assert_eq!(results.len(), 3);
+
+		// Selective wildcards
+		let results: Vec<_> = idx
+			.query("web/domain=example.com/ts=*/crawl/*")
+			.unwrap()
+			.collect();
+		assert_eq!(results.len(), 2);
+	}
+
+	#[test]
+	fn partial_path_query() {
+		let paths = vec!["web/domain=example.com/ts=1234/crawl/2.5"];
+		let idx = DatapathIndex::new(paths.into_iter());
+
+		// Query with fewer segments than the stored path
+		let results: Vec<_> = idx.query("web/domain=example.com").unwrap().collect();
+		assert_eq!(results.len(), 0);
+	}
+
+	#[test]
+	fn longer_path_query() {
+		let paths = vec!["web/domain=example.com"];
+		let idx = DatapathIndex::new(paths.into_iter());
+
+		// Query with more segments than the stored path
+		let results: Vec<_> = idx
+			.query("web/domain=example.com/ts=1234/crawl/2.5")
+			.unwrap()
+			.collect();
+		assert_eq!(results.len(), 0);
+	}
+
+	#[test]
+	fn query_match() {
+		let paths = vec![
+			"web/domain=example.com/ts=1234",
+			"web/domain=other.com/ts=5678",
+		];
+		let idx = DatapathIndex::new(paths.into_iter());
+
+		// Match exists
+		assert_eq!(
+			idx.query_match("web/domain=example.com/ts=1234").unwrap(),
+			true
+		);
+		assert_eq!(idx.query_match("web/domain=*/ts=*").unwrap(), true);
+
+		// No match
+		assert_eq!(
+			idx.query_match("api/domain=example.com/ts=1234").unwrap(),
+			false
+		);
+		assert_eq!(
+			idx.query_match("web/domain=missing.com/ts=9999").unwrap(),
+			false
+		);
+	}
+
+	#[test]
+	fn suffix_wildcard() {
+		let paths = vec![
+			"web/domain=example.com/ts=1234/file1.json",
+			"web/domain=example.com/ts=1234/file2.json",
+			"web/domain=example.com/ts=5678/file3.json",
+		];
+		let idx = DatapathIndex::new(paths.into_iter());
+
+		// Query with suffix wildcard
+		let results: Vec<_> = idx.query("web/domain=example.com/**").unwrap().collect();
+		assert_eq!(results.len(), 3);
+
+		let results: Vec<_> = idx
+			.query("web/domain=example.com/ts=1234/**")
+			.unwrap()
+			.collect();
+		assert_eq!(results.len(), 2);
+	}
+}
--- a/crates/datapath/src/index/rule.rs
+++ b/crates/datapath/src/index/rule.rs
@@ -0,0 +1,381 @@
+use regex::Regex;
+use tracing::warn;
+
+//
+// MARK: rule
+//
+
+#[derive(Debug)]
+enum RegexSegment {
+	/// A single segment
+	Single(String),
+
+	/// An optional doublestar segment
+	DoubleStar,
+}
+
+impl RegexSegment {
+	/// Returns the regex pattern of this part,
+	/// prefixed with a /.
+	fn to_regex_part(&self, prev: Option<&Self>, next: Option<&Self>) -> String {
+		match (prev, self, next) {
+			// Consecutive single segments need a trailing slash
+			(_, Self::Single(x), Some(Self::Single(_))) => format!("{x}[/]"),
+
+			// Terminal single segments don't need a trailing slash
+			(_, Self::Single(x), None) => x.to_owned(),
+
+			// Neighboring doublestar is always responsible for slashes
+			(_, Self::Single(x), Some(Self::DoubleStar)) => x.to_owned(),
+
+			// No additional slashes
+			(None, Self::DoubleStar, None) => "((?:.*)?)".into(),
+
+			// Leading slash
+			(Some(Self::Single(_)), Self::DoubleStar, None) => "((?:[/].*)?)".into(),
+
+			// Trailing slash
+			(None, Self::DoubleStar, Some(Self::Single(_))) => "((?:.*[/])?)".into(),
+
+			// Leading and trailing slash.
+			// Also, replace self with a [/] when empty.
+			(Some(Self::Single(_)), Self::DoubleStar, Some(Self::Single(_))) => {
+				"((?:[/].*[/])|[/])".into()
+			}
+
+			// Doublestars cannot be neighbors
+			(_, Self::DoubleStar, Some(Self::DoubleStar))
+			| (Some(Self::DoubleStar), Self::DoubleStar, _) => {
+				unreachable!("consecutive doublestars must be reduced")
+			}
+		}
+	}
+}
+
+#[derive(Debug, Clone)]
+pub struct Rule {
+	pub pattern: String,
+}
+
+impl Rule {
+	pub fn new(pattern: impl Into<String>) -> Self {
+		Self {
+			pattern: pattern.into(),
+		}
+	}
+
+	/// Turn this rule into a regex pattern.
+	/// Returns `None` if this rule was invalid.
+	pub fn regex(&self) -> Option<Regex> {
+		let pattern = &self.pattern;
+
+		if pattern.ends_with("/") {
+			warn!("Pattern `{pattern}` has a trailing slash which will be ignored")
+		}
+
+		if pattern.starts_with("/") {
+			warn!("Pattern `{pattern}` has a leading slash which will be ignored")
+		}
+
+		// Split on slashes or stars
+		// This is a lot like .split("/"), but handles
+		// the edge case where ** is not delimited by slashes
+		// (`root**test` is equivalent to `root/**/test`)
+		let segments = {
+			#[expect(clippy::unwrap_used)]
+			let re = Regex::new("[*]{2,}|[/]").unwrap();
+			let split = re.find_iter(pattern);
+
+			let bounds = split
+				.into_iter()
+				.flat_map(|x| {
+					let r = x.range();
+					let a = r.start;
+					let b = r.end;
+					[a, b]
+				})
+				.chain([pattern.len()])
+				.collect::<Vec<_>>();
+
+			let mut parts = Vec::new();
+			let mut last = 0;
+			for next in bounds {
+				let seg = &pattern[last..next];
+				// Consecutive slashes are identical to a single slash
+				if seg != "/" && !seg.is_empty() {
+					parts.push(seg);
+				}
+				last = next;
+			}
+
+			parts
+		};
+
+		let mut rebuilt_segments = Vec::new();
+		let mut last_was_doublestar = false;
+		for segment in segments {
+			// This is a wildcard regex
+			// (**, ***, etc)
+			if segment.len() > 1 && segment.chars().all(|x| x == '*') {
+				match segment {
+					"**" => {
+						// Consecutive doublestars are meaningless
+						if !last_was_doublestar {
+							rebuilt_segments.push(RegexSegment::DoubleStar);
+						}
+						last_was_doublestar = true;
+					}
+					_ => return None,
+				}
+				continue;
+			}
+			last_was_doublestar = false;
+
+			let parts = segment.split("*").collect::<Vec<_>>();
+
+			let mut rebuilt = String::new();
+			for (i, part) in parts.into_iter().enumerate() {
+				if i != 0 {
+					rebuilt.push_str("([^/]*)")
+				}
+
+				rebuilt.push_str(&regex::escape(part));
+			}
+
+			rebuilt_segments.push(RegexSegment::Single(rebuilt));
+		}
+
+		let mut re_built = String::new();
+		let mut prev = None;
+		for (i, seg) in rebuilt_segments.iter().enumerate() {
+			let next = rebuilt_segments.get(i + 1);
+			re_built.push_str(&seg.to_regex_part(prev, next));
+			prev = Some(seg);
+		}
+
+		let re_built = format!("^{re_built}$");
+		// This regex should always be valid
+		#[expect(clippy::unwrap_used)]
+		Some(Regex::new(&re_built).unwrap())
+	}
+}
+
+//
+// MARK: tests
+//
+
+#[cfg(test)]
+#[expect(clippy::unwrap_used)]
+mod rule_tests {
+	use super::*;
+
+	fn rule_regex(pattern: &str) -> Regex {
+		let rule = Rule::new(pattern);
+		return rule.regex().unwrap();
+	}
+
+	#[test]
+	fn simple() {
+		let regex = rule_regex("file.txt");
+
+		assert!(regex.is_match("file.txt"));
+		assert!(!regex.is_match("other.txt"));
+		assert!(!regex.is_match("path/file.txt"));
+	}
+
+	#[test]
+	fn simple_dir() {
+		let regex = rule_regex("dir/file.txt");
+
+		assert!(regex.is_match("dir/file.txt"));
+		assert!(!regex.is_match("file.txt"));
+		assert!(!regex.is_match("other/file.txt"));
+	}
+
+	#[test]
+	fn simple_star() {
+		let regex = rule_regex("*.txt");
+
+		assert!(regex.is_match("file.txt"));
+		assert!(regex.is_match("other.txt"));
+		assert!(!regex.is_match("file.jpg"));
+		assert!(!regex.is_match("nested/file.txt"));
+	}
+
+	#[test]
+	fn simple_doublestar() {
+		let regex = rule_regex("**/*.txt");
+
+		assert!(regex.is_match("file.txt"));
+		assert!(regex.is_match("dir/file.txt"));
+		assert!(regex.is_match("dir/subdir/file.txt"));
+		assert!(!regex.is_match("file.jpg"));
+		assert!(!regex.is_match("dir/file.jpg"));
+	}
+
+	#[test]
+	fn consecutive_doublestar() {
+		let regex = rule_regex("**/**/**/*.txt");
+
+		assert!(regex.is_match("file.txt"));
+		assert!(regex.is_match("dir/file.txt"));
+		assert!(regex.is_match("dir/subdir/file.txt"));
+		assert!(!regex.is_match("file.jpg"));
+		assert!(!regex.is_match("dir/file.jpg"));
+	}
+
+	#[test]
+	fn dual_star() {
+		let regex = rule_regex("**/*a*");
+
+		assert!(regex.is_match("fileafile"));
+		assert!(regex.is_match("dir/fileafile"));
+		assert!(regex.is_match("filea"));
+		assert!(regex.is_match("dir/filea"));
+		assert!(regex.is_match("afile"));
+		assert!(regex.is_match("dir/afile"));
+		assert!(!regex.is_match("noletter"));
+		assert!(!regex.is_match("dir/noletter"));
+	}
+
+	#[test]
+	fn single_end() {
+		let regex = rule_regex("**/*");
+
+		assert!(regex.is_match("file"));
+		assert!(regex.is_match("dir/file"));
+		assert!(regex.is_match("a/b/c/dir/file"));
+	}
+
+	#[test]
+	fn doublestar_end() {
+		let regex = rule_regex("root/**");
+
+		assert!(regex.is_match("root/file"));
+		assert!(!regex.is_match("dir/file"));
+	}
+
+	#[test]
+	fn doublestar_start() {
+		let regex = rule_regex("**/dir");
+
+		assert!(regex.is_match("dir"));
+		assert!(regex.is_match("a/b/dir"));
+		assert!(!regex.is_match("dir/file"));
+	}
+
+	#[test]
+	fn doublestar_adjacent_before() {
+		let regex = rule_regex("root/**test");
+
+		assert!(regex.is_match("root/test"));
+		assert!(regex.is_match("root/a/test"));
+		assert!(regex.is_match("root/a/b/c/test"));
+		assert!(!regex.is_match("root/file"));
+		assert!(!regex.is_match("root/xxtest"));
+	}
+
+	#[test]
+	fn doublestar_adjacent_after() {
+		let regex = rule_regex("root/test**");
+
+		assert!(regex.is_match("root/test"));
+		assert!(regex.is_match("root/test/a"));
+		assert!(regex.is_match("root/test/a/b/c"));
+		assert!(!regex.is_match("root/testxx"));
+		assert!(!regex.is_match("root/file"));
+	}
+
+	#[test]
+	fn doublestar_adjacent_middle() {
+		let regex = rule_regex("root/test**file");
+
+		assert!(regex.is_match("root/test/file"));
+		assert!(regex.is_match("root/test/a/b/c/file"));
+		assert!(!regex.is_match("root/test"));
+		assert!(!regex.is_match("root/file"));
+		assert!(!regex.is_match("root/testfile"));
+		assert!(!regex.is_match("root/testxxfile"));
+	}
+
+	#[test]
+	fn doublestar_nullable() {
+		let regex = rule_regex("root/**/file");
+
+		assert!(regex.is_match("root/test/file"));
+		assert!(regex.is_match("root/file"));
+		assert!(!regex.is_match("rootfile"));
+	}
+
+	#[test]
+	fn doublestar_nullable_post() {
+		let regex = rule_regex("root/**");
+
+		assert!(regex.is_match("root"));
+		assert!(regex.is_match("root/file"));
+		assert!(!regex.is_match("rootfile"));
+	}
+
+	#[test]
+	fn doublestar_nullable_pre() {
+		let regex = rule_regex("**/file");
+
+		assert!(regex.is_match("file"));
+		assert!(regex.is_match("root/file"));
+		assert!(!regex.is_match("rootfile"));
+	}
+
+	#[test]
+	fn doublestar_bad_extension() {
+		let regex = rule_regex("**.flac");
+
+		assert!(regex.is_match("root/.flac"));
+		assert!(regex.is_match("root/a/.flac"));
+		assert!(!regex.is_match("root/test.flac"));
+		assert!(!regex.is_match("test.flac"));
+		assert!(!regex.is_match("root/test/a/b/c.flac"));
+		assert!(!regex.is_match("root/testflac"));
+		assert!(!regex.is_match("test.mp3"));
+	}
+
+	#[test]
+	fn doublestar_good_extension() {
+		let regex = rule_regex("**/*.flac");
+
+		assert!(regex.is_match("root/.flac"));
+		assert!(regex.is_match("root/a/.flac"));
+		assert!(regex.is_match("root/test.flac"));
+		assert!(regex.is_match("test.flac"));
+		assert!(regex.is_match("root/test/a/b/c.flac"));
+		assert!(!regex.is_match("root/testflac"));
+		assert!(!regex.is_match("test.mp3"));
+	}
+
+	#[test]
+	fn multi_slash_a() {
+		let regex = rule_regex("dir//file.txt");
+
+		assert!(regex.is_match("dir/file.txt"));
+		assert!(!regex.is_match("dirfile.txt"));
+		assert!(!regex.is_match("dir/other.txt"));
+	}
+
+	#[test]
+	fn multi_slash_b() {
+		let regex = rule_regex("**///*.txt");
+
+		assert!(regex.is_match("dir/file.txt"));
+		assert!(regex.is_match("dir/subdir/file.txt"));
+		assert!(!regex.is_match("file.jpg"));
+	}
+
+	#[test]
+	fn multi_slash_c() {
+		let regex = rule_regex("///dir//**//*.txt//");
+
+		assert!(regex.is_match("dir/subdir/file.txt"));
+		assert!(regex.is_match("dir/sub1/sub2/file.txt"));
+		assert!(!regex.is_match("other/sub/file.txt"));
+		assert!(!regex.is_match("dir/file.jpg"));
+	}
+}
--- a/crates/datapath/src/lib.rs
+++ b/crates/datapath/src/lib.rs
@@ -7,6 +7,10 @@
 #[cfg(test)]
 use uuid as _;

+// silence linter, used by fns in index.rs
+#[cfg(feature = "tokio")]
+use tokio as _;
+
 mod datapath;
 pub use datapath::*;

@@ -19,4 +23,10 @@ pub use schema::*;
 mod wildcardable;
 pub use wildcardable::*;

+#[cfg(feature = "index")]
+mod index;
+
+#[cfg(feature = "index")]
+pub use index::*;
+
 pub use datapath_macro::datapath;