From 37e55e5165937a2ecb3d2adf12102710491fd154 Mon Sep 17 00:00:00 2001 From: rm-dr <96270320+rm-dr@users.noreply.github.com> Date: Fri, 20 Feb 2026 16:57:40 -0800 Subject: [PATCH] Wrap strings in Arc (greatly reduces memory usage) --- Cargo.lock | 4 +- Cargo.toml | 4 +- crates/datapath/src/arcsubstr.rs | 127 +++++++++++++++++++++++++++++++ crates/datapath/src/index/mod.rs | 28 ++++--- crates/datapath/src/lib.rs | 3 + 5 files changed, 152 insertions(+), 14 deletions(-) create mode 100644 crates/datapath/src/arcsubstr.rs diff --git a/Cargo.lock b/Cargo.lock index c7079d8..2264090 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -50,7 +50,7 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "datapath" -version = "0.0.5" +version = "0.0.6" dependencies = [ "datapath-macro", "itertools", @@ -63,7 +63,7 @@ dependencies = [ [[package]] name = "datapath-macro" -version = "0.0.5" +version = "0.0.6" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index e9be832..779eb8b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,7 +11,7 @@ readme = "README.md" authors = ["rm-dr"] # Don't forget to bump datapath-macro below! -version = "0.0.5" +version = "0.0.6" [workspace.lints.rust] unused_import_braces = "deny" @@ -70,7 +70,7 @@ cargo_common_metadata = "deny" # [workspace.dependencies] -datapath-macro = { path = "crates/datapath-macro", version = "0.0.5" } +datapath-macro = { path = "crates/datapath-macro", version = "0.0.6" } datapath = { path = "crates/datapath" } chrono = "0.4.42" diff --git a/crates/datapath/src/arcsubstr.rs b/crates/datapath/src/arcsubstr.rs new file mode 100644 index 0000000..5266d6b --- /dev/null +++ b/crates/datapath/src/arcsubstr.rs @@ -0,0 +1,127 @@ +use std::sync::Arc; + +// +// MARK: str +// + +/// A reference to a substring of an [Arc] +pub struct ArcSubstr<'a> { + pub string: &'a Arc, + pub start: usize, + pub end: usize, +} + +impl<'a> ArcSubstr<'a> { + pub fn as_str(&self) -> &str { + &self.string[self.start..self.end] + } + + pub fn from_string(string: &'a Arc) -> Self { + Self { + start: 0, + end: string.len(), + string, + } + } +} + +impl PartialEq for ArcSubstr<'_> { + fn eq(&self, other: &Self) -> bool { + self.as_str() == other.as_str() + } +} + +impl Eq for ArcSubstr<'_> {} + +impl std::hash::Hash for ArcSubstr<'_> { + fn hash(&self, state: &mut H) { + self.as_str().hash(state); + } +} + +impl PartialOrd for ArcSubstr<'_> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for ArcSubstr<'_> { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.as_str().cmp(other.as_str()) + } +} + +impl std::fmt::Debug for ArcSubstr<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.as_str().fmt(f) + } +} + +impl std::fmt::Display for ArcSubstr<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.as_str().fmt(f) + } +} + +// +// MARK: string +// + +/// An owned [ArcSubstr] +pub struct ArcSubstring { + pub string: Arc, + pub start: usize, + pub end: usize, +} + +impl ArcSubstring { + pub fn as_str(&self) -> &str { + &self.string[self.start..self.end] + } + + pub fn from_string(string: Arc) -> Self { + Self { + start: 0, + end: string.len(), + string, + } + } +} + +impl PartialEq for ArcSubstring { + fn eq(&self, other: &Self) -> bool { + self.as_str() == other.as_str() + } +} + +impl Eq for ArcSubstring {} + +impl std::hash::Hash for ArcSubstring { + fn hash(&self, state: &mut H) { + self.as_str().hash(state); + } +} + +impl PartialOrd for ArcSubstring { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for ArcSubstring { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.as_str().cmp(other.as_str()) + } +} + +impl std::fmt::Debug for ArcSubstring { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.as_str().fmt(f) + } +} + +impl std::fmt::Display for ArcSubstring { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.as_str().fmt(f) + } +} diff --git a/crates/datapath/src/index/mod.rs b/crates/datapath/src/index/mod.rs index cf3b660..d6ae56d 100644 --- a/crates/datapath/src/index/mod.rs +++ b/crates/datapath/src/index/mod.rs @@ -1,5 +1,5 @@ use itertools::Itertools; -use std::{collections::HashMap, fmt::Display, str::FromStr}; +use std::{collections::HashMap, fmt::Display, str::FromStr, sync::Arc}; use tracing::trace; use trie_rs::map::{Trie, TrieBuilder}; @@ -54,7 +54,7 @@ impl FromStr for PathSegment { /// An in-memory cache of s3 paths. #[derive(Debug)] pub struct DatapathIndex { - patterns: Trie>, + patterns: Trie>>, len: usize, } @@ -99,6 +99,8 @@ impl DatapathIndex { for s in paths { let s: String = s.into(); + let s = Arc::new(s); + let mut segments = Vec::new(); for seg in s.split('/') { segments.push(match PathSegment::from_str(&seg) { @@ -136,6 +138,8 @@ impl DatapathIndex { while let Some(s) = paths.recv().await { let s: String = s.into(); + let s = Arc::new(s); + let mut segments = Vec::new(); for seg in s.split('/') { segments.push(match PathSegment::from_str(&seg) { @@ -181,7 +185,10 @@ impl DatapathIndex { /// /// Returns an empty iterator if no paths match. /// Returns `None` if the query was invalid. - pub fn query(&self, query: impl Into) -> Option + '_> { + pub fn query( + &self, + query: impl Into, + ) -> Option> + '_> { let query: String = query.into(); let regex = rule::Rule::new(query.clone())?; let key = Self::query_to_key(&query); @@ -191,13 +198,15 @@ impl DatapathIndex { self.patterns .predictive_search::(&key) .flat_map(|(_, strings)| strings.iter()) - .filter(move |s| regex.is_match(s)) - .cloned(), + .filter(move |s| regex.is_match(s)), ) } /// Like [Self::query], but with a precompiled rule - pub fn query_rule<'a>(&'a self, rule: &'a rule::Rule) -> impl Iterator + 'a { + pub fn query_rule<'a>( + &'a self, + rule: &'a rule::Rule, + ) -> impl Iterator> + 'a { let key = Self::query_to_key(rule.pattern()); trace!("DatapathIndex key is {key}"); @@ -205,7 +214,6 @@ impl DatapathIndex { .predictive_search::(&key) .flat_map(|(_, strings)| strings.iter()) .filter(move |s| rule.is_match(s)) - .cloned() } /// Like [Self::query], but returns `true` if any paths match @@ -270,7 +278,7 @@ mod index_tests { .unwrap() .collect(); assert_eq!(results.len(), 1); - assert_eq!(results[0], "web/domain=example.com/ts=1234"); + assert_eq!(results[0].as_ref(), "web/domain=example.com/ts=1234"); // No match let results: Vec<_> = idx.query("web/domain=other.com/ts=1234").unwrap().collect(); @@ -322,12 +330,12 @@ mod index_tests { .unwrap() .collect(); assert_eq!(results.len(), 1); - assert_eq!(results[0], "web/domain=example.com/ts=1234"); + assert_eq!(results[0].as_ref(), "web/domain=example.com/ts=1234"); // Wildcard time lookup let results: Vec<_> = idx.query("web/domain=example.com/ts=*").unwrap().collect(); assert_eq!(results.len(), 1); - assert_eq!(results[0], "web/domain=example.com/ts=1234"); + assert_eq!(results[0].as_ref(), "web/domain=example.com/ts=1234"); // Double wildcard lookup let results: Vec<_> = idx.query("web/domain=*/ts=*").unwrap().collect(); diff --git a/crates/datapath/src/lib.rs b/crates/datapath/src/lib.rs index 9e19902..ade3bed 100644 --- a/crates/datapath/src/lib.rs +++ b/crates/datapath/src/lib.rs @@ -23,6 +23,9 @@ pub use schema::*; mod wildcardable; pub use wildcardable::*; +mod arcsubstr; +pub use arcsubstr::*; + #[cfg(feature = "index")] mod index;