1
0
mirror of https://github.com/rm-dr/datapath.git synced 2026-05-16 00:08:59 -07:00

Wrap strings in Arc (greatly reduces memory usage)

This commit is contained in:
2026-02-20 16:57:40 -08:00
committed by Mark
parent efac8fd2d5
commit 37e55e5165
5 changed files with 152 additions and 14 deletions
Generated
+2 -2
View File
@@ -50,7 +50,7 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
[[package]] [[package]]
name = "datapath" name = "datapath"
version = "0.0.5" version = "0.0.6"
dependencies = [ dependencies = [
"datapath-macro", "datapath-macro",
"itertools", "itertools",
@@ -63,7 +63,7 @@ dependencies = [
[[package]] [[package]]
name = "datapath-macro" name = "datapath-macro"
version = "0.0.5" version = "0.0.6"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
+2 -2
View File
@@ -11,7 +11,7 @@ readme = "README.md"
authors = ["rm-dr"] authors = ["rm-dr"]
# Don't forget to bump datapath-macro below! # Don't forget to bump datapath-macro below!
version = "0.0.5" version = "0.0.6"
[workspace.lints.rust] [workspace.lints.rust]
unused_import_braces = "deny" unused_import_braces = "deny"
@@ -70,7 +70,7 @@ cargo_common_metadata = "deny"
# #
[workspace.dependencies] [workspace.dependencies]
datapath-macro = { path = "crates/datapath-macro", version = "0.0.5" } datapath-macro = { path = "crates/datapath-macro", version = "0.0.6" }
datapath = { path = "crates/datapath" } datapath = { path = "crates/datapath" }
chrono = "0.4.42" chrono = "0.4.42"
+127
View File
@@ -0,0 +1,127 @@
use std::sync::Arc;
//
// MARK: str
//
/// A reference to a substring of an [Arc<String>]
pub struct ArcSubstr<'a> {
pub string: &'a Arc<String>,
pub start: usize,
pub end: usize,
}
impl<'a> ArcSubstr<'a> {
pub fn as_str(&self) -> &str {
&self.string[self.start..self.end]
}
pub fn from_string(string: &'a Arc<String>) -> Self {
Self {
start: 0,
end: string.len(),
string,
}
}
}
impl PartialEq for ArcSubstr<'_> {
fn eq(&self, other: &Self) -> bool {
self.as_str() == other.as_str()
}
}
impl Eq for ArcSubstr<'_> {}
impl std::hash::Hash for ArcSubstr<'_> {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.as_str().hash(state);
}
}
impl PartialOrd for ArcSubstr<'_> {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Ord for ArcSubstr<'_> {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.as_str().cmp(other.as_str())
}
}
impl std::fmt::Debug for ArcSubstr<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.as_str().fmt(f)
}
}
impl std::fmt::Display for ArcSubstr<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.as_str().fmt(f)
}
}
//
// MARK: string
//
/// An owned [ArcSubstr]
pub struct ArcSubstring {
pub string: Arc<String>,
pub start: usize,
pub end: usize,
}
impl ArcSubstring {
pub fn as_str(&self) -> &str {
&self.string[self.start..self.end]
}
pub fn from_string(string: Arc<String>) -> Self {
Self {
start: 0,
end: string.len(),
string,
}
}
}
impl PartialEq for ArcSubstring {
fn eq(&self, other: &Self) -> bool {
self.as_str() == other.as_str()
}
}
impl Eq for ArcSubstring {}
impl std::hash::Hash for ArcSubstring {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.as_str().hash(state);
}
}
impl PartialOrd for ArcSubstring {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Ord for ArcSubstring {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.as_str().cmp(other.as_str())
}
}
impl std::fmt::Debug for ArcSubstring {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.as_str().fmt(f)
}
}
impl std::fmt::Display for ArcSubstring {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.as_str().fmt(f)
}
}
+18 -10
View File
@@ -1,5 +1,5 @@
use itertools::Itertools; use itertools::Itertools;
use std::{collections::HashMap, fmt::Display, str::FromStr}; use std::{collections::HashMap, fmt::Display, str::FromStr, sync::Arc};
use tracing::trace; use tracing::trace;
use trie_rs::map::{Trie, TrieBuilder}; use trie_rs::map::{Trie, TrieBuilder};
@@ -54,7 +54,7 @@ impl FromStr for PathSegment {
/// An in-memory cache of s3 paths. /// An in-memory cache of s3 paths.
#[derive(Debug)] #[derive(Debug)]
pub struct DatapathIndex { pub struct DatapathIndex {
patterns: Trie<u8, Vec<String>>, patterns: Trie<u8, Vec<Arc<String>>>,
len: usize, len: usize,
} }
@@ -99,6 +99,8 @@ impl DatapathIndex {
for s in paths { for s in paths {
let s: String = s.into(); let s: String = s.into();
let s = Arc::new(s);
let mut segments = Vec::new(); let mut segments = Vec::new();
for seg in s.split('/') { for seg in s.split('/') {
segments.push(match PathSegment::from_str(&seg) { segments.push(match PathSegment::from_str(&seg) {
@@ -136,6 +138,8 @@ impl DatapathIndex {
while let Some(s) = paths.recv().await { while let Some(s) = paths.recv().await {
let s: String = s.into(); let s: String = s.into();
let s = Arc::new(s);
let mut segments = Vec::new(); let mut segments = Vec::new();
for seg in s.split('/') { for seg in s.split('/') {
segments.push(match PathSegment::from_str(&seg) { segments.push(match PathSegment::from_str(&seg) {
@@ -181,7 +185,10 @@ impl DatapathIndex {
/// ///
/// Returns an empty iterator if no paths match. /// Returns an empty iterator if no paths match.
/// Returns `None` if the query was invalid. /// Returns `None` if the query was invalid.
pub fn query(&self, query: impl Into<String>) -> Option<impl Iterator<Item = String> + '_> { pub fn query(
&self,
query: impl Into<String>,
) -> Option<impl Iterator<Item = &Arc<String>> + '_> {
let query: String = query.into(); let query: String = query.into();
let regex = rule::Rule::new(query.clone())?; let regex = rule::Rule::new(query.clone())?;
let key = Self::query_to_key(&query); let key = Self::query_to_key(&query);
@@ -191,13 +198,15 @@ impl DatapathIndex {
self.patterns self.patterns
.predictive_search::<String, _>(&key) .predictive_search::<String, _>(&key)
.flat_map(|(_, strings)| strings.iter()) .flat_map(|(_, strings)| strings.iter())
.filter(move |s| regex.is_match(s)) .filter(move |s| regex.is_match(s)),
.cloned(),
) )
} }
/// Like [Self::query], but with a precompiled rule /// Like [Self::query], but with a precompiled rule
pub fn query_rule<'a>(&'a self, rule: &'a rule::Rule) -> impl Iterator<Item = String> + 'a { pub fn query_rule<'a>(
&'a self,
rule: &'a rule::Rule,
) -> impl Iterator<Item = &'a Arc<String>> + 'a {
let key = Self::query_to_key(rule.pattern()); let key = Self::query_to_key(rule.pattern());
trace!("DatapathIndex key is {key}"); trace!("DatapathIndex key is {key}");
@@ -205,7 +214,6 @@ impl DatapathIndex {
.predictive_search::<String, _>(&key) .predictive_search::<String, _>(&key)
.flat_map(|(_, strings)| strings.iter()) .flat_map(|(_, strings)| strings.iter())
.filter(move |s| rule.is_match(s)) .filter(move |s| rule.is_match(s))
.cloned()
} }
/// Like [Self::query], but returns `true` if any paths match /// Like [Self::query], but returns `true` if any paths match
@@ -270,7 +278,7 @@ mod index_tests {
.unwrap() .unwrap()
.collect(); .collect();
assert_eq!(results.len(), 1); assert_eq!(results.len(), 1);
assert_eq!(results[0], "web/domain=example.com/ts=1234"); assert_eq!(results[0].as_ref(), "web/domain=example.com/ts=1234");
// No match // No match
let results: Vec<_> = idx.query("web/domain=other.com/ts=1234").unwrap().collect(); let results: Vec<_> = idx.query("web/domain=other.com/ts=1234").unwrap().collect();
@@ -322,12 +330,12 @@ mod index_tests {
.unwrap() .unwrap()
.collect(); .collect();
assert_eq!(results.len(), 1); assert_eq!(results.len(), 1);
assert_eq!(results[0], "web/domain=example.com/ts=1234"); assert_eq!(results[0].as_ref(), "web/domain=example.com/ts=1234");
// Wildcard time lookup // Wildcard time lookup
let results: Vec<_> = idx.query("web/domain=example.com/ts=*").unwrap().collect(); let results: Vec<_> = idx.query("web/domain=example.com/ts=*").unwrap().collect();
assert_eq!(results.len(), 1); assert_eq!(results.len(), 1);
assert_eq!(results[0], "web/domain=example.com/ts=1234"); assert_eq!(results[0].as_ref(), "web/domain=example.com/ts=1234");
// Double wildcard lookup // Double wildcard lookup
let results: Vec<_> = idx.query("web/domain=*/ts=*").unwrap().collect(); let results: Vec<_> = idx.query("web/domain=*/ts=*").unwrap().collect();
+3
View File
@@ -23,6 +23,9 @@ pub use schema::*;
mod wildcardable; mod wildcardable;
pub use wildcardable::*; pub use wildcardable::*;
mod arcsubstr;
pub use arcsubstr::*;
#[cfg(feature = "index")] #[cfg(feature = "index")]
mod index; mod index;