1
0
mirror of https://github.com/rm-dr/datapath.git synced 2026-05-16 00:08:59 -07:00

Wrap strings in Arc (greatly reduces memory usage)

This commit is contained in:
2026-02-20 16:57:40 -08:00
committed by Mark
parent efac8fd2d5
commit 37e55e5165
5 changed files with 152 additions and 14 deletions
Generated
+2 -2
View File
@@ -50,7 +50,7 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
[[package]]
name = "datapath"
version = "0.0.5"
version = "0.0.6"
dependencies = [
"datapath-macro",
"itertools",
@@ -63,7 +63,7 @@ dependencies = [
[[package]]
name = "datapath-macro"
version = "0.0.5"
version = "0.0.6"
dependencies = [
"proc-macro2",
"quote",
+2 -2
View File
@@ -11,7 +11,7 @@ readme = "README.md"
authors = ["rm-dr"]
# Don't forget to bump datapath-macro below!
version = "0.0.5"
version = "0.0.6"
[workspace.lints.rust]
unused_import_braces = "deny"
@@ -70,7 +70,7 @@ cargo_common_metadata = "deny"
#
[workspace.dependencies]
datapath-macro = { path = "crates/datapath-macro", version = "0.0.5" }
datapath-macro = { path = "crates/datapath-macro", version = "0.0.6" }
datapath = { path = "crates/datapath" }
chrono = "0.4.42"
+127
View File
@@ -0,0 +1,127 @@
use std::sync::Arc;
//
// MARK: str
//
/// A reference to a substring of an [Arc<String>]
pub struct ArcSubstr<'a> {
pub string: &'a Arc<String>,
pub start: usize,
pub end: usize,
}
impl<'a> ArcSubstr<'a> {
pub fn as_str(&self) -> &str {
&self.string[self.start..self.end]
}
pub fn from_string(string: &'a Arc<String>) -> Self {
Self {
start: 0,
end: string.len(),
string,
}
}
}
impl PartialEq for ArcSubstr<'_> {
fn eq(&self, other: &Self) -> bool {
self.as_str() == other.as_str()
}
}
impl Eq for ArcSubstr<'_> {}
impl std::hash::Hash for ArcSubstr<'_> {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.as_str().hash(state);
}
}
impl PartialOrd for ArcSubstr<'_> {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Ord for ArcSubstr<'_> {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.as_str().cmp(other.as_str())
}
}
impl std::fmt::Debug for ArcSubstr<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.as_str().fmt(f)
}
}
impl std::fmt::Display for ArcSubstr<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.as_str().fmt(f)
}
}
//
// MARK: string
//
/// An owned [ArcSubstr]
pub struct ArcSubstring {
pub string: Arc<String>,
pub start: usize,
pub end: usize,
}
impl ArcSubstring {
pub fn as_str(&self) -> &str {
&self.string[self.start..self.end]
}
pub fn from_string(string: Arc<String>) -> Self {
Self {
start: 0,
end: string.len(),
string,
}
}
}
impl PartialEq for ArcSubstring {
fn eq(&self, other: &Self) -> bool {
self.as_str() == other.as_str()
}
}
impl Eq for ArcSubstring {}
impl std::hash::Hash for ArcSubstring {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.as_str().hash(state);
}
}
impl PartialOrd for ArcSubstring {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Ord for ArcSubstring {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.as_str().cmp(other.as_str())
}
}
impl std::fmt::Debug for ArcSubstring {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.as_str().fmt(f)
}
}
impl std::fmt::Display for ArcSubstring {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.as_str().fmt(f)
}
}
+18 -10
View File
@@ -1,5 +1,5 @@
use itertools::Itertools;
use std::{collections::HashMap, fmt::Display, str::FromStr};
use std::{collections::HashMap, fmt::Display, str::FromStr, sync::Arc};
use tracing::trace;
use trie_rs::map::{Trie, TrieBuilder};
@@ -54,7 +54,7 @@ impl FromStr for PathSegment {
/// An in-memory cache of s3 paths.
#[derive(Debug)]
pub struct DatapathIndex {
patterns: Trie<u8, Vec<String>>,
patterns: Trie<u8, Vec<Arc<String>>>,
len: usize,
}
@@ -99,6 +99,8 @@ impl DatapathIndex {
for s in paths {
let s: String = s.into();
let s = Arc::new(s);
let mut segments = Vec::new();
for seg in s.split('/') {
segments.push(match PathSegment::from_str(&seg) {
@@ -136,6 +138,8 @@ impl DatapathIndex {
while let Some(s) = paths.recv().await {
let s: String = s.into();
let s = Arc::new(s);
let mut segments = Vec::new();
for seg in s.split('/') {
segments.push(match PathSegment::from_str(&seg) {
@@ -181,7 +185,10 @@ impl DatapathIndex {
///
/// Returns an empty iterator if no paths match.
/// Returns `None` if the query was invalid.
pub fn query(&self, query: impl Into<String>) -> Option<impl Iterator<Item = String> + '_> {
pub fn query(
&self,
query: impl Into<String>,
) -> Option<impl Iterator<Item = &Arc<String>> + '_> {
let query: String = query.into();
let regex = rule::Rule::new(query.clone())?;
let key = Self::query_to_key(&query);
@@ -191,13 +198,15 @@ impl DatapathIndex {
self.patterns
.predictive_search::<String, _>(&key)
.flat_map(|(_, strings)| strings.iter())
.filter(move |s| regex.is_match(s))
.cloned(),
.filter(move |s| regex.is_match(s)),
)
}
/// Like [Self::query], but with a precompiled rule
pub fn query_rule<'a>(&'a self, rule: &'a rule::Rule) -> impl Iterator<Item = String> + 'a {
pub fn query_rule<'a>(
&'a self,
rule: &'a rule::Rule,
) -> impl Iterator<Item = &'a Arc<String>> + 'a {
let key = Self::query_to_key(rule.pattern());
trace!("DatapathIndex key is {key}");
@@ -205,7 +214,6 @@ impl DatapathIndex {
.predictive_search::<String, _>(&key)
.flat_map(|(_, strings)| strings.iter())
.filter(move |s| rule.is_match(s))
.cloned()
}
/// Like [Self::query], but returns `true` if any paths match
@@ -270,7 +278,7 @@ mod index_tests {
.unwrap()
.collect();
assert_eq!(results.len(), 1);
assert_eq!(results[0], "web/domain=example.com/ts=1234");
assert_eq!(results[0].as_ref(), "web/domain=example.com/ts=1234");
// No match
let results: Vec<_> = idx.query("web/domain=other.com/ts=1234").unwrap().collect();
@@ -322,12 +330,12 @@ mod index_tests {
.unwrap()
.collect();
assert_eq!(results.len(), 1);
assert_eq!(results[0], "web/domain=example.com/ts=1234");
assert_eq!(results[0].as_ref(), "web/domain=example.com/ts=1234");
// Wildcard time lookup
let results: Vec<_> = idx.query("web/domain=example.com/ts=*").unwrap().collect();
assert_eq!(results.len(), 1);
assert_eq!(results[0], "web/domain=example.com/ts=1234");
assert_eq!(results[0].as_ref(), "web/domain=example.com/ts=1234");
// Double wildcard lookup
let results: Vec<_> = idx.query("web/domain=*/ts=*").unwrap().collect();
+3
View File
@@ -23,6 +23,9 @@ pub use schema::*;
mod wildcardable;
pub use wildcardable::*;
mod arcsubstr;
pub use arcsubstr::*;
#[cfg(feature = "index")]
mod index;