mirror of
https://github.com/rm-dr/datapath.git
synced 2026-05-16 00:08:59 -07:00
Re-use Arc<String> for new indices
This commit is contained in:
@@ -1,3 +1,5 @@
|
|||||||
|
use std::borrow::Borrow;
|
||||||
|
use std::ops::Deref;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
//
|
//
|
||||||
@@ -5,6 +7,7 @@ use std::sync::Arc;
|
|||||||
//
|
//
|
||||||
|
|
||||||
/// A reference to a substring of an [Arc<String>]
|
/// A reference to a substring of an [Arc<String>]
|
||||||
|
#[derive(Copy, Clone)]
|
||||||
pub struct ArcSubstr<'a> {
|
pub struct ArcSubstr<'a> {
|
||||||
pub string: &'a Arc<String>,
|
pub string: &'a Arc<String>,
|
||||||
pub start: usize,
|
pub start: usize,
|
||||||
@@ -23,6 +26,14 @@ impl<'a> ArcSubstr<'a> {
|
|||||||
string,
|
string,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn to_owned(&self) -> ArcSubstring {
|
||||||
|
ArcSubstring {
|
||||||
|
string: self.string.clone(),
|
||||||
|
start: self.start,
|
||||||
|
end: self.end,
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PartialEq for ArcSubstr<'_> {
|
impl PartialEq for ArcSubstr<'_> {
|
||||||
@@ -63,11 +74,50 @@ impl std::fmt::Display for ArcSubstr<'_> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Deref for ArcSubstr<'_> {
|
||||||
|
type Target = str;
|
||||||
|
|
||||||
|
fn deref(&self) -> &Self::Target {
|
||||||
|
self.as_str()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AsRef<str> for ArcSubstr<'_> {
|
||||||
|
fn as_ref(&self) -> &str {
|
||||||
|
self.as_str()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Borrow<str> for ArcSubstr<'_> {
|
||||||
|
fn borrow(&self) -> &str {
|
||||||
|
self.as_str()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialEq<str> for ArcSubstr<'_> {
|
||||||
|
fn eq(&self, other: &str) -> bool {
|
||||||
|
self.as_str() == other
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialEq<&str> for ArcSubstr<'_> {
|
||||||
|
fn eq(&self, other: &&str) -> bool {
|
||||||
|
self.as_str() == *other
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialEq<String> for ArcSubstr<'_> {
|
||||||
|
fn eq(&self, other: &String) -> bool {
|
||||||
|
self.as_str() == other.as_str()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// MARK: string
|
// MARK: string
|
||||||
//
|
//
|
||||||
|
|
||||||
/// An owned [ArcSubstr]
|
/// An owned [ArcSubstr]
|
||||||
|
#[derive(Clone)]
|
||||||
pub struct ArcSubstring {
|
pub struct ArcSubstring {
|
||||||
pub string: Arc<String>,
|
pub string: Arc<String>,
|
||||||
pub start: usize,
|
pub start: usize,
|
||||||
@@ -125,3 +175,65 @@ impl std::fmt::Display for ArcSubstring {
|
|||||||
self.as_str().fmt(f)
|
self.as_str().fmt(f)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Deref for ArcSubstring {
|
||||||
|
type Target = str;
|
||||||
|
|
||||||
|
fn deref(&self) -> &Self::Target {
|
||||||
|
self.as_str()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AsRef<str> for ArcSubstring {
|
||||||
|
fn as_ref(&self) -> &str {
|
||||||
|
self.as_str()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Borrow<str> for ArcSubstring {
|
||||||
|
fn borrow(&self) -> &str {
|
||||||
|
self.as_str()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialEq<str> for ArcSubstring {
|
||||||
|
fn eq(&self, other: &str) -> bool {
|
||||||
|
self.as_str() == other
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialEq<&str> for ArcSubstring {
|
||||||
|
fn eq(&self, other: &&str) -> bool {
|
||||||
|
self.as_str() == *other
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialEq<String> for ArcSubstring {
|
||||||
|
fn eq(&self, other: &String) -> bool {
|
||||||
|
self.as_str() == other.as_str()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<String> for ArcSubstring {
|
||||||
|
fn from(s: String) -> Self {
|
||||||
|
Self::from_string(Arc::new(s))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<Arc<String>> for ArcSubstring {
|
||||||
|
fn from(s: Arc<String>) -> Self {
|
||||||
|
Self::from_string(s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> From<&'a ArcSubstr<'a>> for ArcSubstring {
|
||||||
|
fn from(s: &'a ArcSubstr<'a>) -> Self {
|
||||||
|
s.to_owned()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> From<ArcSubstr<'a>> for ArcSubstring {
|
||||||
|
fn from(s: ArcSubstr<'a>) -> Self {
|
||||||
|
s.to_owned()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,5 +1,10 @@
|
|||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use std::{collections::HashMap, fmt::Display, str::FromStr, sync::Arc};
|
use std::{
|
||||||
|
collections::{HashMap, HashSet},
|
||||||
|
fmt::Display,
|
||||||
|
str::FromStr,
|
||||||
|
sync::Arc,
|
||||||
|
};
|
||||||
use tracing::trace;
|
use tracing::trace;
|
||||||
use trie_rs::map::{Trie, TrieBuilder};
|
use trie_rs::map::{Trie, TrieBuilder};
|
||||||
|
|
||||||
@@ -55,6 +60,7 @@ impl FromStr for PathSegment {
|
|||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct DatapathIndex {
|
pub struct DatapathIndex {
|
||||||
patterns: Trie<u8, Vec<Arc<String>>>,
|
patterns: Trie<u8, Vec<Arc<String>>>,
|
||||||
|
paths: HashSet<Arc<String>>,
|
||||||
len: usize,
|
len: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -90,16 +96,33 @@ impl DatapathIndex {
|
|||||||
Self {
|
Self {
|
||||||
patterns: TrieBuilder::new().build(),
|
patterns: TrieBuilder::new().build(),
|
||||||
len: 0,
|
len: 0,
|
||||||
|
paths: HashSet::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn new<S: Into<String>, I: Iterator<Item = S>>(paths: I) -> Self {
|
pub fn new<S: Into<String>, I: Iterator<Item = S>>(sources: I, old: Option<&Self>) -> Self {
|
||||||
let mut len = 0;
|
let mut len = 0;
|
||||||
let mut patterns = HashMap::new();
|
let mut patterns = HashMap::new();
|
||||||
|
let mut paths = HashSet::new();
|
||||||
|
|
||||||
for s in paths {
|
for s in sources {
|
||||||
let s: String = s.into();
|
let s: String = s.into();
|
||||||
let s = Arc::new(s);
|
|
||||||
|
let s = {
|
||||||
|
let mut s = Arc::new(s);
|
||||||
|
|
||||||
|
// Reuse existing Arc<Strings>, if they are available.
|
||||||
|
// This GREATLY reduces memory usage when updating a dpi
|
||||||
|
// while keeping an older "snapshot" around.
|
||||||
|
if let Some(o) = old {
|
||||||
|
if let Some(existing) = o.paths.get(&s) {
|
||||||
|
s = existing.clone();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
paths.insert(s.clone());
|
||||||
|
s
|
||||||
|
};
|
||||||
|
|
||||||
let mut segments = Vec::new();
|
let mut segments = Vec::new();
|
||||||
for seg in s.split('/') {
|
for seg in s.split('/') {
|
||||||
@@ -128,17 +151,37 @@ impl DatapathIndex {
|
|||||||
Self {
|
Self {
|
||||||
len,
|
len,
|
||||||
patterns: builder.build(),
|
patterns: builder.build(),
|
||||||
|
paths,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "tokio")]
|
#[cfg(feature = "tokio")]
|
||||||
pub async fn async_new<S: Into<String>>(mut paths: tokio::sync::mpsc::Receiver<S>) -> Self {
|
pub async fn async_new<S: Into<String>>(
|
||||||
|
mut sources: tokio::sync::mpsc::Receiver<S>,
|
||||||
|
old: Option<&Self>,
|
||||||
|
) -> Self {
|
||||||
let mut len = 0;
|
let mut len = 0;
|
||||||
let mut patterns = HashMap::new();
|
let mut patterns = HashMap::new();
|
||||||
|
let mut paths = HashSet::new();
|
||||||
|
|
||||||
while let Some(s) = paths.recv().await {
|
while let Some(s) = sources.recv().await {
|
||||||
let s: String = s.into();
|
let s: String = s.into();
|
||||||
let s = Arc::new(s);
|
|
||||||
|
let s = {
|
||||||
|
let mut s = Arc::new(s);
|
||||||
|
|
||||||
|
// Reuse existing Arc<Strings>, if they are available.
|
||||||
|
// This GREATLY reduces memory usage when updating a dpi
|
||||||
|
// while keeping an older "snapshot" around.
|
||||||
|
if let Some(o) = old {
|
||||||
|
if let Some(existing) = o.paths.get(&s) {
|
||||||
|
s = existing.clone();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
paths.insert(s.clone());
|
||||||
|
s
|
||||||
|
};
|
||||||
|
|
||||||
let mut segments = Vec::new();
|
let mut segments = Vec::new();
|
||||||
for seg in s.split('/') {
|
for seg in s.split('/') {
|
||||||
@@ -167,6 +210,7 @@ impl DatapathIndex {
|
|||||||
Self {
|
Self {
|
||||||
len,
|
len,
|
||||||
patterns: builder.build(),
|
patterns: builder.build(),
|
||||||
|
paths,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -260,7 +304,7 @@ mod index_tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn datapath_index_empty() {
|
fn datapath_index_empty() {
|
||||||
let idx = DatapathIndex::new(std::iter::empty::<String>());
|
let idx = DatapathIndex::new(std::iter::empty::<String>(), None);
|
||||||
let query = "web/domain=example.com";
|
let query = "web/domain=example.com";
|
||||||
assert_eq!(idx.query(query).unwrap().count(), 0);
|
assert_eq!(idx.query(query).unwrap().count(), 0);
|
||||||
assert!(idx.is_empty());
|
assert!(idx.is_empty());
|
||||||
@@ -270,7 +314,7 @@ mod index_tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn insert_and_lookup_exact_match() {
|
fn insert_and_lookup_exact_match() {
|
||||||
let paths = vec!["web/domain=example.com/ts=1234"];
|
let paths = vec!["web/domain=example.com/ts=1234"];
|
||||||
let idx = DatapathIndex::new(paths.into_iter());
|
let idx = DatapathIndex::new(paths.into_iter(), None);
|
||||||
|
|
||||||
// Exact match
|
// Exact match
|
||||||
let results: Vec<_> = idx
|
let results: Vec<_> = idx
|
||||||
@@ -293,7 +337,7 @@ mod index_tests {
|
|||||||
"web/domain=example.com/ts=1234",
|
"web/domain=example.com/ts=1234",
|
||||||
"api/domain=example.com/ts=1234",
|
"api/domain=example.com/ts=1234",
|
||||||
];
|
];
|
||||||
let idx = DatapathIndex::new(paths.into_iter());
|
let idx = DatapathIndex::new(paths.into_iter(), None);
|
||||||
|
|
||||||
// Wildcard first segment
|
// Wildcard first segment
|
||||||
let results: Vec<_> = idx.query("*/domain=example.com/ts=1234").unwrap().collect();
|
let results: Vec<_> = idx.query("*/domain=example.com/ts=1234").unwrap().collect();
|
||||||
@@ -308,7 +352,7 @@ mod index_tests {
|
|||||||
"web/domain=example.com/ts=1234",
|
"web/domain=example.com/ts=1234",
|
||||||
"web/domain=other.com/ts=1234",
|
"web/domain=other.com/ts=1234",
|
||||||
];
|
];
|
||||||
let idx = DatapathIndex::new(paths.into_iter());
|
let idx = DatapathIndex::new(paths.into_iter(), None);
|
||||||
|
|
||||||
// Wildcard domain
|
// Wildcard domain
|
||||||
let results: Vec<_> = idx.query("web/domain=*/ts=1234").unwrap().collect();
|
let results: Vec<_> = idx.query("web/domain=*/ts=1234").unwrap().collect();
|
||||||
@@ -322,7 +366,7 @@ mod index_tests {
|
|||||||
"web/domain=other.com/ts=1234",
|
"web/domain=other.com/ts=1234",
|
||||||
"api/domain=example.com/ts=5678",
|
"api/domain=example.com/ts=5678",
|
||||||
];
|
];
|
||||||
let idx = DatapathIndex::new(paths.into_iter());
|
let idx = DatapathIndex::new(paths.into_iter(), None);
|
||||||
|
|
||||||
// Specific lookup
|
// Specific lookup
|
||||||
let results: Vec<_> = idx
|
let results: Vec<_> = idx
|
||||||
@@ -351,7 +395,7 @@ mod index_tests {
|
|||||||
"web/domain=other.com/ts=5678/crawl/2.5",
|
"web/domain=other.com/ts=5678/crawl/2.5",
|
||||||
"web/domain=example.com/ts=9999/crawl/3.0",
|
"web/domain=example.com/ts=9999/crawl/3.0",
|
||||||
];
|
];
|
||||||
let idx = DatapathIndex::new(paths.into_iter());
|
let idx = DatapathIndex::new(paths.into_iter(), None);
|
||||||
|
|
||||||
// Multiple wildcards in path
|
// Multiple wildcards in path
|
||||||
let results: Vec<_> = idx.query("web/domain=*/ts=*/crawl/*").unwrap().collect();
|
let results: Vec<_> = idx.query("web/domain=*/ts=*/crawl/*").unwrap().collect();
|
||||||
@@ -368,7 +412,7 @@ mod index_tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn partial_path_query() {
|
fn partial_path_query() {
|
||||||
let paths = vec!["web/domain=example.com/ts=1234/crawl/2.5"];
|
let paths = vec!["web/domain=example.com/ts=1234/crawl/2.5"];
|
||||||
let idx = DatapathIndex::new(paths.into_iter());
|
let idx = DatapathIndex::new(paths.into_iter(), None);
|
||||||
|
|
||||||
// Query with fewer segments than the stored path
|
// Query with fewer segments than the stored path
|
||||||
let results: Vec<_> = idx.query("web/domain=example.com").unwrap().collect();
|
let results: Vec<_> = idx.query("web/domain=example.com").unwrap().collect();
|
||||||
@@ -378,7 +422,7 @@ mod index_tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn longer_path_query() {
|
fn longer_path_query() {
|
||||||
let paths = vec!["web/domain=example.com"];
|
let paths = vec!["web/domain=example.com"];
|
||||||
let idx = DatapathIndex::new(paths.into_iter());
|
let idx = DatapathIndex::new(paths.into_iter(), None);
|
||||||
|
|
||||||
// Query with more segments than the stored path
|
// Query with more segments than the stored path
|
||||||
let results: Vec<_> = idx
|
let results: Vec<_> = idx
|
||||||
@@ -394,7 +438,7 @@ mod index_tests {
|
|||||||
"web/domain=example.com/ts=1234",
|
"web/domain=example.com/ts=1234",
|
||||||
"web/domain=other.com/ts=5678",
|
"web/domain=other.com/ts=5678",
|
||||||
];
|
];
|
||||||
let idx = DatapathIndex::new(paths.into_iter());
|
let idx = DatapathIndex::new(paths.into_iter(), None);
|
||||||
|
|
||||||
// Match exists
|
// Match exists
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@@ -421,7 +465,7 @@ mod index_tests {
|
|||||||
"web/domain=example.com/ts=1234/file2.json",
|
"web/domain=example.com/ts=1234/file2.json",
|
||||||
"web/domain=example.com/ts=5678/file3.json",
|
"web/domain=example.com/ts=5678/file3.json",
|
||||||
];
|
];
|
||||||
let idx = DatapathIndex::new(paths.into_iter());
|
let idx = DatapathIndex::new(paths.into_iter(), None);
|
||||||
|
|
||||||
// Query with suffix wildcard
|
// Query with suffix wildcard
|
||||||
let results: Vec<_> = idx.query("web/domain=example.com/**").unwrap().collect();
|
let results: Vec<_> = idx.query("web/domain=example.com/**").unwrap().collect();
|
||||||
|
|||||||
Reference in New Issue
Block a user