1
0

Tokenizer

This commit is contained in:
2025-12-11 17:37:33 -08:00
parent 62fcf781c1
commit 1805b7f430
10 changed files with 7678 additions and 0 deletions

View File

@@ -0,0 +1,25 @@
use fancy_regex::Regex;
/// Split text using a regex while keeping both the matched parts and the parts between matches
pub fn regex_segment<'a>(re: &Regex, text: &'a str) -> Vec<&'a str> {
let mut result = Vec::new();
let mut last = 0;
for mat in re.find_iter(text) {
#[expect(clippy::unwrap_used)]
let mat = mat.unwrap();
if mat.start() > last {
result.push(&text[last..mat.start()]);
}
result.push(mat.as_str());
last = mat.end();
}
if last < text.len() {
result.push(&text[last..]);
}
result.retain(|x| !x.is_empty());
result
}