1
0
Files
llmfs/crates/tokenizer/src/split.rs
2025-12-13 08:44:38 -08:00

26 lines
562 B
Rust

use fancy_regex::Regex;
/// Split text using a regex while keeping both the matched parts and the parts between matches
pub fn regex_segment<'a>(re: &Regex, text: &'a str) -> Vec<&'a str> {
let mut result = Vec::new();
let mut last = 0;
for mat in re.find_iter(text) {
#[expect(clippy::unwrap_used)]
let mat = mat.unwrap();
if mat.start() > last {
result.push(&text[last..mat.start()]);
}
result.push(mat.as_str());
last = mat.end();
}
if last < text.len() {
result.push(&text[last..]);
}
result.retain(|x| !x.is_empty());
result
}