Tokenizer

2025-12-11 17:37:33 -08:00
parent 62fcf781c1
commit 1805b7f430
10 changed files with 7678 additions and 0 deletions
--- a/crates/tokenizer/src/split.rs
+++ b/crates/tokenizer/src/split.rs
@@ -0,0 +1,25 @@
+use fancy_regex::Regex;
+
+/// Split text using a regex while keeping both the matched parts and the parts between matches
+pub fn regex_segment<'a>(re: &Regex, text: &'a str) -> Vec<&'a str> {
+	let mut result = Vec::new();
+	let mut last = 0;
+
+	for mat in re.find_iter(text) {
+		#[expect(clippy::unwrap_used)]
+		let mat = mat.unwrap();
+		if mat.start() > last {
+			result.push(&text[last..mat.start()]);
+		}
+		result.push(mat.as_str());
+		last = mat.end();
+	}
+
+	if last < text.len() {
+		result.push(&text[last..]);
+	}
+
+	result.retain(|x| !x.is_empty());
+
+	result
+}