1
0

Tokenizer

This commit is contained in:
2025-12-11 17:37:33 -08:00
parent 62fcf781c1
commit 1805b7f430
10 changed files with 7678 additions and 0 deletions

View File

@@ -0,0 +1,234 @@
use fancy_regex::Regex;
use crate::{Tokenizer, split::regex_segment};
#[test]
fn basic() {
let re = Regex::new(r"[,;]").unwrap();
let text = "apple,banana;cherry";
let result = regex_segment(&re, text);
assert_eq!(result, vec!["apple", ",", "banana", ";", "cherry"]);
}
#[test]
fn empty_string() {
let re = Regex::new(r"[,;]").unwrap();
let text = "";
let result = regex_segment(&re, text);
assert_eq!(result, Vec::<&str>::new());
}
#[test]
fn no_matches() {
let re = Regex::new(r"[,;]").unwrap();
let text = "apple banana cherry";
let result = regex_segment(&re, text);
assert_eq!(result, vec!["apple banana cherry"]);
}
#[test]
fn only_matches() {
let re = Regex::new(r"[,;]").unwrap();
let text = ",;,";
let result = regex_segment(&re, text);
assert_eq!(result, vec![",", ";", ","]);
}
#[test]
fn starts_with_match() {
let re = Regex::new(r"[,;]").unwrap();
let text = ",apple;banana";
let result = regex_segment(&re, text);
assert_eq!(result, vec![",", "apple", ";", "banana"]);
}
#[test]
fn ends_with_match() {
let re = Regex::new(r"[,;]").unwrap();
let text = "apple,banana;";
let result = regex_segment(&re, text);
assert_eq!(result, vec!["apple", ",", "banana", ";"]);
}
#[test]
fn consecutive_matches() {
let re = Regex::new(r"[,;]").unwrap();
let text = "apple,,banana";
let result = regex_segment(&re, text);
assert_eq!(result, vec!["apple", ",", ",", "banana"]);
}
#[test]
fn word_boundaries() {
let re = Regex::new(r"\b").unwrap();
let text = "hello world";
let result = regex_segment(&re, text);
// Word boundaries are zero-width, so we get empty matches between word chars and non-word chars
assert_eq!(result, vec!["hello", " ", "world"]);
}
#[test]
fn digits() {
let re = Regex::new(r"\d+").unwrap();
let text = "abc123def456ghi";
let result = regex_segment(&re, text);
assert_eq!(result, vec!["abc", "123", "def", "456", "ghi"]);
}
#[test]
fn special_tokens() {
let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap();
let text = "Hello <|user_start|>world<|user_end|> test";
let result = regex_segment(&re, text);
assert_eq!(
result,
vec!["Hello ", "<|user_start|>", "world", "<|user_end|>", " test"]
);
}
#[test]
fn unicode() {
let re = Regex::new(r"[=<3D>=<3D>]+").unwrap();
let text = "Hello=<3D>world=<3D>test";
let result = regex_segment(&re, text);
assert_eq!(result, vec!["Hello", "=<3D>", "world", "=<3D>", "test"]);
}
#[test]
fn single_char() {
let re = Regex::new(r"x").unwrap();
let text = "x";
let result = regex_segment(&re, text);
assert_eq!(result, vec!["x"]);
}
#[test]
fn multichar_match() {
let re = Regex::new(r"abc").unwrap();
let text = "123abc456abc789";
let result = regex_segment(&re, text);
assert_eq!(result, vec!["123", "abc", "456", "abc", "789"]);
}
#[test]
fn bos_token() {
let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap();
let text = "<|bos|>This is a document";
let result = regex_segment(&re, text);
assert_eq!(result, vec!["<|bos|>", "This is a document"]);
}
#[test]
fn conversation_flow() {
let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap();
let text = "<|user_start|>Hello<|user_end|><|assistant_start|>Hi there!<|assistant_end|>";
let result = regex_segment(&re, text);
assert_eq!(
result,
vec![
"<|user_start|>",
"Hello",
"<|user_end|>",
"<|assistant_start|>",
"Hi there!",
"<|assistant_end|>"
]
);
}
#[test]
fn python_code_block() {
let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap();
let text = "Code: <|python_start|>print('hello')<|python_end|> Output: <|output_start|>hello<|output_end|>";
let result = regex_segment(&re, text);
assert_eq!(
result,
vec![
"Code: ",
"<|python_start|>",
"print('hello')",
"<|python_end|>",
" Output: ",
"<|output_start|>",
"hello",
"<|output_end|>"
]
);
}
#[test]
fn mixed_special_tokens() {
let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap();
let text =
"<|bos|><|user_start|>Question<|user_end|><|assistant_start|>Answer<|assistant_end|>";
let result = regex_segment(&re, text);
assert_eq!(
result,
vec![
"<|bos|>",
"<|user_start|>",
"Question",
"<|user_end|>",
"<|assistant_start|>",
"Answer",
"<|assistant_end|>"
]
);
}
#[test]
fn no_special_tokens() {
let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap();
let text = "This is just regular text with no special tokens";
let result = regex_segment(&re, text);
assert_eq!(
result,
vec!["This is just regular text with no special tokens"]
);
}
#[test]
fn malformed_special_tokens() {
let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap();
let text = "This has <|invalid_token> and <user_start> which shouldn't match";
let result = regex_segment(&re, text);
assert_eq!(
result,
vec!["This has <|invalid_token> and <user_start> which shouldn't match"]
);
}
#[test]
fn special_tokens_with_whitespace() {
let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap();
let text = " <|bos|> \n<|user_start|>\tHello\n<|user_end|> ";
let result = regex_segment(&re, text);
assert_eq!(
result,
vec![
" ",
"<|bos|>",
" \n",
"<|user_start|>",
"\tHello\n",
"<|user_end|>",
" "
]
);
}
#[test]
fn only_special_tokens() {
let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap();
let text = "<|bos|><|user_start|><|user_end|>";
let result = regex_segment(&re, text);
assert_eq!(result, vec!["<|bos|>", "<|user_start|>", "<|user_end|>"]);
}
#[test]
fn nested() {
let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap();
let text = "<|<|bos|>|>";
let result = regex_segment(&re, text);
assert_eq!(result, vec!["<|", "<|bos|>", "|>"]);
}