235 lines
5.5 KiB
Rust
235 lines
5.5 KiB
Rust
use fancy_regex::Regex;
|
||
|
||
use crate::{Tokenizer, split::regex_segment};
|
||
|
||
#[test]
|
||
fn basic() {
|
||
let re = Regex::new(r"[,;]").unwrap();
|
||
let text = "apple,banana;cherry";
|
||
let result = regex_segment(&re, text);
|
||
assert_eq!(result, vec!["apple", ",", "banana", ";", "cherry"]);
|
||
}
|
||
|
||
#[test]
|
||
fn empty_string() {
|
||
let re = Regex::new(r"[,;]").unwrap();
|
||
let text = "";
|
||
let result = regex_segment(&re, text);
|
||
assert_eq!(result, Vec::<&str>::new());
|
||
}
|
||
|
||
#[test]
|
||
fn no_matches() {
|
||
let re = Regex::new(r"[,;]").unwrap();
|
||
let text = "apple banana cherry";
|
||
let result = regex_segment(&re, text);
|
||
assert_eq!(result, vec!["apple banana cherry"]);
|
||
}
|
||
|
||
#[test]
|
||
fn only_matches() {
|
||
let re = Regex::new(r"[,;]").unwrap();
|
||
let text = ",;,";
|
||
let result = regex_segment(&re, text);
|
||
assert_eq!(result, vec![",", ";", ","]);
|
||
}
|
||
|
||
#[test]
|
||
fn starts_with_match() {
|
||
let re = Regex::new(r"[,;]").unwrap();
|
||
let text = ",apple;banana";
|
||
let result = regex_segment(&re, text);
|
||
assert_eq!(result, vec![",", "apple", ";", "banana"]);
|
||
}
|
||
|
||
#[test]
|
||
fn ends_with_match() {
|
||
let re = Regex::new(r"[,;]").unwrap();
|
||
let text = "apple,banana;";
|
||
let result = regex_segment(&re, text);
|
||
assert_eq!(result, vec!["apple", ",", "banana", ";"]);
|
||
}
|
||
|
||
#[test]
|
||
fn consecutive_matches() {
|
||
let re = Regex::new(r"[,;]").unwrap();
|
||
let text = "apple,,banana";
|
||
let result = regex_segment(&re, text);
|
||
assert_eq!(result, vec!["apple", ",", ",", "banana"]);
|
||
}
|
||
|
||
#[test]
|
||
fn word_boundaries() {
|
||
let re = Regex::new(r"\b").unwrap();
|
||
let text = "hello world";
|
||
let result = regex_segment(&re, text);
|
||
// Word boundaries are zero-width, so we get empty matches between word chars and non-word chars
|
||
assert_eq!(result, vec!["hello", " ", "world"]);
|
||
}
|
||
|
||
#[test]
|
||
fn digits() {
|
||
let re = Regex::new(r"\d+").unwrap();
|
||
let text = "abc123def456ghi";
|
||
let result = regex_segment(&re, text);
|
||
assert_eq!(result, vec!["abc", "123", "def", "456", "ghi"]);
|
||
}
|
||
|
||
#[test]
|
||
fn special_tokens() {
|
||
let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap();
|
||
let text = "Hello <|user_start|>world<|user_end|> test";
|
||
let result = regex_segment(&re, text);
|
||
assert_eq!(
|
||
result,
|
||
vec!["Hello ", "<|user_start|>", "world", "<|user_end|>", " test"]
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn unicode() {
|
||
let re = Regex::new(r"[=<3D>=<3D>]+").unwrap();
|
||
let text = "Hello=<3D>world=<3D>test";
|
||
let result = regex_segment(&re, text);
|
||
assert_eq!(result, vec!["Hello", "=<3D>", "world", "=<3D>", "test"]);
|
||
}
|
||
|
||
#[test]
|
||
fn single_char() {
|
||
let re = Regex::new(r"x").unwrap();
|
||
let text = "x";
|
||
let result = regex_segment(&re, text);
|
||
assert_eq!(result, vec!["x"]);
|
||
}
|
||
|
||
#[test]
|
||
fn multichar_match() {
|
||
let re = Regex::new(r"abc").unwrap();
|
||
let text = "123abc456abc789";
|
||
let result = regex_segment(&re, text);
|
||
assert_eq!(result, vec!["123", "abc", "456", "abc", "789"]);
|
||
}
|
||
|
||
#[test]
|
||
fn bos_token() {
|
||
let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap();
|
||
let text = "<|bos|>This is a document";
|
||
let result = regex_segment(&re, text);
|
||
assert_eq!(result, vec!["<|bos|>", "This is a document"]);
|
||
}
|
||
|
||
#[test]
|
||
fn conversation_flow() {
|
||
let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap();
|
||
let text = "<|user_start|>Hello<|user_end|><|assistant_start|>Hi there!<|assistant_end|>";
|
||
let result = regex_segment(&re, text);
|
||
assert_eq!(
|
||
result,
|
||
vec![
|
||
"<|user_start|>",
|
||
"Hello",
|
||
"<|user_end|>",
|
||
"<|assistant_start|>",
|
||
"Hi there!",
|
||
"<|assistant_end|>"
|
||
]
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn python_code_block() {
|
||
let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap();
|
||
let text = "Code: <|python_start|>print('hello')<|python_end|> Output: <|output_start|>hello<|output_end|>";
|
||
let result = regex_segment(&re, text);
|
||
assert_eq!(
|
||
result,
|
||
vec![
|
||
"Code: ",
|
||
"<|python_start|>",
|
||
"print('hello')",
|
||
"<|python_end|>",
|
||
" Output: ",
|
||
"<|output_start|>",
|
||
"hello",
|
||
"<|output_end|>"
|
||
]
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn mixed_special_tokens() {
|
||
let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap();
|
||
let text =
|
||
"<|bos|><|user_start|>Question<|user_end|><|assistant_start|>Answer<|assistant_end|>";
|
||
let result = regex_segment(&re, text);
|
||
assert_eq!(
|
||
result,
|
||
vec![
|
||
"<|bos|>",
|
||
"<|user_start|>",
|
||
"Question",
|
||
"<|user_end|>",
|
||
"<|assistant_start|>",
|
||
"Answer",
|
||
"<|assistant_end|>"
|
||
]
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn no_special_tokens() {
|
||
let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap();
|
||
let text = "This is just regular text with no special tokens";
|
||
let result = regex_segment(&re, text);
|
||
assert_eq!(
|
||
result,
|
||
vec!["This is just regular text with no special tokens"]
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn malformed_special_tokens() {
|
||
let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap();
|
||
let text = "This has <|invalid_token> and <user_start> which shouldn't match";
|
||
let result = regex_segment(&re, text);
|
||
assert_eq!(
|
||
result,
|
||
vec!["This has <|invalid_token> and <user_start> which shouldn't match"]
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn special_tokens_with_whitespace() {
|
||
let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap();
|
||
let text = " <|bos|> \n<|user_start|>\tHello\n<|user_end|> ";
|
||
let result = regex_segment(&re, text);
|
||
assert_eq!(
|
||
result,
|
||
vec![
|
||
" ",
|
||
"<|bos|>",
|
||
" \n",
|
||
"<|user_start|>",
|
||
"\tHello\n",
|
||
"<|user_end|>",
|
||
" "
|
||
]
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn only_special_tokens() {
|
||
let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap();
|
||
let text = "<|bos|><|user_start|><|user_end|>";
|
||
let result = regex_segment(&re, text);
|
||
assert_eq!(result, vec!["<|bos|>", "<|user_start|>", "<|user_end|>"]);
|
||
}
|
||
|
||
#[test]
|
||
fn nested() {
|
||
let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap();
|
||
let text = "<|<|bos|>|>";
|
||
let result = regex_segment(&re, text);
|
||
assert_eq!(result, vec!["<|", "<|bos|>", "|>"]);
|
||
}
|