use fancy_regex::Regex; use crate::{Tokenizer, split::regex_segment}; #[test] fn basic() { let re = Regex::new(r"[,;]").unwrap(); let text = "apple,banana;cherry"; let result = regex_segment(&re, text); assert_eq!(result, vec!["apple", ",", "banana", ";", "cherry"]); } #[test] fn empty_string() { let re = Regex::new(r"[,;]").unwrap(); let text = ""; let result = regex_segment(&re, text); assert_eq!(result, Vec::<&str>::new()); } #[test] fn no_matches() { let re = Regex::new(r"[,;]").unwrap(); let text = "apple banana cherry"; let result = regex_segment(&re, text); assert_eq!(result, vec!["apple banana cherry"]); } #[test] fn only_matches() { let re = Regex::new(r"[,;]").unwrap(); let text = ",;,"; let result = regex_segment(&re, text); assert_eq!(result, vec![",", ";", ","]); } #[test] fn starts_with_match() { let re = Regex::new(r"[,;]").unwrap(); let text = ",apple;banana"; let result = regex_segment(&re, text); assert_eq!(result, vec![",", "apple", ";", "banana"]); } #[test] fn ends_with_match() { let re = Regex::new(r"[,;]").unwrap(); let text = "apple,banana;"; let result = regex_segment(&re, text); assert_eq!(result, vec!["apple", ",", "banana", ";"]); } #[test] fn consecutive_matches() { let re = Regex::new(r"[,;]").unwrap(); let text = "apple,,banana"; let result = regex_segment(&re, text); assert_eq!(result, vec!["apple", ",", ",", "banana"]); } #[test] fn word_boundaries() { let re = Regex::new(r"\b").unwrap(); let text = "hello world"; let result = regex_segment(&re, text); // Word boundaries are zero-width, so we get empty matches between word chars and non-word chars assert_eq!(result, vec!["hello", " ", "world"]); } #[test] fn digits() { let re = Regex::new(r"\d+").unwrap(); let text = "abc123def456ghi"; let result = regex_segment(&re, text); assert_eq!(result, vec!["abc", "123", "def", "456", "ghi"]); } #[test] fn special_tokens() { let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap(); let text = "Hello <|user_start|>world<|user_end|> test"; let result = regex_segment(&re, text); assert_eq!( result, vec!["Hello ", "<|user_start|>", "world", "<|user_end|>", " test"] ); } #[test] fn unicode() { let re = Regex::new(r"[=�=�]+").unwrap(); let text = "Hello=�world=�test"; let result = regex_segment(&re, text); assert_eq!(result, vec!["Hello", "=�", "world", "=�", "test"]); } #[test] fn single_char() { let re = Regex::new(r"x").unwrap(); let text = "x"; let result = regex_segment(&re, text); assert_eq!(result, vec!["x"]); } #[test] fn multichar_match() { let re = Regex::new(r"abc").unwrap(); let text = "123abc456abc789"; let result = regex_segment(&re, text); assert_eq!(result, vec!["123", "abc", "456", "abc", "789"]); } #[test] fn bos_token() { let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap(); let text = "<|bos|>This is a document"; let result = regex_segment(&re, text); assert_eq!(result, vec!["<|bos|>", "This is a document"]); } #[test] fn conversation_flow() { let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap(); let text = "<|user_start|>Hello<|user_end|><|assistant_start|>Hi there!<|assistant_end|>"; let result = regex_segment(&re, text); assert_eq!( result, vec![ "<|user_start|>", "Hello", "<|user_end|>", "<|assistant_start|>", "Hi there!", "<|assistant_end|>" ] ); } #[test] fn python_code_block() { let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap(); let text = "Code: <|python_start|>print('hello')<|python_end|> Output: <|output_start|>hello<|output_end|>"; let result = regex_segment(&re, text); assert_eq!( result, vec![ "Code: ", "<|python_start|>", "print('hello')", "<|python_end|>", " Output: ", "<|output_start|>", "hello", "<|output_end|>" ] ); } #[test] fn mixed_special_tokens() { let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap(); let text = "<|bos|><|user_start|>Question<|user_end|><|assistant_start|>Answer<|assistant_end|>"; let result = regex_segment(&re, text); assert_eq!( result, vec![ "<|bos|>", "<|user_start|>", "Question", "<|user_end|>", "<|assistant_start|>", "Answer", "<|assistant_end|>" ] ); } #[test] fn no_special_tokens() { let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap(); let text = "This is just regular text with no special tokens"; let result = regex_segment(&re, text); assert_eq!( result, vec!["This is just regular text with no special tokens"] ); } #[test] fn malformed_special_tokens() { let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap(); let text = "This has <|invalid_token> and which shouldn't match"; let result = regex_segment(&re, text); assert_eq!( result, vec!["This has <|invalid_token> and which shouldn't match"] ); } #[test] fn special_tokens_with_whitespace() { let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap(); let text = " <|bos|> \n<|user_start|>\tHello\n<|user_end|> "; let result = regex_segment(&re, text); assert_eq!( result, vec![ " ", "<|bos|>", " \n", "<|user_start|>", "\tHello\n", "<|user_end|>", " " ] ); } #[test] fn only_special_tokens() { let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap(); let text = "<|bos|><|user_start|><|user_end|>"; let result = regex_segment(&re, text); assert_eq!(result, vec!["<|bos|>", "<|user_start|>", "<|user_end|>"]); } #[test] fn nested() { let re = Regex::new(Tokenizer::SPECIAL_REGEX).unwrap(); let text = "<|<|bos|>|>"; let result = regex_segment(&re, text); assert_eq!(result, vec!["<|", "<|bos|>", "|>"]); }