use crate::objectpath::PathParseError; #[cfg_attr(test, derive(PartialEq))] #[derive(Debug)] pub enum Token<'a> { Root, Ident(&'a str), Dot, SqbOpen, SqbClose, } pub struct Tokenizer {} impl Tokenizer { pub fn new() -> Self { Self {} } pub fn tokenize(self, source: &str) -> Result)>, PathParseError> { let mut tokens = Vec::new(); let mut window_start = None; for (i, c) in source.char_indices() { match c { '$' => { if let Some(s) = window_start.take() { tokens.push((s, Token::Ident(&source[s..i]))); } tokens.push((i, Token::Root)); } '.' => { if let Some(s) = window_start.take() { tokens.push((s, Token::Ident(&source[s..i]))); } tokens.push((i, Token::Dot)); } '[' => { if let Some(s) = window_start.take() { tokens.push((s, Token::Ident(&source[s..i]))); } tokens.push((i, Token::SqbOpen)); } ']' => { if let Some(s) = window_start.take() { tokens.push((s, Token::Ident(&source[s..i]))); } tokens.push((i, Token::SqbClose)); } x if x.is_ascii() => match window_start { None => window_start = Some(i), Some(_) => continue, }, char => return Err(PathParseError::NonAsciiChar { position: i, char }), } } if let Some(s) = window_start.take() { tokens.push((s, Token::Ident(&source[s..]))); } return Ok(tokens); } } #[expect(clippy::expect_used)] #[cfg(test)] mod tests { use super::*; fn tokenize(source: &str) -> Result)>, PathParseError> { Tokenizer::new().tokenize(source) } fn tok_ok(source: &str) -> Vec<(usize, Token<'_>)> { tokenize(source).expect("expected tokenization to succeed") } #[test] fn empty() { assert_eq!(tok_ok(""), vec![]); } #[test] fn root_only() { assert_eq!(tok_ok("$"), vec![(0, Token::Root)]); } #[test] fn dot_only() { assert_eq!(tok_ok("."), vec![(0, Token::Dot)]); } #[test] fn sqb_open_only() { assert_eq!(tok_ok("["), vec![(0, Token::SqbOpen)]); } #[test] fn sqb_close_only() { assert_eq!(tok_ok("]"), vec![(0, Token::SqbClose)]); } #[test] fn ident_only() { assert_eq!(tok_ok("foo"), vec![(0, Token::Ident("foo"))]); } #[test] fn ident_with_digits() { assert_eq!(tok_ok("abc123"), vec![(0, Token::Ident("abc123"))]); } #[test] fn root_dot_ident() { assert_eq!( tok_ok("$.foo"), vec![(0, Token::Root), (1, Token::Dot), (2, Token::Ident("foo"))] ); } #[test] fn ident_flushed_before_delimiter() { assert_eq!( tok_ok("foo.bar"), vec![ (0, Token::Ident("foo")), (3, Token::Dot), (4, Token::Ident("bar")) ] ); } #[test] fn root_after_ident_flushes() { // ident window should flush before Root token assert_eq!( tok_ok("foo$"), vec![(0, Token::Ident("foo")), (3, Token::Root)] ); } #[test] fn full_path() { assert_eq!( tok_ok("$.foo[0]"), vec![ (0, Token::Root), (1, Token::Dot), (2, Token::Ident("foo")), (5, Token::SqbOpen), (6, Token::Ident("0")), (7, Token::SqbClose), ] ); } #[test] fn complex_nested() { assert_eq!( tok_ok("$.a[1].b"), vec![ (0, Token::Root), (1, Token::Dot), (2, Token::Ident("a")), (3, Token::SqbOpen), (4, Token::Ident("1")), (5, Token::SqbClose), (6, Token::Dot), (7, Token::Ident("b")), ] ); } #[test] fn negative_number_ident() { // '-' is ASCII, so "-1" is a single ident assert_eq!( tok_ok("[-1]"), vec![ (0, Token::SqbOpen), (1, Token::Ident("-1")), (3, Token::SqbClose) ] ); } #[test] fn root_immediately_followed_by_ident() { // "$foo" with no dot — produces Root then Ident assert_eq!( tok_ok("$foo"), vec![(0, Token::Root), (1, Token::Ident("foo"))] ); } #[test] fn consecutive_delimiters() { assert_eq!(tok_ok(".."), vec![(0, Token::Dot), (1, Token::Dot)]); } #[test] fn non_ascii_error() { assert_eq!( tokenize("$.fé"), Err(PathParseError::NonAsciiChar { position: 3, char: 'é' }) ); } #[test] fn non_ascii_at_start() { assert_eq!( tokenize("é"), Err(PathParseError::NonAsciiChar { position: 0, char: 'é' }) ); } #[test] fn non_ascii_flushes_pending_ident_not_reached() { // "ab é" — the ident "ab" is not yet flushed when error occurs, // but we still get an error for the non-ascii char assert_eq!( tokenize("abé"), Err(PathParseError::NonAsciiChar { position: 2, char: 'é' }) ); } }