242 lines
4.4 KiB
Rust
242 lines
4.4 KiB
Rust
use crate::objectpath::PathParseError;
|
|
|
|
#[cfg_attr(test, derive(PartialEq))]
|
|
#[derive(Debug)]
|
|
pub enum Token<'a> {
|
|
Root,
|
|
Ident(&'a str),
|
|
Dot,
|
|
SqbOpen,
|
|
SqbClose,
|
|
}
|
|
|
|
pub struct Tokenizer {}
|
|
|
|
impl Tokenizer {
|
|
pub fn new() -> Self {
|
|
Self {}
|
|
}
|
|
|
|
pub fn tokenize(self, source: &str) -> Result<Vec<(usize, Token<'_>)>, PathParseError> {
|
|
let mut tokens = Vec::new();
|
|
|
|
let mut window_start = None;
|
|
for (i, c) in source.char_indices() {
|
|
match c {
|
|
'$' => {
|
|
if let Some(s) = window_start.take() {
|
|
tokens.push((s, Token::Ident(&source[s..i])));
|
|
}
|
|
tokens.push((i, Token::Root));
|
|
}
|
|
|
|
'.' => {
|
|
if let Some(s) = window_start.take() {
|
|
tokens.push((s, Token::Ident(&source[s..i])));
|
|
}
|
|
tokens.push((i, Token::Dot));
|
|
}
|
|
|
|
'[' => {
|
|
if let Some(s) = window_start.take() {
|
|
tokens.push((s, Token::Ident(&source[s..i])));
|
|
}
|
|
tokens.push((i, Token::SqbOpen));
|
|
}
|
|
|
|
']' => {
|
|
if let Some(s) = window_start.take() {
|
|
tokens.push((s, Token::Ident(&source[s..i])));
|
|
}
|
|
tokens.push((i, Token::SqbClose));
|
|
}
|
|
|
|
x if x.is_ascii() => match window_start {
|
|
None => window_start = Some(i),
|
|
Some(_) => continue,
|
|
},
|
|
|
|
char => return Err(PathParseError::NonAsciiChar { position: i, char }),
|
|
}
|
|
}
|
|
|
|
if let Some(s) = window_start.take() {
|
|
tokens.push((s, Token::Ident(&source[s..])));
|
|
}
|
|
|
|
return Ok(tokens);
|
|
}
|
|
}
|
|
|
|
#[expect(clippy::expect_used)]
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
fn tokenize(source: &str) -> Result<Vec<(usize, Token<'_>)>, PathParseError> {
|
|
Tokenizer::new().tokenize(source)
|
|
}
|
|
|
|
fn tok_ok(source: &str) -> Vec<(usize, Token<'_>)> {
|
|
tokenize(source).expect("expected tokenization to succeed")
|
|
}
|
|
|
|
#[test]
|
|
fn empty() {
|
|
assert_eq!(tok_ok(""), vec![]);
|
|
}
|
|
|
|
#[test]
|
|
fn root_only() {
|
|
assert_eq!(tok_ok("$"), vec![(0, Token::Root)]);
|
|
}
|
|
|
|
#[test]
|
|
fn dot_only() {
|
|
assert_eq!(tok_ok("."), vec![(0, Token::Dot)]);
|
|
}
|
|
|
|
#[test]
|
|
fn sqb_open_only() {
|
|
assert_eq!(tok_ok("["), vec![(0, Token::SqbOpen)]);
|
|
}
|
|
|
|
#[test]
|
|
fn sqb_close_only() {
|
|
assert_eq!(tok_ok("]"), vec![(0, Token::SqbClose)]);
|
|
}
|
|
|
|
#[test]
|
|
fn ident_only() {
|
|
assert_eq!(tok_ok("foo"), vec![(0, Token::Ident("foo"))]);
|
|
}
|
|
|
|
#[test]
|
|
fn ident_with_digits() {
|
|
assert_eq!(tok_ok("abc123"), vec![(0, Token::Ident("abc123"))]);
|
|
}
|
|
|
|
#[test]
|
|
fn root_dot_ident() {
|
|
assert_eq!(
|
|
tok_ok("$.foo"),
|
|
vec![(0, Token::Root), (1, Token::Dot), (2, Token::Ident("foo"))]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn ident_flushed_before_delimiter() {
|
|
assert_eq!(
|
|
tok_ok("foo.bar"),
|
|
vec![
|
|
(0, Token::Ident("foo")),
|
|
(3, Token::Dot),
|
|
(4, Token::Ident("bar"))
|
|
]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn root_after_ident_flushes() {
|
|
// ident window should flush before Root token
|
|
assert_eq!(
|
|
tok_ok("foo$"),
|
|
vec![(0, Token::Ident("foo")), (3, Token::Root)]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn full_path() {
|
|
assert_eq!(
|
|
tok_ok("$.foo[0]"),
|
|
vec![
|
|
(0, Token::Root),
|
|
(1, Token::Dot),
|
|
(2, Token::Ident("foo")),
|
|
(5, Token::SqbOpen),
|
|
(6, Token::Ident("0")),
|
|
(7, Token::SqbClose),
|
|
]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn complex_nested() {
|
|
assert_eq!(
|
|
tok_ok("$.a[1].b"),
|
|
vec![
|
|
(0, Token::Root),
|
|
(1, Token::Dot),
|
|
(2, Token::Ident("a")),
|
|
(3, Token::SqbOpen),
|
|
(4, Token::Ident("1")),
|
|
(5, Token::SqbClose),
|
|
(6, Token::Dot),
|
|
(7, Token::Ident("b")),
|
|
]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn negative_number_ident() {
|
|
// '-' is ASCII, so "-1" is a single ident
|
|
assert_eq!(
|
|
tok_ok("[-1]"),
|
|
vec![
|
|
(0, Token::SqbOpen),
|
|
(1, Token::Ident("-1")),
|
|
(3, Token::SqbClose)
|
|
]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn root_immediately_followed_by_ident() {
|
|
// "$foo" with no dot — produces Root then Ident
|
|
assert_eq!(
|
|
tok_ok("$foo"),
|
|
vec![(0, Token::Root), (1, Token::Ident("foo"))]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn consecutive_delimiters() {
|
|
assert_eq!(tok_ok(".."), vec![(0, Token::Dot), (1, Token::Dot)]);
|
|
}
|
|
|
|
#[test]
|
|
fn non_ascii_error() {
|
|
assert_eq!(
|
|
tokenize("$.fé"),
|
|
Err(PathParseError::NonAsciiChar {
|
|
position: 3,
|
|
char: 'é'
|
|
})
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn non_ascii_at_start() {
|
|
assert_eq!(
|
|
tokenize("é"),
|
|
Err(PathParseError::NonAsciiChar {
|
|
position: 0,
|
|
char: 'é'
|
|
})
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn non_ascii_flushes_pending_ident_not_reached() {
|
|
// "ab é" — the ident "ab" is not yet flushed when error occurs,
|
|
// but we still get an error for the non-ascii char
|
|
assert_eq!(
|
|
tokenize("abé"),
|
|
Err(PathParseError::NonAsciiChar {
|
|
position: 2,
|
|
char: 'é'
|
|
})
|
|
);
|
|
}
|
|
}
|