Files
pile/crates/pile-config/src/objectpath/parser.rs
2026-03-21 10:20:41 -07:00

487 lines
11 KiB
Rust

use std::str::FromStr;
use smartstring::{LazyCompact, SmartString};
use crate::{
Label,
objectpath::{PathParseError, PathSegment, tokenizer::Token},
};
/// Parse an ident token into a `PathSegment::Field`, handling optional args of
/// the form `name(args)`. Parens inside args may be nested; `\(` and `\)` are
/// escaped and do not affect depth counting.
fn parse_field(ident: &str, position: usize) -> Result<PathSegment, PathParseError> {
let bytes = ident.as_bytes();
let mut i = 0;
// Find the first unescaped '(' — everything before it is the name.
let open_paren: Option<usize> = loop {
if i >= bytes.len() {
break None;
}
match bytes[i] {
b'\\' => i += 2, // skip escaped character
b'(' => break Some(i),
_ => i += 1,
}
};
let name_str = &ident[..open_paren.unwrap_or(bytes.len())];
let name = Label::new(name_str).ok_or_else(|| PathParseError::InvalidField {
position,
str: name_str.into(),
})?;
let Some(open_pos) = open_paren else {
return Ok(PathSegment::Field { name, args: None });
};
// Scan args, tracking paren depth.
let args_start = open_pos + 1;
let mut depth: usize = 1;
let mut j = args_start;
while j < bytes.len() {
match bytes[j] {
b'\\' => j += 2, // skip escaped character
b'(' => {
depth += 1;
j += 1;
}
b')' => {
depth -= 1;
if depth == 0 {
// Closing paren must be the last character.
if j + 1 != bytes.len() {
return Err(PathParseError::Syntax {
position: position + j + 1,
});
}
let args: SmartString<LazyCompact> = ident[args_start..j].into();
return Ok(PathSegment::Field {
name,
args: Some(args),
});
}
j += 1;
}
_ => j += 1,
}
}
// Reached end of ident without finding the matching ')'.
Err(PathParseError::Syntax {
position: position + ident.len(),
})
}
enum State {
Start,
/// We are holding a pointer to an object
Selected,
/// We are waiting for an identifier
Dot,
/// We are indexing an array, waiting for a number
Index,
/// We parsed the start index, waiting for `]` or the first `.` of `..`
IndexAfterStart(i64),
/// We saw one `.` after the start index, waiting for the second `.`
IndexRangeDot1(i64),
/// We saw `..`, waiting for the end index (optionally prefixed with `=`)
IndexRangeDot2(i64),
/// We are indexing an array, waiting for a close-bracket
IndexClose,
}
pub struct Parser {
state: State,
segments: Vec<PathSegment>,
}
impl Parser {
pub fn new() -> Self {
Parser {
state: State::Start,
segments: Vec::new(),
}
}
pub fn parse(
mut self,
source: &str,
tokens: &[(usize, Token<'_>)],
) -> Result<Vec<PathSegment>, PathParseError> {
for t in tokens {
match (self.state, t) {
(State::Start, (_, Token::Root)) => {
self.segments.push(PathSegment::Root);
self.state = State::Selected
}
(State::Start, (p, Token::Ident(_))) => {
return Err(PathParseError::MustStartWithRoot { position: *p });
}
(State::Start, (p, Token::Dot))
| (State::Start, (p, Token::SqbOpen))
| (State::Start, (p, Token::SqbClose)) => {
return Err(PathParseError::Syntax { position: *p });
}
//
// MARK: selected
//
(State::Selected, (_, Token::Dot)) => self.state = State::Dot,
(State::Selected, (_, Token::SqbOpen)) => self.state = State::Index,
(State::Selected, (p, Token::Root))
| (State::Selected, (p, Token::Ident(_)))
| (State::Selected, (p, Token::SqbClose)) => {
return Err(PathParseError::Syntax { position: *p });
}
//
// MARK: dot
//
(State::Dot, (p, Token::Ident(ident))) => {
self.segments.push(parse_field(ident, *p)?);
self.state = State::Selected;
}
(State::Dot, (p, Token::Root))
| (State::Dot, (p, Token::Dot))
| (State::Dot, (p, Token::SqbOpen))
| (State::Dot, (p, Token::SqbClose)) => {
return Err(PathParseError::Syntax { position: *p });
}
//
// MARK: index
//
(State::Index, (p, Token::Ident(ident))) => {
let idx: i64 = i64::from_str(ident).map_err(|_err| {
PathParseError::InvalidIndexString {
position: *p,
str: (*ident).into(),
}
})?;
self.state = State::IndexAfterStart(idx);
}
(State::Index, (p, Token::Root))
| (State::Index, (p, Token::Dot))
| (State::Index, (p, Token::SqbOpen))
| (State::Index, (p, Token::SqbClose)) => {
return Err(PathParseError::Syntax { position: *p });
}
(State::IndexAfterStart(idx), (_, Token::SqbClose)) => {
self.segments.push(PathSegment::Index(idx));
self.state = State::Selected;
}
(State::IndexAfterStart(idx), (_, Token::Dot)) => {
self.state = State::IndexRangeDot1(idx);
}
(State::IndexAfterStart(_), (p, _)) => {
return Err(PathParseError::Syntax { position: *p });
}
(State::IndexRangeDot1(idx), (_, Token::Dot)) => {
self.state = State::IndexRangeDot2(idx);
}
(State::IndexRangeDot1(_), (p, _)) => {
return Err(PathParseError::Syntax { position: *p });
}
(State::IndexRangeDot2(start), (p, Token::Ident(ident))) => {
let (end_str, inclusive) = if let Some(stripped) = ident.strip_prefix('=') {
(stripped, true)
} else {
(*ident, false)
};
let end: i64 = i64::from_str(end_str).map_err(|_err| {
PathParseError::InvalidIndexString {
position: *p,
str: (*ident).into(),
}
})?;
self.segments.push(PathSegment::Range {
start,
end,
inclusive,
});
self.state = State::IndexClose;
}
(State::IndexRangeDot2(_), (p, _)) => {
return Err(PathParseError::Syntax { position: *p });
}
(State::IndexClose, (_, Token::SqbClose)) => self.state = State::Selected,
(State::IndexClose, (p, _)) => {
return Err(PathParseError::Syntax { position: *p });
}
}
}
let position = source.len();
match self.state {
State::Start => Err(PathParseError::Syntax { position: 0 }),
State::Dot => Err(PathParseError::Syntax { position }),
State::Index => Err(PathParseError::Syntax { position }),
State::IndexAfterStart(_) => Err(PathParseError::Syntax { position }),
State::IndexRangeDot1(_) => Err(PathParseError::Syntax { position }),
State::IndexRangeDot2(_) => Err(PathParseError::Syntax { position }),
State::IndexClose => Err(PathParseError::Syntax { position }),
State::Selected => Ok(()),
}?;
return Ok(self.segments);
}
}
//
// MARK: tests
//
#[expect(clippy::unwrap_used)]
#[cfg(test)]
mod tests {
use crate::objectpath::tokenizer::Tokenizer;
use super::*;
fn parse_test(source: &str, expected: Result<&[PathSegment], PathParseError>) {
let parsed = Tokenizer::new()
.tokenize(source)
.and_then(|tokens| Parser::new().parse(source, &tokens[..]));
match (parsed, expected) {
(Ok(segments), Ok(segs)) => assert_eq!(segments, segs),
(Err(e), Err(expected_err)) => assert_eq!(e, expected_err),
(Ok(segments), Err(e)) => panic!("expected error {e}, got {:?}", segments),
(Err(e), Ok(segs)) => panic!("expected {:?}, got error {e}", segs),
}
}
#[test]
fn root_only() {
parse_test("$", Ok(&[PathSegment::Root]));
}
fn field(name: &str) -> PathSegment {
PathSegment::Field {
name: Label::new(name).unwrap(),
args: None,
}
}
fn field_args(name: &str, args: &str) -> PathSegment {
PathSegment::Field {
name: Label::new(name).unwrap(),
args: Some(args.into()),
}
}
#[test]
fn single_field() {
parse_test("$.foo", Ok(&[PathSegment::Root, field("foo")]));
}
#[test]
fn nested_fields() {
parse_test(
"$.foo.bar.baz",
Ok(&[PathSegment::Root, field("foo"), field("bar"), field("baz")]),
);
}
#[test]
fn array_index() {
parse_test(
"$.items[0]",
Ok(&[PathSegment::Root, field("items"), PathSegment::Index(0)]),
);
}
#[test]
fn chained_indices() {
parse_test(
"$.a[1][2]",
Ok(&[
PathSegment::Root,
field("a"),
PathSegment::Index(1),
PathSegment::Index(2),
]),
);
}
#[test]
fn field_after_index() {
parse_test(
"$.a[0].b",
Ok(&[
PathSegment::Root,
field("a"),
PathSegment::Index(0),
field("b"),
]),
);
}
#[test]
fn negative_index() {
parse_test(
"$.a[-1]",
Ok(&[PathSegment::Root, field("a"), PathSegment::Index(-1)]),
);
}
// MARK: args
#[test]
fn field_with_simple_args() {
parse_test(
"$.foo(bar)",
Ok(&[PathSegment::Root, field_args("foo", "bar")]),
);
}
#[test]
fn field_with_empty_args() {
parse_test("$.foo()", Ok(&[PathSegment::Root, field_args("foo", "")]));
}
#[test]
fn field_with_nested_parens_in_args() {
parse_test(
"$.foo(a(b)c)",
Ok(&[PathSegment::Root, field_args("foo", "a(b)c")]),
);
}
#[test]
fn field_with_deeply_nested_parens_in_args() {
parse_test(
"$.foo(a(b(c))d)",
Ok(&[PathSegment::Root, field_args("foo", "a(b(c))d")]),
);
}
#[test]
fn field_with_escaped_open_paren_in_args() {
// "$.foo(a\(b)" — '\(' is escaped, so depth never rises above 1; ')' closes it
parse_test(
r"$.foo(a\(b)",
Ok(&[PathSegment::Root, field_args("foo", r"a\(b")]),
);
}
#[test]
fn field_with_escaped_close_paren_in_args() {
// "$.foo(a\)b)" — '\)' is escaped, the second ')' closes at depth 0
parse_test(
r"$.foo(a\)b)",
Ok(&[PathSegment::Root, field_args("foo", r"a\)b")]),
);
}
#[test]
fn field_with_both_escaped_parens_in_args() {
parse_test(
r"$.foo(a\(b\)c)",
Ok(&[PathSegment::Root, field_args("foo", r"a\(b\)c")]),
);
}
#[test]
fn field_args_with_multiple_segments() {
parse_test(
"$.foo(x).bar(y)",
Ok(&[
PathSegment::Root,
field_args("foo", "x"),
field_args("bar", "y"),
]),
);
}
#[test]
fn field_args_unclosed_paren_error() {
// Missing closing ')' → Syntax error at end of source
parse_test("$.foo(bar", Err(PathParseError::Syntax { position: 9 }));
}
#[test]
fn field_args_trailing_chars_after_close_error() {
// Closing ')' is not the last char → Syntax error at the trailing char
parse_test(
"$.foo(bar)baz",
Err(PathParseError::Syntax { position: 10 }),
);
}
#[test]
fn non_ascii_error() {
parse_test(
"$.fé",
Err(PathParseError::NonAsciiChar {
position: 3,
char: 'é',
}),
);
}
// MARK: range
fn range(start: i64, end: i64, inclusive: bool) -> PathSegment {
PathSegment::Range {
start,
end,
inclusive,
}
}
#[test]
fn exclusive_range() {
parse_test(
"$.a[0..5]",
Ok(&[PathSegment::Root, field("a"), range(0, 5, false)]),
);
}
#[test]
fn inclusive_range() {
parse_test(
"$.a[1..=2]",
Ok(&[PathSegment::Root, field("a"), range(1, 2, true)]),
);
}
#[test]
fn range_with_negative_end() {
parse_test(
"$.a[0..-1]",
Ok(&[PathSegment::Root, field("a"), range(0, -1, false)]),
);
}
#[test]
fn range_with_negative_start() {
parse_test(
"$.a[-3..-1]",
Ok(&[PathSegment::Root, field("a"), range(-3, -1, false)]),
);
}
}