Add ObjectPath query language
Some checks failed
CI / Typos (push) Successful in 19s
CI / Build and test (push) Failing after 40s
CI / Clippy (push) Failing after 53s

This commit is contained in:
2026-03-05 21:35:07 -08:00
parent 0053ed3a69
commit a9e402bc83
11 changed files with 657 additions and 48 deletions

View File

@@ -10,6 +10,7 @@ workspace = true
[dependencies]
serde = { workspace = true }
smartstring = { workspace = true }
thiserror = { workspace = true }
[dev-dependencies]
toml = { workspace = true }

View File

@@ -7,10 +7,14 @@ pub use post::*;
mod misc;
pub use misc::*;
use crate::objectpath::ObjectPath;
pub mod objectpath;
pub static INIT_DB_TOML: &str = include_str!("./config.toml");
#[test]
#[expect(clippy::unwrap_used)]
#[expect(clippy::expect_used)]
fn init_db_toml_valid() {
toml::from_str::<ConfigToml>(INIT_DB_TOML).expect("INIT_DB_TOML should be valid TOML");
}
@@ -56,7 +60,7 @@ pub struct FieldSpec {
pub r#type: FieldType,
/// How to find this field in a data entry
pub path: OneOrMany<String>,
pub path: OneOrMany<ObjectPath>,
/// How to post-process this field
#[serde(default)]

View File

@@ -0,0 +1,95 @@
use std::{fmt, str::FromStr};
use serde::{
Deserialize, Deserializer,
de::{self, Visitor},
};
use smartstring::{LazyCompact, SmartString};
use thiserror::Error;
use crate::Label;
mod parser;
mod tokenizer;
#[derive(Debug, Error, PartialEq)]
pub enum PathParseError {
#[error("invalid syntax at index {position}")]
Syntax { position: usize },
#[error("path string must start with $")]
MustStartWithRoot { position: usize },
#[error("invalid field {str:?} at {position}")]
InvalidField {
position: usize,
str: SmartString<LazyCompact>,
},
#[error("invalid index {str:?} at {position}")]
InvalidIndexString {
position: usize,
str: SmartString<LazyCompact>,
},
#[error("non-ascii character {char:?} at index {position}")]
NonAsciiChar { position: usize, char: char },
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum PathSegment {
/// Go to root node (`$` identifier)
Root,
/// Go to a child of the current object
Field(Label),
/// Go to an element of the current list
Index(i64),
}
/// A path to aPathSegment::Field inside a nested object,
/// This is a subset of the rfc9535 jsonpath.
///
/// Format:
/// - `$` refers to the root object
/// - `.<name>` selects aPathSegment::Field of an object
/// - `[n]` selects an item of an array
#[derive(Debug, Clone)]
pub struct ObjectPath {
pub segments: Vec<PathSegment>,
}
impl<'de> Deserialize<'de> for ObjectPath {
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
struct PathVisitor;
impl Visitor<'_> for PathVisitor {
type Value = ObjectPath;
fn expecting(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str("an objectpath")
}
fn visit_str<E: de::Error>(self, v: &str) -> Result<Self::Value, E> {
v.parse().map_err(de::Error::custom)
}
}
deserializer.deserialize_str(PathVisitor)
}
}
impl FromStr for ObjectPath {
type Err = PathParseError;
fn from_str(source: &str) -> Result<Self, Self::Err> {
let tk = tokenizer::Tokenizer::new();
let tk = tk.tokenize(source)?;
let ps = parser::Parser::new();
let segments = ps.parse(source, &tk)?;
return Ok(Self { segments });
}
}

View File

@@ -0,0 +1,248 @@
use std::str::FromStr;
use crate::{
Label,
objectpath::{PathParseError, PathSegment, tokenizer::Token},
};
enum State {
Start,
/// We are holding a pointer to an object
Selected,
/// We are waiting for an identifier
Dot,
/// We are indexing an array, waiting for a number
Index,
/// We are indexing an array, waiting for a close-bracket
IndexClose,
}
pub struct Parser {
state: State,
segments: Vec<PathSegment>,
}
impl Parser {
pub fn new() -> Self {
Parser {
state: State::Start,
segments: Vec::new(),
}
}
pub fn parse(
mut self,
source: &str,
tokens: &[(usize, Token<'_>)],
) -> Result<Vec<PathSegment>, PathParseError> {
for t in tokens {
match (self.state, t) {
(State::Start, (_, Token::Root)) => {
self.segments.push(PathSegment::Root);
self.state = State::Selected
}
(State::Start, (p, Token::Ident(_))) => {
return Err(PathParseError::MustStartWithRoot { position: *p });
}
(State::Start, (p, Token::Dot))
| (State::Start, (p, Token::SqbOpen))
| (State::Start, (p, Token::SqbClose)) => {
return Err(PathParseError::Syntax { position: *p });
}
//
// MARK: selected
//
(State::Selected, (_, Token::Dot)) => self.state = State::Dot,
(State::Selected, (_, Token::SqbOpen)) => self.state = State::Index,
(State::Selected, (p, Token::Root))
| (State::Selected, (p, Token::Ident(_)))
| (State::Selected, (p, Token::SqbClose)) => {
return Err(PathParseError::Syntax { position: *p });
}
//
// MARK: dot
//
(State::Dot, (p, Token::Ident(ident))) => {
self.segments
.push(PathSegment::Field(Label::new(*ident).ok_or_else(|| {
PathParseError::InvalidField {
position: *p,
str: (*ident).into(),
}
})?));
self.state = State::Selected;
}
(State::Dot, (p, Token::Root))
| (State::Dot, (p, Token::Dot))
| (State::Dot, (p, Token::SqbOpen))
| (State::Dot, (p, Token::SqbClose)) => {
return Err(PathParseError::Syntax { position: *p });
}
//
// MARK: index
//
(State::Index, (p, Token::Ident(ident))) => {
let idx: i64 = i64::from_str(ident).map_err(|_err| {
PathParseError::InvalidIndexString {
position: *p,
str: (*ident).into(),
}
})?;
self.segments.push(PathSegment::Index(idx));
self.state = State::IndexClose;
}
(State::Index, (p, Token::Root))
| (State::Index, (p, Token::Dot))
| (State::Index, (p, Token::SqbOpen))
| (State::Index, (p, Token::SqbClose)) => {
return Err(PathParseError::Syntax { position: *p });
}
(State::IndexClose, (_, Token::SqbClose)) => self.state = State::Selected,
(State::IndexClose, (p, _)) => {
return Err(PathParseError::Syntax { position: *p });
}
}
}
let position = source.len();
match self.state {
State::Start => Err(PathParseError::Syntax { position: 0 }),
State::Dot => Err(PathParseError::Syntax { position }),
State::Index => Err(PathParseError::Syntax { position }),
State::IndexClose => Err(PathParseError::Syntax { position }),
State::Selected => Ok(()),
}?;
return Ok(self.segments);
}
}
//
// MARK: tests
//
#[expect(clippy::unwrap_used)]
#[cfg(test)]
mod tests {
use crate::objectpath::tokenizer::Tokenizer;
use super::*;
fn parse_test(source: &str, expected: Result<&[PathSegment], PathParseError>) {
let parsed = Tokenizer::new()
.tokenize(source)
.and_then(|tokens| Parser::new().parse(source, &tokens[..]));
match (parsed, expected) {
(Ok(segments), Ok(segs)) => assert_eq!(segments, segs),
(Err(e), Err(expected_err)) => assert_eq!(e, expected_err),
(Ok(segments), Err(e)) => panic!("expected error {e}, got {:?}", segments),
(Err(e), Ok(segs)) => panic!("expected {:?}, got error {e}", segs),
}
}
#[test]
fn root_only() {
parse_test("$", Ok(&[PathSegment::Root]));
}
#[test]
fn single_field() {
parse_test(
"$.foo",
Ok(&[
PathSegment::Root,
PathSegment::Field(Label::new("foo").unwrap()),
]),
);
}
#[test]
fn nested_fields() {
parse_test(
"$.foo.bar.baz",
Ok(&[
PathSegment::Root,
PathSegment::Field(Label::new("foo").unwrap()),
PathSegment::Field(Label::new("bar").unwrap()),
PathSegment::Field(Label::new("baz").unwrap()),
]),
);
}
#[test]
fn array_index() {
parse_test(
"$.items[0]",
Ok(&[
PathSegment::Root,
PathSegment::Field(Label::new("items").unwrap()),
PathSegment::Index(0),
]),
);
}
#[test]
fn chained_indices() {
parse_test(
"$.a[1][2]",
Ok(&[
PathSegment::Root,
PathSegment::Field(Label::new("a").unwrap()),
PathSegment::Index(1),
PathSegment::Index(2),
]),
);
}
#[test]
fn field_after_index() {
parse_test(
"$.a[0].b",
Ok(&[
PathSegment::Root,
PathSegment::Field(Label::new("a").unwrap()),
PathSegment::Index(0),
PathSegment::Field(Label::new("b").unwrap()),
]),
);
}
#[test]
fn negative_index() {
parse_test(
"$.a[-1]",
Ok(&[
PathSegment::Root,
PathSegment::Field(Label::new("a").unwrap()),
PathSegment::Index(-1),
]),
);
}
#[test]
fn non_ascii_error() {
parse_test(
"$.fé",
Err(PathParseError::NonAsciiChar {
position: 3,
char: 'é',
}),
);
}
}

View File

@@ -0,0 +1,241 @@
use crate::objectpath::PathParseError;
#[cfg_attr(test, derive(PartialEq))]
#[derive(Debug)]
pub enum Token<'a> {
Root,
Ident(&'a str),
Dot,
SqbOpen,
SqbClose,
}
pub struct Tokenizer {}
impl Tokenizer {
pub fn new() -> Self {
Self {}
}
pub fn tokenize(self, source: &str) -> Result<Vec<(usize, Token<'_>)>, PathParseError> {
let mut tokens = Vec::new();
let mut window_start = None;
for (i, c) in source.char_indices() {
match c {
'$' => {
if let Some(s) = window_start.take() {
tokens.push((s, Token::Ident(&source[s..i])));
}
tokens.push((i, Token::Root));
}
'.' => {
if let Some(s) = window_start.take() {
tokens.push((s, Token::Ident(&source[s..i])));
}
tokens.push((i, Token::Dot));
}
'[' => {
if let Some(s) = window_start.take() {
tokens.push((s, Token::Ident(&source[s..i])));
}
tokens.push((i, Token::SqbOpen));
}
']' => {
if let Some(s) = window_start.take() {
tokens.push((s, Token::Ident(&source[s..i])));
}
tokens.push((i, Token::SqbClose));
}
x if x.is_ascii() => match window_start {
None => window_start = Some(i),
Some(_) => continue,
},
char => return Err(PathParseError::NonAsciiChar { position: i, char }),
}
}
if let Some(s) = window_start.take() {
tokens.push((s, Token::Ident(&source[s..])));
}
return Ok(tokens);
}
}
#[expect(clippy::expect_used)]
#[cfg(test)]
mod tests {
use super::*;
fn tokenize(source: &str) -> Result<Vec<(usize, Token<'_>)>, PathParseError> {
Tokenizer::new().tokenize(source)
}
fn tok_ok(source: &str) -> Vec<(usize, Token<'_>)> {
tokenize(source).expect("expected tokenization to succeed")
}
#[test]
fn empty() {
assert_eq!(tok_ok(""), vec![]);
}
#[test]
fn root_only() {
assert_eq!(tok_ok("$"), vec![(0, Token::Root)]);
}
#[test]
fn dot_only() {
assert_eq!(tok_ok("."), vec![(0, Token::Dot)]);
}
#[test]
fn sqb_open_only() {
assert_eq!(tok_ok("["), vec![(0, Token::SqbOpen)]);
}
#[test]
fn sqb_close_only() {
assert_eq!(tok_ok("]"), vec![(0, Token::SqbClose)]);
}
#[test]
fn ident_only() {
assert_eq!(tok_ok("foo"), vec![(0, Token::Ident("foo"))]);
}
#[test]
fn ident_with_digits() {
assert_eq!(tok_ok("abc123"), vec![(0, Token::Ident("abc123"))]);
}
#[test]
fn root_dot_ident() {
assert_eq!(
tok_ok("$.foo"),
vec![(0, Token::Root), (1, Token::Dot), (2, Token::Ident("foo"))]
);
}
#[test]
fn ident_flushed_before_delimiter() {
assert_eq!(
tok_ok("foo.bar"),
vec![
(0, Token::Ident("foo")),
(3, Token::Dot),
(4, Token::Ident("bar"))
]
);
}
#[test]
fn root_after_ident_flushes() {
// ident window should flush before Root token
assert_eq!(
tok_ok("foo$"),
vec![(0, Token::Ident("foo")), (3, Token::Root)]
);
}
#[test]
fn full_path() {
assert_eq!(
tok_ok("$.foo[0]"),
vec![
(0, Token::Root),
(1, Token::Dot),
(2, Token::Ident("foo")),
(5, Token::SqbOpen),
(6, Token::Ident("0")),
(7, Token::SqbClose),
]
);
}
#[test]
fn complex_nested() {
assert_eq!(
tok_ok("$.a[1].b"),
vec![
(0, Token::Root),
(1, Token::Dot),
(2, Token::Ident("a")),
(3, Token::SqbOpen),
(4, Token::Ident("1")),
(5, Token::SqbClose),
(6, Token::Dot),
(7, Token::Ident("b")),
]
);
}
#[test]
fn negative_number_ident() {
// '-' is ASCII, so "-1" is a single ident
assert_eq!(
tok_ok("[-1]"),
vec![
(0, Token::SqbOpen),
(1, Token::Ident("-1")),
(3, Token::SqbClose)
]
);
}
#[test]
fn root_immediately_followed_by_ident() {
// "$foo" with no dot — produces Root then Ident
assert_eq!(
tok_ok("$foo"),
vec![(0, Token::Root), (1, Token::Ident("foo"))]
);
}
#[test]
fn consecutive_delimiters() {
assert_eq!(tok_ok(".."), vec![(0, Token::Dot), (1, Token::Dot)]);
}
#[test]
fn non_ascii_error() {
assert_eq!(
tokenize("$.fé"),
Err(PathParseError::NonAsciiChar {
position: 3,
char: 'é'
})
);
}
#[test]
fn non_ascii_at_start() {
assert_eq!(
tokenize("é"),
Err(PathParseError::NonAsciiChar {
position: 0,
char: 'é'
})
);
}
#[test]
fn non_ascii_flushes_pending_ident_not_reached() {
// "ab é" — the ident "ab" is not yet flushed when error occurs,
// but we still get an error for the non-ascii char
assert_eq!(
tokenize("abé"),
Err(PathParseError::NonAsciiChar {
position: 2,
char: 'é'
})
);
}
}