From a9e402bc83d700ebad3f989fa527f426d772eefb Mon Sep 17 00:00:00 2001 From: rm-dr <96270320+rm-dr@users.noreply.github.com> Date: Thu, 5 Mar 2026 21:35:07 -0800 Subject: [PATCH] Add `ObjectPath` query language --- Cargo.lock | 1 + Cargo.toml | 2 +- crates/pile-config/Cargo.toml | 1 + crates/pile-config/src/lib.rs | 8 +- crates/pile-config/src/objectpath/mod.rs | 95 +++++++ crates/pile-config/src/objectpath/parser.rs | 248 ++++++++++++++++++ .../pile-config/src/objectpath/tokenizer.rs | 241 +++++++++++++++++ crates/pile-dataset/src/index/index_fts.rs | 17 +- crates/pile-dataset/src/misc.rs | 26 +- crates/pile-dataset/src/value.rs | 43 ++- crates/pile/src/command/lookup.rs | 23 +- 11 files changed, 657 insertions(+), 48 deletions(-) create mode 100644 crates/pile-config/src/objectpath/mod.rs create mode 100644 crates/pile-config/src/objectpath/parser.rs create mode 100644 crates/pile-config/src/objectpath/tokenizer.rs diff --git a/Cargo.lock b/Cargo.lock index d9399a6..a8e9d46 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -960,6 +960,7 @@ version = "0.0.1" dependencies = [ "serde", "smartstring", + "thiserror", "toml", ] diff --git a/Cargo.toml b/Cargo.toml index 9e6ea34..cd47aa3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ members = ["crates/*"] resolver = "2" [workspace.package] -rust-version = "1.91.0" +rust-version = "1.94.0" edition = "2024" version = "0.0.1" diff --git a/crates/pile-config/Cargo.toml b/crates/pile-config/Cargo.toml index d2e66ec..0fe2c77 100644 --- a/crates/pile-config/Cargo.toml +++ b/crates/pile-config/Cargo.toml @@ -10,6 +10,7 @@ workspace = true [dependencies] serde = { workspace = true } smartstring = { workspace = true } +thiserror = { workspace = true } [dev-dependencies] toml = { workspace = true } diff --git a/crates/pile-config/src/lib.rs b/crates/pile-config/src/lib.rs index 8aaa828..819501b 100644 --- a/crates/pile-config/src/lib.rs +++ b/crates/pile-config/src/lib.rs @@ -7,10 +7,14 @@ pub use post::*; mod misc; pub use misc::*; +use crate::objectpath::ObjectPath; + +pub mod objectpath; + pub static INIT_DB_TOML: &str = include_str!("./config.toml"); #[test] -#[expect(clippy::unwrap_used)] +#[expect(clippy::expect_used)] fn init_db_toml_valid() { toml::from_str::(INIT_DB_TOML).expect("INIT_DB_TOML should be valid TOML"); } @@ -56,7 +60,7 @@ pub struct FieldSpec { pub r#type: FieldType, /// How to find this field in a data entry - pub path: OneOrMany, + pub path: OneOrMany, /// How to post-process this field #[serde(default)] diff --git a/crates/pile-config/src/objectpath/mod.rs b/crates/pile-config/src/objectpath/mod.rs new file mode 100644 index 0000000..d3598b9 --- /dev/null +++ b/crates/pile-config/src/objectpath/mod.rs @@ -0,0 +1,95 @@ +use std::{fmt, str::FromStr}; + +use serde::{ + Deserialize, Deserializer, + de::{self, Visitor}, +}; +use smartstring::{LazyCompact, SmartString}; +use thiserror::Error; + +use crate::Label; + +mod parser; +mod tokenizer; + +#[derive(Debug, Error, PartialEq)] +pub enum PathParseError { + #[error("invalid syntax at index {position}")] + Syntax { position: usize }, + + #[error("path string must start with $")] + MustStartWithRoot { position: usize }, + + #[error("invalid field {str:?} at {position}")] + InvalidField { + position: usize, + str: SmartString, + }, + + #[error("invalid index {str:?} at {position}")] + InvalidIndexString { + position: usize, + str: SmartString, + }, + + #[error("non-ascii character {char:?} at index {position}")] + NonAsciiChar { position: usize, char: char }, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum PathSegment { + /// Go to root node (`$` identifier) + Root, + + /// Go to a child of the current object + Field(Label), + + /// Go to an element of the current list + Index(i64), +} + +/// A path to aPathSegment::Field inside a nested object, +/// This is a subset of the rfc9535 jsonpath. +/// +/// Format: +/// - `$` refers to the root object +/// - `.` selects aPathSegment::Field of an object +/// - `[n]` selects an item of an array +#[derive(Debug, Clone)] +pub struct ObjectPath { + pub segments: Vec, +} + +impl<'de> Deserialize<'de> for ObjectPath { + fn deserialize>(deserializer: D) -> Result { + struct PathVisitor; + + impl Visitor<'_> for PathVisitor { + type Value = ObjectPath; + + fn expecting(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("an objectpath") + } + + fn visit_str(self, v: &str) -> Result { + v.parse().map_err(de::Error::custom) + } + } + + deserializer.deserialize_str(PathVisitor) + } +} + +impl FromStr for ObjectPath { + type Err = PathParseError; + + fn from_str(source: &str) -> Result { + let tk = tokenizer::Tokenizer::new(); + let tk = tk.tokenize(source)?; + + let ps = parser::Parser::new(); + let segments = ps.parse(source, &tk)?; + + return Ok(Self { segments }); + } +} diff --git a/crates/pile-config/src/objectpath/parser.rs b/crates/pile-config/src/objectpath/parser.rs new file mode 100644 index 0000000..ea96acf --- /dev/null +++ b/crates/pile-config/src/objectpath/parser.rs @@ -0,0 +1,248 @@ +use std::str::FromStr; + +use crate::{ + Label, + objectpath::{PathParseError, PathSegment, tokenizer::Token}, +}; + +enum State { + Start, + + /// We are holding a pointer to an object + Selected, + + /// We are waiting for an identifier + Dot, + + /// We are indexing an array, waiting for a number + Index, + + /// We are indexing an array, waiting for a close-bracket + IndexClose, +} + +pub struct Parser { + state: State, + segments: Vec, +} + +impl Parser { + pub fn new() -> Self { + Parser { + state: State::Start, + segments: Vec::new(), + } + } + + pub fn parse( + mut self, + source: &str, + tokens: &[(usize, Token<'_>)], + ) -> Result, PathParseError> { + for t in tokens { + match (self.state, t) { + (State::Start, (_, Token::Root)) => { + self.segments.push(PathSegment::Root); + self.state = State::Selected + } + + (State::Start, (p, Token::Ident(_))) => { + return Err(PathParseError::MustStartWithRoot { position: *p }); + } + + (State::Start, (p, Token::Dot)) + | (State::Start, (p, Token::SqbOpen)) + | (State::Start, (p, Token::SqbClose)) => { + return Err(PathParseError::Syntax { position: *p }); + } + + // + // MARK: selected + // + (State::Selected, (_, Token::Dot)) => self.state = State::Dot, + (State::Selected, (_, Token::SqbOpen)) => self.state = State::Index, + + (State::Selected, (p, Token::Root)) + | (State::Selected, (p, Token::Ident(_))) + | (State::Selected, (p, Token::SqbClose)) => { + return Err(PathParseError::Syntax { position: *p }); + } + + // + // MARK: dot + // + (State::Dot, (p, Token::Ident(ident))) => { + self.segments + .push(PathSegment::Field(Label::new(*ident).ok_or_else(|| { + PathParseError::InvalidField { + position: *p, + str: (*ident).into(), + } + })?)); + + self.state = State::Selected; + } + + (State::Dot, (p, Token::Root)) + | (State::Dot, (p, Token::Dot)) + | (State::Dot, (p, Token::SqbOpen)) + | (State::Dot, (p, Token::SqbClose)) => { + return Err(PathParseError::Syntax { position: *p }); + } + + // + // MARK: index + // + (State::Index, (p, Token::Ident(ident))) => { + let idx: i64 = i64::from_str(ident).map_err(|_err| { + PathParseError::InvalidIndexString { + position: *p, + str: (*ident).into(), + } + })?; + + self.segments.push(PathSegment::Index(idx)); + self.state = State::IndexClose; + } + + (State::Index, (p, Token::Root)) + | (State::Index, (p, Token::Dot)) + | (State::Index, (p, Token::SqbOpen)) + | (State::Index, (p, Token::SqbClose)) => { + return Err(PathParseError::Syntax { position: *p }); + } + + (State::IndexClose, (_, Token::SqbClose)) => self.state = State::Selected, + (State::IndexClose, (p, _)) => { + return Err(PathParseError::Syntax { position: *p }); + } + } + } + + let position = source.len(); + match self.state { + State::Start => Err(PathParseError::Syntax { position: 0 }), + State::Dot => Err(PathParseError::Syntax { position }), + State::Index => Err(PathParseError::Syntax { position }), + State::IndexClose => Err(PathParseError::Syntax { position }), + State::Selected => Ok(()), + }?; + + return Ok(self.segments); + } +} + +// +// MARK: tests +// + +#[expect(clippy::unwrap_used)] +#[cfg(test)] +mod tests { + use crate::objectpath::tokenizer::Tokenizer; + + use super::*; + + fn parse_test(source: &str, expected: Result<&[PathSegment], PathParseError>) { + let parsed = Tokenizer::new() + .tokenize(source) + .and_then(|tokens| Parser::new().parse(source, &tokens[..])); + + match (parsed, expected) { + (Ok(segments), Ok(segs)) => assert_eq!(segments, segs), + (Err(e), Err(expected_err)) => assert_eq!(e, expected_err), + (Ok(segments), Err(e)) => panic!("expected error {e}, got {:?}", segments), + (Err(e), Ok(segs)) => panic!("expected {:?}, got error {e}", segs), + } + } + + #[test] + fn root_only() { + parse_test("$", Ok(&[PathSegment::Root])); + } + + #[test] + fn single_field() { + parse_test( + "$.foo", + Ok(&[ + PathSegment::Root, + PathSegment::Field(Label::new("foo").unwrap()), + ]), + ); + } + + #[test] + fn nested_fields() { + parse_test( + "$.foo.bar.baz", + Ok(&[ + PathSegment::Root, + PathSegment::Field(Label::new("foo").unwrap()), + PathSegment::Field(Label::new("bar").unwrap()), + PathSegment::Field(Label::new("baz").unwrap()), + ]), + ); + } + + #[test] + fn array_index() { + parse_test( + "$.items[0]", + Ok(&[ + PathSegment::Root, + PathSegment::Field(Label::new("items").unwrap()), + PathSegment::Index(0), + ]), + ); + } + + #[test] + fn chained_indices() { + parse_test( + "$.a[1][2]", + Ok(&[ + PathSegment::Root, + PathSegment::Field(Label::new("a").unwrap()), + PathSegment::Index(1), + PathSegment::Index(2), + ]), + ); + } + + #[test] + fn field_after_index() { + parse_test( + "$.a[0].b", + Ok(&[ + PathSegment::Root, + PathSegment::Field(Label::new("a").unwrap()), + PathSegment::Index(0), + PathSegment::Field(Label::new("b").unwrap()), + ]), + ); + } + + #[test] + fn negative_index() { + parse_test( + "$.a[-1]", + Ok(&[ + PathSegment::Root, + PathSegment::Field(Label::new("a").unwrap()), + PathSegment::Index(-1), + ]), + ); + } + + #[test] + fn non_ascii_error() { + parse_test( + "$.fé", + Err(PathParseError::NonAsciiChar { + position: 3, + char: 'é', + }), + ); + } +} diff --git a/crates/pile-config/src/objectpath/tokenizer.rs b/crates/pile-config/src/objectpath/tokenizer.rs new file mode 100644 index 0000000..efbf1f7 --- /dev/null +++ b/crates/pile-config/src/objectpath/tokenizer.rs @@ -0,0 +1,241 @@ +use crate::objectpath::PathParseError; + +#[cfg_attr(test, derive(PartialEq))] +#[derive(Debug)] +pub enum Token<'a> { + Root, + Ident(&'a str), + Dot, + SqbOpen, + SqbClose, +} + +pub struct Tokenizer {} + +impl Tokenizer { + pub fn new() -> Self { + Self {} + } + + pub fn tokenize(self, source: &str) -> Result)>, PathParseError> { + let mut tokens = Vec::new(); + + let mut window_start = None; + for (i, c) in source.char_indices() { + match c { + '$' => { + if let Some(s) = window_start.take() { + tokens.push((s, Token::Ident(&source[s..i]))); + } + tokens.push((i, Token::Root)); + } + + '.' => { + if let Some(s) = window_start.take() { + tokens.push((s, Token::Ident(&source[s..i]))); + } + tokens.push((i, Token::Dot)); + } + + '[' => { + if let Some(s) = window_start.take() { + tokens.push((s, Token::Ident(&source[s..i]))); + } + tokens.push((i, Token::SqbOpen)); + } + + ']' => { + if let Some(s) = window_start.take() { + tokens.push((s, Token::Ident(&source[s..i]))); + } + tokens.push((i, Token::SqbClose)); + } + + x if x.is_ascii() => match window_start { + None => window_start = Some(i), + Some(_) => continue, + }, + + char => return Err(PathParseError::NonAsciiChar { position: i, char }), + } + } + + if let Some(s) = window_start.take() { + tokens.push((s, Token::Ident(&source[s..]))); + } + + return Ok(tokens); + } +} + +#[expect(clippy::expect_used)] +#[cfg(test)] +mod tests { + use super::*; + + fn tokenize(source: &str) -> Result)>, PathParseError> { + Tokenizer::new().tokenize(source) + } + + fn tok_ok(source: &str) -> Vec<(usize, Token<'_>)> { + tokenize(source).expect("expected tokenization to succeed") + } + + #[test] + fn empty() { + assert_eq!(tok_ok(""), vec![]); + } + + #[test] + fn root_only() { + assert_eq!(tok_ok("$"), vec![(0, Token::Root)]); + } + + #[test] + fn dot_only() { + assert_eq!(tok_ok("."), vec![(0, Token::Dot)]); + } + + #[test] + fn sqb_open_only() { + assert_eq!(tok_ok("["), vec![(0, Token::SqbOpen)]); + } + + #[test] + fn sqb_close_only() { + assert_eq!(tok_ok("]"), vec![(0, Token::SqbClose)]); + } + + #[test] + fn ident_only() { + assert_eq!(tok_ok("foo"), vec![(0, Token::Ident("foo"))]); + } + + #[test] + fn ident_with_digits() { + assert_eq!(tok_ok("abc123"), vec![(0, Token::Ident("abc123"))]); + } + + #[test] + fn root_dot_ident() { + assert_eq!( + tok_ok("$.foo"), + vec![(0, Token::Root), (1, Token::Dot), (2, Token::Ident("foo"))] + ); + } + + #[test] + fn ident_flushed_before_delimiter() { + assert_eq!( + tok_ok("foo.bar"), + vec![ + (0, Token::Ident("foo")), + (3, Token::Dot), + (4, Token::Ident("bar")) + ] + ); + } + + #[test] + fn root_after_ident_flushes() { + // ident window should flush before Root token + assert_eq!( + tok_ok("foo$"), + vec![(0, Token::Ident("foo")), (3, Token::Root)] + ); + } + + #[test] + fn full_path() { + assert_eq!( + tok_ok("$.foo[0]"), + vec![ + (0, Token::Root), + (1, Token::Dot), + (2, Token::Ident("foo")), + (5, Token::SqbOpen), + (6, Token::Ident("0")), + (7, Token::SqbClose), + ] + ); + } + + #[test] + fn complex_nested() { + assert_eq!( + tok_ok("$.a[1].b"), + vec![ + (0, Token::Root), + (1, Token::Dot), + (2, Token::Ident("a")), + (3, Token::SqbOpen), + (4, Token::Ident("1")), + (5, Token::SqbClose), + (6, Token::Dot), + (7, Token::Ident("b")), + ] + ); + } + + #[test] + fn negative_number_ident() { + // '-' is ASCII, so "-1" is a single ident + assert_eq!( + tok_ok("[-1]"), + vec![ + (0, Token::SqbOpen), + (1, Token::Ident("-1")), + (3, Token::SqbClose) + ] + ); + } + + #[test] + fn root_immediately_followed_by_ident() { + // "$foo" with no dot — produces Root then Ident + assert_eq!( + tok_ok("$foo"), + vec![(0, Token::Root), (1, Token::Ident("foo"))] + ); + } + + #[test] + fn consecutive_delimiters() { + assert_eq!(tok_ok(".."), vec![(0, Token::Dot), (1, Token::Dot)]); + } + + #[test] + fn non_ascii_error() { + assert_eq!( + tokenize("$.fé"), + Err(PathParseError::NonAsciiChar { + position: 3, + char: 'é' + }) + ); + } + + #[test] + fn non_ascii_at_start() { + assert_eq!( + tokenize("é"), + Err(PathParseError::NonAsciiChar { + position: 0, + char: 'é' + }) + ); + } + + #[test] + fn non_ascii_flushes_pending_ident_not_reached() { + // "ab é" — the ident "ab" is not yet flushed when error occurs, + // but we still get an error for the non-ascii char + assert_eq!( + tokenize("abé"), + Err(PathParseError::NonAsciiChar { + position: 2, + char: 'é' + }) + ); + } +} diff --git a/crates/pile-dataset/src/index/index_fts.rs b/crates/pile-dataset/src/index/index_fts.rs index b166dbc..95aa23d 100644 --- a/crates/pile-dataset/src/index/index_fts.rs +++ b/crates/pile-dataset/src/index/index_fts.rs @@ -130,12 +130,7 @@ impl DbFtsIndex { // Try paths in order, using the first value we find 'outer: for path in field.path.as_slice() { - let segments = path - .split('.') - .map(|x| Label::new(x).unwrap_or_else(|| panic!("wtf {x}"))) - .collect::>(); - - let val = match extractor.query(&segments)? { + let val = match extractor.query(path)? { Some(x) => x, None => return Ok(None), }; @@ -145,7 +140,7 @@ impl DbFtsIndex { trace!( message = "Skipping field, is null", field = field_name.to_string(), - path, + ?path, // value = ?val ); continue; @@ -170,7 +165,7 @@ impl DbFtsIndex { debug!( message = "Skipping field, is array with more than one element", field = field_name.to_string(), - path, + ?path, //value = ?val ); continue 'outer; @@ -178,7 +173,7 @@ impl DbFtsIndex { debug!( message = "Skipping field, is empty array", field = field_name.to_string(), - path, + ?path, //value = ?val ); continue 'outer; @@ -188,7 +183,7 @@ impl DbFtsIndex { trace!( message = "Skipping field, is null", field = field_name.to_string(), - path, + ?path, //value = ?val ); continue 'outer; @@ -197,7 +192,7 @@ impl DbFtsIndex { trace!( message = "Skipping field, is object", field = field_name.to_string(), - path, + ?path, //value = ?val ); continue 'outer; diff --git a/crates/pile-dataset/src/misc.rs b/crates/pile-dataset/src/misc.rs index c86aa61..0b85edb 100644 --- a/crates/pile-dataset/src/misc.rs +++ b/crates/pile-dataset/src/misc.rs @@ -71,12 +71,13 @@ fn find_latest_modified(dir: &Path) -> Result>, std::io::Er }); } } else if metadata.is_dir() - && let Some(dir_latest) = find_latest_modified(&path)? { - latest = Some(match latest { - Some(prev) if prev > dir_latest => prev, - _ => dir_latest, - }); - } + && let Some(dir_latest) = find_latest_modified(&path)? + { + latest = Some(match latest { + Some(prev) if prev > dir_latest => prev, + _ => dir_latest, + }); + } } return Ok(latest); @@ -107,12 +108,13 @@ fn find_earliest_modified(dir: &Path) -> Result>, std::io:: }); } } else if metadata.is_dir() - && let Some(dir_earliest) = find_earliest_modified(&path)? { - earliest = Some(match earliest { - Some(prev) if prev < dir_earliest => prev, - _ => dir_earliest, - }); - } + && let Some(dir_earliest) = find_earliest_modified(&path)? + { + earliest = Some(match earliest { + Some(prev) if prev < dir_earliest => prev, + _ => dir_earliest, + }); + } } return Ok(earliest); diff --git a/crates/pile-dataset/src/value.rs b/crates/pile-dataset/src/value.rs index e236871..2a1e75b 100644 --- a/crates/pile-dataset/src/value.rs +++ b/crates/pile-dataset/src/value.rs @@ -1,6 +1,6 @@ use std::rc::Rc; -use pile_config::Label; +use pile_config::objectpath::{ObjectPath, PathSegment}; use serde_json::{Map, Value}; use smartstring::{LazyCompact, SmartString}; @@ -32,17 +32,40 @@ impl Clone for PileValue<'_, I> { } impl<'a, I: Item> PileValue<'a, I> { - pub fn query(&'a self, query: &[Label]) -> Result, std::io::Error> { + pub fn query(&'a self, query: &ObjectPath) -> Result, std::io::Error> { let mut out = Some(self); - for q in query { - out = match &out { - None => return Ok(None), - Some(Self::Null) => None, - Some(Self::Array(_)) => None, - Some(Self::String(_)) => None, - Some(Self::Extractor(e)) => e.field(q)?, - }; + for s in &query.segments { + match s { + PathSegment::Root => out = Some(self), + PathSegment::Field(field) => { + out = match &out { + None => return Ok(None), + Some(Self::Null) => None, + Some(Self::Array(_)) => None, + Some(Self::String(_)) => None, + Some(Self::Extractor(e)) => e.field(field)?, + } + } + + PathSegment::Index(idx) => { + out = match &out { + None => return Ok(None), + Some(Self::Null) => None, + Some(Self::Array(v)) => { + let idx = if *idx >= 0 { + usize::try_from(*idx).ok() + } else { + usize::try_from(v.len() as i64 - idx).ok() + }; + + idx.and_then(|idx| v.get(idx)) + } + Some(Self::String(_)) => None, + Some(Self::Extractor(_)) => None, + } + } + } } return Ok(out); diff --git a/crates/pile/src/command/lookup.rs b/crates/pile/src/command/lookup.rs index b548a4f..2e9f3ea 100644 --- a/crates/pile/src/command/lookup.rs +++ b/crates/pile/src/command/lookup.rs @@ -42,18 +42,17 @@ impl CliCmd for LookupCommand { let ds = Dataset::open(&self.config) .with_context(|| format!("while opening dataset for {}", self.config.display()))?; - if self.refresh - && ds.needs_fts().context("while checking dataset fts")? { - info!("FTS index is missing or out-of-date, regenerating"); - ds.fts_refresh(self.jobs, Some(flag)).map_err(|x| { - x.map_err(|x| { - anyhow::Error::from(x).context(format!( - "while refreshing fts for {}", - self.config.display() - )) - }) - })?; - } + if self.refresh && ds.needs_fts().context("while checking dataset fts")? { + info!("FTS index is missing or out-of-date, regenerating"); + ds.fts_refresh(self.jobs, Some(flag)).map_err(|x| { + x.map_err(|x| { + anyhow::Error::from(x).context(format!( + "while refreshing fts for {}", + self.config.display() + )) + }) + })?; + } let results = ds .fts_lookup(&self.query, self.topn)