From f3bb1a265ee4663d0e459858328b18f8e534a07d Mon Sep 17 00:00:00 2001 From: rm-dr <96270320+rm-dr@users.noreply.github.com> Date: Wed, 11 Mar 2026 12:54:02 -0700 Subject: [PATCH] Improve arg parsing --- crates/pile-config/src/objectpath/mod.rs | 7 +- crates/pile-config/src/objectpath/parser.rs | 204 +++++++++++++++--- .../src/extract/item/epub/epub_meta.rs | 10 +- .../src/extract/item/epub/epub_text.rs | 10 +- .../pile-value/src/extract/item/epub/mod.rs | 12 +- crates/pile-value/src/extract/item/exif.rs | 10 +- crates/pile-value/src/extract/item/flac.rs | 10 +- crates/pile-value/src/extract/item/fs.rs | 10 +- crates/pile-value/src/extract/item/id3.rs | 10 +- crates/pile-value/src/extract/item/mod.rs | 8 +- crates/pile-value/src/extract/item/pdf/mod.rs | 14 +- .../src/extract/item/pdf/pdf_meta.rs | 9 +- .../src/extract/item/pdf/pdf_text.rs | 10 +- crates/pile-value/src/extract/item/sidecar.rs | 8 +- crates/pile-value/src/extract/item/toml.rs | 10 +- crates/pile-value/src/extract/misc/map.rs | 10 +- crates/pile-value/src/extract/string.rs | 62 +++--- crates/pile-value/src/extract/traits.rs | 3 +- crates/pile-value/src/value/value.rs | 8 +- 19 files changed, 327 insertions(+), 98 deletions(-) diff --git a/crates/pile-config/src/objectpath/mod.rs b/crates/pile-config/src/objectpath/mod.rs index d3598b9..668518a 100644 --- a/crates/pile-config/src/objectpath/mod.rs +++ b/crates/pile-config/src/objectpath/mod.rs @@ -41,8 +41,11 @@ pub enum PathSegment { /// Go to root node (`$` identifier) Root, - /// Go to a child of the current object - Field(Label), + /// Go to a child of the current object. + Field { + name: Label, + args: Option>, + }, /// Go to an element of the current list Index(i64), diff --git a/crates/pile-config/src/objectpath/parser.rs b/crates/pile-config/src/objectpath/parser.rs index ea96acf..7b42fa2 100644 --- a/crates/pile-config/src/objectpath/parser.rs +++ b/crates/pile-config/src/objectpath/parser.rs @@ -1,10 +1,80 @@ use std::str::FromStr; +use smartstring::{LazyCompact, SmartString}; + use crate::{ Label, objectpath::{PathParseError, PathSegment, tokenizer::Token}, }; +/// Parse an ident token into a `PathSegment::Field`, handling optional args of +/// the form `name(args)`. Parens inside args may be nested; `\(` and `\)` are +/// escaped and do not affect depth counting. +fn parse_field(ident: &str, position: usize) -> Result { + let bytes = ident.as_bytes(); + let mut i = 0; + + // Find the first unescaped '(' — everything before it is the name. + let open_paren: Option = loop { + if i >= bytes.len() { + break None; + } + match bytes[i] { + b'\\' => i += 2, // skip escaped character + b'(' => break Some(i), + _ => i += 1, + } + }; + + let name_str = &ident[..open_paren.unwrap_or(bytes.len())]; + let name = Label::new(name_str).ok_or_else(|| PathParseError::InvalidField { + position, + str: name_str.into(), + })?; + + let Some(open_pos) = open_paren else { + return Ok(PathSegment::Field { name, args: None }); + }; + + // Scan args, tracking paren depth. + let args_start = open_pos + 1; + let mut depth: usize = 1; + let mut j = args_start; + + while j < bytes.len() { + match bytes[j] { + b'\\' => j += 2, // skip escaped character + b'(' => { + depth += 1; + j += 1; + } + b')' => { + depth -= 1; + if depth == 0 { + // Closing paren must be the last character. + if j + 1 != bytes.len() { + return Err(PathParseError::Syntax { + position: position + j + 1, + }); + } + let args: SmartString = ident[args_start..j].into(); + return Ok(PathSegment::Field { + name, + args: Some(args), + }); + } + j += 1; + } + _ => j += 1, + } + } + + // Reached end of ident without finding the matching ')'. + Err(PathParseError::Syntax { + position: position + ident.len(), + }) +} + enum State { Start, @@ -72,14 +142,7 @@ impl Parser { // MARK: dot // (State::Dot, (p, Token::Ident(ident))) => { - self.segments - .push(PathSegment::Field(Label::new(*ident).ok_or_else(|| { - PathParseError::InvalidField { - position: *p, - str: (*ident).into(), - } - })?)); - + self.segments.push(parse_field(ident, *p)?); self.state = State::Selected; } @@ -161,27 +224,30 @@ mod tests { parse_test("$", Ok(&[PathSegment::Root])); } + fn field(name: &str) -> PathSegment { + PathSegment::Field { + name: Label::new(name).unwrap(), + args: None, + } + } + + fn field_args(name: &str, args: &str) -> PathSegment { + PathSegment::Field { + name: Label::new(name).unwrap(), + args: Some(args.into()), + } + } + #[test] fn single_field() { - parse_test( - "$.foo", - Ok(&[ - PathSegment::Root, - PathSegment::Field(Label::new("foo").unwrap()), - ]), - ); + parse_test("$.foo", Ok(&[PathSegment::Root, field("foo")])); } #[test] fn nested_fields() { parse_test( "$.foo.bar.baz", - Ok(&[ - PathSegment::Root, - PathSegment::Field(Label::new("foo").unwrap()), - PathSegment::Field(Label::new("bar").unwrap()), - PathSegment::Field(Label::new("baz").unwrap()), - ]), + Ok(&[PathSegment::Root, field("foo"), field("bar"), field("baz")]), ); } @@ -189,11 +255,7 @@ mod tests { fn array_index() { parse_test( "$.items[0]", - Ok(&[ - PathSegment::Root, - PathSegment::Field(Label::new("items").unwrap()), - PathSegment::Index(0), - ]), + Ok(&[PathSegment::Root, field("items"), PathSegment::Index(0)]), ); } @@ -203,7 +265,7 @@ mod tests { "$.a[1][2]", Ok(&[ PathSegment::Root, - PathSegment::Field(Label::new("a").unwrap()), + field("a"), PathSegment::Index(1), PathSegment::Index(2), ]), @@ -216,9 +278,9 @@ mod tests { "$.a[0].b", Ok(&[ PathSegment::Root, - PathSegment::Field(Label::new("a").unwrap()), + field("a"), PathSegment::Index(0), - PathSegment::Field(Label::new("b").unwrap()), + field("b"), ]), ); } @@ -227,14 +289,94 @@ mod tests { fn negative_index() { parse_test( "$.a[-1]", + Ok(&[PathSegment::Root, field("a"), PathSegment::Index(-1)]), + ); + } + + // MARK: args + + #[test] + fn field_with_simple_args() { + parse_test( + "$.foo(bar)", + Ok(&[PathSegment::Root, field_args("foo", "bar")]), + ); + } + + #[test] + fn field_with_empty_args() { + parse_test("$.foo()", Ok(&[PathSegment::Root, field_args("foo", "")])); + } + + #[test] + fn field_with_nested_parens_in_args() { + parse_test( + "$.foo(a(b)c)", + Ok(&[PathSegment::Root, field_args("foo", "a(b)c")]), + ); + } + + #[test] + fn field_with_deeply_nested_parens_in_args() { + parse_test( + "$.foo(a(b(c))d)", + Ok(&[PathSegment::Root, field_args("foo", "a(b(c))d")]), + ); + } + + #[test] + fn field_with_escaped_open_paren_in_args() { + // "$.foo(a\(b)" — '\(' is escaped, so depth never rises above 1; ')' closes it + parse_test( + r"$.foo(a\(b)", + Ok(&[PathSegment::Root, field_args("foo", r"a\(b")]), + ); + } + + #[test] + fn field_with_escaped_close_paren_in_args() { + // "$.foo(a\)b)" — '\)' is escaped, the second ')' closes at depth 0 + parse_test( + r"$.foo(a\)b)", + Ok(&[PathSegment::Root, field_args("foo", r"a\)b")]), + ); + } + + #[test] + fn field_with_both_escaped_parens_in_args() { + parse_test( + r"$.foo(a\(b\)c)", + Ok(&[PathSegment::Root, field_args("foo", r"a\(b\)c")]), + ); + } + + #[test] + fn field_args_with_multiple_segments() { + parse_test( + "$.foo(x).bar(y)", Ok(&[ PathSegment::Root, - PathSegment::Field(Label::new("a").unwrap()), - PathSegment::Index(-1), + field_args("foo", "x"), + field_args("bar", "y"), ]), ); } + #[test] + fn field_args_unclosed_paren_error() { + // Missing closing ')' → Syntax error at end of source + parse_test("$.foo(bar", Err(PathParseError::Syntax { position: 9 })); + } + + #[test] + fn field_args_trailing_chars_after_close_error() { + // Closing ')' is not the last char → Syntax error at the trailing char + parse_test( + "$.foo(bar)baz", + Err(PathParseError::Syntax { position: 10 }), + ); + } + #[test] fn non_ascii_error() { parse_test( diff --git a/crates/pile-value/src/extract/item/epub/epub_meta.rs b/crates/pile-value/src/extract/item/epub/epub_meta.rs index 31d2d47..d4e7622 100644 --- a/crates/pile-value/src/extract/item/epub/epub_meta.rs +++ b/crates/pile-value/src/extract/item/epub/epub_meta.rs @@ -85,7 +85,15 @@ impl EpubMetaExtractor { #[async_trait::async_trait] impl ObjectExtractor for EpubMetaExtractor { - async fn field(&self, name: &Label) -> Result, std::io::Error> { + async fn field( + &self, + name: &Label, + args: Option<&str>, + ) -> Result, std::io::Error> { + if args.is_some() { + return Ok(None); + } + Ok(self.get_inner().await?.get(name).cloned()) } diff --git a/crates/pile-value/src/extract/item/epub/epub_text.rs b/crates/pile-value/src/extract/item/epub/epub_text.rs index e04b83c..2d8f645 100644 --- a/crates/pile-value/src/extract/item/epub/epub_text.rs +++ b/crates/pile-value/src/extract/item/epub/epub_text.rs @@ -95,7 +95,15 @@ fn strip_html(html: &str) -> String { #[async_trait::async_trait] impl ObjectExtractor for EpubTextExtractor { - async fn field(&self, name: &Label) -> Result, std::io::Error> { + async fn field( + &self, + name: &Label, + args: Option<&str>, + ) -> Result, std::io::Error> { + if args.is_some() { + return Ok(None); + } + Ok(self.get_inner().await?.get(name).cloned()) } diff --git a/crates/pile-value/src/extract/item/epub/mod.rs b/crates/pile-value/src/extract/item/epub/mod.rs index f742dc0..199deda 100644 --- a/crates/pile-value/src/extract/item/epub/mod.rs +++ b/crates/pile-value/src/extract/item/epub/mod.rs @@ -28,10 +28,14 @@ impl EpubExtractor { #[async_trait::async_trait] impl ObjectExtractor for EpubExtractor { - async fn field(&self, name: &pile_config::Label) -> Result, std::io::Error> { - match name.as_str() { - "text" => self.text.field(name).await, - "meta" => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))), + async fn field( + &self, + name: &pile_config::Label, + args: Option<&str>, + ) -> Result, std::io::Error> { + match (name.as_str(), args) { + ("text", args) => self.text.field(name, args).await, + ("meta", None) => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))), _ => Ok(None), } } diff --git a/crates/pile-value/src/extract/item/exif.rs b/crates/pile-value/src/extract/item/exif.rs index 56a5f8d..abf7be2 100644 --- a/crates/pile-value/src/extract/item/exif.rs +++ b/crates/pile-value/src/extract/item/exif.rs @@ -86,7 +86,15 @@ fn tag_to_label(tag: &str) -> Option