Improve arg parsing
Some checks failed
CI / Typos (push) Successful in 20s
CI / Build and test (push) Successful in 2m28s
CI / Clippy (push) Failing after 2m50s
CI / Build and test (all features) (push) Successful in 7m27s

This commit is contained in:
2026-03-11 12:54:02 -07:00
parent 8a9388020c
commit f3bb1a265e
19 changed files with 327 additions and 98 deletions

View File

@@ -41,8 +41,11 @@ pub enum PathSegment {
/// Go to root node (`$` identifier)
Root,
/// Go to a child of the current object
Field(Label),
/// Go to a child of the current object.
Field {
name: Label,
args: Option<SmartString<LazyCompact>>,
},
/// Go to an element of the current list
Index(i64),

View File

@@ -1,10 +1,80 @@
use std::str::FromStr;
use smartstring::{LazyCompact, SmartString};
use crate::{
Label,
objectpath::{PathParseError, PathSegment, tokenizer::Token},
};
/// Parse an ident token into a `PathSegment::Field`, handling optional args of
/// the form `name(args)`. Parens inside args may be nested; `\(` and `\)` are
/// escaped and do not affect depth counting.
fn parse_field(ident: &str, position: usize) -> Result<PathSegment, PathParseError> {
let bytes = ident.as_bytes();
let mut i = 0;
// Find the first unescaped '(' — everything before it is the name.
let open_paren: Option<usize> = loop {
if i >= bytes.len() {
break None;
}
match bytes[i] {
b'\\' => i += 2, // skip escaped character
b'(' => break Some(i),
_ => i += 1,
}
};
let name_str = &ident[..open_paren.unwrap_or(bytes.len())];
let name = Label::new(name_str).ok_or_else(|| PathParseError::InvalidField {
position,
str: name_str.into(),
})?;
let Some(open_pos) = open_paren else {
return Ok(PathSegment::Field { name, args: None });
};
// Scan args, tracking paren depth.
let args_start = open_pos + 1;
let mut depth: usize = 1;
let mut j = args_start;
while j < bytes.len() {
match bytes[j] {
b'\\' => j += 2, // skip escaped character
b'(' => {
depth += 1;
j += 1;
}
b')' => {
depth -= 1;
if depth == 0 {
// Closing paren must be the last character.
if j + 1 != bytes.len() {
return Err(PathParseError::Syntax {
position: position + j + 1,
});
}
let args: SmartString<LazyCompact> = ident[args_start..j].into();
return Ok(PathSegment::Field {
name,
args: Some(args),
});
}
j += 1;
}
_ => j += 1,
}
}
// Reached end of ident without finding the matching ')'.
Err(PathParseError::Syntax {
position: position + ident.len(),
})
}
enum State {
Start,
@@ -72,14 +142,7 @@ impl Parser {
// MARK: dot
//
(State::Dot, (p, Token::Ident(ident))) => {
self.segments
.push(PathSegment::Field(Label::new(*ident).ok_or_else(|| {
PathParseError::InvalidField {
position: *p,
str: (*ident).into(),
}
})?));
self.segments.push(parse_field(ident, *p)?);
self.state = State::Selected;
}
@@ -161,27 +224,30 @@ mod tests {
parse_test("$", Ok(&[PathSegment::Root]));
}
fn field(name: &str) -> PathSegment {
PathSegment::Field {
name: Label::new(name).unwrap(),
args: None,
}
}
fn field_args(name: &str, args: &str) -> PathSegment {
PathSegment::Field {
name: Label::new(name).unwrap(),
args: Some(args.into()),
}
}
#[test]
fn single_field() {
parse_test(
"$.foo",
Ok(&[
PathSegment::Root,
PathSegment::Field(Label::new("foo").unwrap()),
]),
);
parse_test("$.foo", Ok(&[PathSegment::Root, field("foo")]));
}
#[test]
fn nested_fields() {
parse_test(
"$.foo.bar.baz",
Ok(&[
PathSegment::Root,
PathSegment::Field(Label::new("foo").unwrap()),
PathSegment::Field(Label::new("bar").unwrap()),
PathSegment::Field(Label::new("baz").unwrap()),
]),
Ok(&[PathSegment::Root, field("foo"), field("bar"), field("baz")]),
);
}
@@ -189,11 +255,7 @@ mod tests {
fn array_index() {
parse_test(
"$.items[0]",
Ok(&[
PathSegment::Root,
PathSegment::Field(Label::new("items").unwrap()),
PathSegment::Index(0),
]),
Ok(&[PathSegment::Root, field("items"), PathSegment::Index(0)]),
);
}
@@ -203,7 +265,7 @@ mod tests {
"$.a[1][2]",
Ok(&[
PathSegment::Root,
PathSegment::Field(Label::new("a").unwrap()),
field("a"),
PathSegment::Index(1),
PathSegment::Index(2),
]),
@@ -216,9 +278,9 @@ mod tests {
"$.a[0].b",
Ok(&[
PathSegment::Root,
PathSegment::Field(Label::new("a").unwrap()),
field("a"),
PathSegment::Index(0),
PathSegment::Field(Label::new("b").unwrap()),
field("b"),
]),
);
}
@@ -227,14 +289,94 @@ mod tests {
fn negative_index() {
parse_test(
"$.a[-1]",
Ok(&[PathSegment::Root, field("a"), PathSegment::Index(-1)]),
);
}
// MARK: args
#[test]
fn field_with_simple_args() {
parse_test(
"$.foo(bar)",
Ok(&[PathSegment::Root, field_args("foo", "bar")]),
);
}
#[test]
fn field_with_empty_args() {
parse_test("$.foo()", Ok(&[PathSegment::Root, field_args("foo", "")]));
}
#[test]
fn field_with_nested_parens_in_args() {
parse_test(
"$.foo(a(b)c)",
Ok(&[PathSegment::Root, field_args("foo", "a(b)c")]),
);
}
#[test]
fn field_with_deeply_nested_parens_in_args() {
parse_test(
"$.foo(a(b(c))d)",
Ok(&[PathSegment::Root, field_args("foo", "a(b(c))d")]),
);
}
#[test]
fn field_with_escaped_open_paren_in_args() {
// "$.foo(a\(b)" — '\(' is escaped, so depth never rises above 1; ')' closes it
parse_test(
r"$.foo(a\(b)",
Ok(&[PathSegment::Root, field_args("foo", r"a\(b")]),
);
}
#[test]
fn field_with_escaped_close_paren_in_args() {
// "$.foo(a\)b)" — '\)' is escaped, the second ')' closes at depth 0
parse_test(
r"$.foo(a\)b)",
Ok(&[PathSegment::Root, field_args("foo", r"a\)b")]),
);
}
#[test]
fn field_with_both_escaped_parens_in_args() {
parse_test(
r"$.foo(a\(b\)c)",
Ok(&[PathSegment::Root, field_args("foo", r"a\(b\)c")]),
);
}
#[test]
fn field_args_with_multiple_segments() {
parse_test(
"$.foo(x).bar(y)",
Ok(&[
PathSegment::Root,
PathSegment::Field(Label::new("a").unwrap()),
PathSegment::Index(-1),
field_args("foo", "x"),
field_args("bar", "y"),
]),
);
}
#[test]
fn field_args_unclosed_paren_error() {
// Missing closing ')' → Syntax error at end of source
parse_test("$.foo(bar", Err(PathParseError::Syntax { position: 9 }));
}
#[test]
fn field_args_trailing_chars_after_close_error() {
// Closing ')' is not the last char → Syntax error at the trailing char
parse_test(
"$.foo(bar)baz",
Err(PathParseError::Syntax { position: 10 }),
);
}
#[test]
fn non_ascii_error() {
parse_test(

View File

@@ -85,7 +85,15 @@ impl EpubMetaExtractor {
#[async_trait::async_trait]
impl ObjectExtractor for EpubMetaExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
async fn field(
&self,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
Ok(self.get_inner().await?.get(name).cloned())
}

View File

@@ -95,7 +95,15 @@ fn strip_html(html: &str) -> String {
#[async_trait::async_trait]
impl ObjectExtractor for EpubTextExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
async fn field(
&self,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
Ok(self.get_inner().await?.get(name).cloned())
}

View File

@@ -28,10 +28,14 @@ impl EpubExtractor {
#[async_trait::async_trait]
impl ObjectExtractor for EpubExtractor {
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error> {
match name.as_str() {
"text" => self.text.field(name).await,
"meta" => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))),
async fn field(
&self,
name: &pile_config::Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
match (name.as_str(), args) {
("text", args) => self.text.field(name, args).await,
("meta", None) => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))),
_ => Ok(None),
}
}

View File

@@ -86,7 +86,15 @@ fn tag_to_label(tag: &str) -> Option<Label> {
#[async_trait::async_trait]
impl ObjectExtractor for ExifExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
async fn field(
&self,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
Ok(self.get_inner().await?.get(name).cloned())
}

View File

@@ -142,7 +142,15 @@ impl FlacExtractor {
#[async_trait::async_trait]
impl ObjectExtractor for FlacExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
async fn field(
&self,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
if name.as_str() == "images"
&& let Some(ref images) = self.images
{

View File

@@ -70,7 +70,15 @@ impl FsExtractor {
#[async_trait::async_trait]
impl ObjectExtractor for FsExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
async fn field(
&self,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
Ok(self.get_inner()?.get(name).cloned())
}

View File

@@ -123,7 +123,15 @@ fn frame_id_to_field(id: &str) -> Cow<'static, str> {
#[async_trait::async_trait]
impl ObjectExtractor for Id3Extractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
async fn field(
&self,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
Ok(self.get_inner().await?.get(name).cloned())
}

View File

@@ -80,8 +80,12 @@ impl ItemExtractor {
#[async_trait::async_trait]
impl ObjectExtractor for ItemExtractor {
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error> {
self.inner.field(name).await
async fn field(
&self,
name: &pile_config::Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
self.inner.field(name, args).await
}
#[expect(clippy::unwrap_used)]

View File

@@ -37,12 +37,16 @@ impl PdfExtractor {
#[async_trait::async_trait]
impl ObjectExtractor for PdfExtractor {
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error> {
match name.as_str() {
"text" => self.text.field(name).await,
"meta" => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))),
async fn field(
&self,
name: &pile_config::Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
match (name.as_str(), args) {
("text", args) => self.text.field(name, args).await,
("meta", None) => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))),
#[cfg(feature = "pdfium")]
"pages" => Ok(Some(PileValue::ListExtractor(self.pages.clone()))),
("pages", None) => Ok(Some(PileValue::ListExtractor(self.pages.clone()))),
_ => Ok(None),
}
}

View File

@@ -122,7 +122,14 @@ fn format_date(d: &Date) -> String {
#[async_trait::async_trait]
impl ObjectExtractor for PdfMetaExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
async fn field(
&self,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
Ok(self.get_inner().await?.get(name).cloned())
}

View File

@@ -102,7 +102,15 @@ impl PdfTextExtractor {
#[async_trait::async_trait]
impl ObjectExtractor for PdfTextExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
async fn field(
&self,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
Ok(self.get_inner().await?.get(name).cloned())
}

View File

@@ -23,12 +23,16 @@ impl SidecarExtractor {
#[async_trait::async_trait]
impl ObjectExtractor for SidecarExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
async fn field(
&self,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
match self
.output
.get_or_init(|| self.item.sidecar().map(TomlExtractor::new))
{
Some(x) => Ok(x.field(name).await?),
Some(x) => Ok(x.field(name, args).await?),
None => Ok(Some(PileValue::Null)),
}
}

View File

@@ -68,7 +68,15 @@ impl TomlExtractor {
#[async_trait::async_trait]
impl ObjectExtractor for TomlExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
async fn field(
&self,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
Ok(self.get_inner().await?.get(name).cloned())
}

View File

@@ -10,7 +10,15 @@ pub struct MapExtractor {
#[async_trait::async_trait]
impl ObjectExtractor for MapExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
async fn field(
&self,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
Ok(self.inner.get(name).cloned())
}

View File

@@ -4,20 +4,6 @@ use std::sync::Arc;
use crate::{extract::traits::ObjectExtractor, value::PileValue};
fn parse_name(s: &str) -> (&str, Option<&str>) {
match s.find('(') {
None => (s, None),
Some(i) => {
let name = &s[..i];
let rest = &s[i + 1..];
match rest.strip_suffix(')') {
Some(args) => (name, Some(args)),
None => (name, None),
}
}
}
}
pub struct StringExtractor {
item: Arc<SmartString<LazyCompact>>,
}
@@ -30,9 +16,12 @@ impl StringExtractor {
#[async_trait::async_trait]
impl ObjectExtractor for StringExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
let (name, args) = parse_name(name.as_str());
Ok(match (name, args) {
async fn field(
&self,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
Ok(match (name.as_str(), args) {
("trim", None) => Some(PileValue::String(Arc::new(
self.item.as_str().trim().into(),
))),
@@ -98,8 +87,8 @@ mod tests {
}
#[expect(clippy::unwrap_used)]
async fn field(ext: &StringExtractor, name: &str) -> Option<PileValue> {
ext.field(&Label::new(name).unwrap()).await.unwrap()
async fn field(ext: &StringExtractor, name: &str, args: Option<&str>) -> Option<PileValue> {
ext.field(&Label::new(name).unwrap(), args).await.unwrap()
}
fn string(v: Option<PileValue>) -> Option<String> {
@@ -125,20 +114,20 @@ mod tests {
#[tokio::test]
async fn trim() {
assert_eq!(
string(field(&extractor(" hi "), "trim").await),
string(field(&extractor(" hi "), "trim", None).await),
Some("hi".into())
);
}
#[tokio::test]
async fn trim_no_args() {
assert!(field(&extractor("x"), "trim(foo)").await.is_none());
assert!(field(&extractor("x"), "trim", Some("foo")).await.is_none());
}
#[tokio::test]
async fn nonempty_with_content() {
assert!(matches!(
field(&extractor("hello"), "nonempty").await,
field(&extractor("hello"), "nonempty", None).await,
Some(PileValue::String(_))
));
}
@@ -146,7 +135,7 @@ mod tests {
#[tokio::test]
async fn nonempty_empty_string() {
assert!(matches!(
field(&extractor(""), "nonempty").await,
field(&extractor(""), "nonempty", None).await,
Some(PileValue::Null)
));
}
@@ -154,7 +143,7 @@ mod tests {
#[tokio::test]
async fn trimprefix_present() {
assert_eq!(
string(field(&extractor("foobar"), "trimprefix(foo)").await),
string(field(&extractor("foobar"), "trimprefix", Some("foo")).await),
Some("bar".into())
);
}
@@ -162,20 +151,24 @@ mod tests {
#[tokio::test]
async fn trimprefix_absent() {
assert_eq!(
string(field(&extractor("foobar"), "trimprefix(baz)").await),
string(field(&extractor("foobar"), "trimprefix", Some("baz")).await),
Some("foobar".into())
);
}
#[tokio::test]
async fn trimprefix_no_args() {
assert!(field(&extractor("foobar"), "trimprefix").await.is_none());
assert!(
field(&extractor("foobar"), "trimprefix", None)
.await
.is_none()
);
}
#[tokio::test]
async fn trimsuffix_present() {
assert_eq!(
string(field(&extractor("foobar"), "trimsuffix(bar)").await),
string(field(&extractor("foobar"), "trimsuffix", Some("bar")).await),
Some("foo".into())
);
}
@@ -183,7 +176,7 @@ mod tests {
#[tokio::test]
async fn trimsuffix_absent() {
assert_eq!(
string(field(&extractor("foobar"), "trimsuffix(baz)").await),
string(field(&extractor("foobar"), "trimsuffix", Some("baz")).await),
Some("foobar".into())
);
}
@@ -191,7 +184,7 @@ mod tests {
#[tokio::test]
async fn split_basic() {
assert_eq!(
array(field(&extractor("a,b,c"), "split(,)").await),
array(field(&extractor("a,b,c"), "split", Some(",")).await),
vec!["a", "b", "c"]
);
}
@@ -199,23 +192,18 @@ mod tests {
#[tokio::test]
async fn split_no_match() {
assert_eq!(
array(field(&extractor("abc"), "split(,)").await),
array(field(&extractor("abc"), "split", Some(",")).await),
vec!["abc"]
);
}
#[tokio::test]
async fn split_no_args() {
assert!(field(&extractor("abc"), "split").await.is_none());
}
#[tokio::test]
async fn split_unclosed_paren() {
assert!(field(&extractor("abc"), "split(,").await.is_none());
assert!(field(&extractor("abc"), "split", None).await.is_none());
}
#[tokio::test]
async fn unknown_field() {
assert!(field(&extractor("abc"), "bogus").await.is_none());
assert!(field(&extractor("abc"), "bogus", None).await.is_none());
}
}

View File

@@ -10,6 +10,7 @@ pub trait ObjectExtractor: Send + Sync {
async fn field(
&self,
name: &pile_config::Label,
args: Option<&str>,
) -> Result<Option<crate::value::PileValue>, std::io::Error>;
/// Return all fields in this extractor.
@@ -22,7 +23,7 @@ pub trait ObjectExtractor: Send + Sync {
let keys = self.fields().await?;
let mut map = serde_json::Map::new();
for k in &keys {
let v = match self.field(k).await? {
let v = match self.field(k, None).await? {
Some(x) => x,
None => continue,
};

View File

@@ -97,7 +97,7 @@ impl PileValue {
for s in &query.segments {
match s {
PathSegment::Root => out = Some(self.clone()),
PathSegment::Field(field) => {
PathSegment::Field { name, args } => {
let e = match out.map(|x| x.object_extractor()) {
Some(e) => e,
None => {
@@ -106,7 +106,7 @@ impl PileValue {
}
};
out = e.field(field).await?;
out = e.field(name, args.as_deref()).await?;
}
PathSegment::Index(idx) => {
@@ -163,7 +163,7 @@ impl PileValue {
let keys = e.fields().await?;
let mut map = Map::new();
for k in &keys {
let v = match e.field(k).await? {
let v = match e.field(k, None).await? {
Some(x) => x,
None => continue,
};
@@ -216,7 +216,7 @@ impl PileValue {
let keys = e.fields().await?;
let mut map = Map::new();
for k in &keys {
let v = match e.field(k).await? {
let v = match e.field(k, None).await? {
Some(x) => x,
None => continue,
};