diff --git a/Cargo.lock b/Cargo.lock index 909351a..28a5767 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2574,6 +2574,7 @@ dependencies = [ "pdfium-render", "pile-config", "pile-flac", + "regex", "serde_json", "smartstring", "tokio", diff --git a/Cargo.toml b/Cargo.toml index bf33874..70ba53b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -127,6 +127,7 @@ mime = "0.3.17" mime_guess = "2.0.5" paste = "1.0.15" smartstring = "1.0.1" +regex = "1" chrono = "0.4.43" parking_lot = "0.12.5" rayon = "1.11.0" diff --git a/crates/pile-config/src/objectpath/tokenizer.rs b/crates/pile-config/src/objectpath/tokenizer.rs index efbf1f7..8f76dac 100644 --- a/crates/pile-config/src/objectpath/tokenizer.rs +++ b/crates/pile-config/src/objectpath/tokenizer.rs @@ -21,7 +21,52 @@ impl Tokenizer { let mut tokens = Vec::new(); let mut window_start = None; + // Paren depth: while > 0, `.` / `[` / `]` / `$` are part of the ident. + let mut paren_depth: usize = 0; + // When true, the current char is escaped by a preceding `\` and is + // treated as a plain ident character with no special meaning. + let mut skip_next = false; + for (i, c) in source.char_indices() { + if skip_next { + skip_next = false; + // Escaped char: just extend the ident window (already opened by `\`). + continue; + } + + if c == '\\' { + if window_start.is_none() { + window_start = Some(i); + } + skip_next = true; + continue; + } + + if paren_depth > 0 { + // Inside parens: only track depth changes, everything else is ident. + match c { + '(' => { + if window_start.is_none() { + window_start = Some(i); + } + paren_depth += 1; + } + ')' => { + if window_start.is_none() { + window_start = Some(i); + } + paren_depth -= 1; + } + x if x.is_ascii() => { + if window_start.is_none() { + window_start = Some(i); + } + } + char => return Err(PathParseError::NonAsciiChar { position: i, char }), + } + continue; + } + match c { '$' => { if let Some(s) = window_start.take() { @@ -51,10 +96,26 @@ impl Tokenizer { tokens.push((i, Token::SqbClose)); } - x if x.is_ascii() => match window_start { - None => window_start = Some(i), - Some(_) => continue, - }, + '(' => { + if window_start.is_none() { + window_start = Some(i); + } + paren_depth += 1; + } + + ')' => { + if window_start.is_none() { + window_start = Some(i); + } + // paren_depth is 0 here — stray `)` is an ident char and + // parse_field will surface the error later. + } + + x if x.is_ascii() => { + if window_start.is_none() { + window_start = Some(i); + } + } char => return Err(PathParseError::NonAsciiChar { position: i, char }), } diff --git a/crates/pile-config/src/pattern/parser.rs b/crates/pile-config/src/pattern/parser.rs index eefddf7..f77bfb2 100644 --- a/crates/pile-config/src/pattern/parser.rs +++ b/crates/pile-config/src/pattern/parser.rs @@ -122,6 +122,14 @@ mod tests { GroupSegment::Literal(s.into()) } + #[test] + fn regex() { + assert_eq!( + parse("{$.split(/)[-1].regex((.*).pub \\((.*)\\).pdf)[0]}").unwrap(), + vec![(0, path("$.split(/)[-1].regex((.*).pub \\((.*)\\).pdf)[0]"))] + ); + } + #[test] fn single_path() { assert_eq!(parse("{$.foo}").unwrap(), vec![(0, path("$.foo"))]); diff --git a/crates/pile-value/Cargo.toml b/crates/pile-value/Cargo.toml index c079484..6f96cd7 100644 --- a/crates/pile-value/Cargo.toml +++ b/crates/pile-value/Cargo.toml @@ -18,6 +18,7 @@ tracing = { workspace = true } chrono = { workspace = true } toml = { workspace = true } smartstring = { workspace = true } +regex = { workspace = true } blake3 = { workspace = true } epub = { workspace = true } kamadak-exif = { workspace = true } diff --git a/crates/pile-value/src/extract/mod.rs b/crates/pile-value/src/extract/mod.rs index 2e91ea1..be4e103 100644 --- a/crates/pile-value/src/extract/mod.rs +++ b/crates/pile-value/src/extract/mod.rs @@ -1,4 +1,5 @@ pub mod item; pub mod misc; +pub mod regex; pub mod string; pub mod traits; diff --git a/crates/pile-value/src/extract/regex.rs b/crates/pile-value/src/extract/regex.rs new file mode 100644 index 0000000..7037b9b --- /dev/null +++ b/crates/pile-value/src/extract/regex.rs @@ -0,0 +1,104 @@ +use std::sync::Arc; + +use pile_config::Label; +use regex::Regex; +use smartstring::{LazyCompact, SmartString}; + +use crate::{ + extract::traits::{ExtractState, ListExtractor, ObjectExtractor}, + value::PileValue, +}; + +struct RegexData { + regex: Arc, + /// Captured substrings indexed by group index (0 = whole match). + captures: Vec>>>, +} + +impl RegexData { + fn new(regex: Arc, input: &str) -> Option { + let caps = regex.captures(input)?; + let captures = caps + .iter() + .map(|m| m.map(|m| Arc::new(m.as_str().into()))) + .collect(); + Some(Self { regex, captures }) + } +} + +/// Exposes named capture groups as object fields. +pub struct RegexExtractor(Arc); + +impl RegexExtractor { + /// Run `regex` against `input`. Returns `None` if there is no match. + pub fn new(regex: Arc, input: &str) -> Option { + Some(Self(Arc::new(RegexData::new(regex, input)?))) + } +} + +#[async_trait::async_trait] +impl ObjectExtractor for RegexExtractor { + async fn field( + &self, + _state: &ExtractState, + name: &Label, + args: Option<&str>, + ) -> Result, std::io::Error> { + if args.is_some() { + return Ok(None); + } + + let Some(idx) = self + .0 + .regex + .capture_names() + .position(|n| n == Some(name.as_str())) + else { + return Ok(None); + }; + + Ok(Some( + match self.0.captures.get(idx).and_then(|v| v.as_ref()) { + Some(s) => PileValue::String(s.clone()), + None => PileValue::Null, + }, + )) + } + + async fn fields(&self) -> Result, std::io::Error> { + #[expect(clippy::unwrap_used)] + Ok(self + .0 + .regex + .capture_names() + .flatten() + .map(|n| Label::new(n).unwrap()) + .collect()) + } + + fn as_list(&self) -> Option> { + Some(Arc::new(RegexExtractor(self.0.clone()))) + } +} + +#[async_trait::async_trait] +impl ListExtractor for RegexExtractor { + async fn get( + &self, + _state: &ExtractState, + idx: usize, + ) -> Result, std::io::Error> { + let raw_idx = idx + 1; + let Some(slot) = self.0.captures.get(raw_idx) else { + return Ok(None); + }; + Ok(Some(match slot { + Some(s) => PileValue::String(s.clone()), + None => PileValue::Null, + })) + } + + async fn len(&self, _state: &ExtractState) -> Result { + Ok(self.0.captures.len().saturating_sub(1)) + } +} diff --git a/crates/pile-value/src/extract/string.rs b/crates/pile-value/src/extract/string.rs index 63fd0fa..136b25b 100644 --- a/crates/pile-value/src/extract/string.rs +++ b/crates/pile-value/src/extract/string.rs @@ -1,9 +1,13 @@ use pile_config::Label; +use regex::Regex; use smartstring::{LazyCompact, SmartString}; use std::sync::Arc; use crate::{ - extract::traits::{ExtractState, ObjectExtractor}, + extract::{ + regex::RegexExtractor, + traits::{ExtractState, ObjectExtractor}, + }, value::PileValue, }; @@ -67,6 +71,18 @@ impl ObjectExtractor for StringExtractor { .collect(), ))), + ("regex", Some(pattern)) => { + let Ok(re) = Regex::new(pattern) else { + return Ok(None); + }; + Some( + match RegexExtractor::new(Arc::new(re), self.item.as_str()) { + Some(ext) => PileValue::ObjectExtractor(Arc::new(ext)), + None => PileValue::Null, + }, + ) + } + _ => None, }) } @@ -78,6 +94,9 @@ impl ObjectExtractor for StringExtractor { Label::new("upper").unwrap(), Label::new("lower").unwrap(), Label::new("nonempty").unwrap(), + Label::new("trimprefix").unwrap(), + Label::new("trimsuffix").unwrap(), + Label::new("split").unwrap(), ]); } } diff --git a/crates/pile-value/src/extract/traits.rs b/crates/pile-value/src/extract/traits.rs index fdb232a..1dbd803 100644 --- a/crates/pile-value/src/extract/traits.rs +++ b/crates/pile-value/src/extract/traits.rs @@ -35,6 +35,11 @@ pub trait ObjectExtractor: Send + Sync { /// and [None] for all others. async fn fields(&self) -> Result, std::io::Error>; + /// Return a list view of this extractor, if supported. + fn as_list(&self) -> Option> { + None + } + /// Convert this to a JSON value. async fn to_json(&self, state: &ExtractState) -> Result { let keys = self.fields().await?; diff --git a/crates/pile-value/src/value/value.rs b/crates/pile-value/src/value/value.rs index d66d3a1..a0d5efa 100644 --- a/crates/pile-value/src/value/value.rs +++ b/crates/pile-value/src/value/value.rs @@ -86,7 +86,9 @@ impl PileValue { Self::String(_) => Arc::new(VecExtractor::default()), Self::Blob { .. } => Arc::new(VecExtractor::default()), Self::ListExtractor(e) => e.clone(), - Self::ObjectExtractor(_) => Arc::new(VecExtractor::default()), + Self::ObjectExtractor(e) => e + .as_list() + .unwrap_or_else(|| Arc::new(VecExtractor::default())), Self::Item(_) => Arc::new(VecExtractor::default()), } }