Add regex extractor

This commit is contained in:
2026-03-21 09:27:12 -07:00
parent ed169b3ab4
commit b6cb5870b4
10 changed files with 209 additions and 6 deletions

1
Cargo.lock generated
View File

@@ -2574,6 +2574,7 @@ dependencies = [
"pdfium-render", "pdfium-render",
"pile-config", "pile-config",
"pile-flac", "pile-flac",
"regex",
"serde_json", "serde_json",
"smartstring", "smartstring",
"tokio", "tokio",

View File

@@ -127,6 +127,7 @@ mime = "0.3.17"
mime_guess = "2.0.5" mime_guess = "2.0.5"
paste = "1.0.15" paste = "1.0.15"
smartstring = "1.0.1" smartstring = "1.0.1"
regex = "1"
chrono = "0.4.43" chrono = "0.4.43"
parking_lot = "0.12.5" parking_lot = "0.12.5"
rayon = "1.11.0" rayon = "1.11.0"

View File

@@ -21,7 +21,52 @@ impl Tokenizer {
let mut tokens = Vec::new(); let mut tokens = Vec::new();
let mut window_start = None; let mut window_start = None;
// Paren depth: while > 0, `.` / `[` / `]` / `$` are part of the ident.
let mut paren_depth: usize = 0;
// When true, the current char is escaped by a preceding `\` and is
// treated as a plain ident character with no special meaning.
let mut skip_next = false;
for (i, c) in source.char_indices() { for (i, c) in source.char_indices() {
if skip_next {
skip_next = false;
// Escaped char: just extend the ident window (already opened by `\`).
continue;
}
if c == '\\' {
if window_start.is_none() {
window_start = Some(i);
}
skip_next = true;
continue;
}
if paren_depth > 0 {
// Inside parens: only track depth changes, everything else is ident.
match c {
'(' => {
if window_start.is_none() {
window_start = Some(i);
}
paren_depth += 1;
}
')' => {
if window_start.is_none() {
window_start = Some(i);
}
paren_depth -= 1;
}
x if x.is_ascii() => {
if window_start.is_none() {
window_start = Some(i);
}
}
char => return Err(PathParseError::NonAsciiChar { position: i, char }),
}
continue;
}
match c { match c {
'$' => { '$' => {
if let Some(s) = window_start.take() { if let Some(s) = window_start.take() {
@@ -51,10 +96,26 @@ impl Tokenizer {
tokens.push((i, Token::SqbClose)); tokens.push((i, Token::SqbClose));
} }
x if x.is_ascii() => match window_start { '(' => {
None => window_start = Some(i), if window_start.is_none() {
Some(_) => continue, window_start = Some(i);
}, }
paren_depth += 1;
}
')' => {
if window_start.is_none() {
window_start = Some(i);
}
// paren_depth is 0 here — stray `)` is an ident char and
// parse_field will surface the error later.
}
x if x.is_ascii() => {
if window_start.is_none() {
window_start = Some(i);
}
}
char => return Err(PathParseError::NonAsciiChar { position: i, char }), char => return Err(PathParseError::NonAsciiChar { position: i, char }),
} }

View File

@@ -122,6 +122,14 @@ mod tests {
GroupSegment::Literal(s.into()) GroupSegment::Literal(s.into())
} }
#[test]
fn regex() {
assert_eq!(
parse("{$.split(/)[-1].regex((.*).pub \\((.*)\\).pdf)[0]}").unwrap(),
vec![(0, path("$.split(/)[-1].regex((.*).pub \\((.*)\\).pdf)[0]"))]
);
}
#[test] #[test]
fn single_path() { fn single_path() {
assert_eq!(parse("{$.foo}").unwrap(), vec![(0, path("$.foo"))]); assert_eq!(parse("{$.foo}").unwrap(), vec![(0, path("$.foo"))]);

View File

@@ -18,6 +18,7 @@ tracing = { workspace = true }
chrono = { workspace = true } chrono = { workspace = true }
toml = { workspace = true } toml = { workspace = true }
smartstring = { workspace = true } smartstring = { workspace = true }
regex = { workspace = true }
blake3 = { workspace = true } blake3 = { workspace = true }
epub = { workspace = true } epub = { workspace = true }
kamadak-exif = { workspace = true } kamadak-exif = { workspace = true }

View File

@@ -1,4 +1,5 @@
pub mod item; pub mod item;
pub mod misc; pub mod misc;
pub mod regex;
pub mod string; pub mod string;
pub mod traits; pub mod traits;

View File

@@ -0,0 +1,104 @@
use std::sync::Arc;
use pile_config::Label;
use regex::Regex;
use smartstring::{LazyCompact, SmartString};
use crate::{
extract::traits::{ExtractState, ListExtractor, ObjectExtractor},
value::PileValue,
};
struct RegexData {
regex: Arc<Regex>,
/// Captured substrings indexed by group index (0 = whole match).
captures: Vec<Option<Arc<SmartString<LazyCompact>>>>,
}
impl RegexData {
fn new(regex: Arc<Regex>, input: &str) -> Option<Self> {
let caps = regex.captures(input)?;
let captures = caps
.iter()
.map(|m| m.map(|m| Arc::new(m.as_str().into())))
.collect();
Some(Self { regex, captures })
}
}
/// Exposes named capture groups as object fields.
pub struct RegexExtractor(Arc<RegexData>);
impl RegexExtractor {
/// Run `regex` against `input`. Returns `None` if there is no match.
pub fn new(regex: Arc<Regex>, input: &str) -> Option<Self> {
Some(Self(Arc::new(RegexData::new(regex, input)?)))
}
}
#[async_trait::async_trait]
impl ObjectExtractor for RegexExtractor {
async fn field(
&self,
_state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
let Some(idx) = self
.0
.regex
.capture_names()
.position(|n| n == Some(name.as_str()))
else {
return Ok(None);
};
Ok(Some(
match self.0.captures.get(idx).and_then(|v| v.as_ref()) {
Some(s) => PileValue::String(s.clone()),
None => PileValue::Null,
},
))
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
#[expect(clippy::unwrap_used)]
Ok(self
.0
.regex
.capture_names()
.flatten()
.map(|n| Label::new(n).unwrap())
.collect())
}
fn as_list(&self) -> Option<Arc<dyn ListExtractor>> {
Some(Arc::new(RegexExtractor(self.0.clone())))
}
}
#[async_trait::async_trait]
impl ListExtractor for RegexExtractor {
async fn get(
&self,
_state: &ExtractState,
idx: usize,
) -> Result<Option<PileValue>, std::io::Error> {
let raw_idx = idx + 1;
let Some(slot) = self.0.captures.get(raw_idx) else {
return Ok(None);
};
Ok(Some(match slot {
Some(s) => PileValue::String(s.clone()),
None => PileValue::Null,
}))
}
async fn len(&self, _state: &ExtractState) -> Result<usize, std::io::Error> {
Ok(self.0.captures.len().saturating_sub(1))
}
}

View File

@@ -1,9 +1,13 @@
use pile_config::Label; use pile_config::Label;
use regex::Regex;
use smartstring::{LazyCompact, SmartString}; use smartstring::{LazyCompact, SmartString};
use std::sync::Arc; use std::sync::Arc;
use crate::{ use crate::{
extract::traits::{ExtractState, ObjectExtractor}, extract::{
regex::RegexExtractor,
traits::{ExtractState, ObjectExtractor},
},
value::PileValue, value::PileValue,
}; };
@@ -67,6 +71,18 @@ impl ObjectExtractor for StringExtractor {
.collect(), .collect(),
))), ))),
("regex", Some(pattern)) => {
let Ok(re) = Regex::new(pattern) else {
return Ok(None);
};
Some(
match RegexExtractor::new(Arc::new(re), self.item.as_str()) {
Some(ext) => PileValue::ObjectExtractor(Arc::new(ext)),
None => PileValue::Null,
},
)
}
_ => None, _ => None,
}) })
} }
@@ -78,6 +94,9 @@ impl ObjectExtractor for StringExtractor {
Label::new("upper").unwrap(), Label::new("upper").unwrap(),
Label::new("lower").unwrap(), Label::new("lower").unwrap(),
Label::new("nonempty").unwrap(), Label::new("nonempty").unwrap(),
Label::new("trimprefix").unwrap(),
Label::new("trimsuffix").unwrap(),
Label::new("split").unwrap(),
]); ]);
} }
} }

View File

@@ -35,6 +35,11 @@ pub trait ObjectExtractor: Send + Sync {
/// and [None] for all others. /// and [None] for all others.
async fn fields(&self) -> Result<Vec<pile_config::Label>, std::io::Error>; async fn fields(&self) -> Result<Vec<pile_config::Label>, std::io::Error>;
/// Return a list view of this extractor, if supported.
fn as_list(&self) -> Option<std::sync::Arc<dyn ListExtractor>> {
None
}
/// Convert this to a JSON value. /// Convert this to a JSON value.
async fn to_json(&self, state: &ExtractState) -> Result<serde_json::Value, std::io::Error> { async fn to_json(&self, state: &ExtractState) -> Result<serde_json::Value, std::io::Error> {
let keys = self.fields().await?; let keys = self.fields().await?;

View File

@@ -86,7 +86,9 @@ impl PileValue {
Self::String(_) => Arc::new(VecExtractor::default()), Self::String(_) => Arc::new(VecExtractor::default()),
Self::Blob { .. } => Arc::new(VecExtractor::default()), Self::Blob { .. } => Arc::new(VecExtractor::default()),
Self::ListExtractor(e) => e.clone(), Self::ListExtractor(e) => e.clone(),
Self::ObjectExtractor(_) => Arc::new(VecExtractor::default()), Self::ObjectExtractor(e) => e
.as_list()
.unwrap_or_else(|| Arc::new(VecExtractor::default())),
Self::Item(_) => Arc::new(VecExtractor::default()), Self::Item(_) => Arc::new(VecExtractor::default()),
} }
} }