Add regex extractor
This commit is contained in:
@@ -21,7 +21,52 @@ impl Tokenizer {
|
||||
let mut tokens = Vec::new();
|
||||
|
||||
let mut window_start = None;
|
||||
// Paren depth: while > 0, `.` / `[` / `]` / `$` are part of the ident.
|
||||
let mut paren_depth: usize = 0;
|
||||
// When true, the current char is escaped by a preceding `\` and is
|
||||
// treated as a plain ident character with no special meaning.
|
||||
let mut skip_next = false;
|
||||
|
||||
for (i, c) in source.char_indices() {
|
||||
if skip_next {
|
||||
skip_next = false;
|
||||
// Escaped char: just extend the ident window (already opened by `\`).
|
||||
continue;
|
||||
}
|
||||
|
||||
if c == '\\' {
|
||||
if window_start.is_none() {
|
||||
window_start = Some(i);
|
||||
}
|
||||
skip_next = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if paren_depth > 0 {
|
||||
// Inside parens: only track depth changes, everything else is ident.
|
||||
match c {
|
||||
'(' => {
|
||||
if window_start.is_none() {
|
||||
window_start = Some(i);
|
||||
}
|
||||
paren_depth += 1;
|
||||
}
|
||||
')' => {
|
||||
if window_start.is_none() {
|
||||
window_start = Some(i);
|
||||
}
|
||||
paren_depth -= 1;
|
||||
}
|
||||
x if x.is_ascii() => {
|
||||
if window_start.is_none() {
|
||||
window_start = Some(i);
|
||||
}
|
||||
}
|
||||
char => return Err(PathParseError::NonAsciiChar { position: i, char }),
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
match c {
|
||||
'$' => {
|
||||
if let Some(s) = window_start.take() {
|
||||
@@ -51,10 +96,26 @@ impl Tokenizer {
|
||||
tokens.push((i, Token::SqbClose));
|
||||
}
|
||||
|
||||
x if x.is_ascii() => match window_start {
|
||||
None => window_start = Some(i),
|
||||
Some(_) => continue,
|
||||
},
|
||||
'(' => {
|
||||
if window_start.is_none() {
|
||||
window_start = Some(i);
|
||||
}
|
||||
paren_depth += 1;
|
||||
}
|
||||
|
||||
')' => {
|
||||
if window_start.is_none() {
|
||||
window_start = Some(i);
|
||||
}
|
||||
// paren_depth is 0 here — stray `)` is an ident char and
|
||||
// parse_field will surface the error later.
|
||||
}
|
||||
|
||||
x if x.is_ascii() => {
|
||||
if window_start.is_none() {
|
||||
window_start = Some(i);
|
||||
}
|
||||
}
|
||||
|
||||
char => return Err(PathParseError::NonAsciiChar { position: i, char }),
|
||||
}
|
||||
|
||||
@@ -122,6 +122,14 @@ mod tests {
|
||||
GroupSegment::Literal(s.into())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn regex() {
|
||||
assert_eq!(
|
||||
parse("{$.split(/)[-1].regex((.*).pub \\((.*)\\).pdf)[0]}").unwrap(),
|
||||
vec![(0, path("$.split(/)[-1].regex((.*).pub \\((.*)\\).pdf)[0]"))]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_path() {
|
||||
assert_eq!(parse("{$.foo}").unwrap(), vec![(0, path("$.foo"))]);
|
||||
|
||||
@@ -18,6 +18,7 @@ tracing = { workspace = true }
|
||||
chrono = { workspace = true }
|
||||
toml = { workspace = true }
|
||||
smartstring = { workspace = true }
|
||||
regex = { workspace = true }
|
||||
blake3 = { workspace = true }
|
||||
epub = { workspace = true }
|
||||
kamadak-exif = { workspace = true }
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
pub mod item;
|
||||
pub mod misc;
|
||||
pub mod regex;
|
||||
pub mod string;
|
||||
pub mod traits;
|
||||
|
||||
104
crates/pile-value/src/extract/regex.rs
Normal file
104
crates/pile-value/src/extract/regex.rs
Normal file
@@ -0,0 +1,104 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use pile_config::Label;
|
||||
use regex::Regex;
|
||||
use smartstring::{LazyCompact, SmartString};
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ListExtractor, ObjectExtractor},
|
||||
value::PileValue,
|
||||
};
|
||||
|
||||
struct RegexData {
|
||||
regex: Arc<Regex>,
|
||||
/// Captured substrings indexed by group index (0 = whole match).
|
||||
captures: Vec<Option<Arc<SmartString<LazyCompact>>>>,
|
||||
}
|
||||
|
||||
impl RegexData {
|
||||
fn new(regex: Arc<Regex>, input: &str) -> Option<Self> {
|
||||
let caps = regex.captures(input)?;
|
||||
let captures = caps
|
||||
.iter()
|
||||
.map(|m| m.map(|m| Arc::new(m.as_str().into())))
|
||||
.collect();
|
||||
Some(Self { regex, captures })
|
||||
}
|
||||
}
|
||||
|
||||
/// Exposes named capture groups as object fields.
|
||||
pub struct RegexExtractor(Arc<RegexData>);
|
||||
|
||||
impl RegexExtractor {
|
||||
/// Run `regex` against `input`. Returns `None` if there is no match.
|
||||
pub fn new(regex: Arc<Regex>, input: &str) -> Option<Self> {
|
||||
Some(Self(Arc::new(RegexData::new(regex, input)?)))
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for RegexExtractor {
|
||||
async fn field(
|
||||
&self,
|
||||
_state: &ExtractState,
|
||||
name: &Label,
|
||||
args: Option<&str>,
|
||||
) -> Result<Option<PileValue>, std::io::Error> {
|
||||
if args.is_some() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let Some(idx) = self
|
||||
.0
|
||||
.regex
|
||||
.capture_names()
|
||||
.position(|n| n == Some(name.as_str()))
|
||||
else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
Ok(Some(
|
||||
match self.0.captures.get(idx).and_then(|v| v.as_ref()) {
|
||||
Some(s) => PileValue::String(s.clone()),
|
||||
None => PileValue::Null,
|
||||
},
|
||||
))
|
||||
}
|
||||
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
#[expect(clippy::unwrap_used)]
|
||||
Ok(self
|
||||
.0
|
||||
.regex
|
||||
.capture_names()
|
||||
.flatten()
|
||||
.map(|n| Label::new(n).unwrap())
|
||||
.collect())
|
||||
}
|
||||
|
||||
fn as_list(&self) -> Option<Arc<dyn ListExtractor>> {
|
||||
Some(Arc::new(RegexExtractor(self.0.clone())))
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ListExtractor for RegexExtractor {
|
||||
async fn get(
|
||||
&self,
|
||||
_state: &ExtractState,
|
||||
idx: usize,
|
||||
) -> Result<Option<PileValue>, std::io::Error> {
|
||||
let raw_idx = idx + 1;
|
||||
let Some(slot) = self.0.captures.get(raw_idx) else {
|
||||
return Ok(None);
|
||||
};
|
||||
Ok(Some(match slot {
|
||||
Some(s) => PileValue::String(s.clone()),
|
||||
None => PileValue::Null,
|
||||
}))
|
||||
}
|
||||
|
||||
async fn len(&self, _state: &ExtractState) -> Result<usize, std::io::Error> {
|
||||
Ok(self.0.captures.len().saturating_sub(1))
|
||||
}
|
||||
}
|
||||
@@ -1,9 +1,13 @@
|
||||
use pile_config::Label;
|
||||
use regex::Regex;
|
||||
use smartstring::{LazyCompact, SmartString};
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
extract::{
|
||||
regex::RegexExtractor,
|
||||
traits::{ExtractState, ObjectExtractor},
|
||||
},
|
||||
value::PileValue,
|
||||
};
|
||||
|
||||
@@ -67,6 +71,18 @@ impl ObjectExtractor for StringExtractor {
|
||||
.collect(),
|
||||
))),
|
||||
|
||||
("regex", Some(pattern)) => {
|
||||
let Ok(re) = Regex::new(pattern) else {
|
||||
return Ok(None);
|
||||
};
|
||||
Some(
|
||||
match RegexExtractor::new(Arc::new(re), self.item.as_str()) {
|
||||
Some(ext) => PileValue::ObjectExtractor(Arc::new(ext)),
|
||||
None => PileValue::Null,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
_ => None,
|
||||
})
|
||||
}
|
||||
@@ -78,6 +94,9 @@ impl ObjectExtractor for StringExtractor {
|
||||
Label::new("upper").unwrap(),
|
||||
Label::new("lower").unwrap(),
|
||||
Label::new("nonempty").unwrap(),
|
||||
Label::new("trimprefix").unwrap(),
|
||||
Label::new("trimsuffix").unwrap(),
|
||||
Label::new("split").unwrap(),
|
||||
]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,6 +35,11 @@ pub trait ObjectExtractor: Send + Sync {
|
||||
/// and [None] for all others.
|
||||
async fn fields(&self) -> Result<Vec<pile_config::Label>, std::io::Error>;
|
||||
|
||||
/// Return a list view of this extractor, if supported.
|
||||
fn as_list(&self) -> Option<std::sync::Arc<dyn ListExtractor>> {
|
||||
None
|
||||
}
|
||||
|
||||
/// Convert this to a JSON value.
|
||||
async fn to_json(&self, state: &ExtractState) -> Result<serde_json::Value, std::io::Error> {
|
||||
let keys = self.fields().await?;
|
||||
|
||||
@@ -86,7 +86,9 @@ impl PileValue {
|
||||
Self::String(_) => Arc::new(VecExtractor::default()),
|
||||
Self::Blob { .. } => Arc::new(VecExtractor::default()),
|
||||
Self::ListExtractor(e) => e.clone(),
|
||||
Self::ObjectExtractor(_) => Arc::new(VecExtractor::default()),
|
||||
Self::ObjectExtractor(e) => e
|
||||
.as_list()
|
||||
.unwrap_or_else(|| Arc::new(VecExtractor::default())),
|
||||
Self::Item(_) => Arc::new(VecExtractor::default()),
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user