lazily-evaluated extractors
This commit is contained in:
@@ -1,7 +1,6 @@
|
||||
use jsonpath_rust::JsonPath;
|
||||
use pile_config::{ConfigToml, DatasetFts, Label};
|
||||
use serde_json::Value;
|
||||
use std::{path::PathBuf, sync::LazyLock};
|
||||
use itertools::Itertools;
|
||||
use pile_config::{Case, ConfigToml, DatasetFts, FieldSpecPost, Label};
|
||||
use std::{path::PathBuf, rc::Rc, sync::LazyLock};
|
||||
use tantivy::{
|
||||
DocAddress, Index, ReloadPolicy, TantivyDocument, TantivyError,
|
||||
collector::Collector,
|
||||
@@ -10,7 +9,7 @@ use tantivy::{
|
||||
};
|
||||
use tracing::{debug, trace, warn};
|
||||
|
||||
use crate::{Item, Key};
|
||||
use crate::{Item, Key, PileValue, extract::MetaExtractor};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FtsLookupResult {
|
||||
@@ -84,10 +83,17 @@ impl DbFtsIndex {
|
||||
doc.add_text(self.schema.get_field("_meta_source")?, item.source_name());
|
||||
doc.add_text(self.schema.get_field("_meta_key")?, key);
|
||||
|
||||
let json = item.json()?;
|
||||
let item = match item.as_file() {
|
||||
Some(x) => x,
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
let extractor = MetaExtractor::new(item);
|
||||
let extractor = PileValue::Extractor(Rc::new(extractor));
|
||||
|
||||
let mut empty = true;
|
||||
for name in self.fts_cfg().fields.keys() {
|
||||
let x = self.get_field(&json, name)?;
|
||||
let x = self.get_field(&extractor, name)?;
|
||||
|
||||
let val = match x {
|
||||
Some(x) => x,
|
||||
@@ -109,9 +115,9 @@ impl DbFtsIndex {
|
||||
// MARK: read
|
||||
//
|
||||
|
||||
pub fn get_field(
|
||||
pub fn get_field<I: Item>(
|
||||
&self,
|
||||
json: &Value,
|
||||
extractor: &PileValue<'_, I>,
|
||||
field_name: &Label,
|
||||
) -> Result<Option<String>, std::io::Error> {
|
||||
let field = match self.cfg.schema.get(field_name) {
|
||||
@@ -124,41 +130,23 @@ impl DbFtsIndex {
|
||||
|
||||
// Try paths in order, using the first value we find
|
||||
'outer: for path in field.path.as_slice() {
|
||||
let val = match json.query(path) {
|
||||
Ok(mut x) => {
|
||||
if x.len() > 1 {
|
||||
warn!(
|
||||
message = "Path returned more than one value, this is not supported. Skipping.",
|
||||
?path,
|
||||
field = field_name.to_string()
|
||||
);
|
||||
continue;
|
||||
}
|
||||
let segments = path
|
||||
.split('.')
|
||||
.map(|x| Label::new(x).unwrap_or_else(|| panic!("wtf {x}")))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
match x.pop() {
|
||||
Some(x) => x,
|
||||
None => continue,
|
||||
}
|
||||
}
|
||||
|
||||
Err(error) => {
|
||||
warn!(
|
||||
message = "Invalid path, skipping",
|
||||
?path,
|
||||
field = field_name.to_string(),
|
||||
?error
|
||||
);
|
||||
continue;
|
||||
}
|
||||
let val = match extractor.query(&segments)? {
|
||||
Some(x) => x,
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
let mut val = match val {
|
||||
Value::Null => {
|
||||
PileValue::Null => {
|
||||
trace!(
|
||||
message = "Skipping field, is null",
|
||||
field = field_name.to_string(),
|
||||
path,
|
||||
value = ?val
|
||||
// value = ?val
|
||||
);
|
||||
continue;
|
||||
}
|
||||
@@ -166,7 +154,7 @@ impl DbFtsIndex {
|
||||
};
|
||||
|
||||
for post in &field.post {
|
||||
val = match post.apply(&val) {
|
||||
val = match apply(post, &val) {
|
||||
Some(x) => x,
|
||||
None => return Ok(None),
|
||||
};
|
||||
@@ -175,7 +163,7 @@ impl DbFtsIndex {
|
||||
loop {
|
||||
val = match val {
|
||||
#[expect(clippy::unwrap_used)]
|
||||
Value::Array(ref mut x) => {
|
||||
PileValue::Array(ref mut x) => {
|
||||
if x.len() == 1 {
|
||||
x.pop().unwrap()
|
||||
} else if x.len() > 1 {
|
||||
@@ -183,7 +171,7 @@ impl DbFtsIndex {
|
||||
message = "Skipping field, is array with more than one element",
|
||||
field = field_name.to_string(),
|
||||
path,
|
||||
value = ?val
|
||||
//value = ?val
|
||||
);
|
||||
continue 'outer;
|
||||
} else {
|
||||
@@ -191,32 +179,30 @@ impl DbFtsIndex {
|
||||
message = "Skipping field, is empty array",
|
||||
field = field_name.to_string(),
|
||||
path,
|
||||
value = ?val
|
||||
//value = ?val
|
||||
);
|
||||
continue 'outer;
|
||||
}
|
||||
}
|
||||
Value::Null => {
|
||||
PileValue::Null => {
|
||||
trace!(
|
||||
message = "Skipping field, is null",
|
||||
field = field_name.to_string(),
|
||||
path,
|
||||
value = ?val
|
||||
//value = ?val
|
||||
);
|
||||
continue 'outer;
|
||||
}
|
||||
Value::Object(_) => {
|
||||
PileValue::Extractor(_) => {
|
||||
trace!(
|
||||
message = "Skipping field, is object",
|
||||
field = field_name.to_string(),
|
||||
path,
|
||||
value = ?val
|
||||
//value = ?val
|
||||
);
|
||||
continue 'outer;
|
||||
}
|
||||
Value::Bool(x) => return Ok(Some(x.to_string())),
|
||||
Value::Number(x) => return Ok(Some(x.to_string())),
|
||||
Value::String(x) => return Ok(Some(x)),
|
||||
PileValue::String(x) => return Ok(Some(x.to_string())),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -310,3 +296,80 @@ impl DbFtsIndex {
|
||||
return Ok(out);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn apply<'a, I: Item>(
|
||||
post: &FieldSpecPost,
|
||||
val: &PileValue<'a, I>,
|
||||
) -> Option<PileValue<'a, I>> {
|
||||
Some(match post {
|
||||
FieldSpecPost::NotEmpty { notempty: false } => val.clone(),
|
||||
FieldSpecPost::NotEmpty { notempty: true } => match val {
|
||||
PileValue::Null => return None,
|
||||
PileValue::String(x) if x.is_empty() => return None,
|
||||
PileValue::Array(x) if x.is_empty() => return None,
|
||||
x => x.clone(),
|
||||
},
|
||||
|
||||
FieldSpecPost::SetCase { case: Case::Lower } => match val {
|
||||
PileValue::Null => return None,
|
||||
PileValue::Extractor(_) => return None,
|
||||
PileValue::String(x) => PileValue::String(x.to_lowercase().into()),
|
||||
|
||||
PileValue::Array(x) => {
|
||||
PileValue::Array(x.iter().map(|x| apply(post, x)).collect::<Option<_>>()?)
|
||||
}
|
||||
},
|
||||
|
||||
FieldSpecPost::SetCase { case: Case::Upper } => match val {
|
||||
PileValue::Null => return None,
|
||||
PileValue::Extractor(_) => return None,
|
||||
PileValue::String(x) => PileValue::String(x.to_uppercase().into()),
|
||||
|
||||
PileValue::Array(x) => {
|
||||
PileValue::Array(x.iter().map(|x| apply(post, x)).collect::<Option<_>>()?)
|
||||
}
|
||||
},
|
||||
|
||||
FieldSpecPost::TrimSuffix { trim_suffix } => match val {
|
||||
PileValue::Null => return None,
|
||||
PileValue::Extractor(_) => return None,
|
||||
|
||||
PileValue::String(x) => {
|
||||
PileValue::String(x.strip_suffix(trim_suffix).unwrap_or(x).into())
|
||||
}
|
||||
|
||||
PileValue::Array(x) => {
|
||||
PileValue::Array(x.iter().map(|x| apply(post, x)).collect::<Option<_>>()?)
|
||||
}
|
||||
},
|
||||
|
||||
FieldSpecPost::TrimPrefix { trim_prefix } => match val {
|
||||
PileValue::Null => return None,
|
||||
PileValue::Extractor(_) => return None,
|
||||
|
||||
PileValue::String(x) => {
|
||||
PileValue::String(x.strip_prefix(trim_prefix).unwrap_or(x).into())
|
||||
}
|
||||
|
||||
PileValue::Array(x) => {
|
||||
PileValue::Array(x.iter().map(|x| apply(post, x)).collect::<Option<_>>()?)
|
||||
}
|
||||
},
|
||||
|
||||
FieldSpecPost::Join { join } => match val {
|
||||
PileValue::Null => return None,
|
||||
PileValue::Extractor(_) => return None,
|
||||
|
||||
PileValue::String(x) => PileValue::String(x.clone()),
|
||||
PileValue::Array(x) => PileValue::String(
|
||||
x.iter()
|
||||
.map(|x| apply(post, x))
|
||||
.map(|x| x.and_then(|x| x.as_str().map(|x| x.to_owned())))
|
||||
.collect::<Option<Vec<_>>>()?
|
||||
.into_iter()
|
||||
.join(join)
|
||||
.into(),
|
||||
),
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user