Flatten arrays for FTS index
Some checks failed
CI / Typos (push) Successful in 31s
CI / Clippy (push) Failing after 1m13s
CI / Build and test (all features) (push) Successful in 4m22s
CI / Build and test (push) Successful in 6m2s

This commit is contained in:
2026-03-16 09:55:50 -07:00
parent 2a2d5af36c
commit 583a1aa6b1
2 changed files with 62 additions and 112 deletions

View File

@@ -1,4 +1,4 @@
use pile_config::{ConfigToml, DatasetFts, Label}; use pile_config::{ConfigToml, DatasetFts, Label, objectpath::ObjectPath};
use pile_value::{ use pile_value::{
extract::traits::ExtractState, extract::traits::ExtractState,
value::{Item, PileValue}, value::{Item, PileValue},
@@ -10,7 +10,7 @@ use tantivy::{
query::QueryParser, query::QueryParser,
schema::{self, Schema, Value as TantivyValue}, schema::{self, Schema, Value as TantivyValue},
}; };
use tracing::{debug, trace, warn}; use tracing::warn;
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct FtsLookupResult { pub struct FtsLookupResult {
@@ -79,18 +79,12 @@ impl DbFtsIndex {
let mut empty = true; let mut empty = true;
for name in self.fts_cfg().fields.keys() { for name in self.fts_cfg().fields.keys() {
let x = self.get_field(state, &item, name).await?; let vals = self.get_field(state, &item, name).await?;
let field = self.schema.get_field(name)?;
let val = match x {
Some(x) => x,
None => continue,
};
for v in vals {
empty = false; empty = false;
doc.add_text(field, v);
let field = self.schema.get_field(name);
if let Ok(field) = field {
doc.add_text(field, val);
} }
} }
@@ -106,110 +100,29 @@ impl DbFtsIndex {
state: &ExtractState, state: &ExtractState,
extractor: &PileValue, extractor: &PileValue,
field_name: &Label, field_name: &Label,
) -> Result<Option<String>, std::io::Error> { ) -> Result<Vec<String>, std::io::Error> {
let field = match self.cfg.schema.get(field_name) { let field = match self.cfg.schema.get(field_name) {
Some(x) => x, Some(x) => x,
None => { None => {
warn!("Unknown field {field_name:?}"); warn!("Unknown field {field_name:?}");
return Ok(None); return Ok(Vec::new());
} }
}; };
// Try paths in order, using the first value we find // Try paths in order, using the first value we find
'outer: for path in field.path.as_slice() { for path in field.path.as_slice() {
let val = match extractor.query(state, path).await? { let val = match extractor.query(state, path).await? {
Some(x) => x, Some(x) => x,
None => return Ok(None), None => continue,
}; };
let mut val = match val { let val = val_to_string(state, &val, path, field_name).await?;
PileValue::Null => { if !val.is_empty() {
trace!( return Ok(val);
message = "Skipping field, is null",
field = field_name.to_string(),
?path,
// value = ?val
);
continue;
}
x => x.clone(),
};
loop {
val = match val {
PileValue::String(x) => return Ok(Some(x.to_string())),
PileValue::U64(x) => return Ok(Some(x.to_string())),
PileValue::I64(x) => return Ok(Some(x.to_string())),
PileValue::Array(x) => {
if x.len() == 1 {
x[0].clone()
} else if x.len() > 1 {
debug!(
message = "Skipping field, is array with more than one element",
field = field_name.to_string(),
?path,
);
continue 'outer;
} else {
debug!(
message = "Skipping field, is empty array",
field = field_name.to_string(),
?path,
);
continue 'outer;
} }
} }
PileValue::Null => { return Ok(Vec::new());
trace!(
message = "Skipping field, is null",
field = field_name.to_string(),
?path,
);
continue 'outer;
}
PileValue::ObjectExtractor(_) => {
trace!(
message = "Skipping field, is object",
field = field_name.to_string(),
?path,
);
continue 'outer;
}
PileValue::Item(_) => {
trace!(
message = "Skipping field, is item",
field = field_name.to_string(),
?path,
);
continue 'outer;
}
PileValue::ListExtractor(_) => {
trace!(
message = "Skipping field, is ListExtractor",
field = field_name.to_string(),
?path,
);
continue 'outer;
}
PileValue::Blob { .. } => {
trace!(
message = "Skipping field, is blob",
field = field_name.to_string(),
?path,
);
continue 'outer;
}
}
}
}
return Ok(None);
} }
/// Run the given query on this table's ftx index. /// Run the given query on this table's ftx index.
@@ -298,3 +211,42 @@ impl DbFtsIndex {
return Ok(out); return Ok(out);
} }
} }
async fn val_to_string(
state: &ExtractState,
val: &PileValue,
path: &ObjectPath,
field_name: &str,
) -> Result<Vec<String>, std::io::Error> {
match val {
PileValue::String(x) => return Ok(vec![x.to_string()]),
PileValue::U64(x) => return Ok(vec![x.to_string()]),
PileValue::I64(x) => return Ok(vec![x.to_string()]),
PileValue::Array(x) => {
let mut out = Vec::new();
for x in x.iter() {
out.extend(Box::pin(val_to_string(state, x, path, field_name)).await?);
}
return Ok(out);
}
#[expect(clippy::unwrap_used)]
PileValue::ListExtractor(x) => {
let mut out = Vec::new();
let len = x.len(state).await?;
for i in 0..len {
let v = x.get(state, i).await?;
out.extend(Box::pin(val_to_string(state, &v.unwrap(), path, field_name)).await?);
}
return Ok(out);
}
PileValue::Null => {}
PileValue::ObjectExtractor(_) => {}
PileValue::Item(_) => {}
PileValue::Blob { .. } => {}
}
return Ok(Vec::new());
}

View File

@@ -77,18 +77,16 @@ impl CliCmd for AnnotateCommand {
}; };
let item = PileValue::Item(item.clone()); let item = PileValue::Item(item.clone());
let Some(value) = index let vals =
index
.get_field(&state, &item, &field) .get_field(&state, &item, &field)
.await .await
.with_context(|| { .with_context(|| {
format!("while extracting field from {}", path.display()) format!("while extracting field from {}", path.display())
})? })?;
else {
continue;
};
// TODO: implement sidecar writing // TODO: implement sidecar writing
let _ = (&dest_path, &value); let _ = (&dest_path, &vals);
todo!("write_sidecar not yet implemented"); todo!("write_sidecar not yet implemented");
#[expect(unreachable_code)] #[expect(unreachable_code)]