Extractor refactor, S3 support
This commit is contained in:
@@ -1,6 +1,9 @@
|
||||
use itertools::Itertools;
|
||||
use pile_config::{Case, ConfigToml, DatasetFts, FieldSpecPost, Label};
|
||||
use std::{path::PathBuf, rc::Rc, sync::LazyLock};
|
||||
use std::{
|
||||
path::PathBuf,
|
||||
sync::{Arc, LazyLock},
|
||||
};
|
||||
use tantivy::{
|
||||
DocAddress, Index, ReloadPolicy, TantivyDocument, TantivyError,
|
||||
collector::Collector,
|
||||
@@ -9,7 +12,7 @@ use tantivy::{
|
||||
};
|
||||
use tracing::{debug, trace, warn};
|
||||
|
||||
use crate::{Item, Key, PileValue, extract::MetaExtractor};
|
||||
use crate::{Item, PileValue, extract::MetaExtractor};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FtsLookupResult {
|
||||
@@ -63,37 +66,21 @@ impl DbFtsIndex {
|
||||
//
|
||||
|
||||
/// Turn an entry into a tantivy document
|
||||
pub fn entry_to_document<K: Key, I: Item<Key = K>>(
|
||||
pub async fn entry_to_document(
|
||||
&self,
|
||||
item: &I,
|
||||
item: &Item,
|
||||
) -> Result<Option<TantivyDocument>, TantivyError> {
|
||||
let mut doc = TantivyDocument::default();
|
||||
|
||||
let key = match item.key().to_string() {
|
||||
Some(x) => x,
|
||||
None => {
|
||||
warn!(
|
||||
message = "Item key cannot be converted to a string, skipping",
|
||||
key = ?item.key(),
|
||||
);
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
let key = item.key();
|
||||
|
||||
doc.add_text(self.schema.get_field("_meta_source")?, item.source_name());
|
||||
doc.add_text(self.schema.get_field("_meta_key")?, key);
|
||||
|
||||
let item = match item.as_file() {
|
||||
Some(x) => x,
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
let extractor = MetaExtractor::new(item);
|
||||
let extractor = PileValue::Extractor(Rc::new(extractor));
|
||||
let extractor = PileValue::Extractor(Arc::new(MetaExtractor::new(item)));
|
||||
|
||||
let mut empty = true;
|
||||
for name in self.fts_cfg().fields.keys() {
|
||||
let x = self.get_field(&extractor, name)?;
|
||||
let x = self.get_field(&extractor, name).await?;
|
||||
|
||||
let val = match x {
|
||||
Some(x) => x,
|
||||
@@ -115,9 +102,9 @@ impl DbFtsIndex {
|
||||
// MARK: read
|
||||
//
|
||||
|
||||
pub fn get_field<I: Item>(
|
||||
pub async fn get_field(
|
||||
&self,
|
||||
extractor: &PileValue<'_, I>,
|
||||
extractor: &PileValue<'_>,
|
||||
field_name: &Label,
|
||||
) -> Result<Option<String>, std::io::Error> {
|
||||
let field = match self.cfg.schema.get(field_name) {
|
||||
@@ -130,7 +117,7 @@ impl DbFtsIndex {
|
||||
|
||||
// Try paths in order, using the first value we find
|
||||
'outer: for path in field.path.as_slice() {
|
||||
let val = match extractor.query(path)? {
|
||||
let val = match extractor.query(path).await? {
|
||||
Some(x) => x,
|
||||
None => return Ok(None),
|
||||
};
|
||||
@@ -292,10 +279,7 @@ impl DbFtsIndex {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn apply<'a, I: Item>(
|
||||
post: &FieldSpecPost,
|
||||
val: &PileValue<'a, I>,
|
||||
) -> Option<PileValue<'a, I>> {
|
||||
pub fn apply<'a>(post: &FieldSpecPost, val: &PileValue<'a>) -> Option<PileValue<'a>> {
|
||||
Some(match post {
|
||||
FieldSpecPost::NotEmpty { notempty: false } => val.clone(),
|
||||
FieldSpecPost::NotEmpty { notempty: true } => match val {
|
||||
|
||||
Reference in New Issue
Block a user