Refactor sidecars

This commit is contained in:
2026-03-16 22:24:30 -07:00
parent f2f5726d7b
commit 053459f340
25 changed files with 674 additions and 530 deletions

View File

@@ -11,7 +11,6 @@ use std::{collections::HashMap, io::ErrorKind, path::PathBuf, sync::Arc, time::I
use tantivy::{Executor, Index, IndexWriter, TantivyError, collector::TopDocs};
use thiserror::Error;
use tokio::task::JoinSet;
use tokio_stream::{StreamExt, wrappers::ReceiverStream};
use tracing::{debug, info, trace, warn};
use crate::index::{DbFtsIndex, FtsLookupResult};
@@ -46,10 +45,10 @@ impl Dataset {
}
}
pub fn iter(&self) -> ReceiverStream<Result<Item, std::io::Error>> {
pub fn iter(&self) -> Box<dyn Iterator<Item = &Item> + Send + '_> {
match self {
Self::Dir(ds) => ds.iter(),
Self::S3(ds) => ds.iter(),
Self::Dir(ds) => Box::new(ds.iter()),
Self::S3(ds) => Box::new(ds.iter()),
}
}
@@ -76,7 +75,7 @@ pub struct Datasets {
}
impl Datasets {
pub fn open(config: impl Into<PathBuf>) -> Result<Self, std::io::Error> {
pub async fn open(config: impl Into<PathBuf>) -> Result<Self, std::io::Error> {
let path_config = config.into();
let path_parent = path_config
.parent()
@@ -118,7 +117,7 @@ impl Datasets {
Source::Filesystem {
enabled,
path,
sidecars,
pattern,
} => {
if !enabled {
continue;
@@ -126,11 +125,10 @@ impl Datasets {
sources.insert(
label.clone(),
Dataset::Dir(Arc::new(DirDataSource::new(
label,
path_parent.join(path),
*sidecars,
))),
Dataset::Dir(
DirDataSource::new(label, path_parent.join(path), pattern.clone())
.await?,
),
);
}
@@ -141,7 +139,7 @@ impl Datasets {
endpoint,
region,
credentials,
sidecars,
pattern,
} => {
if !enabled {
continue;
@@ -154,10 +152,12 @@ impl Datasets {
endpoint.clone(),
region.clone(),
credentials,
*sidecars,
) {
pattern.clone(),
)
.await
{
Ok(ds) => {
sources.insert(label.clone(), Dataset::S3(Arc::new(ds)));
sources.insert(label.clone(), Dataset::S3(ds));
}
Err(err) => {
warn!("Could not open S3 source {label}: {err}");
@@ -258,17 +258,17 @@ impl Datasets {
for (name, dataset) in &self.sources {
info!("Loading source {name}");
let mut stream = dataset.iter();
while let Some(item_result) = stream.next().await {
let stream = dataset.iter();
for item in stream {
if let Some(flag) = &flag
&& flag.is_cancelled()
{
return Err(CancelableTaskError::Cancelled);
}
let item = item_result.map_err(DatasetError::from)?;
let db = Arc::clone(&db_index);
let state = state.clone();
let item = item.clone();
join_set.spawn(async move {
let key = item.key();
let result = db.entry_to_document(&state, &item).await;