Files
pile/crates/pile-dataset/src/dataset.rs
2026-03-11 10:12:36 -07:00

377 lines
9.1 KiB
Rust

use chrono::{DateTime, Utc};
use pile_config::{ConfigToml, Label, Source, objectpath::ObjectPath};
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
use pile_value::{
source::{DataSource, DirDataSource, S3DataSource, misc::path_ts_earliest},
value::{Item, PileValue},
};
use serde_json::Value;
use std::{collections::HashMap, io::ErrorKind, path::PathBuf, sync::Arc, time::Instant};
use tantivy::{Executor, Index, IndexWriter, TantivyError, collector::TopDocs};
use thiserror::Error;
use tokio::task::JoinSet;
use tokio_stream::{StreamExt, wrappers::ReceiverStream};
use tracing::{debug, info, trace, warn};
use crate::index::{DbFtsIndex, FtsLookupResult};
#[derive(Debug, Error)]
pub enum DatasetError {
#[error("{0}")]
IoError(#[from] std::io::Error),
#[error("{0}")]
TantivyError(#[from] TantivyError),
#[error("this dataset does not have an fts index")]
NoFtsIndex,
}
//
// MARK: Dataset enum
//
/// An opened data source — either a local filesystem directory or an S3 bucket.
pub enum Dataset {
Dir(Arc<DirDataSource>),
S3(Arc<S3DataSource>),
}
impl Dataset {
pub async fn get(&self, key: &str) -> Option<Item> {
match self {
Self::Dir(ds) => ds.get(key).await.ok().flatten(),
Self::S3(ds) => ds.get(key).await.ok().flatten(),
}
}
pub fn iter(&self) -> ReceiverStream<Result<Item, std::io::Error>> {
match self {
Self::Dir(ds) => ds.iter(),
Self::S3(ds) => ds.iter(),
}
}
pub async fn latest_change(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
match self {
Self::Dir(ds) => ds.latest_change().await,
Self::S3(ds) => ds.latest_change().await,
}
}
}
//
// MARK: Datasets collection
//
/// An opened dataset: config, working directory, and all opened sources.
pub struct Datasets {
pub path_config: PathBuf,
pub path_parent: PathBuf,
pub path_workdir: PathBuf,
pub config: ConfigToml,
pub sources: HashMap<Label, Dataset>,
}
impl Datasets {
pub fn open(config: impl Into<PathBuf>) -> Result<Self, std::io::Error> {
let path_config = config.into();
let path_parent = path_config
.parent()
.ok_or(std::io::Error::new(
ErrorKind::NotADirectory,
format!("Config file {} has no parent", path_config.display()),
))?
.to_owned();
let config = {
let config = std::fs::read_to_string(&path_config)?;
let config: Result<ConfigToml, _> = toml::from_str(&config);
match config {
Ok(config) => {
trace!(message = "Loaded config", ?config);
config
}
Err(error) => {
return Err(std::io::Error::new(
ErrorKind::InvalidData,
format!("{} is invalid:\n{error}", path_config.display()),
));
}
}
};
let path_workdir = config
.dataset
.working_dir
.clone()
.unwrap_or(path_parent.join(".pile"))
.join(config.dataset.name.as_str());
let mut sources = HashMap::new();
for (label, source) in &config.dataset.source {
match source {
Source::Filesystem { path, sidecars } => {
sources.insert(
label.clone(),
Dataset::Dir(Arc::new(DirDataSource::new(
label,
path_parent.join(path),
*sidecars,
))),
);
}
Source::S3 {
bucket,
prefix,
endpoint,
region,
credentials,
sidecars,
} => {
match S3DataSource::new(
label,
bucket.clone(),
prefix.clone(),
endpoint.clone(),
region.clone(),
credentials,
*sidecars,
) {
Ok(ds) => {
sources.insert(label.clone(), Dataset::S3(Arc::new(ds)));
}
Err(err) => {
warn!("Could not open S3 source {label}: {err}");
}
}
}
}
}
return Ok(Self {
path_config,
path_parent,
path_workdir,
config,
sources,
});
}
//
// MARK: get
//
pub async fn get(&self, source: &Label, key: &str) -> Option<Item> {
self.sources.get(source)?.get(key).await
}
/// Extract a field from an item by object path.
/// Returns `None` if the item or field is not found.
pub async fn get_field(
&self,
source: &Label,
key: &str,
path: &ObjectPath,
) -> Result<Option<Value>, std::io::Error> {
let Some(item) = self.get(source, key).await else {
return Ok(None);
};
let item = PileValue::Item(item);
let Some(value) = item.query(path).await? else {
return Ok(None);
};
Ok(Some(value.to_json().await?))
}
//
// MARK: fts
//
/// Refresh this dataset's fts index.
pub async fn fts_refresh(
&self,
_threads: usize,
flag: Option<CancelFlag>,
) -> Result<(), CancelableTaskError<DatasetError>> {
let fts_tmp_dir = self.path_workdir.join(".tmp-fts");
let fts_dir = self.path_workdir.join("fts");
if fts_tmp_dir.is_dir() {
warn!("Removing temporary index in {}", fts_dir.display());
std::fs::remove_dir_all(&fts_tmp_dir).map_err(DatasetError::from)?;
}
std::fs::create_dir_all(&fts_tmp_dir).map_err(DatasetError::from)?;
let db_index = Arc::new(DbFtsIndex::new(&fts_tmp_dir, &self.config));
let mut index = Index::create_in_dir(&fts_tmp_dir, db_index.schema.clone())
.map_err(DatasetError::from)?;
index.set_executor(Executor::multi_thread(10, "build-fts").map_err(DatasetError::from)?);
let mut index_writer: IndexWriter =
index.writer(50 * 1024 * 1024).map_err(DatasetError::from)?;
let mut total = 0u64;
let mut logged_at = Instant::now();
const CONCURRENCY: usize = 32;
let mut join_set = JoinSet::new();
let mut process = |key, result: Result<_, _>| -> Result<(), DatasetError> {
match result {
Ok(Some(doc)) => {
index_writer.add_document(doc).map_err(DatasetError::from)?;
total += 1;
if logged_at.elapsed().as_secs() >= 5 {
debug!("Indexed {total} documents so far");
logged_at = Instant::now();
}
}
Ok(None) => warn!("Skipping {key:?}, document is empty"),
Err(err) => warn!("Could not read {key:?}, skipping. {err}"),
}
Ok(())
};
for (name, dataset) in &self.sources {
info!("Loading source {name}");
let mut stream = dataset.iter();
while let Some(item_result) = stream.next().await {
if let Some(flag) = &flag
&& flag.is_cancelled()
{
return Err(CancelableTaskError::Cancelled);
}
let item = item_result.map_err(DatasetError::from)?;
let db = Arc::clone(&db_index);
join_set.spawn(async move {
let key = item.key();
let result = db.entry_to_document(&item).await;
(key, result)
});
while join_set.len() >= CONCURRENCY {
match join_set.join_next().await {
Some(Ok((key, result))) => process(key, result)?,
Some(Err(e)) => warn!("Indexing task panicked: {e}"),
None => break,
}
}
}
}
while let Some(join_result) = join_set.join_next().await {
match join_result {
Ok((key, result)) => process(key, result)?,
Err(e) => warn!("Indexing task panicked: {e}"),
}
}
if let Some(flag) = flag.as_ref()
&& flag.is_cancelled()
{
return Err(CancelableTaskError::Cancelled);
}
info!("Committing {total} documents");
index_writer.commit().map_err(DatasetError::from)?;
if fts_dir.is_dir() {
warn!("Removing existing index in {}", fts_dir.display());
std::fs::remove_dir_all(&fts_dir).map_err(DatasetError::from)?;
}
std::fs::rename(&fts_tmp_dir, &fts_dir).map_err(DatasetError::from)?;
return Ok(());
}
pub fn fts_lookup(
&self,
query: &str,
top_n: usize,
) -> Result<Vec<FtsLookupResult>, DatasetError> {
let fts_dir = self.path_workdir.join("fts");
if !fts_dir.exists() {
return Err(DatasetError::NoFtsIndex);
}
if !fts_dir.is_dir() {
return Err(std::io::Error::new(
ErrorKind::NotADirectory,
format!("fts index {} is not a directory", fts_dir.display()),
)
.into());
}
let db_index = DbFtsIndex::new(&fts_dir, &self.config);
let results = db_index.lookup(query, Arc::new(TopDocs::with_limit(top_n)))?;
return Ok(results);
}
/// Time at which fts was created
pub fn ts_fts(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
let fts_dir = self.path_workdir.join("fts");
if !fts_dir.exists() {
return Ok(None);
}
if !fts_dir.is_dir() {
return Err(std::io::Error::new(
ErrorKind::NotADirectory,
format!("fts index {} is not a directory", fts_dir.display()),
));
}
return path_ts_earliest(&fts_dir);
}
/// Time at which data was last modified
pub async fn ts_data(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
let mut ts: Option<DateTime<Utc>> = None;
for dataset in self.sources.values() {
match (ts, dataset.latest_change().await?) {
(_, None) => continue,
(None, Some(new)) => ts = Some(new),
(Some(old), Some(new)) => ts = Some(old.max(new)),
}
}
return Ok(ts);
}
/// Returns true if we do not have an fts index,
/// or if our fts index is older than our data.
pub async fn needs_fts(&self) -> Result<bool, std::io::Error> {
let start = Instant::now();
let ts_fts = self.ts_fts()?;
let ts_data = self.ts_data().await?;
let result = match (ts_fts, ts_data) {
(None, Some(_)) => true,
(None, None) | (Some(_), None) => {
warn!("Could not determine data age");
false
}
(Some(ts_fts), Some(ts_data)) => ts_data > ts_fts,
};
debug!(
message = "Ran needs_fts",
time_ms = start.elapsed().as_millis(),
?result
);
return Ok(result);
}
}