diff --git a/Cargo.toml b/Cargo.toml index 880021f..855ca0a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,6 +57,7 @@ unimplemented = "deny" unwrap_used = "warn" expect_used = "warn" type_complexity = "allow" +len_without_is_empty = "allow" # # MARK: dependencies diff --git a/crates/pile-dataset/src/dataset.rs b/crates/pile-dataset/src/dataset.rs index 0fd9756..7d3311d 100644 --- a/crates/pile-dataset/src/dataset.rs +++ b/crates/pile-dataset/src/dataset.rs @@ -460,6 +460,37 @@ impl Datasets { return Ok(results); } + pub fn fts_lookup_fuzzy( + &self, + query: &str, + top_n: usize, + ) -> Result, DatasetError> { + let workdir = match self.path_workdir.as_ref() { + Some(x) => x, + None => { + warn!("Skipping fts_lookup_fuzzy, no workdir"); + return Ok(Vec::new()); + } + }; + + let fts_dir = workdir.join("fts"); + + if !fts_dir.exists() { + return Err(DatasetError::NoFtsIndex); + } + if !fts_dir.is_dir() { + return Err(std::io::Error::new( + ErrorKind::NotADirectory, + format!("fts index {} is not a directory", fts_dir.display()), + ) + .into()); + } + + let db_index = DbFtsIndex::new(&fts_dir, &self.config); + let results = db_index.lookup_fuzzy(query, Arc::new(TopDocs::with_limit(top_n)), 3)?; + return Ok(results); + } + /// Time at which fts was created pub fn ts_fts(&self) -> Result>, std::io::Error> { let workdir = match self.path_workdir.as_ref() { diff --git a/crates/pile-dataset/src/index/index_fts.rs b/crates/pile-dataset/src/index/index_fts.rs index 0bf0369..87547b1 100644 --- a/crates/pile-dataset/src/index/index_fts.rs +++ b/crates/pile-dataset/src/index/index_fts.rs @@ -7,8 +7,8 @@ use std::{path::PathBuf, sync::LazyLock}; use tantivy::{ DocAddress, Index, ReloadPolicy, TantivyDocument, TantivyError, collector::Collector, - query::QueryParser, - schema::{self, Schema, Value as TantivyValue}, + query::{BooleanQuery, FuzzyTermQuery, Occur, QueryParser}, + schema::{self, Schema, Term, Value as TantivyValue}, }; use tracing::warn; @@ -168,6 +168,74 @@ impl DbFtsIndex { let query = query_parser.parse_query(&query)?; let res = searcher.search(&query, collector.as_ref())?; + return Self::collect_results(&schema, &searcher, res); + } + + /// Run a fuzzy query on this table's fts index. + /// Each whitespace-separated term is matched with edit distance 1. + /// + /// See [`Self::lookup`] for caveats about concurrent writes. + pub fn lookup_fuzzy( + &self, + query: impl Into, + collector: impl AsRef + Send + 'static, + distance: u8, + ) -> Result, TantivyError> + where + C: Collector, + C::Fruit: IntoIterator, + { + if !self.path.exists() { + return Ok(Vec::new()); + } + + if !self.path.is_dir() { + warn!("fts index at {} is not a directory?!", self.path.display()); + return Ok(Vec::new()); + } + + let index = Index::open_in_dir(&self.path)?; + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::OnCommitWithDelay) + .try_into()?; + + let schema = index.schema(); + + let search_fields: Vec<_> = self + .schema + .fields() + .filter(|(_, entry)| !entry.name().starts_with("_meta_")) + .map(|(field, _)| field) + .collect(); + + let query: String = query.into(); + let mut clauses: Vec<(Occur, Box)> = Vec::new(); + for term_str in query.split_whitespace() { + for &field in &search_fields { + let term = Term::from_field_text(field, term_str); + clauses.push(( + Occur::Should, + Box::new(FuzzyTermQuery::new(term, distance, true)), + )); + } + } + + if clauses.is_empty() { + return Ok(Vec::new()); + } + + let searcher = reader.searcher(); + let res = searcher.search(&BooleanQuery::new(clauses), collector.as_ref())?; + + return Self::collect_results(&schema, &searcher, res); + } + + fn collect_results( + schema: &Schema, + searcher: &tantivy::Searcher, + res: impl IntoIterator, + ) -> Result, TantivyError> { let mut out = Vec::new(); for (score, doc) in res { let retrieved_doc: TantivyDocument = searcher.doc(doc)?;