Compare commits

...

1 Commits

Author SHA1 Message Date
6b4fbf0ae9 Fuzzy search
All checks were successful
CI / Typos (push) Successful in 20s
CI / Clippy (push) Successful in 1m30s
CI / Build and test (all features) (push) Successful in 5m8s
CI / Build and test (push) Successful in 5m55s
2026-03-23 14:48:52 -07:00
3 changed files with 102 additions and 2 deletions

View File

@@ -57,6 +57,7 @@ unimplemented = "deny"
unwrap_used = "warn" unwrap_used = "warn"
expect_used = "warn" expect_used = "warn"
type_complexity = "allow" type_complexity = "allow"
len_without_is_empty = "allow"
# #
# MARK: dependencies # MARK: dependencies

View File

@@ -460,6 +460,37 @@ impl Datasets {
return Ok(results); return Ok(results);
} }
pub fn fts_lookup_fuzzy(
&self,
query: &str,
top_n: usize,
) -> Result<Vec<FtsLookupResult>, DatasetError> {
let workdir = match self.path_workdir.as_ref() {
Some(x) => x,
None => {
warn!("Skipping fts_lookup_fuzzy, no workdir");
return Ok(Vec::new());
}
};
let fts_dir = workdir.join("fts");
if !fts_dir.exists() {
return Err(DatasetError::NoFtsIndex);
}
if !fts_dir.is_dir() {
return Err(std::io::Error::new(
ErrorKind::NotADirectory,
format!("fts index {} is not a directory", fts_dir.display()),
)
.into());
}
let db_index = DbFtsIndex::new(&fts_dir, &self.config);
let results = db_index.lookup_fuzzy(query, Arc::new(TopDocs::with_limit(top_n)), 3)?;
return Ok(results);
}
/// Time at which fts was created /// Time at which fts was created
pub fn ts_fts(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> { pub fn ts_fts(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
let workdir = match self.path_workdir.as_ref() { let workdir = match self.path_workdir.as_ref() {

View File

@@ -7,8 +7,8 @@ use std::{path::PathBuf, sync::LazyLock};
use tantivy::{ use tantivy::{
DocAddress, Index, ReloadPolicy, TantivyDocument, TantivyError, DocAddress, Index, ReloadPolicy, TantivyDocument, TantivyError,
collector::Collector, collector::Collector,
query::QueryParser, query::{BooleanQuery, FuzzyTermQuery, Occur, QueryParser},
schema::{self, Schema, Value as TantivyValue}, schema::{self, Schema, Term, Value as TantivyValue},
}; };
use tracing::warn; use tracing::warn;
@@ -168,6 +168,74 @@ impl DbFtsIndex {
let query = query_parser.parse_query(&query)?; let query = query_parser.parse_query(&query)?;
let res = searcher.search(&query, collector.as_ref())?; let res = searcher.search(&query, collector.as_ref())?;
return Self::collect_results(&schema, &searcher, res);
}
/// Run a fuzzy query on this table's fts index.
/// Each whitespace-separated term is matched with edit distance 1.
///
/// See [`Self::lookup`] for caveats about concurrent writes.
pub fn lookup_fuzzy<C>(
&self,
query: impl Into<String>,
collector: impl AsRef<C> + Send + 'static,
distance: u8,
) -> Result<Vec<FtsLookupResult>, TantivyError>
where
C: Collector,
C::Fruit: IntoIterator<Item = (f32, DocAddress)>,
{
if !self.path.exists() {
return Ok(Vec::new());
}
if !self.path.is_dir() {
warn!("fts index at {} is not a directory?!", self.path.display());
return Ok(Vec::new());
}
let index = Index::open_in_dir(&self.path)?;
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay)
.try_into()?;
let schema = index.schema();
let search_fields: Vec<_> = self
.schema
.fields()
.filter(|(_, entry)| !entry.name().starts_with("_meta_"))
.map(|(field, _)| field)
.collect();
let query: String = query.into();
let mut clauses: Vec<(Occur, Box<dyn tantivy::query::Query>)> = Vec::new();
for term_str in query.split_whitespace() {
for &field in &search_fields {
let term = Term::from_field_text(field, term_str);
clauses.push((
Occur::Should,
Box::new(FuzzyTermQuery::new(term, distance, true)),
));
}
}
if clauses.is_empty() {
return Ok(Vec::new());
}
let searcher = reader.searcher();
let res = searcher.search(&BooleanQuery::new(clauses), collector.as_ref())?;
return Self::collect_results(&schema, &searcher, res);
}
fn collect_results(
schema: &Schema,
searcher: &tantivy::Searcher,
res: impl IntoIterator<Item = (f32, DocAddress)>,
) -> Result<Vec<FtsLookupResult>, TantivyError> {
let mut out = Vec::new(); let mut out = Vec::new();
for (score, doc) in res { for (score, doc) in res {
let retrieved_doc: TantivyDocument = searcher.doc(doc)?; let retrieved_doc: TantivyDocument = searcher.doc(doc)?;