Compare commits
2 Commits
4010d3cc1c
...
6b4fbf0ae9
| Author | SHA1 | Date | |
|---|---|---|---|
| 6b4fbf0ae9 | |||
| 9008a248c1 |
@@ -57,6 +57,7 @@ unimplemented = "deny"
|
||||
unwrap_used = "warn"
|
||||
expect_used = "warn"
|
||||
type_complexity = "allow"
|
||||
len_without_is_empty = "allow"
|
||||
|
||||
#
|
||||
# MARK: dependencies
|
||||
|
||||
@@ -460,6 +460,37 @@ impl Datasets {
|
||||
return Ok(results);
|
||||
}
|
||||
|
||||
pub fn fts_lookup_fuzzy(
|
||||
&self,
|
||||
query: &str,
|
||||
top_n: usize,
|
||||
) -> Result<Vec<FtsLookupResult>, DatasetError> {
|
||||
let workdir = match self.path_workdir.as_ref() {
|
||||
Some(x) => x,
|
||||
None => {
|
||||
warn!("Skipping fts_lookup_fuzzy, no workdir");
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
};
|
||||
|
||||
let fts_dir = workdir.join("fts");
|
||||
|
||||
if !fts_dir.exists() {
|
||||
return Err(DatasetError::NoFtsIndex);
|
||||
}
|
||||
if !fts_dir.is_dir() {
|
||||
return Err(std::io::Error::new(
|
||||
ErrorKind::NotADirectory,
|
||||
format!("fts index {} is not a directory", fts_dir.display()),
|
||||
)
|
||||
.into());
|
||||
}
|
||||
|
||||
let db_index = DbFtsIndex::new(&fts_dir, &self.config);
|
||||
let results = db_index.lookup_fuzzy(query, Arc::new(TopDocs::with_limit(top_n)), 3)?;
|
||||
return Ok(results);
|
||||
}
|
||||
|
||||
/// Time at which fts was created
|
||||
pub fn ts_fts(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
|
||||
let workdir = match self.path_workdir.as_ref() {
|
||||
|
||||
@@ -7,8 +7,8 @@ use std::{path::PathBuf, sync::LazyLock};
|
||||
use tantivy::{
|
||||
DocAddress, Index, ReloadPolicy, TantivyDocument, TantivyError,
|
||||
collector::Collector,
|
||||
query::QueryParser,
|
||||
schema::{self, Schema, Value as TantivyValue},
|
||||
query::{BooleanQuery, FuzzyTermQuery, Occur, QueryParser},
|
||||
schema::{self, Schema, Term, Value as TantivyValue},
|
||||
};
|
||||
use tracing::warn;
|
||||
|
||||
@@ -168,6 +168,74 @@ impl DbFtsIndex {
|
||||
let query = query_parser.parse_query(&query)?;
|
||||
let res = searcher.search(&query, collector.as_ref())?;
|
||||
|
||||
return Self::collect_results(&schema, &searcher, res);
|
||||
}
|
||||
|
||||
/// Run a fuzzy query on this table's fts index.
|
||||
/// Each whitespace-separated term is matched with edit distance 1.
|
||||
///
|
||||
/// See [`Self::lookup`] for caveats about concurrent writes.
|
||||
pub fn lookup_fuzzy<C>(
|
||||
&self,
|
||||
query: impl Into<String>,
|
||||
collector: impl AsRef<C> + Send + 'static,
|
||||
distance: u8,
|
||||
) -> Result<Vec<FtsLookupResult>, TantivyError>
|
||||
where
|
||||
C: Collector,
|
||||
C::Fruit: IntoIterator<Item = (f32, DocAddress)>,
|
||||
{
|
||||
if !self.path.exists() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
if !self.path.is_dir() {
|
||||
warn!("fts index at {} is not a directory?!", self.path.display());
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let index = Index::open_in_dir(&self.path)?;
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::OnCommitWithDelay)
|
||||
.try_into()?;
|
||||
|
||||
let schema = index.schema();
|
||||
|
||||
let search_fields: Vec<_> = self
|
||||
.schema
|
||||
.fields()
|
||||
.filter(|(_, entry)| !entry.name().starts_with("_meta_"))
|
||||
.map(|(field, _)| field)
|
||||
.collect();
|
||||
|
||||
let query: String = query.into();
|
||||
let mut clauses: Vec<(Occur, Box<dyn tantivy::query::Query>)> = Vec::new();
|
||||
for term_str in query.split_whitespace() {
|
||||
for &field in &search_fields {
|
||||
let term = Term::from_field_text(field, term_str);
|
||||
clauses.push((
|
||||
Occur::Should,
|
||||
Box::new(FuzzyTermQuery::new(term, distance, true)),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
if clauses.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let res = searcher.search(&BooleanQuery::new(clauses), collector.as_ref())?;
|
||||
|
||||
return Self::collect_results(&schema, &searcher, res);
|
||||
}
|
||||
|
||||
fn collect_results(
|
||||
schema: &Schema,
|
||||
searcher: &tantivy::Searcher,
|
||||
res: impl IntoIterator<Item = (f32, DocAddress)>,
|
||||
) -> Result<Vec<FtsLookupResult>, TantivyError> {
|
||||
let mut out = Vec::new();
|
||||
for (score, doc) in res {
|
||||
let retrieved_doc: TantivyDocument = searcher.doc(doc)?;
|
||||
|
||||
@@ -112,6 +112,7 @@ impl DirDataSource {
|
||||
}
|
||||
|
||||
impl DataSource for Arc<DirDataSource> {
|
||||
#[expect(clippy::expect_used)]
|
||||
fn len(&self) -> usize {
|
||||
self.index.get().expect("index should be initialized").len()
|
||||
}
|
||||
|
||||
@@ -163,6 +163,7 @@ impl S3DataSource {
|
||||
}
|
||||
|
||||
impl DataSource for Arc<S3DataSource> {
|
||||
#[expect(clippy::expect_used)]
|
||||
fn len(&self) -> usize {
|
||||
self.index.get().expect("index should be initialized").len()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user