Compare commits
1 Commits
2a22086992
...
6b4fbf0ae9
| Author | SHA1 | Date | |
|---|---|---|---|
| 6b4fbf0ae9 |
@@ -57,6 +57,7 @@ unimplemented = "deny"
|
|||||||
unwrap_used = "warn"
|
unwrap_used = "warn"
|
||||||
expect_used = "warn"
|
expect_used = "warn"
|
||||||
type_complexity = "allow"
|
type_complexity = "allow"
|
||||||
|
len_without_is_empty = "allow"
|
||||||
|
|
||||||
#
|
#
|
||||||
# MARK: dependencies
|
# MARK: dependencies
|
||||||
|
|||||||
@@ -460,6 +460,37 @@ impl Datasets {
|
|||||||
return Ok(results);
|
return Ok(results);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn fts_lookup_fuzzy(
|
||||||
|
&self,
|
||||||
|
query: &str,
|
||||||
|
top_n: usize,
|
||||||
|
) -> Result<Vec<FtsLookupResult>, DatasetError> {
|
||||||
|
let workdir = match self.path_workdir.as_ref() {
|
||||||
|
Some(x) => x,
|
||||||
|
None => {
|
||||||
|
warn!("Skipping fts_lookup_fuzzy, no workdir");
|
||||||
|
return Ok(Vec::new());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let fts_dir = workdir.join("fts");
|
||||||
|
|
||||||
|
if !fts_dir.exists() {
|
||||||
|
return Err(DatasetError::NoFtsIndex);
|
||||||
|
}
|
||||||
|
if !fts_dir.is_dir() {
|
||||||
|
return Err(std::io::Error::new(
|
||||||
|
ErrorKind::NotADirectory,
|
||||||
|
format!("fts index {} is not a directory", fts_dir.display()),
|
||||||
|
)
|
||||||
|
.into());
|
||||||
|
}
|
||||||
|
|
||||||
|
let db_index = DbFtsIndex::new(&fts_dir, &self.config);
|
||||||
|
let results = db_index.lookup_fuzzy(query, Arc::new(TopDocs::with_limit(top_n)), 3)?;
|
||||||
|
return Ok(results);
|
||||||
|
}
|
||||||
|
|
||||||
/// Time at which fts was created
|
/// Time at which fts was created
|
||||||
pub fn ts_fts(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
|
pub fn ts_fts(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
|
||||||
let workdir = match self.path_workdir.as_ref() {
|
let workdir = match self.path_workdir.as_ref() {
|
||||||
|
|||||||
@@ -7,8 +7,8 @@ use std::{path::PathBuf, sync::LazyLock};
|
|||||||
use tantivy::{
|
use tantivy::{
|
||||||
DocAddress, Index, ReloadPolicy, TantivyDocument, TantivyError,
|
DocAddress, Index, ReloadPolicy, TantivyDocument, TantivyError,
|
||||||
collector::Collector,
|
collector::Collector,
|
||||||
query::QueryParser,
|
query::{BooleanQuery, FuzzyTermQuery, Occur, QueryParser},
|
||||||
schema::{self, Schema, Value as TantivyValue},
|
schema::{self, Schema, Term, Value as TantivyValue},
|
||||||
};
|
};
|
||||||
use tracing::warn;
|
use tracing::warn;
|
||||||
|
|
||||||
@@ -168,6 +168,74 @@ impl DbFtsIndex {
|
|||||||
let query = query_parser.parse_query(&query)?;
|
let query = query_parser.parse_query(&query)?;
|
||||||
let res = searcher.search(&query, collector.as_ref())?;
|
let res = searcher.search(&query, collector.as_ref())?;
|
||||||
|
|
||||||
|
return Self::collect_results(&schema, &searcher, res);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Run a fuzzy query on this table's fts index.
|
||||||
|
/// Each whitespace-separated term is matched with edit distance 1.
|
||||||
|
///
|
||||||
|
/// See [`Self::lookup`] for caveats about concurrent writes.
|
||||||
|
pub fn lookup_fuzzy<C>(
|
||||||
|
&self,
|
||||||
|
query: impl Into<String>,
|
||||||
|
collector: impl AsRef<C> + Send + 'static,
|
||||||
|
distance: u8,
|
||||||
|
) -> Result<Vec<FtsLookupResult>, TantivyError>
|
||||||
|
where
|
||||||
|
C: Collector,
|
||||||
|
C::Fruit: IntoIterator<Item = (f32, DocAddress)>,
|
||||||
|
{
|
||||||
|
if !self.path.exists() {
|
||||||
|
return Ok(Vec::new());
|
||||||
|
}
|
||||||
|
|
||||||
|
if !self.path.is_dir() {
|
||||||
|
warn!("fts index at {} is not a directory?!", self.path.display());
|
||||||
|
return Ok(Vec::new());
|
||||||
|
}
|
||||||
|
|
||||||
|
let index = Index::open_in_dir(&self.path)?;
|
||||||
|
let reader = index
|
||||||
|
.reader_builder()
|
||||||
|
.reload_policy(ReloadPolicy::OnCommitWithDelay)
|
||||||
|
.try_into()?;
|
||||||
|
|
||||||
|
let schema = index.schema();
|
||||||
|
|
||||||
|
let search_fields: Vec<_> = self
|
||||||
|
.schema
|
||||||
|
.fields()
|
||||||
|
.filter(|(_, entry)| !entry.name().starts_with("_meta_"))
|
||||||
|
.map(|(field, _)| field)
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let query: String = query.into();
|
||||||
|
let mut clauses: Vec<(Occur, Box<dyn tantivy::query::Query>)> = Vec::new();
|
||||||
|
for term_str in query.split_whitespace() {
|
||||||
|
for &field in &search_fields {
|
||||||
|
let term = Term::from_field_text(field, term_str);
|
||||||
|
clauses.push((
|
||||||
|
Occur::Should,
|
||||||
|
Box::new(FuzzyTermQuery::new(term, distance, true)),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if clauses.is_empty() {
|
||||||
|
return Ok(Vec::new());
|
||||||
|
}
|
||||||
|
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
let res = searcher.search(&BooleanQuery::new(clauses), collector.as_ref())?;
|
||||||
|
|
||||||
|
return Self::collect_results(&schema, &searcher, res);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect_results(
|
||||||
|
schema: &Schema,
|
||||||
|
searcher: &tantivy::Searcher,
|
||||||
|
res: impl IntoIterator<Item = (f32, DocAddress)>,
|
||||||
|
) -> Result<Vec<FtsLookupResult>, TantivyError> {
|
||||||
let mut out = Vec::new();
|
let mut out = Vec::new();
|
||||||
for (score, doc) in res {
|
for (score, doc) in res {
|
||||||
let retrieved_doc: TantivyDocument = searcher.doc(doc)?;
|
let retrieved_doc: TantivyDocument = searcher.doc(doc)?;
|
||||||
|
|||||||
Reference in New Issue
Block a user