lazily-evaluated extractors

This commit is contained in:
2026-02-22 09:23:57 -08:00
parent d16d16be26
commit 751ff787e2
19 changed files with 525 additions and 391 deletions

View File

@@ -8,7 +8,11 @@ use rayon::{
use std::{
io::ErrorKind,
path::PathBuf,
sync::{Arc, mpsc::Receiver},
sync::{
Arc,
atomic::{AtomicU64, Ordering},
mpsc::Receiver,
},
thread::JoinHandle,
time::Instant,
};
@@ -144,15 +148,14 @@ impl Dataset {
let mut total = 0u64;
while let Ok(batch) = read_rx.recv() {
let batch = batch.map_err(DatasetError::from)?;
let len = batch.len() as u64;
let batch = batch?;
if let Some(flag) = &flag
&& flag.is_cancelled()
{
return Err(CancelableTaskError::Cancelled);
}
let this = AtomicU64::new(0);
let start = Instant::now();
write_pool
.install(|| {
@@ -170,6 +173,7 @@ impl Dataset {
}
})
.map(|(key, doc)| {
this.fetch_add(1, Ordering::Relaxed);
index_writer
.add_document(doc)
.map_err(|err| (key, err))
@@ -180,9 +184,10 @@ impl Dataset {
})
.map_err(|(_key, err)| DatasetError::from(err))?;
total += len;
let this = this.load(Ordering::Relaxed);
total += this;
let time_ms = start.elapsed().as_millis();
debug!("Added a batch of {len} in {time_ms} ms ({total} total)");
debug!("Added a batch of {this} in {time_ms} ms ({total} total)");
}
if let Some(flag) = flag.as_ref()
@@ -334,6 +339,13 @@ fn start_read_task(
}
}
}
if !batch.is_empty() {
match read_tx.send(Ok(batch)) {
Ok(()) => {}
Err(_) => return,
};
}
});
return (read_task, read_rx);