diff --git a/Cargo.lock b/Cargo.lock index a8e9d46..e64ccd2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -91,6 +91,12 @@ dependencies = [ "rustversion", ] +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + [[package]] name = "arrayvec" version = "0.7.6" @@ -135,6 +141,20 @@ dependencies = [ "crunchy", ] +[[package]] +name = "blake3" +version = "1.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2468ef7d57b3fb7e16b576e8377cdbde2320c60e1491e961d11da40fc4f02a2d" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", + "cpufeatures", +] + [[package]] name = "block-buffer" version = "0.11.0" @@ -289,6 +309,12 @@ version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6ef517f0926dd24a1582492c791b6a4818a4d94e789a334894aa15b0d12f55c" +[[package]] +name = "constant_time_eq" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" + [[package]] name = "core-foundation-sys" version = "0.8.7" @@ -448,7 +474,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -858,7 +884,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -968,6 +994,7 @@ dependencies = [ name = "pile-dataset" version = "0.0.1" dependencies = [ + "blake3", "chrono", "itertools", "pile-config", @@ -979,6 +1006,7 @@ dependencies = [ "tantivy", "thiserror", "toml", + "toml_edit", "tracing", "walkdir", ] @@ -1196,7 +1224,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -1569,10 +1597,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0136791f7c95b1f6dd99f9cc786b91bb81c3800b639b3478e561ddb7be95e5f1" dependencies = [ "fastrand", - "getrandom 0.3.4", + "getrandom 0.4.1", "once_cell", "rustix", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -1687,6 +1715,19 @@ dependencies = [ "serde_core", ] +[[package]] +name = "toml_edit" +version = "0.25.4+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7193cbd0ce53dc966037f54351dbbcf0d5a642c7f0038c382ef9e677ce8c13f2" +dependencies = [ + "indexmap", + "toml_datetime", + "toml_parser", + "toml_writer", + "winnow", +] + [[package]] name = "toml_parser" version = "1.0.9+spec-1.1.0" @@ -2026,7 +2067,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -2255,6 +2296,9 @@ name = "winnow" version = "0.7.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829" +dependencies = [ + "memchr", +] [[package]] name = "wit-bindgen" diff --git a/Cargo.toml b/Cargo.toml index cd47aa3..f137f73 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -87,7 +87,9 @@ serde = { version = "1.0.228", features = ["derive"] } serde_json = "1.0.149" base64 = "0.22.1" toml = "1.0.3" +toml_edit = "0.25.4" sha2 = "0.11.0-rc.5" +blake3 = "1.8.3" # Misc helpers thiserror = "2.0.18" diff --git a/crates/pile-dataset/Cargo.toml b/crates/pile-dataset/Cargo.toml index d60c1c9..b65142a 100644 --- a/crates/pile-dataset/Cargo.toml +++ b/crates/pile-dataset/Cargo.toml @@ -22,3 +22,5 @@ toml = { workspace = true } thiserror = { workspace = true } rayon = { workspace = true } smartstring = { workspace = true } +blake3 = { workspace = true } +toml_edit = { workspace = true } diff --git a/crates/pile-dataset/src/dataset.rs b/crates/pile-dataset/src/dataset.rs index 27f9d2c..c20823a 100644 --- a/crates/pile-dataset/src/dataset.rs +++ b/crates/pile-dataset/src/dataset.rs @@ -21,7 +21,7 @@ use thiserror::Error; use tracing::{debug, info, trace, warn}; use crate::{ - DataSource, Item, + DataSource, FileItem, index::{DbFtsIndex, FtsLookupResult}, path_ts_earliest, source::DirDataSource, @@ -96,11 +96,7 @@ impl Dataset { // MARK: get // - pub fn get( - &self, - source: &Label, - key: &PathBuf, - ) -> Option + 'static>> { + pub fn get(&self, source: &Label, key: &PathBuf) -> Option { let s = self.config.dataset.source.get(source)?; let s = match s { Source::Filesystem { path, sidecars } => { @@ -115,7 +111,7 @@ impl Dataset { // MARK: fts // - /// Refresh this dataset's fts index + /// Refresh this dataset's fts index. pub fn fts_refresh( &self, threads: usize, @@ -163,7 +159,7 @@ impl Dataset { .install(|| { batch .into_par_iter() - .filter_map(|(key, item)| match db_index.entry_to_document(&*item) { + .filter_map(|(key, item)| match db_index.entry_to_document(&item) { Ok(Some(doc)) => Some((key, doc)), Ok(None) => { warn!("Skipping {key:?}, document is empty"); @@ -306,7 +302,7 @@ fn start_read_task( batch_size: usize, ) -> ( JoinHandle<()>, - Receiver>)>, DatasetError>>, + Receiver, DatasetError>>, ) { let config = config.clone(); let (read_tx, read_rx) = std::sync::mpsc::sync_channel(2); diff --git a/crates/pile-dataset/src/extract/sidecar.rs b/crates/pile-dataset/src/extract/sidecar.rs index b282877..77f09ce 100644 --- a/crates/pile-dataset/src/extract/sidecar.rs +++ b/crates/pile-dataset/src/extract/sidecar.rs @@ -39,9 +39,11 @@ impl<'a> SidecarExtractor<'a> { return Ok(self.output.get_or_init(HashMap::new)); } - let sidecar = std::fs::read_to_string(&sidecar_file)?; - let sidecar: toml::Value = toml::from_str(&sidecar) - .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; + let sidecar = std::fs::read(&sidecar_file)?; + let sidecar: toml::Value = match toml::from_slice(&sidecar) { + Ok(x) => x, + Err(_) => return Ok(self.output.get_or_init(HashMap::new)), + }; let output: HashMap> = match sidecar { toml::Value::Table(t) => t diff --git a/crates/pile-dataset/src/index/index_fts.rs b/crates/pile-dataset/src/index/index_fts.rs index 95aa23d..8937fe3 100644 --- a/crates/pile-dataset/src/index/index_fts.rs +++ b/crates/pile-dataset/src/index/index_fts.rs @@ -63,9 +63,9 @@ impl DbFtsIndex { // /// Turn an entry into a tantivy document - pub fn entry_to_document( + pub fn entry_to_document>( &self, - item: &dyn Item, + item: &I, ) -> Result, TantivyError> { let mut doc = TantivyDocument::default(); diff --git a/crates/pile-dataset/src/item.rs b/crates/pile-dataset/src/item.rs index a274547..1513679 100644 --- a/crates/pile-dataset/src/item.rs +++ b/crates/pile-dataset/src/item.rs @@ -1,5 +1,10 @@ use pile_config::Label; -use std::{fmt::Debug, path::PathBuf}; +use std::{fmt::Debug, path::PathBuf, rc::Rc}; + +use crate::{ + PileValue, + extract::{Extractor, SidecarExtractor}, +}; // // MARK: key @@ -28,12 +33,27 @@ impl Key for PathBuf { // /// A pointer to raw data -pub trait Item: Debug + Send + Sync + 'static { +pub trait Item: Debug + Send + Sync + 'static + Sized { type Key: Key; fn source_name(&self) -> &str; fn key(&self) -> &Self::Key; + /// Get this item's sidecar metadata + fn sidecar(&self) -> Result + '_>>, std::io::Error>; + + /// Set this file's sidecar metadata, + /// overwriting any existing file. + fn write_sidecar( + &self, + path: Vec