Write sidecar fields

This commit is contained in:
2026-03-06 15:16:35 -08:00
parent 22724eee3f
commit d51b8b51bf
11 changed files with 311 additions and 49 deletions

56
Cargo.lock generated
View File

@@ -91,6 +91,12 @@ dependencies = [
"rustversion", "rustversion",
] ]
[[package]]
name = "arrayref"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb"
[[package]] [[package]]
name = "arrayvec" name = "arrayvec"
version = "0.7.6" version = "0.7.6"
@@ -135,6 +141,20 @@ dependencies = [
"crunchy", "crunchy",
] ]
[[package]]
name = "blake3"
version = "1.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2468ef7d57b3fb7e16b576e8377cdbde2320c60e1491e961d11da40fc4f02a2d"
dependencies = [
"arrayref",
"arrayvec",
"cc",
"cfg-if",
"constant_time_eq",
"cpufeatures",
]
[[package]] [[package]]
name = "block-buffer" name = "block-buffer"
version = "0.11.0" version = "0.11.0"
@@ -289,6 +309,12 @@ version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6ef517f0926dd24a1582492c791b6a4818a4d94e789a334894aa15b0d12f55c" checksum = "a6ef517f0926dd24a1582492c791b6a4818a4d94e789a334894aa15b0d12f55c"
[[package]]
name = "constant_time_eq"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b"
[[package]] [[package]]
name = "core-foundation-sys" name = "core-foundation-sys"
version = "0.8.7" version = "0.8.7"
@@ -448,7 +474,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
dependencies = [ dependencies = [
"libc", "libc",
"windows-sys 0.59.0", "windows-sys 0.61.2",
] ]
[[package]] [[package]]
@@ -858,7 +884,7 @@ version = "0.50.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5"
dependencies = [ dependencies = [
"windows-sys 0.59.0", "windows-sys 0.61.2",
] ]
[[package]] [[package]]
@@ -968,6 +994,7 @@ dependencies = [
name = "pile-dataset" name = "pile-dataset"
version = "0.0.1" version = "0.0.1"
dependencies = [ dependencies = [
"blake3",
"chrono", "chrono",
"itertools", "itertools",
"pile-config", "pile-config",
@@ -979,6 +1006,7 @@ dependencies = [
"tantivy", "tantivy",
"thiserror", "thiserror",
"toml", "toml",
"toml_edit",
"tracing", "tracing",
"walkdir", "walkdir",
] ]
@@ -1196,7 +1224,7 @@ dependencies = [
"errno", "errno",
"libc", "libc",
"linux-raw-sys", "linux-raw-sys",
"windows-sys 0.59.0", "windows-sys 0.61.2",
] ]
[[package]] [[package]]
@@ -1569,10 +1597,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0136791f7c95b1f6dd99f9cc786b91bb81c3800b639b3478e561ddb7be95e5f1" checksum = "0136791f7c95b1f6dd99f9cc786b91bb81c3800b639b3478e561ddb7be95e5f1"
dependencies = [ dependencies = [
"fastrand", "fastrand",
"getrandom 0.3.4", "getrandom 0.4.1",
"once_cell", "once_cell",
"rustix", "rustix",
"windows-sys 0.59.0", "windows-sys 0.61.2",
] ]
[[package]] [[package]]
@@ -1687,6 +1715,19 @@ dependencies = [
"serde_core", "serde_core",
] ]
[[package]]
name = "toml_edit"
version = "0.25.4+spec-1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7193cbd0ce53dc966037f54351dbbcf0d5a642c7f0038c382ef9e677ce8c13f2"
dependencies = [
"indexmap",
"toml_datetime",
"toml_parser",
"toml_writer",
"winnow",
]
[[package]] [[package]]
name = "toml_parser" name = "toml_parser"
version = "1.0.9+spec-1.1.0" version = "1.0.9+spec-1.1.0"
@@ -2026,7 +2067,7 @@ version = "0.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
dependencies = [ dependencies = [
"windows-sys 0.59.0", "windows-sys 0.61.2",
] ]
[[package]] [[package]]
@@ -2255,6 +2296,9 @@ name = "winnow"
version = "0.7.14" version = "0.7.14"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829" checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829"
dependencies = [
"memchr",
]
[[package]] [[package]]
name = "wit-bindgen" name = "wit-bindgen"

View File

@@ -87,7 +87,9 @@ serde = { version = "1.0.228", features = ["derive"] }
serde_json = "1.0.149" serde_json = "1.0.149"
base64 = "0.22.1" base64 = "0.22.1"
toml = "1.0.3" toml = "1.0.3"
toml_edit = "0.25.4"
sha2 = "0.11.0-rc.5" sha2 = "0.11.0-rc.5"
blake3 = "1.8.3"
# Misc helpers # Misc helpers
thiserror = "2.0.18" thiserror = "2.0.18"

View File

@@ -22,3 +22,5 @@ toml = { workspace = true }
thiserror = { workspace = true } thiserror = { workspace = true }
rayon = { workspace = true } rayon = { workspace = true }
smartstring = { workspace = true } smartstring = { workspace = true }
blake3 = { workspace = true }
toml_edit = { workspace = true }

View File

@@ -21,7 +21,7 @@ use thiserror::Error;
use tracing::{debug, info, trace, warn}; use tracing::{debug, info, trace, warn};
use crate::{ use crate::{
DataSource, Item, DataSource, FileItem,
index::{DbFtsIndex, FtsLookupResult}, index::{DbFtsIndex, FtsLookupResult},
path_ts_earliest, path_ts_earliest,
source::DirDataSource, source::DirDataSource,
@@ -96,11 +96,7 @@ impl Dataset {
// MARK: get // MARK: get
// //
pub fn get( pub fn get(&self, source: &Label, key: &PathBuf) -> Option<FileItem> {
&self,
source: &Label,
key: &PathBuf,
) -> Option<Box<dyn Item<Key = PathBuf> + 'static>> {
let s = self.config.dataset.source.get(source)?; let s = self.config.dataset.source.get(source)?;
let s = match s { let s = match s {
Source::Filesystem { path, sidecars } => { Source::Filesystem { path, sidecars } => {
@@ -115,7 +111,7 @@ impl Dataset {
// MARK: fts // MARK: fts
// //
/// Refresh this dataset's fts index /// Refresh this dataset's fts index.
pub fn fts_refresh( pub fn fts_refresh(
&self, &self,
threads: usize, threads: usize,
@@ -163,7 +159,7 @@ impl Dataset {
.install(|| { .install(|| {
batch batch
.into_par_iter() .into_par_iter()
.filter_map(|(key, item)| match db_index.entry_to_document(&*item) { .filter_map(|(key, item)| match db_index.entry_to_document(&item) {
Ok(Some(doc)) => Some((key, doc)), Ok(Some(doc)) => Some((key, doc)),
Ok(None) => { Ok(None) => {
warn!("Skipping {key:?}, document is empty"); warn!("Skipping {key:?}, document is empty");
@@ -306,7 +302,7 @@ fn start_read_task(
batch_size: usize, batch_size: usize,
) -> ( ) -> (
JoinHandle<()>, JoinHandle<()>,
Receiver<Result<Vec<(PathBuf, Box<dyn Item<Key = PathBuf>>)>, DatasetError>>, Receiver<Result<Vec<(PathBuf, FileItem)>, DatasetError>>,
) { ) {
let config = config.clone(); let config = config.clone();
let (read_tx, read_rx) = std::sync::mpsc::sync_channel(2); let (read_tx, read_rx) = std::sync::mpsc::sync_channel(2);

View File

@@ -39,9 +39,11 @@ impl<'a> SidecarExtractor<'a> {
return Ok(self.output.get_or_init(HashMap::new)); return Ok(self.output.get_or_init(HashMap::new));
} }
let sidecar = std::fs::read_to_string(&sidecar_file)?; let sidecar = std::fs::read(&sidecar_file)?;
let sidecar: toml::Value = toml::from_str(&sidecar) let sidecar: toml::Value = match toml::from_slice(&sidecar) {
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; Ok(x) => x,
Err(_) => return Ok(self.output.get_or_init(HashMap::new)),
};
let output: HashMap<Label, PileValue<'_, FileItem>> = match sidecar { let output: HashMap<Label, PileValue<'_, FileItem>> = match sidecar {
toml::Value::Table(t) => t toml::Value::Table(t) => t

View File

@@ -63,9 +63,9 @@ impl DbFtsIndex {
// //
/// Turn an entry into a tantivy document /// Turn an entry into a tantivy document
pub fn entry_to_document<K: Key>( pub fn entry_to_document<K: Key, I: Item<Key = K>>(
&self, &self,
item: &dyn Item<Key = K>, item: &I,
) -> Result<Option<TantivyDocument>, TantivyError> { ) -> Result<Option<TantivyDocument>, TantivyError> {
let mut doc = TantivyDocument::default(); let mut doc = TantivyDocument::default();

View File

@@ -1,5 +1,10 @@
use pile_config::Label; use pile_config::Label;
use std::{fmt::Debug, path::PathBuf}; use std::{fmt::Debug, path::PathBuf, rc::Rc};
use crate::{
PileValue,
extract::{Extractor, SidecarExtractor},
};
// //
// MARK: key // MARK: key
@@ -28,12 +33,27 @@ impl Key for PathBuf {
// //
/// A pointer to raw data /// A pointer to raw data
pub trait Item: Debug + Send + Sync + 'static { pub trait Item: Debug + Send + Sync + 'static + Sized {
type Key: Key; type Key: Key;
fn source_name(&self) -> &str; fn source_name(&self) -> &str;
fn key(&self) -> &Self::Key; fn key(&self) -> &Self::Key;
/// Get this item's sidecar metadata
fn sidecar(&self) -> Result<Option<Rc<dyn Extractor<Self> + '_>>, std::io::Error>;
/// Set this file's sidecar metadata,
/// overwriting any existing file.
fn write_sidecar(
&self,
path: Vec<Label>,
value: PileValue<'_, Self>,
) -> Result<(), std::io::Error>;
fn hash(&self) -> Result<blake3::Hash, std::io::Error>;
/// Item conversion, downcast to specific type.
/// Returns `None` if this is not a [FileItem]
fn as_file(&self) -> Option<&FileItem>; fn as_file(&self) -> Option<&FileItem>;
} }
@@ -62,4 +82,97 @@ impl Item for FileItem {
fn as_file(&self) -> Option<&FileItem> { fn as_file(&self) -> Option<&FileItem> {
Some(self) Some(self)
} }
fn hash(&self) -> Result<blake3::Hash, std::io::Error> {
let mut hasher = blake3::Hasher::new();
let mut file = std::fs::File::open(&self.path)?;
std::io::copy(&mut file, &mut hasher)?;
return Ok(hasher.finalize());
}
fn sidecar(&self) -> Result<Option<Rc<dyn Extractor<Self> + '_>>, std::io::Error> {
if !self.sidecar {
return Ok(None);
}
// TODO: use a generic tomlextractor instead?
// you'll need a fake _ref_ to the toml file, though.
return Ok(Some(Rc::new(SidecarExtractor::new(self))));
}
fn write_sidecar(
&self,
path: Vec<Label>,
value: PileValue<'_, Self>,
) -> Result<(), std::io::Error> {
if !self.sidecar {
return Ok(());
}
let sidecar_path = self.path.with_extension("toml");
let mut doc: toml_edit::DocumentMut = if sidecar_path.is_file() {
let content = std::fs::read_to_string(&sidecar_path)?;
content.parse().unwrap_or_default()
} else {
toml_edit::DocumentMut::new()
};
fn to_edit_item(v: toml::Value) -> toml_edit::Item {
match v {
toml::Value::String(s) => toml_edit::value(s),
toml::Value::Integer(i) => toml_edit::value(i),
toml::Value::Float(f) => toml_edit::value(f),
toml::Value::Boolean(b) => toml_edit::value(b),
toml::Value::Datetime(d) => toml_edit::value(d.to_string()),
toml::Value::Array(arr) => {
let mut array = toml_edit::Array::new();
for item in arr {
if let toml_edit::Item::Value(v) = to_edit_item(item) {
array.push_formatted(v);
}
}
toml_edit::Item::Value(toml_edit::Value::Array(array))
}
toml::Value::Table(t) => {
let mut table = toml_edit::Table::new();
for (k, v) in t {
table.insert(&k, to_edit_item(v));
}
toml_edit::Item::Table(table)
}
}
}
let json_value = value.to_json()?;
let toml_value: toml::Value = serde_json::from_value(json_value)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
let item = to_edit_item(toml_value);
let Some((path_last, path_init)) = path.split_last() else {
return Ok(());
};
let mut table = doc.as_table_mut();
for label in path_init {
let key = label.as_str();
if !table.contains_key(key) {
table.insert(key, toml_edit::Item::Table(toml_edit::Table::new()));
}
table = table
.get_mut(key)
.and_then(|item| item.as_table_mut())
.ok_or_else(|| {
std::io::Error::new(
std::io::ErrorKind::InvalidData,
"path element is not a table",
)
})?;
}
table.insert(path_last.as_str(), item);
std::fs::write(&sidecar_path, doc.to_string())?;
Ok(())
}
} }

View File

@@ -4,7 +4,7 @@ use pile_config::Label;
use std::path::PathBuf; use std::path::PathBuf;
use walkdir::WalkDir; use walkdir::WalkDir;
use crate::{DataSource, Item, item::FileItem, path_ts_latest}; use crate::{DataSource, item::FileItem, path_ts_latest};
#[derive(Debug)] #[derive(Debug)]
pub struct DirDataSource { pub struct DirDataSource {
@@ -26,26 +26,27 @@ impl DirDataSource {
impl DataSource for DirDataSource { impl DataSource for DirDataSource {
type Key = PathBuf; type Key = PathBuf;
type Item = FileItem;
type Error = std::io::Error; type Error = std::io::Error;
fn get( fn get(&self, key: &Self::Key) -> Result<Option<Self::Item>, Self::Error> {
&self,
key: &Self::Key,
) -> Result<Option<Box<dyn Item<Key = Self::Key> + 'static>>, Self::Error> {
if !key.is_file() { if !key.is_file() {
return Ok(None); return Ok(None);
} }
return Ok(Some(Box::new(FileItem { // Ignore toml files if sidecars are enabled
if self.sidecars && key.extension().and_then(|x| x.to_str()) == Some("toml") {
return Ok(None);
}
return Ok(Some(FileItem {
source_name: self.name.clone(), source_name: self.name.clone(),
path: key.to_owned(), path: key.to_owned(),
sidecar: self.sidecars, sidecar: self.sidecars,
}))); }));
} }
fn iter( fn iter(&self) -> impl Iterator<Item = Result<(Self::Key, Self::Item), Self::Error>> {
&self,
) -> impl Iterator<Item = Result<(Self::Key, Box<dyn Item<Key = Self::Key>>), Self::Error>> {
return self return self
.dirs .dirs
.iter() .iter()
@@ -62,15 +63,17 @@ impl DataSource for DirDataSource {
Ok((_, entry)) => { Ok((_, entry)) => {
let path = entry.into_path(); let path = entry.into_path();
let item: Box<dyn Item<Key = Self::Key>> = let item = match path.extension().and_then(|x| x.to_str()) {
match path.extension().and_then(|x| x.to_str()) {
None => return None, None => return None,
Some("flac") => Box::new(FileItem {
// Ignore toml if sidecars are enabled
Some("toml") if self.sidecars => return None,
Some(_) => FileItem {
source_name: self.name.clone(), source_name: self.name.clone(),
path: path.clone(), path: path.clone(),
sidecar: self.sidecars, sidecar: self.sidecars,
}), },
Some(_) => return None,
}; };
Some(Ok((path, item))) Some(Ok((path, item)))

View File

@@ -8,19 +8,15 @@ pub trait DataSource {
/// The type used to retrieve items from this source /// The type used to retrieve items from this source
/// (e.g, a PathBuf or a primary key) /// (e.g, a PathBuf or a primary key)
type Key: Key; type Key: Key;
type Item: Item<Key = Self::Key>;
type Error: Error + Sync + Send; type Error: Error + Sync + Send;
/// Get an item from this datasource /// Get an item from this datasource
fn get( fn get(&self, key: &Self::Key) -> Result<Option<Self::Item>, Self::Error>;
&self,
key: &Self::Key,
) -> Result<Option<Box<dyn Item<Key = Self::Key> + 'static>>, Self::Error>;
/// Iterate over all items in this source in an arbitrary order /// Iterate over all items in this source in an arbitrary order
fn iter( fn iter(&self) -> impl Iterator<Item = Result<(Self::Key, Self::Item), Self::Error>>;
&self,
) -> impl Iterator<Item = Result<(Self::Key, Box<dyn Item<Key = Self::Key> + 'static>), Self::Error>>;
/// Return the time of the latest change to the data in this source /// Return the time of the latest change to the data in this source
fn latest_change(&self) -> Result<Option<DateTime<Utc>>, Self::Error>; fn latest_change(&self) -> Result<Option<DateTime<Utc>>, Self::Error>;

View File

@@ -0,0 +1,96 @@
use anyhow::{Context, Result};
use clap::Args;
use pile_config::{Label, Source};
use pile_dataset::index::DbFtsIndex;
use pile_dataset::source::DirDataSource;
use pile_dataset::{DataSource, Dataset, FileItem, Item, PileValue, extract::MetaExtractor};
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
use std::{path::PathBuf, rc::Rc};
use tracing::{info, warn};
use crate::{CliCmd, GlobalContext};
#[derive(Debug, Args)]
pub struct AnnotateCommand {
/// The schema field to read (must be defined in pile.toml)
field: String,
/// Sidecar path to write to (e.g. meta.title)
dest: String,
/// Path to dataset config
#[arg(long, short = 'c', default_value = "./pile.toml")]
config: PathBuf,
}
impl AnnotateCommand {
fn parse_dest(dest: &str) -> Result<Vec<Label>> {
dest.split('.')
.map(|s| {
Label::new(s).ok_or_else(|| anyhow::anyhow!("invalid label {s:?} in dest path"))
})
.collect()
}
}
impl CliCmd for AnnotateCommand {
async fn run(
self,
_ctx: GlobalContext,
_flag: CancelFlag,
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
let field = Label::new(&self.field)
.ok_or_else(|| anyhow::anyhow!("invalid field name {:?}", self.field))?;
let dest_path = Self::parse_dest(&self.dest)?;
let ds = Dataset::open(&self.config)
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
if !ds.config.schema.contains_key(&field) {
return Err(anyhow::anyhow!("field {:?} is not defined in schema", self.field).into());
}
let index = DbFtsIndex::new(&ds.path_workdir, &ds.config);
let mut count = 0u64;
for (name, source) in &ds.config.dataset.source {
match source {
Source::Filesystem { path, sidecars } => {
if !sidecars {
warn!("Source {name} does not have sidecars enabled, skipping");
continue;
}
let source = DirDataSource::new(name, path.clone().to_vec(), *sidecars);
for res in source.iter() {
let (_key, item) =
res.with_context(|| format!("while reading source {name}"))?;
let meta = MetaExtractor::new(&item);
let extractor = PileValue::<FileItem>::Extractor(Rc::new(meta));
let Some(value) =
index.get_field(&extractor, &field).with_context(|| {
format!("while extracting field from {}", item.path.display())
})?
else {
continue;
};
item.write_sidecar(dest_path.clone(), PileValue::String(value.into()))
.with_context(|| {
format!("while writing sidecar for {}", item.path.display())
})?;
count += 1;
}
}
}
}
info!("Annotated {count} items");
return Ok(0);
}
}

View File

@@ -2,6 +2,7 @@ use anyhow::Result;
use clap::Subcommand; use clap::Subcommand;
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTask, CancelableTaskError}; use pile_toolbox::cancelabletask::{CancelFlag, CancelableTask, CancelableTaskError};
mod annotate;
mod check; mod check;
mod index; mod index;
mod init; mod init;
@@ -12,6 +13,12 @@ use crate::GlobalContext;
#[derive(Debug, Subcommand)] #[derive(Debug, Subcommand)]
pub enum SubCommand { pub enum SubCommand {
/// Annotate all items with a field, writing it to a sidecar path
Annotate {
#[command(flatten)]
cmd: annotate::AnnotateCommand,
},
/// Create an empty dataset /// Create an empty dataset
Init { Init {
#[command(flatten)] #[command(flatten)]
@@ -46,6 +53,7 @@ pub enum SubCommand {
impl CliCmdDispatch for SubCommand { impl CliCmdDispatch for SubCommand {
fn start(self, ctx: GlobalContext) -> Result<CancelableTask<Result<i32>>> { fn start(self, ctx: GlobalContext) -> Result<CancelableTask<Result<i32>>> {
match self { match self {
Self::Annotate { cmd } => cmd.start(ctx),
Self::Init { cmd } => cmd.start(ctx), Self::Init { cmd } => cmd.start(ctx),
Self::Check { cmd } => cmd.start(ctx), Self::Check { cmd } => cmd.start(ctx),
Self::Index { cmd } => cmd.start(ctx), Self::Index { cmd } => cmd.start(ctx),