Write sidecar fields
This commit is contained in:
@@ -22,3 +22,5 @@ toml = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
rayon = { workspace = true }
|
||||
smartstring = { workspace = true }
|
||||
blake3 = { workspace = true }
|
||||
toml_edit = { workspace = true }
|
||||
|
||||
@@ -21,7 +21,7 @@ use thiserror::Error;
|
||||
use tracing::{debug, info, trace, warn};
|
||||
|
||||
use crate::{
|
||||
DataSource, Item,
|
||||
DataSource, FileItem,
|
||||
index::{DbFtsIndex, FtsLookupResult},
|
||||
path_ts_earliest,
|
||||
source::DirDataSource,
|
||||
@@ -96,11 +96,7 @@ impl Dataset {
|
||||
// MARK: get
|
||||
//
|
||||
|
||||
pub fn get(
|
||||
&self,
|
||||
source: &Label,
|
||||
key: &PathBuf,
|
||||
) -> Option<Box<dyn Item<Key = PathBuf> + 'static>> {
|
||||
pub fn get(&self, source: &Label, key: &PathBuf) -> Option<FileItem> {
|
||||
let s = self.config.dataset.source.get(source)?;
|
||||
let s = match s {
|
||||
Source::Filesystem { path, sidecars } => {
|
||||
@@ -115,7 +111,7 @@ impl Dataset {
|
||||
// MARK: fts
|
||||
//
|
||||
|
||||
/// Refresh this dataset's fts index
|
||||
/// Refresh this dataset's fts index.
|
||||
pub fn fts_refresh(
|
||||
&self,
|
||||
threads: usize,
|
||||
@@ -163,7 +159,7 @@ impl Dataset {
|
||||
.install(|| {
|
||||
batch
|
||||
.into_par_iter()
|
||||
.filter_map(|(key, item)| match db_index.entry_to_document(&*item) {
|
||||
.filter_map(|(key, item)| match db_index.entry_to_document(&item) {
|
||||
Ok(Some(doc)) => Some((key, doc)),
|
||||
Ok(None) => {
|
||||
warn!("Skipping {key:?}, document is empty");
|
||||
@@ -306,7 +302,7 @@ fn start_read_task(
|
||||
batch_size: usize,
|
||||
) -> (
|
||||
JoinHandle<()>,
|
||||
Receiver<Result<Vec<(PathBuf, Box<dyn Item<Key = PathBuf>>)>, DatasetError>>,
|
||||
Receiver<Result<Vec<(PathBuf, FileItem)>, DatasetError>>,
|
||||
) {
|
||||
let config = config.clone();
|
||||
let (read_tx, read_rx) = std::sync::mpsc::sync_channel(2);
|
||||
|
||||
@@ -39,9 +39,11 @@ impl<'a> SidecarExtractor<'a> {
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
|
||||
let sidecar = std::fs::read_to_string(&sidecar_file)?;
|
||||
let sidecar: toml::Value = toml::from_str(&sidecar)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
|
||||
let sidecar = std::fs::read(&sidecar_file)?;
|
||||
let sidecar: toml::Value = match toml::from_slice(&sidecar) {
|
||||
Ok(x) => x,
|
||||
Err(_) => return Ok(self.output.get_or_init(HashMap::new)),
|
||||
};
|
||||
|
||||
let output: HashMap<Label, PileValue<'_, FileItem>> = match sidecar {
|
||||
toml::Value::Table(t) => t
|
||||
|
||||
@@ -63,9 +63,9 @@ impl DbFtsIndex {
|
||||
//
|
||||
|
||||
/// Turn an entry into a tantivy document
|
||||
pub fn entry_to_document<K: Key>(
|
||||
pub fn entry_to_document<K: Key, I: Item<Key = K>>(
|
||||
&self,
|
||||
item: &dyn Item<Key = K>,
|
||||
item: &I,
|
||||
) -> Result<Option<TantivyDocument>, TantivyError> {
|
||||
let mut doc = TantivyDocument::default();
|
||||
|
||||
|
||||
@@ -1,5 +1,10 @@
|
||||
use pile_config::Label;
|
||||
use std::{fmt::Debug, path::PathBuf};
|
||||
use std::{fmt::Debug, path::PathBuf, rc::Rc};
|
||||
|
||||
use crate::{
|
||||
PileValue,
|
||||
extract::{Extractor, SidecarExtractor},
|
||||
};
|
||||
|
||||
//
|
||||
// MARK: key
|
||||
@@ -28,12 +33,27 @@ impl Key for PathBuf {
|
||||
//
|
||||
|
||||
/// A pointer to raw data
|
||||
pub trait Item: Debug + Send + Sync + 'static {
|
||||
pub trait Item: Debug + Send + Sync + 'static + Sized {
|
||||
type Key: Key;
|
||||
|
||||
fn source_name(&self) -> &str;
|
||||
fn key(&self) -> &Self::Key;
|
||||
|
||||
/// Get this item's sidecar metadata
|
||||
fn sidecar(&self) -> Result<Option<Rc<dyn Extractor<Self> + '_>>, std::io::Error>;
|
||||
|
||||
/// Set this file's sidecar metadata,
|
||||
/// overwriting any existing file.
|
||||
fn write_sidecar(
|
||||
&self,
|
||||
path: Vec<Label>,
|
||||
value: PileValue<'_, Self>,
|
||||
) -> Result<(), std::io::Error>;
|
||||
|
||||
fn hash(&self) -> Result<blake3::Hash, std::io::Error>;
|
||||
|
||||
/// Item conversion, downcast to specific type.
|
||||
/// Returns `None` if this is not a [FileItem]
|
||||
fn as_file(&self) -> Option<&FileItem>;
|
||||
}
|
||||
|
||||
@@ -62,4 +82,97 @@ impl Item for FileItem {
|
||||
fn as_file(&self) -> Option<&FileItem> {
|
||||
Some(self)
|
||||
}
|
||||
|
||||
fn hash(&self) -> Result<blake3::Hash, std::io::Error> {
|
||||
let mut hasher = blake3::Hasher::new();
|
||||
let mut file = std::fs::File::open(&self.path)?;
|
||||
std::io::copy(&mut file, &mut hasher)?;
|
||||
return Ok(hasher.finalize());
|
||||
}
|
||||
|
||||
fn sidecar(&self) -> Result<Option<Rc<dyn Extractor<Self> + '_>>, std::io::Error> {
|
||||
if !self.sidecar {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// TODO: use a generic tomlextractor instead?
|
||||
// you'll need a fake _ref_ to the toml file, though.
|
||||
return Ok(Some(Rc::new(SidecarExtractor::new(self))));
|
||||
}
|
||||
|
||||
fn write_sidecar(
|
||||
&self,
|
||||
path: Vec<Label>,
|
||||
value: PileValue<'_, Self>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
if !self.sidecar {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let sidecar_path = self.path.with_extension("toml");
|
||||
|
||||
let mut doc: toml_edit::DocumentMut = if sidecar_path.is_file() {
|
||||
let content = std::fs::read_to_string(&sidecar_path)?;
|
||||
content.parse().unwrap_or_default()
|
||||
} else {
|
||||
toml_edit::DocumentMut::new()
|
||||
};
|
||||
|
||||
fn to_edit_item(v: toml::Value) -> toml_edit::Item {
|
||||
match v {
|
||||
toml::Value::String(s) => toml_edit::value(s),
|
||||
toml::Value::Integer(i) => toml_edit::value(i),
|
||||
toml::Value::Float(f) => toml_edit::value(f),
|
||||
toml::Value::Boolean(b) => toml_edit::value(b),
|
||||
toml::Value::Datetime(d) => toml_edit::value(d.to_string()),
|
||||
toml::Value::Array(arr) => {
|
||||
let mut array = toml_edit::Array::new();
|
||||
for item in arr {
|
||||
if let toml_edit::Item::Value(v) = to_edit_item(item) {
|
||||
array.push_formatted(v);
|
||||
}
|
||||
}
|
||||
toml_edit::Item::Value(toml_edit::Value::Array(array))
|
||||
}
|
||||
toml::Value::Table(t) => {
|
||||
let mut table = toml_edit::Table::new();
|
||||
for (k, v) in t {
|
||||
table.insert(&k, to_edit_item(v));
|
||||
}
|
||||
toml_edit::Item::Table(table)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let json_value = value.to_json()?;
|
||||
let toml_value: toml::Value = serde_json::from_value(json_value)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
|
||||
let item = to_edit_item(toml_value);
|
||||
|
||||
let Some((path_last, path_init)) = path.split_last() else {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
let mut table = doc.as_table_mut();
|
||||
for label in path_init {
|
||||
let key = label.as_str();
|
||||
if !table.contains_key(key) {
|
||||
table.insert(key, toml_edit::Item::Table(toml_edit::Table::new()));
|
||||
}
|
||||
table = table
|
||||
.get_mut(key)
|
||||
.and_then(|item| item.as_table_mut())
|
||||
.ok_or_else(|| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
"path element is not a table",
|
||||
)
|
||||
})?;
|
||||
}
|
||||
table.insert(path_last.as_str(), item);
|
||||
|
||||
std::fs::write(&sidecar_path, doc.to_string())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ use pile_config::Label;
|
||||
use std::path::PathBuf;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::{DataSource, Item, item::FileItem, path_ts_latest};
|
||||
use crate::{DataSource, item::FileItem, path_ts_latest};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DirDataSource {
|
||||
@@ -26,26 +26,27 @@ impl DirDataSource {
|
||||
|
||||
impl DataSource for DirDataSource {
|
||||
type Key = PathBuf;
|
||||
type Item = FileItem;
|
||||
type Error = std::io::Error;
|
||||
|
||||
fn get(
|
||||
&self,
|
||||
key: &Self::Key,
|
||||
) -> Result<Option<Box<dyn Item<Key = Self::Key> + 'static>>, Self::Error> {
|
||||
fn get(&self, key: &Self::Key) -> Result<Option<Self::Item>, Self::Error> {
|
||||
if !key.is_file() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
return Ok(Some(Box::new(FileItem {
|
||||
// Ignore toml files if sidecars are enabled
|
||||
if self.sidecars && key.extension().and_then(|x| x.to_str()) == Some("toml") {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
return Ok(Some(FileItem {
|
||||
source_name: self.name.clone(),
|
||||
path: key.to_owned(),
|
||||
sidecar: self.sidecars,
|
||||
})));
|
||||
}));
|
||||
}
|
||||
|
||||
fn iter(
|
||||
&self,
|
||||
) -> impl Iterator<Item = Result<(Self::Key, Box<dyn Item<Key = Self::Key>>), Self::Error>> {
|
||||
fn iter(&self) -> impl Iterator<Item = Result<(Self::Key, Self::Item), Self::Error>> {
|
||||
return self
|
||||
.dirs
|
||||
.iter()
|
||||
@@ -62,16 +63,18 @@ impl DataSource for DirDataSource {
|
||||
Ok((_, entry)) => {
|
||||
let path = entry.into_path();
|
||||
|
||||
let item: Box<dyn Item<Key = Self::Key>> =
|
||||
match path.extension().and_then(|x| x.to_str()) {
|
||||
None => return None,
|
||||
Some("flac") => Box::new(FileItem {
|
||||
source_name: self.name.clone(),
|
||||
path: path.clone(),
|
||||
sidecar: self.sidecars,
|
||||
}),
|
||||
Some(_) => return None,
|
||||
};
|
||||
let item = match path.extension().and_then(|x| x.to_str()) {
|
||||
None => return None,
|
||||
|
||||
// Ignore toml if sidecars are enabled
|
||||
Some("toml") if self.sidecars => return None,
|
||||
|
||||
Some(_) => FileItem {
|
||||
source_name: self.name.clone(),
|
||||
path: path.clone(),
|
||||
sidecar: self.sidecars,
|
||||
},
|
||||
};
|
||||
|
||||
Some(Ok((path, item)))
|
||||
}
|
||||
|
||||
@@ -8,19 +8,15 @@ pub trait DataSource {
|
||||
/// The type used to retrieve items from this source
|
||||
/// (e.g, a PathBuf or a primary key)
|
||||
type Key: Key;
|
||||
type Item: Item<Key = Self::Key>;
|
||||
|
||||
type Error: Error + Sync + Send;
|
||||
|
||||
/// Get an item from this datasource
|
||||
fn get(
|
||||
&self,
|
||||
key: &Self::Key,
|
||||
) -> Result<Option<Box<dyn Item<Key = Self::Key> + 'static>>, Self::Error>;
|
||||
fn get(&self, key: &Self::Key) -> Result<Option<Self::Item>, Self::Error>;
|
||||
|
||||
/// Iterate over all items in this source in an arbitrary order
|
||||
fn iter(
|
||||
&self,
|
||||
) -> impl Iterator<Item = Result<(Self::Key, Box<dyn Item<Key = Self::Key> + 'static>), Self::Error>>;
|
||||
fn iter(&self) -> impl Iterator<Item = Result<(Self::Key, Self::Item), Self::Error>>;
|
||||
|
||||
/// Return the time of the latest change to the data in this source
|
||||
fn latest_change(&self) -> Result<Option<DateTime<Utc>>, Self::Error>;
|
||||
|
||||
96
crates/pile/src/command/annotate.rs
Normal file
96
crates/pile/src/command/annotate.rs
Normal file
@@ -0,0 +1,96 @@
|
||||
use anyhow::{Context, Result};
|
||||
use clap::Args;
|
||||
use pile_config::{Label, Source};
|
||||
use pile_dataset::index::DbFtsIndex;
|
||||
use pile_dataset::source::DirDataSource;
|
||||
use pile_dataset::{DataSource, Dataset, FileItem, Item, PileValue, extract::MetaExtractor};
|
||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
||||
use std::{path::PathBuf, rc::Rc};
|
||||
use tracing::{info, warn};
|
||||
|
||||
use crate::{CliCmd, GlobalContext};
|
||||
|
||||
#[derive(Debug, Args)]
|
||||
pub struct AnnotateCommand {
|
||||
/// The schema field to read (must be defined in pile.toml)
|
||||
field: String,
|
||||
|
||||
/// Sidecar path to write to (e.g. meta.title)
|
||||
dest: String,
|
||||
|
||||
/// Path to dataset config
|
||||
#[arg(long, short = 'c', default_value = "./pile.toml")]
|
||||
config: PathBuf,
|
||||
}
|
||||
|
||||
impl AnnotateCommand {
|
||||
fn parse_dest(dest: &str) -> Result<Vec<Label>> {
|
||||
dest.split('.')
|
||||
.map(|s| {
|
||||
Label::new(s).ok_or_else(|| anyhow::anyhow!("invalid label {s:?} in dest path"))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
impl CliCmd for AnnotateCommand {
|
||||
async fn run(
|
||||
self,
|
||||
_ctx: GlobalContext,
|
||||
_flag: CancelFlag,
|
||||
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
||||
let field = Label::new(&self.field)
|
||||
.ok_or_else(|| anyhow::anyhow!("invalid field name {:?}", self.field))?;
|
||||
let dest_path = Self::parse_dest(&self.dest)?;
|
||||
|
||||
let ds = Dataset::open(&self.config)
|
||||
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
||||
|
||||
if !ds.config.schema.contains_key(&field) {
|
||||
return Err(anyhow::anyhow!("field {:?} is not defined in schema", self.field).into());
|
||||
}
|
||||
|
||||
let index = DbFtsIndex::new(&ds.path_workdir, &ds.config);
|
||||
let mut count = 0u64;
|
||||
|
||||
for (name, source) in &ds.config.dataset.source {
|
||||
match source {
|
||||
Source::Filesystem { path, sidecars } => {
|
||||
if !sidecars {
|
||||
warn!("Source {name} does not have sidecars enabled, skipping");
|
||||
continue;
|
||||
}
|
||||
|
||||
let source = DirDataSource::new(name, path.clone().to_vec(), *sidecars);
|
||||
|
||||
for res in source.iter() {
|
||||
let (_key, item) =
|
||||
res.with_context(|| format!("while reading source {name}"))?;
|
||||
|
||||
let meta = MetaExtractor::new(&item);
|
||||
let extractor = PileValue::<FileItem>::Extractor(Rc::new(meta));
|
||||
|
||||
let Some(value) =
|
||||
index.get_field(&extractor, &field).with_context(|| {
|
||||
format!("while extracting field from {}", item.path.display())
|
||||
})?
|
||||
else {
|
||||
continue;
|
||||
};
|
||||
|
||||
item.write_sidecar(dest_path.clone(), PileValue::String(value.into()))
|
||||
.with_context(|| {
|
||||
format!("while writing sidecar for {}", item.path.display())
|
||||
})?;
|
||||
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!("Annotated {count} items");
|
||||
|
||||
return Ok(0);
|
||||
}
|
||||
}
|
||||
@@ -2,6 +2,7 @@ use anyhow::Result;
|
||||
use clap::Subcommand;
|
||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTask, CancelableTaskError};
|
||||
|
||||
mod annotate;
|
||||
mod check;
|
||||
mod index;
|
||||
mod init;
|
||||
@@ -12,6 +13,12 @@ use crate::GlobalContext;
|
||||
|
||||
#[derive(Debug, Subcommand)]
|
||||
pub enum SubCommand {
|
||||
/// Annotate all items with a field, writing it to a sidecar path
|
||||
Annotate {
|
||||
#[command(flatten)]
|
||||
cmd: annotate::AnnotateCommand,
|
||||
},
|
||||
|
||||
/// Create an empty dataset
|
||||
Init {
|
||||
#[command(flatten)]
|
||||
@@ -46,6 +53,7 @@ pub enum SubCommand {
|
||||
impl CliCmdDispatch for SubCommand {
|
||||
fn start(self, ctx: GlobalContext) -> Result<CancelableTask<Result<i32>>> {
|
||||
match self {
|
||||
Self::Annotate { cmd } => cmd.start(ctx),
|
||||
Self::Init { cmd } => cmd.start(ctx),
|
||||
Self::Check { cmd } => cmd.start(ctx),
|
||||
Self::Index { cmd } => cmd.start(ctx),
|
||||
|
||||
Reference in New Issue
Block a user