From 3846553b8a43ef3125728b8fd35ddd9ddc254acc Mon Sep 17 00:00:00 2001 From: rm-dr <96270320+rm-dr@users.noreply.github.com> Date: Tue, 6 Jan 2026 23:05:33 -0800 Subject: [PATCH] Initial `pile-config` --- crates/pile-config/Cargo.toml | 16 ++++ crates/pile-config/src/config.toml | 46 +++++++++++ crates/pile-config/src/lib.rs | 104 +++++++++++++++++++++++++ crates/pile-config/src/post.rs | 121 +++++++++++++++++++++++++++++ 4 files changed, 287 insertions(+) create mode 100644 crates/pile-config/Cargo.toml create mode 100644 crates/pile-config/src/config.toml create mode 100644 crates/pile-config/src/lib.rs create mode 100644 crates/pile-config/src/post.rs diff --git a/crates/pile-config/Cargo.toml b/crates/pile-config/Cargo.toml new file mode 100644 index 0000000..9b637d1 --- /dev/null +++ b/crates/pile-config/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "pile-config" +version = { workspace = true } +rust-version = { workspace = true } +edition = { workspace = true } + +[lints] +workspace = true + +[dependencies] +serde = { workspace = true } +itertools = { workspace = true } +serde_json = { workspace = true } + +[dev-dependencies] +toml = { workspace = true } diff --git a/crates/pile-config/src/config.toml b/crates/pile-config/src/config.toml new file mode 100644 index 0000000..505321f --- /dev/null +++ b/crates/pile-config/src/config.toml @@ -0,0 +1,46 @@ +[dataset] +# This dataset's name. Must be unique. +name = "dataset" + +# root directory for fts indices, relative to the parent dir of this config file. +# Files are written to {working_dir}/{dataset.name}/*. +# Default is ".pile" +# +# working_dir = ".pile" + +# Data sources avaliable in this dataset +source."music" = { type = "flac", path = ["music", "music-2"] } + + +# This dataset's schema. +# Defines normalized fields that are extracted from source entries on-demand. +# +# Format is as follows: +# +# "field-name" = { +# # The type of data this field contains. +# # only text is supportedin this verison. +# type = "text", +# +# # An array of jsonpaths (rfc9535) used to extract this field from each source entry. +# # These are evaluated in order, the first non-null value is used. +# # A single string is equivalent to an array with one element. +# path = "$.json.path" +# } +[schema] +album = { type = "text", path = "$.Album" } +isrc = { type = "text", path = "$.Isrc" } +artist = { type = "text", path = ["$.Artist", "$.TrackArtist"] } +lyrics = { type = "text", path = "$.Lyrics" } +genre = { type = "text", path = "$.Genre" } +title = { type = "text", path = ["$.Title", "$.TrackTitle"] } + +# Fts configuration. +# Determines which fields (defined in `schema`) are included in the fts index. +[fts] +field.album = { tokenize = true } +field.isrc = { tokenize = true } +field.artist = { tokenize = true } +field.lyrics = { tokenize = true } +field.genre = { tokenize = true } +field.title = { tokenize = true } diff --git a/crates/pile-config/src/lib.rs b/crates/pile-config/src/lib.rs new file mode 100644 index 0000000..cd671a5 --- /dev/null +++ b/crates/pile-config/src/lib.rs @@ -0,0 +1,104 @@ +use serde::Deserialize; +use std::{collections::HashMap, fmt::Debug, path::PathBuf, slice}; + +pub static INIT_DB_TOML: &str = include_str!("./config.toml"); + +mod post; +pub use post::*; + +#[test] +fn init_db_toml_valid() { + toml::from_str::(INIT_DB_TOML).unwrap(); +} + +#[derive(Debug, Clone, Deserialize)] +#[serde(untagged)] +pub enum OneOrMany { + One(T), + Many(Vec), +} + +impl OneOrMany { + pub fn to_vec(self) -> Vec { + match self { + Self::One(x) => vec![x], + Self::Many(x) => x, + } + } + + pub fn as_slice(&self) -> &[T] { + match self { + Self::One(x) => slice::from_ref(&x), + Self::Many(x) => &x[..], + } + } +} + +#[derive(Debug, Clone, Deserialize)] +pub struct ConfigToml { + pub dataset: DatasetConfig, + pub schema: HashMap, + pub fts: Option, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct DatasetConfig { + /// Must be unique + pub name: String, + + /// Root dir for indices + pub working_dir: Option, + + /// Where to find this field + pub source: HashMap, + + /// How to post-process this field + #[serde(default)] + pub post: Vec, +} + +#[derive(Debug, Clone, Deserialize)] +#[serde(tag = "type")] +#[serde(rename_all = "lowercase")] +pub enum Source { + /// A directory of FLAC files + Flac { path: OneOrMany }, +} + +// +// MARK: schema +// + +#[derive(Debug, Clone, Deserialize)] +pub struct FieldSpec { + /// The type of this field + pub r#type: FieldType, + + /// How to find this field in a data entry + pub path: OneOrMany, + + /// How to post-process this field + #[serde(default)] + pub post: Vec, +} + +#[derive(Debug, Clone, Copy, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "lowercase")] +pub enum FieldType { + Text, +} + +// +// MARK: fts +// + +#[derive(Debug, Clone, Deserialize, Default)] +pub struct DatasetFts { + #[serde(alias = "field")] + pub fields: HashMap, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct FtsIndexField { + pub tokenize: bool, +} diff --git a/crates/pile-config/src/post.rs b/crates/pile-config/src/post.rs new file mode 100644 index 0000000..5b43116 --- /dev/null +++ b/crates/pile-config/src/post.rs @@ -0,0 +1,121 @@ +use itertools::Itertools; +use serde::Deserialize; +use serde_json::Value; + +#[derive(Debug, Clone, Deserialize, PartialEq, Eq)] +#[serde(untagged)] +pub enum FieldSpecPost { + TrimSuffix { trim_suffix: String }, + TrimPrefix { trim_prefix: String }, + SetCase { case: Case }, + Join { join: String }, + NotEmpty { notempty: bool }, +} + +impl FieldSpecPost { + pub fn apply(&self, val: &Value) -> Option { + Some(match self { + Self::NotEmpty { notempty: false } => val.clone(), + Self::NotEmpty { notempty: true } => match val { + Value::Null => return None, + Value::String(x) if x.is_empty() => return None, + Value::Array(x) if x.is_empty() => return None, + x => x.clone(), + }, + + Self::SetCase { case: Case::Lower } => match val { + Value::Null => return None, + Value::Bool(_) | Value::Number(_) => val.clone(), + Value::String(x) => Value::String(x.to_lowercase()), + + Value::Array(x) => { + Value::Array(x.iter().map(|x| self.apply(x)).collect::>()?) + } + + Value::Object(x) => Value::Object( + x.iter() + .map(|x| (x.0.to_lowercase(), self.apply(x.1))) + .map(|x| x.1.map(|y| (x.0, y))) + .collect::>()?, + ), + }, + + Self::SetCase { case: Case::Upper } => match val { + Value::Null => return None, + Value::Bool(_) | Value::Number(_) => val.clone(), + Value::String(x) => Value::String(x.to_uppercase()), + + Value::Array(x) => { + Value::Array(x.iter().map(|x| self.apply(x)).collect::>()?) + } + + Value::Object(x) => Value::Object( + x.iter() + .map(|x| (x.0.to_uppercase(), self.apply(x.1))) + .map(|x| x.1.map(|y| (x.0, y))) + .collect::>()?, + ), + }, + + Self::TrimSuffix { trim_suffix } => match val { + Value::Null => return None, + Value::Bool(_) | Value::Number(_) => Value::String(val.to_string()), + + Value::String(x) => { + Value::String(x.strip_suffix(trim_suffix).unwrap_or(&x).to_owned()) + } + + Value::Array(x) => { + Value::Array(x.iter().map(|x| self.apply(x)).collect::>()?) + } + + Value::Object(x) => Value::Object( + x.iter() + .map(|x| { + ( + x.0.strip_suffix(trim_suffix).unwrap_or(&x.0).to_owned(), + self.apply(x.1), + ) + }) + .map(|x| x.1.map(|y| (x.0, y))) + .collect::>()?, + ), + }, + + Self::TrimPrefix { trim_prefix } => match val { + Value::Null => return None, + Value::Object(_) => return None, + Value::Bool(_) | Value::Number(_) => Value::String(val.to_string()), + + Value::String(x) => { + Value::String(x.strip_prefix(trim_prefix).unwrap_or(&x).to_owned()) + } + + Value::Array(x) => { + Value::Array(x.iter().map(|x| self.apply(x)).collect::>()?) + } + }, + + Self::Join { join } => match val { + Value::Null => return None, + Value::Object(_) => return None, + Value::Bool(_) | Value::Number(_) => Value::String(val.to_string()), + Value::String(x) => Value::String(x.clone()), + Value::Array(x) => Value::String( + x.iter() + .map(|x| self.apply(x)) + .collect::>>()? + .into_iter() + .join(join), + ), + }, + }) + } +} + +#[derive(Debug, Clone, Copy, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "lowercase")] +pub enum Case { + Lower, + Upper, +}