Initial pile-config

This commit is contained in:
2026-01-06 23:05:33 -08:00
parent 26ee97e98d
commit 3846553b8a
4 changed files with 287 additions and 0 deletions

View File

@@ -0,0 +1,16 @@
[package]
name = "pile-config"
version = { workspace = true }
rust-version = { workspace = true }
edition = { workspace = true }
[lints]
workspace = true
[dependencies]
serde = { workspace = true }
itertools = { workspace = true }
serde_json = { workspace = true }
[dev-dependencies]
toml = { workspace = true }

View File

@@ -0,0 +1,46 @@
[dataset]
# This dataset's name. Must be unique.
name = "dataset"
# root directory for fts indices, relative to the parent dir of this config file.
# Files are written to {working_dir}/{dataset.name}/*.
# Default is ".pile"
#
# working_dir = ".pile"
# Data sources avaliable in this dataset
source."music" = { type = "flac", path = ["music", "music-2"] }
# This dataset's schema.
# Defines normalized fields that are extracted from source entries on-demand.
#
# Format is as follows:
#
# "field-name" = {
# # The type of data this field contains.
# # only text is supportedin this verison.
# type = "text",
#
# # An array of jsonpaths (rfc9535) used to extract this field from each source entry.
# # These are evaluated in order, the first non-null value is used.
# # A single string is equivalent to an array with one element.
# path = "$.json.path"
# }
[schema]
album = { type = "text", path = "$.Album" }
isrc = { type = "text", path = "$.Isrc" }
artist = { type = "text", path = ["$.Artist", "$.TrackArtist"] }
lyrics = { type = "text", path = "$.Lyrics" }
genre = { type = "text", path = "$.Genre" }
title = { type = "text", path = ["$.Title", "$.TrackTitle"] }
# Fts configuration.
# Determines which fields (defined in `schema`) are included in the fts index.
[fts]
field.album = { tokenize = true }
field.isrc = { tokenize = true }
field.artist = { tokenize = true }
field.lyrics = { tokenize = true }
field.genre = { tokenize = true }
field.title = { tokenize = true }

View File

@@ -0,0 +1,104 @@
use serde::Deserialize;
use std::{collections::HashMap, fmt::Debug, path::PathBuf, slice};
pub static INIT_DB_TOML: &str = include_str!("./config.toml");
mod post;
pub use post::*;
#[test]
fn init_db_toml_valid() {
toml::from_str::<ConfigToml>(INIT_DB_TOML).unwrap();
}
#[derive(Debug, Clone, Deserialize)]
#[serde(untagged)]
pub enum OneOrMany<T: Debug + Clone> {
One(T),
Many(Vec<T>),
}
impl<T: Debug + Clone> OneOrMany<T> {
pub fn to_vec(self) -> Vec<T> {
match self {
Self::One(x) => vec![x],
Self::Many(x) => x,
}
}
pub fn as_slice(&self) -> &[T] {
match self {
Self::One(x) => slice::from_ref(&x),
Self::Many(x) => &x[..],
}
}
}
#[derive(Debug, Clone, Deserialize)]
pub struct ConfigToml {
pub dataset: DatasetConfig,
pub schema: HashMap<String, FieldSpec>,
pub fts: Option<DatasetFts>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct DatasetConfig {
/// Must be unique
pub name: String,
/// Root dir for indices
pub working_dir: Option<PathBuf>,
/// Where to find this field
pub source: HashMap<String, Source>,
/// How to post-process this field
#[serde(default)]
pub post: Vec<FieldSpecPost>,
}
#[derive(Debug, Clone, Deserialize)]
#[serde(tag = "type")]
#[serde(rename_all = "lowercase")]
pub enum Source {
/// A directory of FLAC files
Flac { path: OneOrMany<PathBuf> },
}
//
// MARK: schema
//
#[derive(Debug, Clone, Deserialize)]
pub struct FieldSpec {
/// The type of this field
pub r#type: FieldType,
/// How to find this field in a data entry
pub path: OneOrMany<String>,
/// How to post-process this field
#[serde(default)]
pub post: Vec<FieldSpecPost>,
}
#[derive(Debug, Clone, Copy, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
pub enum FieldType {
Text,
}
//
// MARK: fts
//
#[derive(Debug, Clone, Deserialize, Default)]
pub struct DatasetFts {
#[serde(alias = "field")]
pub fields: HashMap<String, FtsIndexField>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct FtsIndexField {
pub tokenize: bool,
}

View File

@@ -0,0 +1,121 @@
use itertools::Itertools;
use serde::Deserialize;
use serde_json::Value;
#[derive(Debug, Clone, Deserialize, PartialEq, Eq)]
#[serde(untagged)]
pub enum FieldSpecPost {
TrimSuffix { trim_suffix: String },
TrimPrefix { trim_prefix: String },
SetCase { case: Case },
Join { join: String },
NotEmpty { notempty: bool },
}
impl FieldSpecPost {
pub fn apply(&self, val: &Value) -> Option<Value> {
Some(match self {
Self::NotEmpty { notempty: false } => val.clone(),
Self::NotEmpty { notempty: true } => match val {
Value::Null => return None,
Value::String(x) if x.is_empty() => return None,
Value::Array(x) if x.is_empty() => return None,
x => x.clone(),
},
Self::SetCase { case: Case::Lower } => match val {
Value::Null => return None,
Value::Bool(_) | Value::Number(_) => val.clone(),
Value::String(x) => Value::String(x.to_lowercase()),
Value::Array(x) => {
Value::Array(x.iter().map(|x| self.apply(x)).collect::<Option<_>>()?)
}
Value::Object(x) => Value::Object(
x.iter()
.map(|x| (x.0.to_lowercase(), self.apply(x.1)))
.map(|x| x.1.map(|y| (x.0, y)))
.collect::<Option<_>>()?,
),
},
Self::SetCase { case: Case::Upper } => match val {
Value::Null => return None,
Value::Bool(_) | Value::Number(_) => val.clone(),
Value::String(x) => Value::String(x.to_uppercase()),
Value::Array(x) => {
Value::Array(x.iter().map(|x| self.apply(x)).collect::<Option<_>>()?)
}
Value::Object(x) => Value::Object(
x.iter()
.map(|x| (x.0.to_uppercase(), self.apply(x.1)))
.map(|x| x.1.map(|y| (x.0, y)))
.collect::<Option<_>>()?,
),
},
Self::TrimSuffix { trim_suffix } => match val {
Value::Null => return None,
Value::Bool(_) | Value::Number(_) => Value::String(val.to_string()),
Value::String(x) => {
Value::String(x.strip_suffix(trim_suffix).unwrap_or(&x).to_owned())
}
Value::Array(x) => {
Value::Array(x.iter().map(|x| self.apply(x)).collect::<Option<_>>()?)
}
Value::Object(x) => Value::Object(
x.iter()
.map(|x| {
(
x.0.strip_suffix(trim_suffix).unwrap_or(&x.0).to_owned(),
self.apply(x.1),
)
})
.map(|x| x.1.map(|y| (x.0, y)))
.collect::<Option<_>>()?,
),
},
Self::TrimPrefix { trim_prefix } => match val {
Value::Null => return None,
Value::Object(_) => return None,
Value::Bool(_) | Value::Number(_) => Value::String(val.to_string()),
Value::String(x) => {
Value::String(x.strip_prefix(trim_prefix).unwrap_or(&x).to_owned())
}
Value::Array(x) => {
Value::Array(x.iter().map(|x| self.apply(x)).collect::<Option<_>>()?)
}
},
Self::Join { join } => match val {
Value::Null => return None,
Value::Object(_) => return None,
Value::Bool(_) | Value::Number(_) => Value::String(val.to_string()),
Value::String(x) => Value::String(x.clone()),
Value::Array(x) => Value::String(
x.iter()
.map(|x| self.apply(x))
.collect::<Option<Vec<_>>>()?
.into_iter()
.join(join),
),
},
})
}
}
#[derive(Debug, Clone, Copy, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
pub enum Case {
Lower,
Upper,
}