From 2ee559be7327cb0f7ce7076f6243bd682e6c28b8 Mon Sep 17 00:00:00 2001 From: rm-dr <96270320+rm-dr@users.noreply.github.com> Date: Tue, 6 Jan 2026 23:05:58 -0800 Subject: [PATCH] Initial `pile-dataset` --- crates/pile-dataset/Cargo.toml | 15 ++++++ crates/pile-dataset/src/item/flac.rs | 77 +++++++++++++++++++++++++++ crates/pile-dataset/src/item/mod.rs | 2 + crates/pile-dataset/src/lib.rs | 5 ++ crates/pile-dataset/src/source/dir.rs | 57 ++++++++++++++++++++ crates/pile-dataset/src/source/mod.rs | 2 + crates/pile-dataset/src/traits.rs | 25 +++++++++ 7 files changed, 183 insertions(+) create mode 100644 crates/pile-dataset/Cargo.toml create mode 100644 crates/pile-dataset/src/item/flac.rs create mode 100644 crates/pile-dataset/src/item/mod.rs create mode 100644 crates/pile-dataset/src/lib.rs create mode 100644 crates/pile-dataset/src/source/dir.rs create mode 100644 crates/pile-dataset/src/source/mod.rs create mode 100644 crates/pile-dataset/src/traits.rs diff --git a/crates/pile-dataset/Cargo.toml b/crates/pile-dataset/Cargo.toml new file mode 100644 index 0000000..1edaa15 --- /dev/null +++ b/crates/pile-dataset/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "pile-dataset" +version = { workspace = true } +rust-version = { workspace = true } +edition = { workspace = true } + +[lints] +workspace = true + +[dependencies] +pile-audio = { workspace = true } + +serde_json = { workspace = true } +itertools = { workspace = true } +walkdir = { workspace = true } diff --git a/crates/pile-dataset/src/item/flac.rs b/crates/pile-dataset/src/item/flac.rs new file mode 100644 index 0000000..0852842 --- /dev/null +++ b/crates/pile-dataset/src/item/flac.rs @@ -0,0 +1,77 @@ +use std::{ + fmt::Debug, + fs::File, + io::{Read, Seek}, + path::PathBuf, +}; + +use pile_audio::flac::blockread::{FlacBlock, FlacBlockReader, FlacBlockSelector}; +use serde_json::{Map, Value}; + +use crate::Item; + +pub struct FlacItem { + pub(crate) path: PathBuf, + pub(crate) file: File, +} + +impl Debug for FlacItem { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("FlacItem") + .field("path", &self.path) + .finish() + } +} + +impl Item for FlacItem { + fn json(&mut self) -> Result { + let mut block_reader = FlacBlockReader::new(FlacBlockSelector { + pick_vorbiscomment: true, + ..Default::default() + }); + + // TODO: do not read the whole file + self.file.rewind()?; + let mut data = Vec::new(); + self.file.read_to_end(&mut data)?; + + block_reader.push_data(&data).unwrap(); + block_reader.finish().unwrap(); + + // + // Return tags + // + + let mut output = Map::new(); + + while block_reader.has_block() { + let b = block_reader.pop_block().unwrap(); + match b { + FlacBlock::VorbisComment(comment) => { + for (k, v) in comment.comment.comments { + let k = k.to_string(); + let v = Value::String(v.into()); + let e = output.get_mut(&k); + + match e { + None => { + output.insert(k.to_string(), Value::Array(vec![v])); + } + Some(e) => { + e.as_array_mut().unwrap().push(v); + } + } + } + } + + // `reader` filters blocks for us + _ => unreachable!(), + } + + // We should only have one comment block + assert!(!block_reader.has_block()); + } + + return Ok(serde_json::Value::Object(output)); + } +} diff --git a/crates/pile-dataset/src/item/mod.rs b/crates/pile-dataset/src/item/mod.rs new file mode 100644 index 0000000..db29339 --- /dev/null +++ b/crates/pile-dataset/src/item/mod.rs @@ -0,0 +1,2 @@ +mod flac; +pub use flac::*; diff --git a/crates/pile-dataset/src/lib.rs b/crates/pile-dataset/src/lib.rs new file mode 100644 index 0000000..c4048fb --- /dev/null +++ b/crates/pile-dataset/src/lib.rs @@ -0,0 +1,5 @@ +mod traits; +pub use traits::*; + +pub mod item; +pub mod source; diff --git a/crates/pile-dataset/src/source/dir.rs b/crates/pile-dataset/src/source/dir.rs new file mode 100644 index 0000000..bd06b7a --- /dev/null +++ b/crates/pile-dataset/src/source/dir.rs @@ -0,0 +1,57 @@ +use itertools::Itertools; +use std::{fs::File, io::ErrorKind, path::PathBuf}; +use walkdir::WalkDir; + +use crate::{DataSource, item::FlacItem}; + +#[derive(Debug)] +pub struct DirDataSource { + dir: PathBuf, +} + +impl DirDataSource { + pub fn new(dir: impl Into) -> Self { + Self { dir: dir.into() } + } +} + +impl DataSource for DirDataSource { + type Key = PathBuf; + type Item = FlacItem; + type Error = std::io::Error; + + fn get(&self, key: &Self::Key) -> Result, Self::Error> { + let path = self.dir.join(key); + if !path.is_file() { + return Ok(None); + } + + let file = File::open(&path)?; + return Ok(Some(FlacItem { path, file })); + } + + fn iter(&self) -> impl Iterator> { + let walkdir = WalkDir::new(&self.dir); + return walkdir + .into_iter() + .filter_ok(|x| x.file_type().is_file()) + .map(|x| match x { + Err(err) => { + let msg = format!("other walkdir error: {err:?}"); + Err(err + .into_io_error() + .unwrap_or(std::io::Error::new(ErrorKind::Other, msg))) + } + Ok(x) => { + let path = x.into_path(); + let file = File::open(&path)?; + let item = FlacItem { + path: path.clone(), + file, + }; + + Ok((path, item)) + } + }); + } +} diff --git a/crates/pile-dataset/src/source/mod.rs b/crates/pile-dataset/src/source/mod.rs new file mode 100644 index 0000000..fa148b9 --- /dev/null +++ b/crates/pile-dataset/src/source/mod.rs @@ -0,0 +1,2 @@ +mod dir; +pub use dir::*; diff --git a/crates/pile-dataset/src/traits.rs b/crates/pile-dataset/src/traits.rs new file mode 100644 index 0000000..3a1470e --- /dev/null +++ b/crates/pile-dataset/src/traits.rs @@ -0,0 +1,25 @@ +use std::{error::Error, fmt::Debug}; + +pub trait DataSource { + /// The type used to retrieve items from this source + /// (e.g, a PathBuf or a primary key) + type Key: Debug + Send + Sync + 'static + Clone; + + /// The item this datasource produces + type Item: Item; + + type Error: Error; + + /// Get an item from this datasource + fn get(&self, key: &Self::Key) -> Result, Self::Error>; + + /// Iterate over all items in this source in an arbitrar order + fn iter(&self) -> impl Iterator>; +} + +pub trait Item: Debug + Send + Sync + 'static { + /// Get this item's unstructured schema + /// + /// TODO: don't use json, use a lazily-evaluated type that supports binary + fn json(&mut self) -> Result; +}