Initial pile-dataset
Some checks failed
CI / Typos (push) Failing after 18s
CI / Build and test (push) Failing after 35s
CI / Clippy (push) Failing after 51s

This commit is contained in:
2026-01-06 23:05:58 -08:00
parent 3d9f0bd990
commit 2ee559be73
7 changed files with 183 additions and 0 deletions

View File

@@ -0,0 +1,15 @@
[package]
name = "pile-dataset"
version = { workspace = true }
rust-version = { workspace = true }
edition = { workspace = true }
[lints]
workspace = true
[dependencies]
pile-audio = { workspace = true }
serde_json = { workspace = true }
itertools = { workspace = true }
walkdir = { workspace = true }

View File

@@ -0,0 +1,77 @@
use std::{
fmt::Debug,
fs::File,
io::{Read, Seek},
path::PathBuf,
};
use pile_audio::flac::blockread::{FlacBlock, FlacBlockReader, FlacBlockSelector};
use serde_json::{Map, Value};
use crate::Item;
pub struct FlacItem {
pub(crate) path: PathBuf,
pub(crate) file: File,
}
impl Debug for FlacItem {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("FlacItem")
.field("path", &self.path)
.finish()
}
}
impl Item for FlacItem {
fn json(&mut self) -> Result<serde_json::Value, std::io::Error> {
let mut block_reader = FlacBlockReader::new(FlacBlockSelector {
pick_vorbiscomment: true,
..Default::default()
});
// TODO: do not read the whole file
self.file.rewind()?;
let mut data = Vec::new();
self.file.read_to_end(&mut data)?;
block_reader.push_data(&data).unwrap();
block_reader.finish().unwrap();
//
// Return tags
//
let mut output = Map::new();
while block_reader.has_block() {
let b = block_reader.pop_block().unwrap();
match b {
FlacBlock::VorbisComment(comment) => {
for (k, v) in comment.comment.comments {
let k = k.to_string();
let v = Value::String(v.into());
let e = output.get_mut(&k);
match e {
None => {
output.insert(k.to_string(), Value::Array(vec![v]));
}
Some(e) => {
e.as_array_mut().unwrap().push(v);
}
}
}
}
// `reader` filters blocks for us
_ => unreachable!(),
}
// We should only have one comment block
assert!(!block_reader.has_block());
}
return Ok(serde_json::Value::Object(output));
}
}

View File

@@ -0,0 +1,2 @@
mod flac;
pub use flac::*;

View File

@@ -0,0 +1,5 @@
mod traits;
pub use traits::*;
pub mod item;
pub mod source;

View File

@@ -0,0 +1,57 @@
use itertools::Itertools;
use std::{fs::File, io::ErrorKind, path::PathBuf};
use walkdir::WalkDir;
use crate::{DataSource, item::FlacItem};
#[derive(Debug)]
pub struct DirDataSource {
dir: PathBuf,
}
impl DirDataSource {
pub fn new(dir: impl Into<PathBuf>) -> Self {
Self { dir: dir.into() }
}
}
impl DataSource for DirDataSource {
type Key = PathBuf;
type Item = FlacItem;
type Error = std::io::Error;
fn get(&self, key: &Self::Key) -> Result<Option<Self::Item>, Self::Error> {
let path = self.dir.join(key);
if !path.is_file() {
return Ok(None);
}
let file = File::open(&path)?;
return Ok(Some(FlacItem { path, file }));
}
fn iter(&self) -> impl Iterator<Item = Result<(Self::Key, Self::Item), Self::Error>> {
let walkdir = WalkDir::new(&self.dir);
return walkdir
.into_iter()
.filter_ok(|x| x.file_type().is_file())
.map(|x| match x {
Err(err) => {
let msg = format!("other walkdir error: {err:?}");
Err(err
.into_io_error()
.unwrap_or(std::io::Error::new(ErrorKind::Other, msg)))
}
Ok(x) => {
let path = x.into_path();
let file = File::open(&path)?;
let item = FlacItem {
path: path.clone(),
file,
};
Ok((path, item))
}
});
}
}

View File

@@ -0,0 +1,2 @@
mod dir;
pub use dir::*;

View File

@@ -0,0 +1,25 @@
use std::{error::Error, fmt::Debug};
pub trait DataSource {
/// The type used to retrieve items from this source
/// (e.g, a PathBuf or a primary key)
type Key: Debug + Send + Sync + 'static + Clone;
/// The item this datasource produces
type Item: Item;
type Error: Error;
/// Get an item from this datasource
fn get(&self, key: &Self::Key) -> Result<Option<Self::Item>, Self::Error>;
/// Iterate over all items in this source in an arbitrar order
fn iter(&self) -> impl Iterator<Item = Result<(Self::Key, Self::Item), Self::Error>>;
}
pub trait Item: Debug + Send + Sync + 'static {
/// Get this item's unstructured schema
///
/// TODO: don't use json, use a lazily-evaluated type that supports binary
fn json(&mut self) -> Result<serde_json::Value, std::io::Error>;
}