Extractor refactor, S3 support
Some checks failed
CI / Typos (push) Successful in 1m5s
CI / Clippy (push) Failing after 1m50s
CI / Build and test (push) Successful in 3m1s

This commit is contained in:
2026-03-06 17:49:12 -08:00
parent 77b3125af4
commit aecc84233b
31 changed files with 2676 additions and 675 deletions

View File

@@ -9,8 +9,7 @@ name = "dataset"
# working_dir = ".pile"
# Data sources available in this dataset
source."music" = { type = "flac", path = ["music", "music-2"] }
source."music" = { type = "filesystem", path = "music" }
# This dataset's schema.
# Defines normalized fields that are extracted from source entries on-demand.

View File

@@ -46,16 +46,21 @@ pub struct DatasetConfig {
pub post: Vec<FieldSpecPost>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct S3Credentials {
pub access_key_id: String,
pub secret_access_key: String,
}
#[derive(Debug, Clone, Deserialize)]
#[serde(tag = "type")]
#[serde(rename_all = "lowercase")]
pub enum Source {
/// A directory files
/// A directory of files
Filesystem {
/// The directories to scan.
/// Must be relative.
#[serde(alias = "paths")]
path: OneOrMany<PathBuf>,
path: PathBuf,
/// If true, all toml files are ignored.
/// Metadata can be added to any file using a {filename}.toml.
@@ -65,6 +70,23 @@ pub enum Source {
#[serde(default = "default_true")]
sidecars: bool,
},
/// An S3-compatible object store bucket
S3 {
bucket: String,
prefix: Option<String>,
/// Custom endpoint URL (for MinIO, etc.)
endpoint: Option<String>,
region: String,
credentials: S3Credentials,
/// If true, all .toml objects are treated as sidecar metadata files.
#[serde(default = "default_true")]
sidecars: bool,
},
}
//