Extractor refactor, S3 support
This commit is contained in:
@@ -9,8 +9,7 @@ name = "dataset"
|
||||
# working_dir = ".pile"
|
||||
|
||||
# Data sources available in this dataset
|
||||
source."music" = { type = "flac", path = ["music", "music-2"] }
|
||||
|
||||
source."music" = { type = "filesystem", path = "music" }
|
||||
|
||||
# This dataset's schema.
|
||||
# Defines normalized fields that are extracted from source entries on-demand.
|
||||
|
||||
@@ -46,16 +46,21 @@ pub struct DatasetConfig {
|
||||
pub post: Vec<FieldSpecPost>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub struct S3Credentials {
|
||||
pub access_key_id: String,
|
||||
pub secret_access_key: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
#[serde(tag = "type")]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum Source {
|
||||
/// A directory files
|
||||
/// A directory of files
|
||||
Filesystem {
|
||||
/// The directories to scan.
|
||||
/// Must be relative.
|
||||
#[serde(alias = "paths")]
|
||||
path: OneOrMany<PathBuf>,
|
||||
path: PathBuf,
|
||||
|
||||
/// If true, all toml files are ignored.
|
||||
/// Metadata can be added to any file using a {filename}.toml.
|
||||
@@ -65,6 +70,23 @@ pub enum Source {
|
||||
#[serde(default = "default_true")]
|
||||
sidecars: bool,
|
||||
},
|
||||
|
||||
/// An S3-compatible object store bucket
|
||||
S3 {
|
||||
bucket: String,
|
||||
prefix: Option<String>,
|
||||
|
||||
/// Custom endpoint URL (for MinIO, etc.)
|
||||
endpoint: Option<String>,
|
||||
|
||||
region: String,
|
||||
|
||||
credentials: S3Credentials,
|
||||
|
||||
/// If true, all .toml objects are treated as sidecar metadata files.
|
||||
#[serde(default = "default_true")]
|
||||
sidecars: bool,
|
||||
},
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
Reference in New Issue
Block a user