Extractor refactor, S3 support

2026-03-06 17:49:12 -08:00
parent 77b3125af4
commit aecc84233b
31 changed files with 2676 additions and 675 deletions
--- a/crates/pile-config/src/config.toml
+++ b/crates/pile-config/src/config.toml
@@ -9,8 +9,7 @@ name = "dataset"
 # working_dir = ".pile"

 # Data sources available in this dataset
-source."music" = { type = "flac", path = ["music", "music-2"] }
-
+source."music" = { type = "filesystem", path = "music" }

 # This dataset's schema.
 # Defines normalized fields that are extracted from source entries on-demand.
--- a/crates/pile-config/src/lib.rs
+++ b/crates/pile-config/src/lib.rs
@@ -46,16 +46,21 @@ pub struct DatasetConfig {
 	pub post: Vec<FieldSpecPost>,
 }

+#[derive(Debug, Clone, Deserialize)]
+pub struct S3Credentials {
+	pub access_key_id: String,
+	pub secret_access_key: String,
+}
+
 #[derive(Debug, Clone, Deserialize)]
 #[serde(tag = "type")]
 #[serde(rename_all = "lowercase")]
 pub enum Source {
-	/// A directory files
+	/// A directory of files
 	Filesystem {
 		/// The directories to scan.
 		/// Must be relative.
-		#[serde(alias = "paths")]
-		path: OneOrMany<PathBuf>,
+		path: PathBuf,

 		/// If true, all toml files are ignored.
 		/// Metadata can be added to any file using a {filename}.toml.
@@ -65,6 +70,23 @@ pub enum Source {
 		#[serde(default = "default_true")]
 		sidecars: bool,
 	},
+
+	/// An S3-compatible object store bucket
+	S3 {
+		bucket: String,
+		prefix: Option<String>,
+
+		/// Custom endpoint URL (for MinIO, etc.)
+		endpoint: Option<String>,
+
+		region: String,
+
+		credentials: S3Credentials,
+
+		/// If true, all .toml objects are treated as sidecar metadata files.
+		#[serde(default = "default_true")]
+		sidecars: bool,
+	},
 }

 //