diff --git a/Cargo.lock b/Cargo.lock index 94a3817..c84775d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1866,6 +1866,15 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "kamadak-exif" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1130d80c7374efad55a117d715a3af9368f0fa7a2c54573afc15a188cd984837" +dependencies = [ + "mutate_once", +] + [[package]] name = "lazy_static" version = "1.5.0" @@ -2059,6 +2068,12 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b" +[[package]] +name = "mutate_once" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13d2233c9842d08cfe13f9eac96e207ca6a2ea10b80259ebe8ad0268be27d2af" + [[package]] name = "nom" version = "7.1.3" @@ -2270,6 +2285,7 @@ dependencies = [ "epub", "id3", "itertools 0.14.0", + "kamadak-exif", "pdf", "pile-config", "pile-flac", diff --git a/Cargo.toml b/Cargo.toml index b3e2251..00b2e0f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -94,9 +94,12 @@ toml = "1.0.3" toml_edit = "0.25.4" sha2 = "0.11.0-rc.5" blake3 = "1.8.3" + +# Extractors pdf = "0.10.0" id3 = "1.16.4" epub = "1.2.2" +kamadak-exif = "0.6.1" # Misc helpers thiserror = "2.0.18" diff --git a/crates/pile-dataset/Cargo.toml b/crates/pile-dataset/Cargo.toml index 2439e8e..f52fcde 100644 --- a/crates/pile-dataset/Cargo.toml +++ b/crates/pile-dataset/Cargo.toml @@ -23,6 +23,7 @@ thiserror = { workspace = true } smartstring = { workspace = true } blake3 = { workspace = true } epub = { workspace = true } +kamadak-exif = { workspace = true } pdf = { workspace = true } id3 = { workspace = true } tokio = { workspace = true } diff --git a/crates/pile-dataset/src/extract/exif.rs b/crates/pile-dataset/src/extract/exif.rs new file mode 100644 index 0000000..d879b78 --- /dev/null +++ b/crates/pile-dataset/src/extract/exif.rs @@ -0,0 +1,74 @@ +use pile_config::Label; +use std::{collections::HashMap, io::Cursor, sync::OnceLock}; +use tracing::debug; + +use crate::{Item, PileValue, extract::Extractor}; + +pub struct ExifExtractor<'a> { + item: &'a Item, + output: OnceLock>>, +} + +impl<'a> ExifExtractor<'a> { + pub fn new(item: &'a Item) -> Self { + Self { + item, + output: OnceLock::new(), + } + } + + async fn get_inner(&self) -> Result<&HashMap>, std::io::Error> { + if let Some(x) = self.output.get() { + return Ok(x); + } + + let bytes = self.item.read().await?.read_to_end().await?; + let mut cursor = Cursor::new(bytes); + + let exif = match exif::Reader::new().read_from_container(&mut cursor) { + Ok(x) => x, + Err(error) => { + debug!(message = "Could not process exif", ?error, key = ?self.item.key()); + return Ok(self.output.get_or_init(HashMap::new)); + } + }; + + let mut output: HashMap> = HashMap::new(); + + for field in exif.fields() { + let Some(label) = tag_to_label(&field.tag) else { + continue; + }; + // First occurrence wins (PRIMARY IFD comes before THUMBNAIL) + output.entry(label).or_insert_with(|| { + PileValue::String(field.display_value().with_unit(&exif).to_string().into()) + }); + } + + return Ok(self.output.get_or_init(|| output)); + } +} + +fn tag_to_label(tag: &exif::Tag) -> Option