Files
pile/crates/pile-dataset/src/extract/exif.rs
2026-03-10 20:25:21 -07:00

93 lines
2.3 KiB
Rust

use pile_config::Label;
use std::{collections::HashMap, io::BufReader, sync::OnceLock};
use tracing::debug;
use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor};
pub struct ExifExtractor<'a> {
item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a>>>,
}
impl<'a> ExifExtractor<'a> {
pub fn new(item: &'a Item) -> Self {
Self {
item,
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let reader = SyncReadBridge::new_current(self.item.read().await?);
let raw_fields = tokio::task::spawn_blocking(move || {
let mut br = BufReader::new(reader);
let exif = exif::Reader::new()
.read_from_container(&mut br)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let fields: Vec<(String, String)> = exif
.fields()
.map(|f| {
(
f.tag.to_string(),
f.display_value().with_unit(&exif).to_string(),
)
})
.collect();
Ok::<_, std::io::Error>(fields)
})
.await
.map_err(std::io::Error::other)?;
let raw_fields = match raw_fields {
Ok(x) => x,
Err(error) => {
debug!(message = "Could not process exif", ?error, key = ?self.item.key());
return Ok(self.output.get_or_init(HashMap::new));
}
};
let mut output: HashMap<Label, PileValue<'a>> = HashMap::new();
for (tag_name, value) in raw_fields {
let Some(label) = tag_to_label(&tag_name) else {
continue;
};
// First occurrence wins (PRIMARY IFD comes before THUMBNAIL)
output
.entry(label)
.or_insert_with(|| PileValue::String(value.into()));
}
return Ok(self.output.get_or_init(|| output));
}
}
fn tag_to_label(tag: &str) -> Option<Label> {
let sanitized: String = tag
.chars()
.map(|c| if c == ' ' { '_' } else { c })
.filter(|c| Label::VALID_CHARS.contains(*c))
.collect();
Label::new(sanitized)
}
#[async_trait::async_trait]
impl ObjectExtractor for ExifExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner().await?.get(name))
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect())
}
}