Add exif extractor
This commit is contained in:
16
Cargo.lock
generated
16
Cargo.lock
generated
@@ -1866,6 +1866,15 @@ dependencies = [
|
|||||||
"wasm-bindgen",
|
"wasm-bindgen",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "kamadak-exif"
|
||||||
|
version = "0.6.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1130d80c7374efad55a117d715a3af9368f0fa7a2c54573afc15a188cd984837"
|
||||||
|
dependencies = [
|
||||||
|
"mutate_once",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lazy_static"
|
name = "lazy_static"
|
||||||
version = "1.5.0"
|
version = "1.5.0"
|
||||||
@@ -2059,6 +2068,12 @@ version = "0.3.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b"
|
checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "mutate_once"
|
||||||
|
version = "0.1.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "13d2233c9842d08cfe13f9eac96e207ca6a2ea10b80259ebe8ad0268be27d2af"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "nom"
|
name = "nom"
|
||||||
version = "7.1.3"
|
version = "7.1.3"
|
||||||
@@ -2270,6 +2285,7 @@ dependencies = [
|
|||||||
"epub",
|
"epub",
|
||||||
"id3",
|
"id3",
|
||||||
"itertools 0.14.0",
|
"itertools 0.14.0",
|
||||||
|
"kamadak-exif",
|
||||||
"pdf",
|
"pdf",
|
||||||
"pile-config",
|
"pile-config",
|
||||||
"pile-flac",
|
"pile-flac",
|
||||||
|
|||||||
@@ -94,9 +94,12 @@ toml = "1.0.3"
|
|||||||
toml_edit = "0.25.4"
|
toml_edit = "0.25.4"
|
||||||
sha2 = "0.11.0-rc.5"
|
sha2 = "0.11.0-rc.5"
|
||||||
blake3 = "1.8.3"
|
blake3 = "1.8.3"
|
||||||
|
|
||||||
|
# Extractors
|
||||||
pdf = "0.10.0"
|
pdf = "0.10.0"
|
||||||
id3 = "1.16.4"
|
id3 = "1.16.4"
|
||||||
epub = "1.2.2"
|
epub = "1.2.2"
|
||||||
|
kamadak-exif = "0.6.1"
|
||||||
|
|
||||||
# Misc helpers
|
# Misc helpers
|
||||||
thiserror = "2.0.18"
|
thiserror = "2.0.18"
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ thiserror = { workspace = true }
|
|||||||
smartstring = { workspace = true }
|
smartstring = { workspace = true }
|
||||||
blake3 = { workspace = true }
|
blake3 = { workspace = true }
|
||||||
epub = { workspace = true }
|
epub = { workspace = true }
|
||||||
|
kamadak-exif = { workspace = true }
|
||||||
pdf = { workspace = true }
|
pdf = { workspace = true }
|
||||||
id3 = { workspace = true }
|
id3 = { workspace = true }
|
||||||
tokio = { workspace = true }
|
tokio = { workspace = true }
|
||||||
|
|||||||
74
crates/pile-dataset/src/extract/exif.rs
Normal file
74
crates/pile-dataset/src/extract/exif.rs
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
use pile_config::Label;
|
||||||
|
use std::{collections::HashMap, io::Cursor, sync::OnceLock};
|
||||||
|
use tracing::debug;
|
||||||
|
|
||||||
|
use crate::{Item, PileValue, extract::Extractor};
|
||||||
|
|
||||||
|
pub struct ExifExtractor<'a> {
|
||||||
|
item: &'a Item,
|
||||||
|
output: OnceLock<HashMap<Label, PileValue<'a>>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> ExifExtractor<'a> {
|
||||||
|
pub fn new(item: &'a Item) -> Self {
|
||||||
|
Self {
|
||||||
|
item,
|
||||||
|
output: OnceLock::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
|
||||||
|
if let Some(x) = self.output.get() {
|
||||||
|
return Ok(x);
|
||||||
|
}
|
||||||
|
|
||||||
|
let bytes = self.item.read().await?.read_to_end().await?;
|
||||||
|
let mut cursor = Cursor::new(bytes);
|
||||||
|
|
||||||
|
let exif = match exif::Reader::new().read_from_container(&mut cursor) {
|
||||||
|
Ok(x) => x,
|
||||||
|
Err(error) => {
|
||||||
|
debug!(message = "Could not process exif", ?error, key = ?self.item.key());
|
||||||
|
return Ok(self.output.get_or_init(HashMap::new));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut output: HashMap<Label, PileValue<'a>> = HashMap::new();
|
||||||
|
|
||||||
|
for field in exif.fields() {
|
||||||
|
let Some(label) = tag_to_label(&field.tag) else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
// First occurrence wins (PRIMARY IFD comes before THUMBNAIL)
|
||||||
|
output.entry(label).or_insert_with(|| {
|
||||||
|
PileValue::String(field.display_value().with_unit(&exif).to_string().into())
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return Ok(self.output.get_or_init(|| output));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn tag_to_label(tag: &exif::Tag) -> Option<Label> {
|
||||||
|
let sanitized: String = tag
|
||||||
|
.to_string()
|
||||||
|
.chars()
|
||||||
|
.map(|c| if c == ' ' { '_' } else { c })
|
||||||
|
.filter(|c| Label::VALID_CHARS.contains(*c))
|
||||||
|
.collect();
|
||||||
|
Label::new(sanitized)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
impl Extractor for ExifExtractor<'_> {
|
||||||
|
async fn field<'a>(
|
||||||
|
&'a self,
|
||||||
|
name: &Label,
|
||||||
|
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
|
||||||
|
Ok(self.get_inner().await?.get(name))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||||
|
Ok(self.get_inner().await?.keys().cloned().collect())
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -13,6 +13,9 @@ pub use fs::*;
|
|||||||
mod epub;
|
mod epub;
|
||||||
pub use epub::*;
|
pub use epub::*;
|
||||||
|
|
||||||
|
mod exif;
|
||||||
|
pub use exif::*;
|
||||||
|
|
||||||
mod pdf;
|
mod pdf;
|
||||||
pub use pdf::*;
|
pub use pdf::*;
|
||||||
|
|
||||||
@@ -76,6 +79,10 @@ impl<'a> MetaExtractor<'a> {
|
|||||||
Label::new("epub").unwrap(),
|
Label::new("epub").unwrap(),
|
||||||
crate::PileValue::Extractor(Arc::new(EpubExtractor::new(item))),
|
crate::PileValue::Extractor(Arc::new(EpubExtractor::new(item))),
|
||||||
),
|
),
|
||||||
|
(
|
||||||
|
Label::new("exif").unwrap(),
|
||||||
|
crate::PileValue::Extractor(Arc::new(ExifExtractor::new(item))),
|
||||||
|
),
|
||||||
(
|
(
|
||||||
Label::new("pdf").unwrap(),
|
Label::new("pdf").unwrap(),
|
||||||
crate::PileValue::Extractor(Arc::new(PdfExtractor::new(item))),
|
crate::PileValue::Extractor(Arc::new(PdfExtractor::new(item))),
|
||||||
@@ -111,6 +118,7 @@ impl Extractor for MetaExtractor<'_> {
|
|||||||
Label::new("id3").unwrap(),
|
Label::new("id3").unwrap(),
|
||||||
Label::new("fs").unwrap(),
|
Label::new("fs").unwrap(),
|
||||||
Label::new("epub").unwrap(),
|
Label::new("epub").unwrap(),
|
||||||
|
Label::new("exif").unwrap(),
|
||||||
Label::new("pdf").unwrap(),
|
Label::new("pdf").unwrap(),
|
||||||
Label::new("sidecar").unwrap(),
|
Label::new("sidecar").unwrap(),
|
||||||
]);
|
]);
|
||||||
|
|||||||
Reference in New Issue
Block a user