diff --git a/crates/pile-dataset/src/extract/epub/epub_meta.rs b/crates/pile-dataset/src/extract/epub/epub_meta.rs index 4488406..49b218b 100644 --- a/crates/pile-dataset/src/extract/epub/epub_meta.rs +++ b/crates/pile-dataset/src/extract/epub/epub_meta.rs @@ -1,24 +1,27 @@ use epub::doc::EpubDoc; use pile_config::Label; -use std::{collections::HashMap, sync::OnceLock}; +use std::{ + collections::HashMap, + sync::{Arc, OnceLock}, +}; use tracing::trace; use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor}; -pub struct EpubMetaExtractor<'a> { - item: &'a Item, - output: OnceLock>>, +pub struct EpubMetaExtractor { + item: Item, + output: OnceLock>, } -impl<'a> EpubMetaExtractor<'a> { - pub fn new(item: &'a Item) -> Self { +impl EpubMetaExtractor { + pub fn new(item: &Item) -> Self { Self { - item, + item: item.clone(), output: OnceLock::new(), } } - async fn get_inner(&self) -> Result<&HashMap>, std::io::Error> { + async fn get_inner(&self) -> Result<&HashMap, std::io::Error> { if let Some(x) = self.output.get() { return Ok(x); } @@ -61,13 +64,13 @@ impl<'a> EpubMetaExtractor<'a> { } }; - let mut output: HashMap> = HashMap::new(); + let mut output: HashMap = HashMap::new(); #[expect(clippy::unwrap_used)] for (key, val) in raw_meta { let label = Label::new(key).unwrap(); let value = match val { - Some(s) => PileValue::String(s.into()), + Some(s) => PileValue::String(Arc::new(s.into())), None => PileValue::Null, }; output.insert(label, value); @@ -78,12 +81,9 @@ impl<'a> EpubMetaExtractor<'a> { } #[async_trait::async_trait] -impl ObjectExtractor for EpubMetaExtractor<'_> { - async fn field<'a>( - &'a self, - name: &Label, - ) -> Result>, std::io::Error> { - Ok(self.get_inner().await?.get(name)) +impl ObjectExtractor for EpubMetaExtractor { + async fn field(&self, name: &Label) -> Result, std::io::Error> { + Ok(self.get_inner().await?.get(name).cloned()) } async fn fields(&self) -> Result, std::io::Error> { diff --git a/crates/pile-dataset/src/extract/epub/epub_text.rs b/crates/pile-dataset/src/extract/epub/epub_text.rs index 7997b90..5c47b15 100644 --- a/crates/pile-dataset/src/extract/epub/epub_text.rs +++ b/crates/pile-dataset/src/extract/epub/epub_text.rs @@ -1,24 +1,27 @@ use epub::doc::EpubDoc; use pile_config::Label; -use std::{collections::HashMap, sync::OnceLock}; +use std::{ + collections::HashMap, + sync::{Arc, OnceLock}, +}; use tracing::debug; use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor}; -pub struct EpubTextExtractor<'a> { - item: &'a Item, - output: OnceLock>>, +pub struct EpubTextExtractor { + item: Item, + output: OnceLock>, } -impl<'a> EpubTextExtractor<'a> { - pub fn new(item: &'a Item) -> Self { +impl EpubTextExtractor { + pub fn new(item: &Item) -> Self { Self { - item, + item: item.clone(), output: OnceLock::new(), } } - async fn get_inner(&self) -> Result<&HashMap>, std::io::Error> { + async fn get_inner(&self) -> Result<&HashMap, std::io::Error> { if let Some(x) = self.output.get() { return Ok(x); } @@ -61,7 +64,7 @@ impl<'a> EpubTextExtractor<'a> { #[expect(clippy::unwrap_used)] let output = HashMap::from([( Label::new("text").unwrap(), - PileValue::String(raw_text.into()), + PileValue::String(Arc::new(raw_text.into())), )]); let _ = self.output.set(output); @@ -88,12 +91,9 @@ fn strip_html(html: &str) -> String { } #[async_trait::async_trait] -impl ObjectExtractor for EpubTextExtractor<'_> { - async fn field<'a>( - &'a self, - name: &Label, - ) -> Result>, std::io::Error> { - Ok(self.get_inner().await?.get(name)) +impl ObjectExtractor for EpubTextExtractor { + async fn field(&self, name: &Label) -> Result, std::io::Error> { + Ok(self.get_inner().await?.get(name).cloned()) } async fn fields(&self) -> Result, std::io::Error> { diff --git a/crates/pile-dataset/src/extract/epub/mod.rs b/crates/pile-dataset/src/extract/epub/mod.rs index 599f93e..eb814d7 100644 --- a/crates/pile-dataset/src/extract/epub/mod.rs +++ b/crates/pile-dataset/src/extract/epub/mod.rs @@ -12,13 +12,13 @@ use crate::{ extract::{MapExtractor, ObjectExtractor}, }; -pub struct EpubExtractor<'a> { - inner: MapExtractor<'a>, +pub struct EpubExtractor { + inner: MapExtractor, } -impl<'a> EpubExtractor<'a> { +impl EpubExtractor { #[expect(clippy::unwrap_used)] - pub fn new(item: &'a Item) -> Self { + pub fn new(item: &Item) -> Self { let inner = MapExtractor { inner: HashMap::from([ ( @@ -37,19 +37,8 @@ impl<'a> EpubExtractor<'a> { } #[async_trait::async_trait] -impl ObjectExtractor for EpubExtractor<'_> { - async fn field<'a>( - &'a self, - name: &pile_config::Label, - ) -> Result>, std::io::Error> { - #[expect(clippy::unwrap_used)] - if name.as_str() == "text" { - match self.inner.inner.get(name).unwrap() { - PileValue::ObjectExtractor(x) => return x.field(name).await, - _ => unreachable!(), - }; - } - +impl ObjectExtractor for EpubExtractor { + async fn field(&self, name: &pile_config::Label) -> Result, std::io::Error> { self.inner.field(name).await } diff --git a/crates/pile-dataset/src/extract/exif.rs b/crates/pile-dataset/src/extract/exif.rs index 2e14468..fef16c1 100644 --- a/crates/pile-dataset/src/extract/exif.rs +++ b/crates/pile-dataset/src/extract/exif.rs @@ -1,23 +1,27 @@ use pile_config::Label; -use std::{collections::HashMap, io::BufReader, sync::OnceLock}; +use std::{ + collections::HashMap, + io::BufReader, + sync::{Arc, OnceLock}, +}; use tracing::debug; use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor}; -pub struct ExifExtractor<'a> { - item: &'a Item, - output: OnceLock>>, +pub struct ExifExtractor { + item: Item, + output: OnceLock>, } -impl<'a> ExifExtractor<'a> { - pub fn new(item: &'a Item) -> Self { +impl ExifExtractor { + pub fn new(item: &Item) -> Self { Self { - item, + item: item.clone(), output: OnceLock::new(), } } - async fn get_inner(&self) -> Result<&HashMap>, std::io::Error> { + async fn get_inner(&self) -> Result<&HashMap, std::io::Error> { if let Some(x) = self.output.get() { return Ok(x); } @@ -52,7 +56,7 @@ impl<'a> ExifExtractor<'a> { } }; - let mut output: HashMap> = HashMap::new(); + let mut output: HashMap = HashMap::new(); for (tag_name, value) in raw_fields { let Some(label) = tag_to_label(&tag_name) else { @@ -61,7 +65,7 @@ impl<'a> ExifExtractor<'a> { // First occurrence wins (PRIMARY IFD comes before THUMBNAIL) output .entry(label) - .or_insert_with(|| PileValue::String(value.into())); + .or_insert_with(|| PileValue::String(Arc::new(value.into()))); } return Ok(self.output.get_or_init(|| output)); @@ -78,12 +82,9 @@ fn tag_to_label(tag: &str) -> Option