From bfa67994bf12747f530c6a038776ae4845fbe6be Mon Sep 17 00:00:00 2001 From: rm-dr <96270320+rm-dr@users.noreply.github.com> Date: Tue, 10 Mar 2026 21:05:51 -0700 Subject: [PATCH] Owned items, static values --- .../src/extract/epub/epub_meta.rs | 32 +++---- .../src/extract/epub/epub_text.rs | 30 +++--- crates/pile-dataset/src/extract/epub/mod.rs | 23 ++--- crates/pile-dataset/src/extract/exif.rs | 33 +++---- crates/pile-dataset/src/extract/flac.rs | 73 ++++++-------- crates/pile-dataset/src/extract/fs.rs | 39 ++++---- crates/pile-dataset/src/extract/id3.rs | 36 +++---- crates/pile-dataset/src/extract/map.rs | 13 +-- crates/pile-dataset/src/extract/mod.rs | 27 ++---- crates/pile-dataset/src/extract/pdf/mod.rs | 32 ++----- .../pile-dataset/src/extract/pdf/pdf_cover.rs | 23 ++--- .../pile-dataset/src/extract/pdf/pdf_meta.rs | 33 +++---- .../pile-dataset/src/extract/pdf/pdf_pages.rs | 96 +++++++------------ .../pile-dataset/src/extract/pdf/pdf_text.rs | 31 +++--- crates/pile-dataset/src/extract/sidecar.rs | 21 ++-- crates/pile-dataset/src/extract/toml.rs | 46 ++++----- crates/pile-dataset/src/index/index_fts.rs | 60 ++++++------ crates/pile-dataset/src/item.rs | 1 + crates/pile-dataset/src/serve/field.rs | 2 +- crates/pile-dataset/src/value.rs | 31 +++--- 20 files changed, 304 insertions(+), 378 deletions(-) diff --git a/crates/pile-dataset/src/extract/epub/epub_meta.rs b/crates/pile-dataset/src/extract/epub/epub_meta.rs index 4488406..49b218b 100644 --- a/crates/pile-dataset/src/extract/epub/epub_meta.rs +++ b/crates/pile-dataset/src/extract/epub/epub_meta.rs @@ -1,24 +1,27 @@ use epub::doc::EpubDoc; use pile_config::Label; -use std::{collections::HashMap, sync::OnceLock}; +use std::{ + collections::HashMap, + sync::{Arc, OnceLock}, +}; use tracing::trace; use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor}; -pub struct EpubMetaExtractor<'a> { - item: &'a Item, - output: OnceLock>>, +pub struct EpubMetaExtractor { + item: Item, + output: OnceLock>, } -impl<'a> EpubMetaExtractor<'a> { - pub fn new(item: &'a Item) -> Self { +impl EpubMetaExtractor { + pub fn new(item: &Item) -> Self { Self { - item, + item: item.clone(), output: OnceLock::new(), } } - async fn get_inner(&self) -> Result<&HashMap>, std::io::Error> { + async fn get_inner(&self) -> Result<&HashMap, std::io::Error> { if let Some(x) = self.output.get() { return Ok(x); } @@ -61,13 +64,13 @@ impl<'a> EpubMetaExtractor<'a> { } }; - let mut output: HashMap> = HashMap::new(); + let mut output: HashMap = HashMap::new(); #[expect(clippy::unwrap_used)] for (key, val) in raw_meta { let label = Label::new(key).unwrap(); let value = match val { - Some(s) => PileValue::String(s.into()), + Some(s) => PileValue::String(Arc::new(s.into())), None => PileValue::Null, }; output.insert(label, value); @@ -78,12 +81,9 @@ impl<'a> EpubMetaExtractor<'a> { } #[async_trait::async_trait] -impl ObjectExtractor for EpubMetaExtractor<'_> { - async fn field<'a>( - &'a self, - name: &Label, - ) -> Result>, std::io::Error> { - Ok(self.get_inner().await?.get(name)) +impl ObjectExtractor for EpubMetaExtractor { + async fn field(&self, name: &Label) -> Result, std::io::Error> { + Ok(self.get_inner().await?.get(name).cloned()) } async fn fields(&self) -> Result, std::io::Error> { diff --git a/crates/pile-dataset/src/extract/epub/epub_text.rs b/crates/pile-dataset/src/extract/epub/epub_text.rs index 7997b90..5c47b15 100644 --- a/crates/pile-dataset/src/extract/epub/epub_text.rs +++ b/crates/pile-dataset/src/extract/epub/epub_text.rs @@ -1,24 +1,27 @@ use epub::doc::EpubDoc; use pile_config::Label; -use std::{collections::HashMap, sync::OnceLock}; +use std::{ + collections::HashMap, + sync::{Arc, OnceLock}, +}; use tracing::debug; use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor}; -pub struct EpubTextExtractor<'a> { - item: &'a Item, - output: OnceLock>>, +pub struct EpubTextExtractor { + item: Item, + output: OnceLock>, } -impl<'a> EpubTextExtractor<'a> { - pub fn new(item: &'a Item) -> Self { +impl EpubTextExtractor { + pub fn new(item: &Item) -> Self { Self { - item, + item: item.clone(), output: OnceLock::new(), } } - async fn get_inner(&self) -> Result<&HashMap>, std::io::Error> { + async fn get_inner(&self) -> Result<&HashMap, std::io::Error> { if let Some(x) = self.output.get() { return Ok(x); } @@ -61,7 +64,7 @@ impl<'a> EpubTextExtractor<'a> { #[expect(clippy::unwrap_used)] let output = HashMap::from([( Label::new("text").unwrap(), - PileValue::String(raw_text.into()), + PileValue::String(Arc::new(raw_text.into())), )]); let _ = self.output.set(output); @@ -88,12 +91,9 @@ fn strip_html(html: &str) -> String { } #[async_trait::async_trait] -impl ObjectExtractor for EpubTextExtractor<'_> { - async fn field<'a>( - &'a self, - name: &Label, - ) -> Result>, std::io::Error> { - Ok(self.get_inner().await?.get(name)) +impl ObjectExtractor for EpubTextExtractor { + async fn field(&self, name: &Label) -> Result, std::io::Error> { + Ok(self.get_inner().await?.get(name).cloned()) } async fn fields(&self) -> Result, std::io::Error> { diff --git a/crates/pile-dataset/src/extract/epub/mod.rs b/crates/pile-dataset/src/extract/epub/mod.rs index 599f93e..eb814d7 100644 --- a/crates/pile-dataset/src/extract/epub/mod.rs +++ b/crates/pile-dataset/src/extract/epub/mod.rs @@ -12,13 +12,13 @@ use crate::{ extract::{MapExtractor, ObjectExtractor}, }; -pub struct EpubExtractor<'a> { - inner: MapExtractor<'a>, +pub struct EpubExtractor { + inner: MapExtractor, } -impl<'a> EpubExtractor<'a> { +impl EpubExtractor { #[expect(clippy::unwrap_used)] - pub fn new(item: &'a Item) -> Self { + pub fn new(item: &Item) -> Self { let inner = MapExtractor { inner: HashMap::from([ ( @@ -37,19 +37,8 @@ impl<'a> EpubExtractor<'a> { } #[async_trait::async_trait] -impl ObjectExtractor for EpubExtractor<'_> { - async fn field<'a>( - &'a self, - name: &pile_config::Label, - ) -> Result>, std::io::Error> { - #[expect(clippy::unwrap_used)] - if name.as_str() == "text" { - match self.inner.inner.get(name).unwrap() { - PileValue::ObjectExtractor(x) => return x.field(name).await, - _ => unreachable!(), - }; - } - +impl ObjectExtractor for EpubExtractor { + async fn field(&self, name: &pile_config::Label) -> Result, std::io::Error> { self.inner.field(name).await } diff --git a/crates/pile-dataset/src/extract/exif.rs b/crates/pile-dataset/src/extract/exif.rs index 2e14468..fef16c1 100644 --- a/crates/pile-dataset/src/extract/exif.rs +++ b/crates/pile-dataset/src/extract/exif.rs @@ -1,23 +1,27 @@ use pile_config::Label; -use std::{collections::HashMap, io::BufReader, sync::OnceLock}; +use std::{ + collections::HashMap, + io::BufReader, + sync::{Arc, OnceLock}, +}; use tracing::debug; use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor}; -pub struct ExifExtractor<'a> { - item: &'a Item, - output: OnceLock>>, +pub struct ExifExtractor { + item: Item, + output: OnceLock>, } -impl<'a> ExifExtractor<'a> { - pub fn new(item: &'a Item) -> Self { +impl ExifExtractor { + pub fn new(item: &Item) -> Self { Self { - item, + item: item.clone(), output: OnceLock::new(), } } - async fn get_inner(&self) -> Result<&HashMap>, std::io::Error> { + async fn get_inner(&self) -> Result<&HashMap, std::io::Error> { if let Some(x) = self.output.get() { return Ok(x); } @@ -52,7 +56,7 @@ impl<'a> ExifExtractor<'a> { } }; - let mut output: HashMap> = HashMap::new(); + let mut output: HashMap = HashMap::new(); for (tag_name, value) in raw_fields { let Some(label) = tag_to_label(&tag_name) else { @@ -61,7 +65,7 @@ impl<'a> ExifExtractor<'a> { // First occurrence wins (PRIMARY IFD comes before THUMBNAIL) output .entry(label) - .or_insert_with(|| PileValue::String(value.into())); + .or_insert_with(|| PileValue::String(Arc::new(value.into()))); } return Ok(self.output.get_or_init(|| output)); @@ -78,12 +82,9 @@ fn tag_to_label(tag: &str) -> Option