From 48ac93c78e0e96b9d1db9e607689571cf9704747 Mon Sep 17 00:00:00 2001 From: rm-dr <96270320+rm-dr@users.noreply.github.com> Date: Tue, 10 Mar 2026 20:24:56 -0700 Subject: [PATCH] Add `ListExtractor` --- crates/pile-dataset/src/dataset.rs | 2 +- .../src/extract/epub/epub_meta.rs | 4 +- .../src/extract/epub/epub_text.rs | 4 +- crates/pile-dataset/src/extract/epub/mod.rs | 10 +- crates/pile-dataset/src/extract/exif.rs | 4 +- crates/pile-dataset/src/extract/flac.rs | 119 ++++++++++++++---- crates/pile-dataset/src/extract/fs.rs | 4 +- crates/pile-dataset/src/extract/id3.rs | 4 +- crates/pile-dataset/src/extract/map.rs | 4 +- crates/pile-dataset/src/extract/mod.rs | 45 ++++--- crates/pile-dataset/src/extract/pdf/mod.rs | 26 ++-- .../pile-dataset/src/extract/pdf/pdf_cover.rs | 4 +- .../pile-dataset/src/extract/pdf/pdf_meta.rs | 17 ++- .../pile-dataset/src/extract/pdf/pdf_pages.rs | 119 ++++++++++++++++++ .../pile-dataset/src/extract/pdf/pdf_text.rs | 5 +- crates/pile-dataset/src/extract/sidecar.rs | 4 +- crates/pile-dataset/src/extract/toml.rs | 4 +- crates/pile-dataset/src/index/index_fts.rs | 41 ++++-- crates/pile-dataset/src/serve/field.rs | 2 +- crates/pile-dataset/src/value.rs | 53 +++++++- crates/pile/src/command/annotate.rs | 2 +- crates/pile/src/command/probe.rs | 2 +- 22 files changed, 386 insertions(+), 93 deletions(-) create mode 100644 crates/pile-dataset/src/extract/pdf/pdf_pages.rs diff --git a/crates/pile-dataset/src/dataset.rs b/crates/pile-dataset/src/dataset.rs index 322a84e..d8696f2 100644 --- a/crates/pile-dataset/src/dataset.rs +++ b/crates/pile-dataset/src/dataset.rs @@ -184,7 +184,7 @@ impl Datasets { return Ok(None); }; let extractor = MetaExtractor::new(&item); - let root = PileValue::Extractor(Arc::new(extractor)); + let root = PileValue::ObjectExtractor(Arc::new(extractor)); let Some(value) = root.query(path).await? else { return Ok(None); }; diff --git a/crates/pile-dataset/src/extract/epub/epub_meta.rs b/crates/pile-dataset/src/extract/epub/epub_meta.rs index d6edfa6..4488406 100644 --- a/crates/pile-dataset/src/extract/epub/epub_meta.rs +++ b/crates/pile-dataset/src/extract/epub/epub_meta.rs @@ -3,7 +3,7 @@ use pile_config::Label; use std::{collections::HashMap, sync::OnceLock}; use tracing::trace; -use crate::{Item, PileValue, SyncReadBridge, extract::Extractor}; +use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor}; pub struct EpubMetaExtractor<'a> { item: &'a Item, @@ -78,7 +78,7 @@ impl<'a> EpubMetaExtractor<'a> { } #[async_trait::async_trait] -impl Extractor for EpubMetaExtractor<'_> { +impl ObjectExtractor for EpubMetaExtractor<'_> { async fn field<'a>( &'a self, name: &Label, diff --git a/crates/pile-dataset/src/extract/epub/epub_text.rs b/crates/pile-dataset/src/extract/epub/epub_text.rs index 24dbf1a..7997b90 100644 --- a/crates/pile-dataset/src/extract/epub/epub_text.rs +++ b/crates/pile-dataset/src/extract/epub/epub_text.rs @@ -3,7 +3,7 @@ use pile_config::Label; use std::{collections::HashMap, sync::OnceLock}; use tracing::debug; -use crate::{Item, PileValue, SyncReadBridge, extract::Extractor}; +use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor}; pub struct EpubTextExtractor<'a> { item: &'a Item, @@ -88,7 +88,7 @@ fn strip_html(html: &str) -> String { } #[async_trait::async_trait] -impl Extractor for EpubTextExtractor<'_> { +impl ObjectExtractor for EpubTextExtractor<'_> { async fn field<'a>( &'a self, name: &Label, diff --git a/crates/pile-dataset/src/extract/epub/mod.rs b/crates/pile-dataset/src/extract/epub/mod.rs index b4d6e90..599f93e 100644 --- a/crates/pile-dataset/src/extract/epub/mod.rs +++ b/crates/pile-dataset/src/extract/epub/mod.rs @@ -9,7 +9,7 @@ pub use epub_text::*; use crate::{ Item, PileValue, - extract::{Extractor, MapExtractor}, + extract::{MapExtractor, ObjectExtractor}, }; pub struct EpubExtractor<'a> { @@ -23,11 +23,11 @@ impl<'a> EpubExtractor<'a> { inner: HashMap::from([ ( Label::new("text").unwrap(), - PileValue::Extractor(Arc::new(EpubTextExtractor::new(item))), + PileValue::ObjectExtractor(Arc::new(EpubTextExtractor::new(item))), ), ( Label::new("meta").unwrap(), - PileValue::Extractor(Arc::new(EpubMetaExtractor::new(item))), + PileValue::ObjectExtractor(Arc::new(EpubMetaExtractor::new(item))), ), ]), }; @@ -37,7 +37,7 @@ impl<'a> EpubExtractor<'a> { } #[async_trait::async_trait] -impl Extractor for EpubExtractor<'_> { +impl ObjectExtractor for EpubExtractor<'_> { async fn field<'a>( &'a self, name: &pile_config::Label, @@ -45,7 +45,7 @@ impl Extractor for EpubExtractor<'_> { #[expect(clippy::unwrap_used)] if name.as_str() == "text" { match self.inner.inner.get(name).unwrap() { - PileValue::Extractor(x) => return x.field(name).await, + PileValue::ObjectExtractor(x) => return x.field(name).await, _ => unreachable!(), }; } diff --git a/crates/pile-dataset/src/extract/exif.rs b/crates/pile-dataset/src/extract/exif.rs index d2166f2..2e14468 100644 --- a/crates/pile-dataset/src/extract/exif.rs +++ b/crates/pile-dataset/src/extract/exif.rs @@ -2,7 +2,7 @@ use pile_config::Label; use std::{collections::HashMap, io::BufReader, sync::OnceLock}; use tracing::debug; -use crate::{Item, PileValue, SyncReadBridge, extract::Extractor}; +use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor}; pub struct ExifExtractor<'a> { item: &'a Item, @@ -78,7 +78,7 @@ fn tag_to_label(tag: &str) -> Option