From 979fbb9b0d4acb7bd0c30aa78a667d869d96bf63 Mon Sep 17 00:00:00 2001 From: rm-dr <96270320+rm-dr@users.noreply.github.com> Date: Sun, 15 Mar 2026 10:20:15 -0700 Subject: [PATCH] Filter by mime --- crates/pile-dataset/src/dataset.rs | 10 +++-- crates/pile-dataset/src/index/index_fts.rs | 11 ++++-- crates/pile-dataset/src/serve/field.rs | 8 ++-- .../src/extract/item/epub/epub_meta.rs | 7 +++- .../src/extract/item/epub/epub_text.rs | 7 +++- .../pile-value/src/extract/item/epub/mod.rs | 5 ++- crates/pile-value/src/extract/item/exif.rs | 7 +++- crates/pile-value/src/extract/item/flac.rs | 39 +++++++++++++++++-- crates/pile-value/src/extract/item/fs.rs | 3 +- crates/pile-value/src/extract/item/id3.rs | 7 +++- crates/pile-value/src/extract/item/mod.rs | 8 +++- crates/pile-value/src/extract/item/pdf/mod.rs | 5 ++- .../src/extract/item/pdf/pdf_meta.rs | 8 +++- .../src/extract/item/pdf/pdf_pages.rs | 22 ++++++++--- .../src/extract/item/pdf/pdf_text.rs | 7 +++- crates/pile-value/src/extract/item/sidecar.rs | 5 ++- crates/pile-value/src/extract/item/toml.rs | 7 +++- crates/pile-value/src/extract/misc/list.rs | 13 +++++-- crates/pile-value/src/extract/misc/map.rs | 6 ++- crates/pile-value/src/extract/misc/vec.rs | 13 +++++-- crates/pile-value/src/extract/string.rs | 11 +++++- crates/pile-value/src/extract/traits.rs | 37 +++++++++++------- crates/pile-value/src/value/value.rs | 31 +++++++++------ crates/pile/src/command/annotate.rs | 8 +++- crates/pile/src/command/fields.rs | 6 ++- crates/pile/src/command/index.rs | 20 ++++++---- crates/pile/src/command/list.rs | 6 ++- crates/pile/src/command/lookup.rs | 21 ++++++---- crates/pile/src/command/probe.rs | 8 ++-- crates/pile/src/command/serve.rs | 5 ++- 30 files changed, 258 insertions(+), 93 deletions(-) diff --git a/crates/pile-dataset/src/dataset.rs b/crates/pile-dataset/src/dataset.rs index 240232e..616f775 100644 --- a/crates/pile-dataset/src/dataset.rs +++ b/crates/pile-dataset/src/dataset.rs @@ -2,6 +2,7 @@ use chrono::{DateTime, Utc}; use pile_config::{ConfigToml, Label, Source, objectpath::ObjectPath}; use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError}; use pile_value::{ + extract::traits::ExtractState, source::{DataSource, DirDataSource, S3DataSource, misc::path_ts_earliest}, value::{Item, PileValue}, }; @@ -187,6 +188,7 @@ impl Datasets { /// Returns `None` if the item or field is not found. pub async fn get_field( &self, + state: &ExtractState, source: &Label, key: &str, path: &ObjectPath, @@ -196,11 +198,11 @@ impl Datasets { }; let item = PileValue::Item(item); - let Some(value) = item.query(path).await? else { + let Some(value) = item.query(state, path).await? else { return Ok(None); }; - Ok(Some(value.to_json().await?)) + Ok(Some(value.to_json(state).await?)) } // @@ -210,6 +212,7 @@ impl Datasets { /// Refresh this dataset's fts index. pub async fn fts_refresh( &self, + state: &ExtractState, _threads: usize, flag: Option, ) -> Result<(), CancelableTaskError> { @@ -265,9 +268,10 @@ impl Datasets { let item = item_result.map_err(DatasetError::from)?; let db = Arc::clone(&db_index); + let state = state.clone(); join_set.spawn(async move { let key = item.key(); - let result = db.entry_to_document(&item).await; + let result = db.entry_to_document(&state, &item).await; (key, result) }); diff --git a/crates/pile-dataset/src/index/index_fts.rs b/crates/pile-dataset/src/index/index_fts.rs index bf70551..2b44652 100644 --- a/crates/pile-dataset/src/index/index_fts.rs +++ b/crates/pile-dataset/src/index/index_fts.rs @@ -1,5 +1,8 @@ use pile_config::{ConfigToml, DatasetFts, Label}; -use pile_value::value::{Item, PileValue}; +use pile_value::{ + extract::traits::ExtractState, + value::{Item, PileValue}, +}; use std::{path::PathBuf, sync::LazyLock}; use tantivy::{ DocAddress, Index, ReloadPolicy, TantivyDocument, TantivyError, @@ -63,6 +66,7 @@ impl DbFtsIndex { /// Turn an entry into a tantivy document pub async fn entry_to_document( &self, + state: &ExtractState, item: &Item, ) -> Result, TantivyError> { let mut doc = TantivyDocument::default(); @@ -75,7 +79,7 @@ impl DbFtsIndex { let mut empty = true; for name in self.fts_cfg().fields.keys() { - let x = self.get_field(&item, name).await?; + let x = self.get_field(state, &item, name).await?; let val = match x { Some(x) => x, @@ -99,6 +103,7 @@ impl DbFtsIndex { pub async fn get_field( &self, + state: &ExtractState, extractor: &PileValue, field_name: &Label, ) -> Result, std::io::Error> { @@ -112,7 +117,7 @@ impl DbFtsIndex { // Try paths in order, using the first value we find 'outer: for path in field.path.as_slice() { - let val = match extractor.query(path).await? { + let val = match extractor.query(state, path).await? { Some(x) => x, None => return Ok(None), }; diff --git a/crates/pile-dataset/src/serve/field.rs b/crates/pile-dataset/src/serve/field.rs index eab1dff..ef561f8 100644 --- a/crates/pile-dataset/src/serve/field.rs +++ b/crates/pile-dataset/src/serve/field.rs @@ -5,7 +5,7 @@ use axum::{ response::{IntoResponse, Response}, }; use pile_config::{Label, objectpath::ObjectPath}; -use pile_value::value::PileValue; +use pile_value::{extract::traits::ExtractState, value::PileValue}; use serde::Deserialize; use std::{sync::Arc, time::Instant}; use tracing::debug; @@ -62,8 +62,10 @@ pub async fn get_field( return StatusCode::NOT_FOUND.into_response(); }; + let state = ExtractState { ignore_mime: false }; + let item = PileValue::Item(item); - let value = match item.query(&path).await { + let value = match item.query(&state, &path).await { Ok(Some(v)) => v, Ok(None) => return StatusCode::NOT_FOUND.into_response(), Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(), @@ -90,7 +92,7 @@ pub async fn get_field( bytes.as_ref().clone(), ) .into_response(), - _ => match value.to_json().await { + _ => match value.to_json(&state).await { Ok(json) => (StatusCode::OK, Json(json)).into_response(), Err(e) => (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(), }, diff --git a/crates/pile-value/src/extract/item/epub/epub_meta.rs b/crates/pile-value/src/extract/item/epub/epub_meta.rs index 6956ab0..439fc4a 100644 --- a/crates/pile-value/src/extract/item/epub/epub_meta.rs +++ b/crates/pile-value/src/extract/item/epub/epub_meta.rs @@ -7,7 +7,7 @@ use std::{ use tracing::trace; use crate::{ - extract::traits::ObjectExtractor, + extract::traits::{ExtractState, ObjectExtractor}, value::{Item, PileValue, SyncReadBridge}, }; @@ -82,6 +82,7 @@ impl EpubMetaExtractor { impl ObjectExtractor for EpubMetaExtractor { async fn field( &self, + state: &ExtractState, name: &Label, args: Option<&str>, ) -> Result, std::io::Error> { @@ -89,6 +90,10 @@ impl ObjectExtractor for EpubMetaExtractor { return Ok(None); } + if !state.ignore_mime && self.item.mime().essence_str() != "application/epub+zip" { + return Ok(None); + } + Ok(self.get_inner().await?.get(name).cloned()) } diff --git a/crates/pile-value/src/extract/item/epub/epub_text.rs b/crates/pile-value/src/extract/item/epub/epub_text.rs index 9ff5afc..3e9c768 100644 --- a/crates/pile-value/src/extract/item/epub/epub_text.rs +++ b/crates/pile-value/src/extract/item/epub/epub_text.rs @@ -7,7 +7,7 @@ use std::{ use tracing::trace; use crate::{ - extract::traits::ObjectExtractor, + extract::traits::{ExtractState, ObjectExtractor}, value::{Item, PileValue, SyncReadBridge}, }; @@ -92,6 +92,7 @@ fn strip_html(html: &str) -> String { impl ObjectExtractor for EpubTextExtractor { async fn field( &self, + state: &ExtractState, name: &Label, args: Option<&str>, ) -> Result, std::io::Error> { @@ -99,6 +100,10 @@ impl ObjectExtractor for EpubTextExtractor { return Ok(None); } + if !state.ignore_mime && self.item.mime().essence_str() != "application/epub+zip" { + return Ok(None); + } + Ok(self.get_inner().await?.get(name).cloned()) } diff --git a/crates/pile-value/src/extract/item/epub/mod.rs b/crates/pile-value/src/extract/item/epub/mod.rs index 7766c19..ccf96a7 100644 --- a/crates/pile-value/src/extract/item/epub/mod.rs +++ b/crates/pile-value/src/extract/item/epub/mod.rs @@ -8,7 +8,7 @@ mod epub_text; pub use epub_text::*; use crate::{ - extract::traits::ObjectExtractor, + extract::traits::{ExtractState, ObjectExtractor}, value::{Item, PileValue}, }; @@ -30,13 +30,14 @@ impl EpubExtractor { impl ObjectExtractor for EpubExtractor { async fn field( &self, + state: &ExtractState, name: &pile_config::Label, args: Option<&str>, ) -> Result, std::io::Error> { match (name.as_str(), args) { ("text", args) => Ok(Some( self.text - .field(name, args) + .field(state, name, args) .await .map(|x| x.unwrap_or(PileValue::Null))?, )), diff --git a/crates/pile-value/src/extract/item/exif.rs b/crates/pile-value/src/extract/item/exif.rs index 2a9d5ea..78ef3ff 100644 --- a/crates/pile-value/src/extract/item/exif.rs +++ b/crates/pile-value/src/extract/item/exif.rs @@ -7,7 +7,7 @@ use std::{ use tracing::trace; use crate::{ - extract::traits::ObjectExtractor, + extract::traits::{ExtractState, ObjectExtractor}, value::{Item, PileValue, SyncReadBridge}, }; @@ -87,6 +87,7 @@ fn tag_to_label(tag: &str) -> Option