Filter by mime

This commit is contained in:
2026-03-15 10:20:15 -07:00
parent 8041fc7531
commit 979fbb9b0d
30 changed files with 258 additions and 93 deletions

View File

@@ -7,7 +7,7 @@ use std::{
use tracing::trace;
use crate::{
extract::traits::ObjectExtractor,
extract::traits::{ExtractState, ObjectExtractor},
value::{Item, PileValue, SyncReadBridge},
};
@@ -82,6 +82,7 @@ impl EpubMetaExtractor {
impl ObjectExtractor for EpubMetaExtractor {
async fn field(
&self,
state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
@@ -89,6 +90,10 @@ impl ObjectExtractor for EpubMetaExtractor {
return Ok(None);
}
if !state.ignore_mime && self.item.mime().essence_str() != "application/epub+zip" {
return Ok(None);
}
Ok(self.get_inner().await?.get(name).cloned())
}

View File

@@ -7,7 +7,7 @@ use std::{
use tracing::trace;
use crate::{
extract::traits::ObjectExtractor,
extract::traits::{ExtractState, ObjectExtractor},
value::{Item, PileValue, SyncReadBridge},
};
@@ -92,6 +92,7 @@ fn strip_html(html: &str) -> String {
impl ObjectExtractor for EpubTextExtractor {
async fn field(
&self,
state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
@@ -99,6 +100,10 @@ impl ObjectExtractor for EpubTextExtractor {
return Ok(None);
}
if !state.ignore_mime && self.item.mime().essence_str() != "application/epub+zip" {
return Ok(None);
}
Ok(self.get_inner().await?.get(name).cloned())
}

View File

@@ -8,7 +8,7 @@ mod epub_text;
pub use epub_text::*;
use crate::{
extract::traits::ObjectExtractor,
extract::traits::{ExtractState, ObjectExtractor},
value::{Item, PileValue},
};
@@ -30,13 +30,14 @@ impl EpubExtractor {
impl ObjectExtractor for EpubExtractor {
async fn field(
&self,
state: &ExtractState,
name: &pile_config::Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
match (name.as_str(), args) {
("text", args) => Ok(Some(
self.text
.field(name, args)
.field(state, name, args)
.await
.map(|x| x.unwrap_or(PileValue::Null))?,
)),

View File

@@ -7,7 +7,7 @@ use std::{
use tracing::trace;
use crate::{
extract::traits::ObjectExtractor,
extract::traits::{ExtractState, ObjectExtractor},
value::{Item, PileValue, SyncReadBridge},
};
@@ -87,6 +87,7 @@ fn tag_to_label(tag: &str) -> Option<Label> {
impl ObjectExtractor for ExifExtractor {
async fn field(
&self,
state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
@@ -100,6 +101,10 @@ impl ObjectExtractor for ExifExtractor {
return Ok(None);
}
if !state.ignore_mime && self.item.mime().type_() != mime::IMAGE {
return Ok(None);
}
Ok(self.get_inner().await?.get(name).cloned())
}

View File

@@ -9,7 +9,7 @@ use std::{
use tracing::trace;
use crate::{
extract::traits::{ListExtractor, ObjectExtractor},
extract::traits::{ExtractState, ListExtractor, ObjectExtractor},
value::{Item, PileValue, SyncReadBridge},
};
@@ -46,16 +46,32 @@ impl FlacImagesExtractor {
return Ok(count);
}
fn mime_ok(&self, state: &ExtractState) -> bool {
if state.ignore_mime {
return true;
}
let essence = self.item.mime().essence_str();
essence == "audio/flac" || essence == "audio/x-flac"
}
}
#[async_trait::async_trait]
impl ListExtractor for FlacImagesExtractor {
async fn get<'a>(&'a self, mut idx: usize) -> Result<Option<PileValue>, std::io::Error> {
async fn get(
&self,
state: &ExtractState,
mut idx: usize,
) -> Result<Option<PileValue>, std::io::Error> {
trace!(
key = self.item.key().as_str(),
"Getting index {idx} from FlacImagesExtractor",
);
if !self.mime_ok(state) {
return Ok(None);
}
let key = self.item.key();
let reader = SyncReadBridge::new_current(self.item.read().await?);
let image = tokio::task::spawn_blocking(move || {
@@ -98,7 +114,11 @@ impl ListExtractor for FlacImagesExtractor {
}))
}
async fn len(&self) -> Result<usize, std::io::Error> {
async fn len(&self, state: &ExtractState) -> Result<usize, std::io::Error> {
if !self.mime_ok(state) {
return Ok(0);
}
if let Some(x) = self.cached_count.get() {
return Ok(*x);
}
@@ -178,12 +198,21 @@ impl FlacExtractor {
return Ok(self.output.get_or_init(|| output));
}
fn mime_ok(&self, state: &ExtractState) -> bool {
if state.ignore_mime {
return true;
}
let essence = self.item.mime().essence_str();
essence == "audio/flac" || essence == "audio/x-flac"
}
}
#[async_trait::async_trait]
impl ObjectExtractor for FlacExtractor {
async fn field(
&self,
state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
@@ -191,6 +220,10 @@ impl ObjectExtractor for FlacExtractor {
return Ok(None);
}
if !self.mime_ok(state) {
return Ok(None);
}
if name.as_str() == "images" {
return Ok(Some(self.images.clone()));
}

View File

@@ -1,5 +1,5 @@
use crate::{
extract::traits::ObjectExtractor,
extract::traits::{ExtractState, ObjectExtractor},
value::{Item, PileValue},
};
use pile_config::Label;
@@ -88,6 +88,7 @@ impl FsExtractor {
impl ObjectExtractor for FsExtractor {
async fn field(
&self,
_state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {

View File

@@ -9,7 +9,7 @@ use std::{
use tracing::trace;
use crate::{
extract::traits::ObjectExtractor,
extract::traits::{ExtractState, ObjectExtractor},
value::{Item, PileValue, SyncReadBridge},
};
@@ -125,6 +125,7 @@ fn frame_id_to_field(id: &str) -> Cow<'static, str> {
impl ObjectExtractor for Id3Extractor {
async fn field(
&self,
state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
@@ -132,6 +133,10 @@ impl ObjectExtractor for Id3Extractor {
return Ok(None);
}
if !state.ignore_mime && self.item.mime().essence_str() != "audio/mpeg" {
return Ok(None);
}
Ok(self.get_inner().await?.get(name).cloned())
}

View File

@@ -26,7 +26,10 @@ mod sidecar;
pub use sidecar::*;
use crate::{
extract::{misc::MapExtractor, traits::ObjectExtractor},
extract::{
misc::MapExtractor,
traits::{ExtractState, ObjectExtractor},
},
value::{Item, PileValue},
};
@@ -82,10 +85,11 @@ impl ItemExtractor {
impl ObjectExtractor for ItemExtractor {
async fn field(
&self,
state: &ExtractState,
name: &pile_config::Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
self.inner.field(name, args).await
self.inner.field(state, name, args).await
}
#[expect(clippy::unwrap_used)]

View File

@@ -14,7 +14,7 @@ mod pdf_text;
pub use pdf_text::*;
use crate::{
extract::traits::ObjectExtractor,
extract::traits::{ExtractState, ObjectExtractor},
value::{Item, PileValue},
};
@@ -40,6 +40,7 @@ impl PdfExtractor {
impl ObjectExtractor for PdfExtractor {
async fn field(
&self,
state: &ExtractState,
name: &pile_config::Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
@@ -50,7 +51,7 @@ impl ObjectExtractor for PdfExtractor {
);
match (name.as_str(), args) {
("text", args) => self.text.field(name, args).await,
("text", args) => self.text.field(state, name, args).await,
("meta", None) => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))),
#[cfg(feature = "pdfium")]
("pages", None) => Ok(Some(PileValue::ListExtractor(self.pages.clone()))),

View File

@@ -9,7 +9,7 @@ use std::{
use tracing::trace;
use crate::{
extract::traits::ObjectExtractor,
extract::traits::{ExtractState, ObjectExtractor},
value::{Item, PileValue, SyncReadBridge},
};
@@ -124,12 +124,18 @@ fn format_date(d: &Date) -> String {
impl ObjectExtractor for PdfMetaExtractor {
async fn field(
&self,
state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
if !state.ignore_mime && self.item.mime().essence_str() != "application/pdf" {
return Ok(None);
}
Ok(self.get_inner().await?.get(name).cloned())
}

View File

@@ -7,7 +7,7 @@ use std::{
use tracing::trace;
use crate::{
extract::traits::ListExtractor,
extract::traits::{ExtractState, ListExtractor},
value::{Item, PileValue, SyncReadBridge},
};
@@ -34,12 +34,20 @@ impl PdfPagesExtractor {
#[async_trait::async_trait]
impl ListExtractor for PdfPagesExtractor {
async fn get(&self, idx: usize) -> Result<Option<PileValue>, std::io::Error> {
async fn get(
&self,
state: &ExtractState,
idx: usize,
) -> Result<Option<PileValue>, std::io::Error> {
trace!(
key = self.item.key().as_str(),
"Getting index {idx} from PdfPagesExtractor",
);
if !state.ignore_mime && self.item.mime().essence_str() != "application/pdf" {
return Ok(None);
}
let bytes = self.get_bytes().await?;
let png = tokio::task::spawn_blocking(move || {
let pdfium = Pdfium::default();
@@ -81,7 +89,11 @@ impl ListExtractor for PdfPagesExtractor {
Ok(Some(value))
}
async fn len(&self) -> Result<usize, std::io::Error> {
async fn len(&self, state: &ExtractState) -> Result<usize, std::io::Error> {
if !state.ignore_mime && self.item.mime().essence_str() != "application/pdf" {
return Ok(0);
}
let bytes = self.get_bytes().await?;
let count = tokio::task::spawn_blocking(move || {
let pdfium = Pdfium::default();
@@ -103,10 +115,10 @@ impl ListExtractor for PdfPagesExtractor {
// Override, extracting all pages is very slow,
// and we can't display binary in json anyway
async fn to_json(&self) -> Result<serde_json::Value, std::io::Error> {
async fn to_json(&self, state: &ExtractState) -> Result<serde_json::Value, std::io::Error> {
Ok(serde_json::Value::String(format!(
"<PdfPages ({} pages)>",
self.len().await?
self.len(state).await?
)))
}
}

View File

@@ -9,7 +9,7 @@ use std::{
use tracing::trace;
use crate::{
extract::traits::ObjectExtractor,
extract::traits::{ExtractState, ObjectExtractor},
value::{Item, PileValue, SyncReadBridge},
};
@@ -104,6 +104,7 @@ impl PdfTextExtractor {
impl ObjectExtractor for PdfTextExtractor {
async fn field(
&self,
state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
@@ -111,6 +112,10 @@ impl ObjectExtractor for PdfTextExtractor {
return Ok(None);
}
if !state.ignore_mime && self.item.mime().essence_str() != "application/pdf" {
return Ok(None);
}
Ok(self.get_inner().await?.get(name).cloned())
}

View File

@@ -4,7 +4,7 @@ use tracing::trace;
use super::TomlExtractor;
use crate::{
extract::traits::ObjectExtractor,
extract::traits::{ExtractState, ObjectExtractor},
value::{Item, PileValue},
};
@@ -26,6 +26,7 @@ impl SidecarExtractor {
impl ObjectExtractor for SidecarExtractor {
async fn field(
&self,
state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
@@ -39,7 +40,7 @@ impl ObjectExtractor for SidecarExtractor {
.output
.get_or_init(|| self.item.sidecar().map(TomlExtractor::new))
{
Some(x) => Ok(x.field(name, args).await?),
Some(x) => Ok(x.field(state, name, args).await?),
None => Ok(Some(PileValue::Null)),
}
}

View File

@@ -5,7 +5,7 @@ use std::{
};
use crate::{
extract::traits::ObjectExtractor,
extract::traits::{ExtractState, ObjectExtractor},
value::{AsyncReader, Item, PileValue},
};
@@ -64,6 +64,7 @@ impl TomlExtractor {
impl ObjectExtractor for TomlExtractor {
async fn field(
&self,
state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
@@ -71,6 +72,10 @@ impl ObjectExtractor for TomlExtractor {
return Ok(None);
}
if !state.ignore_mime && self.item.mime().type_() != mime::TEXT {
return Ok(None);
}
Ok(self.get_inner().await?.get(name).cloned())
}