Add discovery subcommands list and fields
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
use pile_config::Label;
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
use std::sync::Arc;
|
||||
|
||||
mod epub_meta;
|
||||
pub use epub_meta::*;
|
||||
@@ -7,39 +7,30 @@ pub use epub_meta::*;
|
||||
mod epub_text;
|
||||
pub use epub_text::*;
|
||||
|
||||
use crate::{
|
||||
Item, PileValue,
|
||||
extract::{MapExtractor, ObjectExtractor},
|
||||
};
|
||||
use crate::{Item, PileValue, extract::ObjectExtractor};
|
||||
|
||||
pub struct EpubExtractor {
|
||||
inner: MapExtractor,
|
||||
text: Arc<EpubTextExtractor>,
|
||||
meta: Arc<EpubMetaExtractor>,
|
||||
}
|
||||
|
||||
impl EpubExtractor {
|
||||
#[expect(clippy::unwrap_used)]
|
||||
pub fn new(item: &Item) -> Self {
|
||||
let inner = MapExtractor {
|
||||
inner: HashMap::from([
|
||||
(
|
||||
Label::new("text").unwrap(),
|
||||
PileValue::ObjectExtractor(Arc::new(EpubTextExtractor::new(item))),
|
||||
),
|
||||
(
|
||||
Label::new("meta").unwrap(),
|
||||
PileValue::ObjectExtractor(Arc::new(EpubMetaExtractor::new(item))),
|
||||
),
|
||||
]),
|
||||
};
|
||||
|
||||
Self { inner }
|
||||
Self {
|
||||
text: Arc::new(EpubTextExtractor::new(item)),
|
||||
meta: Arc::new(EpubMetaExtractor::new(item)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for EpubExtractor {
|
||||
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||
self.inner.field(name).await
|
||||
match name.as_str() {
|
||||
"text" => self.text.field(name).await,
|
||||
"meta" => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))),
|
||||
_ => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
|
||||
@@ -4,7 +4,7 @@ use std::{
|
||||
io::BufReader,
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
use tracing::debug;
|
||||
use tracing::trace;
|
||||
|
||||
use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor};
|
||||
|
||||
@@ -51,7 +51,7 @@ impl ExifExtractor {
|
||||
let raw_fields = match raw_fields {
|
||||
Ok(x) => x,
|
||||
Err(error) => {
|
||||
debug!(message = "Could not process exif", ?error, key = ?self.item.key());
|
||||
trace!(message = "Could not process exif", ?error, key = ?self.item.key());
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use pile_config::Label;
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
use std::sync::Arc;
|
||||
|
||||
#[cfg(feature = "pdfium")]
|
||||
mod pdf_cover;
|
||||
@@ -17,40 +17,26 @@ pub use pdf_meta::*;
|
||||
mod pdf_text;
|
||||
pub use pdf_text::*;
|
||||
|
||||
use crate::{
|
||||
Item, PileValue,
|
||||
extract::{MapExtractor, ObjectExtractor},
|
||||
};
|
||||
use crate::{Item, PileValue, extract::ObjectExtractor};
|
||||
|
||||
pub struct PdfExtractor {
|
||||
inner: MapExtractor,
|
||||
text: Arc<PdfTextExtractor>,
|
||||
meta: Arc<PdfMetaExtractor>,
|
||||
#[cfg(feature = "pdfium")]
|
||||
cover: Arc<PdfCoverExtractor>,
|
||||
#[cfg(feature = "pdfium")]
|
||||
pages: Arc<PdfPagesExtractor>,
|
||||
}
|
||||
|
||||
impl PdfExtractor {
|
||||
#[expect(clippy::unwrap_used)]
|
||||
pub fn new(item: &Item) -> Self {
|
||||
let mut inner_map = HashMap::new();
|
||||
inner_map.insert(
|
||||
Label::new("text").unwrap(),
|
||||
PileValue::ObjectExtractor(Arc::new(PdfTextExtractor::new(item))),
|
||||
);
|
||||
inner_map.insert(
|
||||
Label::new("meta").unwrap(),
|
||||
PileValue::ObjectExtractor(Arc::new(PdfMetaExtractor::new(item))),
|
||||
);
|
||||
#[cfg(feature = "pdfium")]
|
||||
inner_map.insert(
|
||||
Label::new("cover").unwrap(),
|
||||
PileValue::ObjectExtractor(Arc::new(PdfCoverExtractor::new(item))),
|
||||
);
|
||||
#[cfg(feature = "pdfium")]
|
||||
inner_map.insert(
|
||||
Label::new("pages").unwrap(),
|
||||
PileValue::ListExtractor(Arc::new(PdfPagesExtractor::new(item))),
|
||||
);
|
||||
|
||||
Self {
|
||||
inner: MapExtractor { inner: inner_map },
|
||||
text: Arc::new(PdfTextExtractor::new(item)),
|
||||
meta: Arc::new(PdfMetaExtractor::new(item)),
|
||||
#[cfg(feature = "pdfium")]
|
||||
cover: Arc::new(PdfCoverExtractor::new(item)),
|
||||
#[cfg(feature = "pdfium")]
|
||||
pages: Arc::new(PdfPagesExtractor::new(item)),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -58,7 +44,15 @@ impl PdfExtractor {
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for PdfExtractor {
|
||||
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||
self.inner.field(name).await
|
||||
match name.as_str() {
|
||||
"text" => self.text.field(name).await,
|
||||
"meta" => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))),
|
||||
#[cfg(feature = "pdfium")]
|
||||
"cover" => self.cover.field(name).await,
|
||||
#[cfg(feature = "pdfium")]
|
||||
"pages" => Ok(Some(PileValue::ListExtractor(self.pages.clone()))),
|
||||
_ => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
|
||||
@@ -38,7 +38,14 @@ impl TomlExtractor {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let bytes = self.item.read().await?.read_to_end().await?;
|
||||
let mut reader = match self.item.read().await {
|
||||
Ok(r) => r,
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
Err(e) => return Err(e),
|
||||
};
|
||||
let bytes = reader.read_to_end().await?;
|
||||
let toml: toml::Value = match toml::from_slice(&bytes) {
|
||||
Ok(x) => x,
|
||||
Err(_) => return Ok(self.output.get_or_init(HashMap::new)),
|
||||
|
||||
@@ -109,6 +109,45 @@ impl PileValue {
|
||||
return Ok(out.clone());
|
||||
}
|
||||
|
||||
/// Like `to_json`, but counts populated fields instead of collecting values.
|
||||
///
|
||||
/// - Leaf values (non-null scalars, arrays, blobs) contribute `Some(1)`.
|
||||
/// - `Null` contributes `None`.
|
||||
/// - `ObjectExtractor` is recursed into; returns `Some(Object(map))` with
|
||||
/// only the fields that had data, or `None` if all fields were absent.
|
||||
/// - `Array` / `ListExtractor` are treated as opaque leaf values (not descended into).
|
||||
pub async fn count_fields(&self) -> Result<Option<Value>, std::io::Error> {
|
||||
Ok(match self {
|
||||
Self::Null => None,
|
||||
|
||||
Self::U64(_) | Self::I64(_) | Self::String(_) | Self::Blob { .. } => {
|
||||
Some(Value::Number(1u64.into()))
|
||||
}
|
||||
|
||||
Self::Array(x) => (!x.is_empty()).then(|| Value::Number(1u64.into())),
|
||||
Self::ListExtractor(x) => (!x.is_empty().await?).then(|| Value::Number(1u64.into())),
|
||||
|
||||
Self::ObjectExtractor(e) => {
|
||||
let keys = e.fields().await?;
|
||||
let mut map = Map::new();
|
||||
for k in &keys {
|
||||
let v = match e.field(k).await? {
|
||||
Some(x) => x,
|
||||
None => continue,
|
||||
};
|
||||
if let Some(counted) = Box::pin(v.count_fields()).await? {
|
||||
map.insert(k.to_string(), counted);
|
||||
}
|
||||
}
|
||||
if map.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(Value::Object(map))
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn as_str(&self) -> Option<&str> {
|
||||
match self {
|
||||
Self::String(x) => Some(x),
|
||||
|
||||
Reference in New Issue
Block a user