Add ListExtractor

This commit is contained in:
2026-03-10 20:24:56 -07:00
parent 280bbcb83e
commit 48ac93c78e
22 changed files with 386 additions and 93 deletions

View File

@@ -4,7 +4,8 @@ use pile_config::Label;
use std::{collections::HashMap, io::BufReader, sync::OnceLock};
use tracing::trace;
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor};
use crate::extract::ObjectExtractor;
use crate::{Item, PileValue, SyncReadBridge};
pub struct PdfMetaExtractor<'a> {
item: &'a Item,
@@ -40,6 +41,8 @@ impl<'a> PdfMetaExtractor<'a> {
}
};
let page_count = file.num_pages();
let mut meta: Vec<(&'static str, Option<String>)> = Vec::new();
if let Some(info) = &file.trailer.info_dict {
@@ -64,12 +67,12 @@ impl<'a> PdfMetaExtractor<'a> {
meta.push(("mod_date", info.mod_date.as_ref().map(format_date)));
}
Ok::<_, std::io::Error>(meta)
Ok::<_, std::io::Error>((page_count, meta))
})
.await
.map_err(std::io::Error::other)?;
let raw_meta = match raw_meta {
let (page_count, raw_meta) = match raw_meta {
Ok(x) => x,
Err(error) => {
trace!(message = "Could not process pdf", ?error, key = ?self.item.key());
@@ -79,6 +82,12 @@ impl<'a> PdfMetaExtractor<'a> {
let mut output: HashMap<Label, PileValue<'a>> = HashMap::new();
#[expect(clippy::unwrap_used)]
output.insert(
Label::new("pages").unwrap(),
PileValue::U64(page_count as u64),
);
#[expect(clippy::unwrap_used)]
for (key, val) in raw_meta {
let label = Label::new(key).unwrap();
@@ -106,7 +115,7 @@ fn format_date(d: &Date) -> String {
}
#[async_trait::async_trait]
impl Extractor for PdfMetaExtractor<'_> {
impl ObjectExtractor for PdfMetaExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,