130 lines
3.4 KiB
Rust
130 lines
3.4 KiB
Rust
use pdf::file::FileOptions;
|
|
use pdf::primitive::{Date, TimeRel};
|
|
use pile_config::Label;
|
|
use std::{collections::HashMap, io::BufReader, sync::OnceLock};
|
|
use tracing::trace;
|
|
|
|
use crate::extract::ObjectExtractor;
|
|
use crate::{Item, PileValue, SyncReadBridge};
|
|
|
|
pub struct PdfMetaExtractor<'a> {
|
|
item: &'a Item,
|
|
output: OnceLock<HashMap<Label, PileValue<'a>>>,
|
|
}
|
|
|
|
impl<'a> PdfMetaExtractor<'a> {
|
|
pub fn new(item: &'a Item) -> Self {
|
|
Self {
|
|
item,
|
|
output: OnceLock::new(),
|
|
}
|
|
}
|
|
|
|
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
|
|
if let Some(x) = self.output.get() {
|
|
return Ok(x);
|
|
}
|
|
|
|
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
|
let raw_meta = tokio::task::spawn_blocking(move || {
|
|
let mut bytes = Vec::new();
|
|
std::io::Read::read_to_end(&mut BufReader::new(reader), &mut bytes)?;
|
|
|
|
let file = match FileOptions::cached().load(bytes) {
|
|
Ok(x) => x,
|
|
Err(pdf::PdfError::Io { source }) => return Err(source),
|
|
Err(error) => {
|
|
return Err(std::io::Error::new(
|
|
std::io::ErrorKind::InvalidData,
|
|
error.to_string(),
|
|
));
|
|
}
|
|
};
|
|
|
|
let page_count = file.num_pages();
|
|
|
|
let mut meta: Vec<(&'static str, Option<String>)> = Vec::new();
|
|
|
|
if let Some(info) = &file.trailer.info_dict {
|
|
use pdf::primitive::PdfString;
|
|
let fields: &[(&'static str, Option<&PdfString>)] = &[
|
|
("title", info.title.as_ref()),
|
|
("author", info.author.as_ref()),
|
|
("subject", info.subject.as_ref()),
|
|
("keywords", info.keywords.as_ref()),
|
|
("creator", info.creator.as_ref()),
|
|
("producer", info.producer.as_ref()),
|
|
];
|
|
|
|
for (key, val) in fields {
|
|
meta.push((key, val.map(|s| s.to_string_lossy())));
|
|
}
|
|
|
|
meta.push((
|
|
"creation_date",
|
|
info.creation_date.as_ref().map(format_date),
|
|
));
|
|
meta.push(("mod_date", info.mod_date.as_ref().map(format_date)));
|
|
}
|
|
|
|
Ok::<_, std::io::Error>((page_count, meta))
|
|
})
|
|
.await
|
|
.map_err(std::io::Error::other)?;
|
|
|
|
let (page_count, raw_meta) = match raw_meta {
|
|
Ok(x) => x,
|
|
Err(error) => {
|
|
trace!(message = "Could not process pdf", ?error, key = ?self.item.key());
|
|
return Ok(self.output.get_or_init(HashMap::new));
|
|
}
|
|
};
|
|
|
|
let mut output: HashMap<Label, PileValue<'a>> = HashMap::new();
|
|
|
|
#[expect(clippy::unwrap_used)]
|
|
output.insert(
|
|
Label::new("pages").unwrap(),
|
|
PileValue::U64(page_count as u64),
|
|
);
|
|
|
|
#[expect(clippy::unwrap_used)]
|
|
for (key, val) in raw_meta {
|
|
let label = Label::new(key).unwrap();
|
|
let value = match val {
|
|
Some(s) => PileValue::String(s.into()),
|
|
None => PileValue::Null,
|
|
};
|
|
output.insert(label, value);
|
|
}
|
|
|
|
return Ok(self.output.get_or_init(|| output));
|
|
}
|
|
}
|
|
|
|
fn format_date(d: &Date) -> String {
|
|
let tz = match d.rel {
|
|
TimeRel::Universal => "Z".to_owned(),
|
|
TimeRel::Later => format!("+{:02}:{:02}", d.tz_hour, d.tz_minute),
|
|
TimeRel::Earlier => format!("-{:02}:{:02}", d.tz_hour, d.tz_minute),
|
|
};
|
|
format!(
|
|
"{:04}-{:02}-{:02}T{:02}:{:02}:{:02}{}",
|
|
d.year, d.month, d.day, d.hour, d.minute, d.second, tz
|
|
)
|
|
}
|
|
|
|
#[async_trait::async_trait]
|
|
impl ObjectExtractor for PdfMetaExtractor<'_> {
|
|
async fn field<'a>(
|
|
&'a self,
|
|
name: &Label,
|
|
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
|
|
Ok(self.get_inner().await?.get(name))
|
|
}
|
|
|
|
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
|
Ok(self.get_inner().await?.keys().cloned().collect())
|
|
}
|
|
}
|