Read files incrementally

This commit is contained in:
2026-03-10 09:18:26 -07:00
parent c02e92dd72
commit 20dc30ea18
9 changed files with 352 additions and 214 deletions

View File

@@ -1,10 +1,10 @@
use pdf::file::FileOptions;
use pdf::primitive::{Date, PdfString, TimeRel};
use pdf::primitive::{Date, TimeRel};
use pile_config::Label;
use std::{collections::HashMap, sync::OnceLock};
use tracing::debug;
use std::{collections::HashMap, io::BufReader, sync::OnceLock};
use tracing::trace;
use crate::{Item, PileValue, extract::Extractor};
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor};
pub struct PdfMetaExtractor<'a> {
item: &'a Item,
@@ -24,56 +24,69 @@ impl<'a> PdfMetaExtractor<'a> {
return Ok(x);
}
let bytes = self.item.read().await?.read_to_end().await?;
let reader = SyncReadBridge::new_current(self.item.read().await?);
let raw_meta = tokio::task::spawn_blocking(move || {
let mut bytes = Vec::new();
std::io::Read::read_to_end(&mut BufReader::new(reader), &mut bytes)?;
let file = match FileOptions::cached().load(bytes) {
let file = match FileOptions::cached().load(bytes) {
Ok(x) => x,
Err(pdf::PdfError::Io { source }) => return Err(source),
Err(error) => {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidData,
error.to_string(),
));
}
};
let mut meta: Vec<(&'static str, Option<String>)> = Vec::new();
if let Some(info) = &file.trailer.info_dict {
use pdf::primitive::PdfString;
let fields: &[(&'static str, Option<&PdfString>)] = &[
("title", info.title.as_ref()),
("author", info.author.as_ref()),
("subject", info.subject.as_ref()),
("keywords", info.keywords.as_ref()),
("creator", info.creator.as_ref()),
("producer", info.producer.as_ref()),
];
for (key, val) in fields {
meta.push((key, val.map(|s| s.to_string_lossy())));
}
meta.push((
"creation_date",
info.creation_date.as_ref().map(format_date),
));
meta.push(("mod_date", info.mod_date.as_ref().map(format_date)));
}
Ok::<_, std::io::Error>(meta)
})
.await
.map_err(std::io::Error::other)?;
let raw_meta = match raw_meta {
Ok(x) => x,
Err(pdf::PdfError::Io { source }) => return Err(source),
Err(error) => {
debug!(message = "Could not process pdf", ?error, key = ?self.item.key());
trace!(message = "Could not process pdf", ?error, key = ?self.item.key());
return Ok(self.output.get_or_init(HashMap::new));
}
};
let mut output: HashMap<Label, PileValue<'a>> = HashMap::new();
if let Some(info) = &file.trailer.info_dict {
let fields: &[(&str, Option<&PdfString>)] = &[
("title", info.title.as_ref()),
("author", info.author.as_ref()),
("subject", info.subject.as_ref()),
("keywords", info.keywords.as_ref()),
("creator", info.creator.as_ref()),
("producer", info.producer.as_ref()),
];
#[expect(clippy::unwrap_used)]
for (key, val) in fields {
let label = Label::new(*key).unwrap();
let value = match val {
Some(s) => PileValue::String(s.to_string_lossy().into()),
None => PileValue::Null,
};
output.insert(label, value);
}
#[expect(clippy::unwrap_used)]
{
output.insert(
Label::new("creation_date").unwrap(),
info.creation_date
.as_ref()
.map(|d| PileValue::String(format_date(d).into()))
.unwrap_or(PileValue::Null),
);
output.insert(
Label::new("mod_date").unwrap(),
info.mod_date
.as_ref()
.map(|d| PileValue::String(format_date(d).into()))
.unwrap_or(PileValue::Null),
);
}
#[expect(clippy::unwrap_used)]
for (key, val) in raw_meta {
let label = Label::new(key).unwrap();
let value = match val {
Some(s) => PileValue::String(s.into()),
None => PileValue::Null,
};
output.insert(label, value);
}
return Ok(self.output.get_or_init(|| output));