Read files incrementally

This commit is contained in:
2026-03-10 09:18:26 -07:00
parent c02e92dd72
commit 20dc30ea18
9 changed files with 352 additions and 214 deletions

View File

@@ -1,9 +1,9 @@
use epub::doc::EpubDoc;
use pile_config::Label;
use std::{collections::HashMap, io::Cursor, sync::OnceLock};
use tracing::debug;
use std::{collections::HashMap, sync::OnceLock};
use tracing::trace;
use crate::{Item, PileValue, extract::Extractor};
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor};
pub struct EpubMetaExtractor<'a> {
item: &'a Item,
@@ -23,34 +23,50 @@ impl<'a> EpubMetaExtractor<'a> {
return Ok(x);
}
let bytes = self.item.read().await?.read_to_end().await?;
let key = self.item.key();
let ext = key.as_str().rsplit('.').next();
if !matches!(ext, Some("epub")) {
return Ok(self.output.get_or_init(HashMap::new));
}
let cursor = Cursor::new(bytes);
let doc = match EpubDoc::from_reader(cursor) {
let reader = SyncReadBridge::new_current(self.item.read().await?);
let raw_meta = tokio::task::spawn_blocking(move || {
let doc = EpubDoc::from_reader(reader)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let fields: &[&'static str] = &[
"title",
"creator",
"description",
"language",
"publisher",
"date",
"subject",
"identifier",
];
let meta: Vec<(&'static str, Option<String>)> =
fields.iter().map(|&key| (key, doc.mdata(key))).collect();
Ok::<_, std::io::Error>(meta)
})
.await
.map_err(std::io::Error::other)?;
let raw_meta = match raw_meta {
Ok(x) => x,
Err(error) => {
debug!(message = "Could not process epub", ?error, key = ?self.item.key());
trace!(message = "Could not process epub", ?error, key = ?self.item.key());
return Ok(self.output.get_or_init(HashMap::new));
}
};
let mut output: HashMap<Label, PileValue<'a>> = HashMap::new();
let fields = &[
"title",
"creator",
"description",
"language",
"publisher",
"date",
"subject",
"identifier",
];
#[expect(clippy::unwrap_used)]
for key in fields {
let label = Label::new(*key).unwrap();
let value = match doc.mdata(key) {
for (key, val) in raw_meta {
let label = Label::new(key).unwrap();
let value = match val {
Some(s) => PileValue::String(s.into()),
None => PileValue::Null,
};

View File

@@ -1,9 +1,9 @@
use epub::doc::EpubDoc;
use pile_config::Label;
use std::{collections::HashMap, io::Cursor, sync::OnceLock};
use std::{collections::HashMap, sync::OnceLock};
use tracing::debug;
use crate::{Item, PileValue, extract::Extractor};
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor};
pub struct EpubTextExtractor<'a> {
item: &'a Item,
@@ -23,10 +23,34 @@ impl<'a> EpubTextExtractor<'a> {
return Ok(x);
}
let bytes = self.item.read().await?.read_to_end().await?;
let key = self.item.key();
let ext = key.as_str().rsplit('.').next();
if !matches!(ext, Some("epub")) {
return Ok(self.output.get_or_init(HashMap::new));
}
let cursor = Cursor::new(bytes);
let mut doc = match EpubDoc::from_reader(cursor) {
let reader = SyncReadBridge::new_current(self.item.read().await?);
let raw_text = tokio::task::spawn_blocking(move || {
let mut doc = EpubDoc::from_reader(reader)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let mut text_parts: Vec<String> = Vec::new();
loop {
if let Ok(content) = doc.get_current_str() {
text_parts.push(strip_html(&content));
}
if doc.go_next().is_err() {
break;
}
}
Ok::<_, std::io::Error>(text_parts.join(" "))
})
.await
.map_err(std::io::Error::other)?;
let raw_text = match raw_text {
Ok(x) => x,
Err(error) => {
debug!(message = "Could not process epub", ?error, key = ?self.item.key());
@@ -34,21 +58,11 @@ impl<'a> EpubTextExtractor<'a> {
}
};
let mut text_parts: Vec<String> = Vec::new();
loop {
if let Ok(content) = doc.get_current_str() {
text_parts.push(strip_html(&content));
}
if doc.go_next().is_err() {
break;
}
}
let text = text_parts.join(" ");
#[expect(clippy::unwrap_used)]
let output = HashMap::from([(Label::new("text").unwrap(), PileValue::String(text.into()))]);
let output = HashMap::from([(
Label::new("text").unwrap(),
PileValue::String(raw_text.into()),
)]);
let _ = self.output.set(output);
#[expect(clippy::unwrap_used)]