Read files incrementally
This commit is contained in:
@@ -1,9 +1,9 @@
|
||||
use epub::doc::EpubDoc;
|
||||
use pile_config::Label;
|
||||
use std::{collections::HashMap, io::Cursor, sync::OnceLock};
|
||||
use tracing::debug;
|
||||
use std::{collections::HashMap, sync::OnceLock};
|
||||
use tracing::trace;
|
||||
|
||||
use crate::{Item, PileValue, extract::Extractor};
|
||||
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor};
|
||||
|
||||
pub struct EpubMetaExtractor<'a> {
|
||||
item: &'a Item,
|
||||
@@ -23,34 +23,50 @@ impl<'a> EpubMetaExtractor<'a> {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let bytes = self.item.read().await?.read_to_end().await?;
|
||||
let key = self.item.key();
|
||||
let ext = key.as_str().rsplit('.').next();
|
||||
if !matches!(ext, Some("epub")) {
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
|
||||
let cursor = Cursor::new(bytes);
|
||||
let doc = match EpubDoc::from_reader(cursor) {
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
let raw_meta = tokio::task::spawn_blocking(move || {
|
||||
let doc = EpubDoc::from_reader(reader)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||
|
||||
let fields: &[&'static str] = &[
|
||||
"title",
|
||||
"creator",
|
||||
"description",
|
||||
"language",
|
||||
"publisher",
|
||||
"date",
|
||||
"subject",
|
||||
"identifier",
|
||||
];
|
||||
|
||||
let meta: Vec<(&'static str, Option<String>)> =
|
||||
fields.iter().map(|&key| (key, doc.mdata(key))).collect();
|
||||
|
||||
Ok::<_, std::io::Error>(meta)
|
||||
})
|
||||
.await
|
||||
.map_err(std::io::Error::other)?;
|
||||
|
||||
let raw_meta = match raw_meta {
|
||||
Ok(x) => x,
|
||||
Err(error) => {
|
||||
debug!(message = "Could not process epub", ?error, key = ?self.item.key());
|
||||
trace!(message = "Could not process epub", ?error, key = ?self.item.key());
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
};
|
||||
|
||||
let mut output: HashMap<Label, PileValue<'a>> = HashMap::new();
|
||||
|
||||
let fields = &[
|
||||
"title",
|
||||
"creator",
|
||||
"description",
|
||||
"language",
|
||||
"publisher",
|
||||
"date",
|
||||
"subject",
|
||||
"identifier",
|
||||
];
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
for key in fields {
|
||||
let label = Label::new(*key).unwrap();
|
||||
let value = match doc.mdata(key) {
|
||||
for (key, val) in raw_meta {
|
||||
let label = Label::new(key).unwrap();
|
||||
let value = match val {
|
||||
Some(s) => PileValue::String(s.into()),
|
||||
None => PileValue::Null,
|
||||
};
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use epub::doc::EpubDoc;
|
||||
use pile_config::Label;
|
||||
use std::{collections::HashMap, io::Cursor, sync::OnceLock};
|
||||
use std::{collections::HashMap, sync::OnceLock};
|
||||
use tracing::debug;
|
||||
|
||||
use crate::{Item, PileValue, extract::Extractor};
|
||||
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor};
|
||||
|
||||
pub struct EpubTextExtractor<'a> {
|
||||
item: &'a Item,
|
||||
@@ -23,10 +23,34 @@ impl<'a> EpubTextExtractor<'a> {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let bytes = self.item.read().await?.read_to_end().await?;
|
||||
let key = self.item.key();
|
||||
let ext = key.as_str().rsplit('.').next();
|
||||
if !matches!(ext, Some("epub")) {
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
|
||||
let cursor = Cursor::new(bytes);
|
||||
let mut doc = match EpubDoc::from_reader(cursor) {
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
let raw_text = tokio::task::spawn_blocking(move || {
|
||||
let mut doc = EpubDoc::from_reader(reader)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||
|
||||
let mut text_parts: Vec<String> = Vec::new();
|
||||
|
||||
loop {
|
||||
if let Ok(content) = doc.get_current_str() {
|
||||
text_parts.push(strip_html(&content));
|
||||
}
|
||||
if doc.go_next().is_err() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok::<_, std::io::Error>(text_parts.join(" "))
|
||||
})
|
||||
.await
|
||||
.map_err(std::io::Error::other)?;
|
||||
|
||||
let raw_text = match raw_text {
|
||||
Ok(x) => x,
|
||||
Err(error) => {
|
||||
debug!(message = "Could not process epub", ?error, key = ?self.item.key());
|
||||
@@ -34,21 +58,11 @@ impl<'a> EpubTextExtractor<'a> {
|
||||
}
|
||||
};
|
||||
|
||||
let mut text_parts: Vec<String> = Vec::new();
|
||||
|
||||
loop {
|
||||
if let Ok(content) = doc.get_current_str() {
|
||||
text_parts.push(strip_html(&content));
|
||||
}
|
||||
if doc.go_next().is_err() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let text = text_parts.join(" ");
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
let output = HashMap::from([(Label::new("text").unwrap(), PileValue::String(text.into()))]);
|
||||
let output = HashMap::from([(
|
||||
Label::new("text").unwrap(),
|
||||
PileValue::String(raw_text.into()),
|
||||
)]);
|
||||
|
||||
let _ = self.output.set(output);
|
||||
#[expect(clippy::unwrap_used)]
|
||||
|
||||
Reference in New Issue
Block a user