Extractor rewrite

This commit is contained in:
2026-03-11 10:12:36 -07:00
parent b789255ea9
commit 4546a85bd3
51 changed files with 669 additions and 688 deletions

View File

@@ -0,0 +1,95 @@
use epub::doc::EpubDoc;
use pile_config::Label;
use std::{
collections::HashMap,
sync::{Arc, OnceLock},
};
use tracing::trace;
use crate::{
extract::traits::ObjectExtractor,
value::{Item, PileValue, SyncReadBridge},
};
pub struct EpubMetaExtractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl EpubMetaExtractor {
pub fn new(item: &Item) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let key = self.item.key();
let ext = key.as_str().rsplit('.').next();
if !matches!(ext, Some("epub")) {
return Ok(self.output.get_or_init(HashMap::new));
}
let reader = SyncReadBridge::new_current(self.item.read().await?);
let raw_meta = tokio::task::spawn_blocking(move || {
let doc = EpubDoc::from_reader(reader)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let fields: &[&'static str] = &[
"title",
"creator",
"description",
"language",
"publisher",
"date",
"subject",
"identifier",
];
let meta: Vec<(&'static str, Option<String>)> =
fields.iter().map(|&key| (key, doc.mdata(key))).collect();
Ok::<_, std::io::Error>(meta)
})
.await
.map_err(std::io::Error::other)?;
let raw_meta = match raw_meta {
Ok(x) => x,
Err(error) => {
trace!(message = "Could not process epub", ?error, key = ?self.item.key());
return Ok(self.output.get_or_init(HashMap::new));
}
};
let mut output: HashMap<Label, PileValue> = HashMap::new();
#[expect(clippy::unwrap_used)]
for (key, val) in raw_meta {
let label = Label::new(key).unwrap();
let value = match val {
Some(s) => PileValue::String(Arc::new(s.into())),
None => PileValue::Null,
};
output.insert(label, value);
}
return Ok(self.output.get_or_init(|| output));
}
}
#[async_trait::async_trait]
impl ObjectExtractor for EpubMetaExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.get_inner().await?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect())
}
}

View File

@@ -0,0 +1,105 @@
use epub::doc::EpubDoc;
use pile_config::Label;
use std::{
collections::HashMap,
sync::{Arc, OnceLock},
};
use tracing::debug;
use crate::{
extract::traits::ObjectExtractor,
value::{Item, PileValue, SyncReadBridge},
};
pub struct EpubTextExtractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl EpubTextExtractor {
pub fn new(item: &Item) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let key = self.item.key();
let ext = key.as_str().rsplit('.').next();
if !matches!(ext, Some("epub")) {
return Ok(self.output.get_or_init(HashMap::new));
}
let reader = SyncReadBridge::new_current(self.item.read().await?);
let raw_text = tokio::task::spawn_blocking(move || {
let mut doc = EpubDoc::from_reader(reader)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let mut text_parts: Vec<String> = Vec::new();
loop {
if let Ok(content) = doc.get_current_str() {
text_parts.push(strip_html(&content));
}
if doc.go_next().is_err() {
break;
}
}
Ok::<_, std::io::Error>(text_parts.join(" "))
})
.await
.map_err(std::io::Error::other)?;
let raw_text = match raw_text {
Ok(x) => x,
Err(error) => {
debug!(message = "Could not process epub", ?error, key = ?self.item.key());
return Ok(self.output.get_or_init(HashMap::new));
}
};
#[expect(clippy::unwrap_used)]
let output = HashMap::from([(
Label::new("text").unwrap(),
PileValue::String(Arc::new(raw_text.into())),
)]);
let _ = self.output.set(output);
#[expect(clippy::unwrap_used)]
return Ok(self.output.get().unwrap());
}
}
/// Strip HTML/XHTML tags from a string, leaving only text nodes.
fn strip_html(html: &str) -> String {
let mut result = String::with_capacity(html.len());
let mut in_tag = false;
for c in html.chars() {
match c {
'<' => in_tag = true,
'>' => in_tag = false,
_ if !in_tag => result.push(c),
_ => {}
}
}
result
}
#[async_trait::async_trait]
impl ObjectExtractor for EpubTextExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.get_inner().await?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect())
}
}

View File

@@ -0,0 +1,46 @@
use pile_config::Label;
use std::sync::Arc;
mod epub_meta;
pub use epub_meta::*;
mod epub_text;
pub use epub_text::*;
use crate::{
extract::traits::ObjectExtractor,
value::{Item, PileValue},
};
pub struct EpubExtractor {
text: Arc<EpubTextExtractor>,
meta: Arc<EpubMetaExtractor>,
}
impl EpubExtractor {
pub fn new(item: &Item) -> Self {
Self {
text: Arc::new(EpubTextExtractor::new(item)),
meta: Arc::new(EpubMetaExtractor::new(item)),
}
}
}
#[async_trait::async_trait]
impl ObjectExtractor for EpubExtractor {
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error> {
match name.as_str() {
"text" => self.text.field(name).await,
"meta" => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))),
_ => Ok(None),
}
}
#[expect(clippy::unwrap_used)]
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(vec![
Label::new("text").unwrap(),
Label::new("meta").unwrap(),
])
}
}