Add epub extractor

This commit is contained in:
2026-03-09 22:34:39 -07:00
parent aecc84233b
commit ad41a8abbd
7 changed files with 269 additions and 0 deletions

View File

@@ -0,0 +1,76 @@
use epub::doc::EpubDoc;
use pile_config::Label;
use std::{collections::HashMap, io::Cursor, sync::OnceLock};
use tracing::debug;
use crate::{Item, PileValue, extract::Extractor};
pub struct EpubMetaExtractor<'a> {
item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a>>>,
}
impl<'a> EpubMetaExtractor<'a> {
pub fn new(item: &'a Item) -> Self {
Self {
item,
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let bytes = self.item.read().await?.read_to_end().await?;
let cursor = Cursor::new(bytes);
let doc = match EpubDoc::from_reader(cursor) {
Ok(x) => x,
Err(error) => {
debug!(message = "Could not process epub", ?error, key = ?self.item.key());
return Ok(self.output.get_or_init(HashMap::new));
}
};
let mut output: HashMap<Label, PileValue<'a>> = HashMap::new();
let fields = &[
"title",
"creator",
"description",
"language",
"publisher",
"date",
"subject",
"identifier",
];
#[expect(clippy::unwrap_used)]
for key in fields {
let label = Label::new(*key).unwrap();
let value = match doc.mdata(key) {
Some(s) => PileValue::String(s.into()),
None => PileValue::Null,
};
output.insert(label, value);
}
return Ok(self.output.get_or_init(|| output));
}
}
#[async_trait::async_trait]
impl Extractor for EpubMetaExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner().await?.get(name))
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect())
}
}

View File

@@ -0,0 +1,88 @@
use epub::doc::EpubDoc;
use pile_config::Label;
use std::{collections::HashMap, io::Cursor, sync::OnceLock};
use tracing::debug;
use crate::{Item, PileValue, extract::Extractor};
pub struct EpubTextExtractor<'a> {
item: &'a Item,
output: OnceLock<HashMap<Label, PileValue<'a>>>,
}
impl<'a> EpubTextExtractor<'a> {
pub fn new(item: &'a Item) -> Self {
Self {
item,
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let bytes = self.item.read().await?.read_to_end().await?;
let cursor = Cursor::new(bytes);
let mut doc = match EpubDoc::from_reader(cursor) {
Ok(x) => x,
Err(error) => {
debug!(message = "Could not process epub", ?error, key = ?self.item.key());
return Ok(self.output.get_or_init(HashMap::new));
}
};
let mut text_parts: Vec<String> = Vec::new();
loop {
if let Ok(content) = doc.get_current_str() {
text_parts.push(strip_html(&content));
}
if doc.go_next().is_err() {
break;
}
}
let text = text_parts.join(" ");
#[expect(clippy::unwrap_used)]
let output = HashMap::from([(Label::new("text").unwrap(), PileValue::String(text.into()))]);
let _ = self.output.set(output);
#[expect(clippy::unwrap_used)]
return Ok(self.output.get().unwrap());
}
}
/// Strip HTML/XHTML tags from a string, leaving only text nodes.
fn strip_html(html: &str) -> String {
let mut result = String::with_capacity(html.len());
let mut in_tag = false;
for c in html.chars() {
match c {
'<' => in_tag = true,
'>' => in_tag = false,
_ if !in_tag => result.push(c),
_ => {}
}
}
result
}
#[async_trait::async_trait]
impl Extractor for EpubTextExtractor<'_> {
async fn field<'a>(
&'a self,
name: &Label,
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
Ok(self.get_inner().await?.get(name))
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect())
}
}

View File

@@ -0,0 +1,63 @@
use pile_config::Label;
use std::{collections::HashMap, sync::Arc};
mod epub_meta;
pub use epub_meta::*;
mod epub_text;
pub use epub_text::*;
use crate::{
Item, PileValue,
extract::{Extractor, MapExtractor},
};
pub struct EpubExtractor<'a> {
inner: MapExtractor<'a>,
}
impl<'a> EpubExtractor<'a> {
#[expect(clippy::unwrap_used)]
pub fn new(item: &'a Item) -> Self {
let inner = MapExtractor {
inner: HashMap::from([
(
Label::new("text").unwrap(),
PileValue::Extractor(Arc::new(EpubTextExtractor::new(item))),
),
(
Label::new("meta").unwrap(),
PileValue::Extractor(Arc::new(EpubMetaExtractor::new(item))),
),
]),
};
Self { inner }
}
}
#[async_trait::async_trait]
impl Extractor for EpubExtractor<'_> {
async fn field<'a>(
&'a self,
name: &pile_config::Label,
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
#[expect(clippy::unwrap_used)]
if name.as_str() == "text" {
match self.inner.inner.get(name).unwrap() {
PileValue::Extractor(x) => return x.field(name).await,
_ => unreachable!(),
};
}
self.inner.field(name).await
}
#[expect(clippy::unwrap_used)]
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(vec![
Label::new("text").unwrap(),
Label::new("meta").unwrap(),
])
}
}

View File

@@ -10,6 +10,9 @@ pub use id3::*;
mod fs;
pub use fs::*;
mod epub;
pub use epub::*;
mod pdf;
pub use pdf::*;
@@ -69,6 +72,10 @@ impl<'a> MetaExtractor<'a> {
Label::new("fs").unwrap(),
crate::PileValue::Extractor(Arc::new(FsExtractor::new(item))),
),
(
Label::new("epub").unwrap(),
crate::PileValue::Extractor(Arc::new(EpubExtractor::new(item))),
),
(
Label::new("pdf").unwrap(),
crate::PileValue::Extractor(Arc::new(PdfExtractor::new(item))),
@@ -103,6 +110,7 @@ impl Extractor for MetaExtractor<'_> {
Label::new("flac").unwrap(),
Label::new("id3").unwrap(),
Label::new("fs").unwrap(),
Label::new("epub").unwrap(),
Label::new("pdf").unwrap(),
Label::new("sidecar").unwrap(),
]);