Add epub extractor
This commit is contained in:
76
crates/pile-dataset/src/extract/epub/epub_meta.rs
Normal file
76
crates/pile-dataset/src/extract/epub/epub_meta.rs
Normal file
@@ -0,0 +1,76 @@
|
||||
use epub::doc::EpubDoc;
|
||||
use pile_config::Label;
|
||||
use std::{collections::HashMap, io::Cursor, sync::OnceLock};
|
||||
use tracing::debug;
|
||||
|
||||
use crate::{Item, PileValue, extract::Extractor};
|
||||
|
||||
pub struct EpubMetaExtractor<'a> {
|
||||
item: &'a Item,
|
||||
output: OnceLock<HashMap<Label, PileValue<'a>>>,
|
||||
}
|
||||
|
||||
impl<'a> EpubMetaExtractor<'a> {
|
||||
pub fn new(item: &'a Item) -> Self {
|
||||
Self {
|
||||
item,
|
||||
output: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
|
||||
if let Some(x) = self.output.get() {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let bytes = self.item.read().await?.read_to_end().await?;
|
||||
|
||||
let cursor = Cursor::new(bytes);
|
||||
let doc = match EpubDoc::from_reader(cursor) {
|
||||
Ok(x) => x,
|
||||
Err(error) => {
|
||||
debug!(message = "Could not process epub", ?error, key = ?self.item.key());
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
};
|
||||
|
||||
let mut output: HashMap<Label, PileValue<'a>> = HashMap::new();
|
||||
|
||||
let fields = &[
|
||||
"title",
|
||||
"creator",
|
||||
"description",
|
||||
"language",
|
||||
"publisher",
|
||||
"date",
|
||||
"subject",
|
||||
"identifier",
|
||||
];
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
for key in fields {
|
||||
let label = Label::new(*key).unwrap();
|
||||
let value = match doc.mdata(key) {
|
||||
Some(s) => PileValue::String(s.into()),
|
||||
None => PileValue::Null,
|
||||
};
|
||||
output.insert(label, value);
|
||||
}
|
||||
|
||||
return Ok(self.output.get_or_init(|| output));
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Extractor for EpubMetaExtractor<'_> {
|
||||
async fn field<'a>(
|
||||
&'a self,
|
||||
name: &Label,
|
||||
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.get(name))
|
||||
}
|
||||
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.keys().cloned().collect())
|
||||
}
|
||||
}
|
||||
88
crates/pile-dataset/src/extract/epub/epub_text.rs
Normal file
88
crates/pile-dataset/src/extract/epub/epub_text.rs
Normal file
@@ -0,0 +1,88 @@
|
||||
use epub::doc::EpubDoc;
|
||||
use pile_config::Label;
|
||||
use std::{collections::HashMap, io::Cursor, sync::OnceLock};
|
||||
use tracing::debug;
|
||||
|
||||
use crate::{Item, PileValue, extract::Extractor};
|
||||
|
||||
pub struct EpubTextExtractor<'a> {
|
||||
item: &'a Item,
|
||||
output: OnceLock<HashMap<Label, PileValue<'a>>>,
|
||||
}
|
||||
|
||||
impl<'a> EpubTextExtractor<'a> {
|
||||
pub fn new(item: &'a Item) -> Self {
|
||||
Self {
|
||||
item,
|
||||
output: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
|
||||
if let Some(x) = self.output.get() {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let bytes = self.item.read().await?.read_to_end().await?;
|
||||
|
||||
let cursor = Cursor::new(bytes);
|
||||
let mut doc = match EpubDoc::from_reader(cursor) {
|
||||
Ok(x) => x,
|
||||
Err(error) => {
|
||||
debug!(message = "Could not process epub", ?error, key = ?self.item.key());
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
};
|
||||
|
||||
let mut text_parts: Vec<String> = Vec::new();
|
||||
|
||||
loop {
|
||||
if let Ok(content) = doc.get_current_str() {
|
||||
text_parts.push(strip_html(&content));
|
||||
}
|
||||
if doc.go_next().is_err() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let text = text_parts.join(" ");
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
let output = HashMap::from([(Label::new("text").unwrap(), PileValue::String(text.into()))]);
|
||||
|
||||
let _ = self.output.set(output);
|
||||
#[expect(clippy::unwrap_used)]
|
||||
return Ok(self.output.get().unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
/// Strip HTML/XHTML tags from a string, leaving only text nodes.
|
||||
fn strip_html(html: &str) -> String {
|
||||
let mut result = String::with_capacity(html.len());
|
||||
let mut in_tag = false;
|
||||
|
||||
for c in html.chars() {
|
||||
match c {
|
||||
'<' => in_tag = true,
|
||||
'>' => in_tag = false,
|
||||
_ if !in_tag => result.push(c),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Extractor for EpubTextExtractor<'_> {
|
||||
async fn field<'a>(
|
||||
&'a self,
|
||||
name: &Label,
|
||||
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.get(name))
|
||||
}
|
||||
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.keys().cloned().collect())
|
||||
}
|
||||
}
|
||||
63
crates/pile-dataset/src/extract/epub/mod.rs
Normal file
63
crates/pile-dataset/src/extract/epub/mod.rs
Normal file
@@ -0,0 +1,63 @@
|
||||
use pile_config::Label;
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
mod epub_meta;
|
||||
pub use epub_meta::*;
|
||||
|
||||
mod epub_text;
|
||||
pub use epub_text::*;
|
||||
|
||||
use crate::{
|
||||
Item, PileValue,
|
||||
extract::{Extractor, MapExtractor},
|
||||
};
|
||||
|
||||
pub struct EpubExtractor<'a> {
|
||||
inner: MapExtractor<'a>,
|
||||
}
|
||||
|
||||
impl<'a> EpubExtractor<'a> {
|
||||
#[expect(clippy::unwrap_used)]
|
||||
pub fn new(item: &'a Item) -> Self {
|
||||
let inner = MapExtractor {
|
||||
inner: HashMap::from([
|
||||
(
|
||||
Label::new("text").unwrap(),
|
||||
PileValue::Extractor(Arc::new(EpubTextExtractor::new(item))),
|
||||
),
|
||||
(
|
||||
Label::new("meta").unwrap(),
|
||||
PileValue::Extractor(Arc::new(EpubMetaExtractor::new(item))),
|
||||
),
|
||||
]),
|
||||
};
|
||||
|
||||
Self { inner }
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Extractor for EpubExtractor<'_> {
|
||||
async fn field<'a>(
|
||||
&'a self,
|
||||
name: &pile_config::Label,
|
||||
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
|
||||
#[expect(clippy::unwrap_used)]
|
||||
if name.as_str() == "text" {
|
||||
match self.inner.inner.get(name).unwrap() {
|
||||
PileValue::Extractor(x) => return x.field(name).await,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
}
|
||||
|
||||
self.inner.field(name).await
|
||||
}
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(vec![
|
||||
Label::new("text").unwrap(),
|
||||
Label::new("meta").unwrap(),
|
||||
])
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user