Add epub extractor
This commit is contained in:
32
Cargo.lock
generated
32
Cargo.lock
generated
@@ -1086,6 +1086,19 @@ version = "1.0.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
|
checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "epub"
|
||||||
|
version = "1.2.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "83c5ac32621967f51e8b82def1a8a86bf4f4e4ab21b6e22f3486d42121fa6581"
|
||||||
|
dependencies = [
|
||||||
|
"anyhow",
|
||||||
|
"percent-encoding",
|
||||||
|
"regex",
|
||||||
|
"xml-rs",
|
||||||
|
"zip",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "equivalent"
|
name = "equivalent"
|
||||||
version = "1.0.2"
|
version = "1.0.2"
|
||||||
@@ -2254,6 +2267,7 @@ dependencies = [
|
|||||||
"aws-sdk-s3",
|
"aws-sdk-s3",
|
||||||
"blake3",
|
"blake3",
|
||||||
"chrono",
|
"chrono",
|
||||||
|
"epub",
|
||||||
"id3",
|
"id3",
|
||||||
"itertools 0.14.0",
|
"itertools 0.14.0",
|
||||||
"pdf",
|
"pdf",
|
||||||
@@ -4152,6 +4166,12 @@ version = "0.6.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
|
checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "xml-rs"
|
||||||
|
version = "0.8.28"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3ae8337f8a065cfc972643663ea4279e04e7256de865aa66fe25cec5fb912d3f"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "xmlparser"
|
name = "xmlparser"
|
||||||
version = "0.13.6"
|
version = "0.13.6"
|
||||||
@@ -4261,6 +4281,18 @@ dependencies = [
|
|||||||
"syn 2.0.117",
|
"syn 2.0.117",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zip"
|
||||||
|
version = "0.6.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261"
|
||||||
|
dependencies = [
|
||||||
|
"byteorder",
|
||||||
|
"crc32fast",
|
||||||
|
"crossbeam-utils",
|
||||||
|
"flate2",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "zmij"
|
name = "zmij"
|
||||||
version = "1.0.21"
|
version = "1.0.21"
|
||||||
|
|||||||
@@ -96,6 +96,7 @@ sha2 = "0.11.0-rc.5"
|
|||||||
blake3 = "1.8.3"
|
blake3 = "1.8.3"
|
||||||
pdf = "0.10.0"
|
pdf = "0.10.0"
|
||||||
id3 = "1.16.4"
|
id3 = "1.16.4"
|
||||||
|
epub = "1.2.2"
|
||||||
|
|
||||||
# Misc helpers
|
# Misc helpers
|
||||||
thiserror = "2.0.18"
|
thiserror = "2.0.18"
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ toml = { workspace = true }
|
|||||||
thiserror = { workspace = true }
|
thiserror = { workspace = true }
|
||||||
smartstring = { workspace = true }
|
smartstring = { workspace = true }
|
||||||
blake3 = { workspace = true }
|
blake3 = { workspace = true }
|
||||||
|
epub = { workspace = true }
|
||||||
pdf = { workspace = true }
|
pdf = { workspace = true }
|
||||||
id3 = { workspace = true }
|
id3 = { workspace = true }
|
||||||
tokio = { workspace = true }
|
tokio = { workspace = true }
|
||||||
|
|||||||
76
crates/pile-dataset/src/extract/epub/epub_meta.rs
Normal file
76
crates/pile-dataset/src/extract/epub/epub_meta.rs
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
use epub::doc::EpubDoc;
|
||||||
|
use pile_config::Label;
|
||||||
|
use std::{collections::HashMap, io::Cursor, sync::OnceLock};
|
||||||
|
use tracing::debug;
|
||||||
|
|
||||||
|
use crate::{Item, PileValue, extract::Extractor};
|
||||||
|
|
||||||
|
pub struct EpubMetaExtractor<'a> {
|
||||||
|
item: &'a Item,
|
||||||
|
output: OnceLock<HashMap<Label, PileValue<'a>>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> EpubMetaExtractor<'a> {
|
||||||
|
pub fn new(item: &'a Item) -> Self {
|
||||||
|
Self {
|
||||||
|
item,
|
||||||
|
output: OnceLock::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
|
||||||
|
if let Some(x) = self.output.get() {
|
||||||
|
return Ok(x);
|
||||||
|
}
|
||||||
|
|
||||||
|
let bytes = self.item.read().await?.read_to_end().await?;
|
||||||
|
|
||||||
|
let cursor = Cursor::new(bytes);
|
||||||
|
let doc = match EpubDoc::from_reader(cursor) {
|
||||||
|
Ok(x) => x,
|
||||||
|
Err(error) => {
|
||||||
|
debug!(message = "Could not process epub", ?error, key = ?self.item.key());
|
||||||
|
return Ok(self.output.get_or_init(HashMap::new));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut output: HashMap<Label, PileValue<'a>> = HashMap::new();
|
||||||
|
|
||||||
|
let fields = &[
|
||||||
|
"title",
|
||||||
|
"creator",
|
||||||
|
"description",
|
||||||
|
"language",
|
||||||
|
"publisher",
|
||||||
|
"date",
|
||||||
|
"subject",
|
||||||
|
"identifier",
|
||||||
|
];
|
||||||
|
|
||||||
|
#[expect(clippy::unwrap_used)]
|
||||||
|
for key in fields {
|
||||||
|
let label = Label::new(*key).unwrap();
|
||||||
|
let value = match doc.mdata(key) {
|
||||||
|
Some(s) => PileValue::String(s.into()),
|
||||||
|
None => PileValue::Null,
|
||||||
|
};
|
||||||
|
output.insert(label, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Ok(self.output.get_or_init(|| output));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
impl Extractor for EpubMetaExtractor<'_> {
|
||||||
|
async fn field<'a>(
|
||||||
|
&'a self,
|
||||||
|
name: &Label,
|
||||||
|
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
|
||||||
|
Ok(self.get_inner().await?.get(name))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||||
|
Ok(self.get_inner().await?.keys().cloned().collect())
|
||||||
|
}
|
||||||
|
}
|
||||||
88
crates/pile-dataset/src/extract/epub/epub_text.rs
Normal file
88
crates/pile-dataset/src/extract/epub/epub_text.rs
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
use epub::doc::EpubDoc;
|
||||||
|
use pile_config::Label;
|
||||||
|
use std::{collections::HashMap, io::Cursor, sync::OnceLock};
|
||||||
|
use tracing::debug;
|
||||||
|
|
||||||
|
use crate::{Item, PileValue, extract::Extractor};
|
||||||
|
|
||||||
|
pub struct EpubTextExtractor<'a> {
|
||||||
|
item: &'a Item,
|
||||||
|
output: OnceLock<HashMap<Label, PileValue<'a>>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> EpubTextExtractor<'a> {
|
||||||
|
pub fn new(item: &'a Item) -> Self {
|
||||||
|
Self {
|
||||||
|
item,
|
||||||
|
output: OnceLock::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
|
||||||
|
if let Some(x) = self.output.get() {
|
||||||
|
return Ok(x);
|
||||||
|
}
|
||||||
|
|
||||||
|
let bytes = self.item.read().await?.read_to_end().await?;
|
||||||
|
|
||||||
|
let cursor = Cursor::new(bytes);
|
||||||
|
let mut doc = match EpubDoc::from_reader(cursor) {
|
||||||
|
Ok(x) => x,
|
||||||
|
Err(error) => {
|
||||||
|
debug!(message = "Could not process epub", ?error, key = ?self.item.key());
|
||||||
|
return Ok(self.output.get_or_init(HashMap::new));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut text_parts: Vec<String> = Vec::new();
|
||||||
|
|
||||||
|
loop {
|
||||||
|
if let Ok(content) = doc.get_current_str() {
|
||||||
|
text_parts.push(strip_html(&content));
|
||||||
|
}
|
||||||
|
if doc.go_next().is_err() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let text = text_parts.join(" ");
|
||||||
|
|
||||||
|
#[expect(clippy::unwrap_used)]
|
||||||
|
let output = HashMap::from([(Label::new("text").unwrap(), PileValue::String(text.into()))]);
|
||||||
|
|
||||||
|
let _ = self.output.set(output);
|
||||||
|
#[expect(clippy::unwrap_used)]
|
||||||
|
return Ok(self.output.get().unwrap());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Strip HTML/XHTML tags from a string, leaving only text nodes.
|
||||||
|
fn strip_html(html: &str) -> String {
|
||||||
|
let mut result = String::with_capacity(html.len());
|
||||||
|
let mut in_tag = false;
|
||||||
|
|
||||||
|
for c in html.chars() {
|
||||||
|
match c {
|
||||||
|
'<' => in_tag = true,
|
||||||
|
'>' => in_tag = false,
|
||||||
|
_ if !in_tag => result.push(c),
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
result
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
impl Extractor for EpubTextExtractor<'_> {
|
||||||
|
async fn field<'a>(
|
||||||
|
&'a self,
|
||||||
|
name: &Label,
|
||||||
|
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
|
||||||
|
Ok(self.get_inner().await?.get(name))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||||
|
Ok(self.get_inner().await?.keys().cloned().collect())
|
||||||
|
}
|
||||||
|
}
|
||||||
63
crates/pile-dataset/src/extract/epub/mod.rs
Normal file
63
crates/pile-dataset/src/extract/epub/mod.rs
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
use pile_config::Label;
|
||||||
|
use std::{collections::HashMap, sync::Arc};
|
||||||
|
|
||||||
|
mod epub_meta;
|
||||||
|
pub use epub_meta::*;
|
||||||
|
|
||||||
|
mod epub_text;
|
||||||
|
pub use epub_text::*;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
Item, PileValue,
|
||||||
|
extract::{Extractor, MapExtractor},
|
||||||
|
};
|
||||||
|
|
||||||
|
pub struct EpubExtractor<'a> {
|
||||||
|
inner: MapExtractor<'a>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> EpubExtractor<'a> {
|
||||||
|
#[expect(clippy::unwrap_used)]
|
||||||
|
pub fn new(item: &'a Item) -> Self {
|
||||||
|
let inner = MapExtractor {
|
||||||
|
inner: HashMap::from([
|
||||||
|
(
|
||||||
|
Label::new("text").unwrap(),
|
||||||
|
PileValue::Extractor(Arc::new(EpubTextExtractor::new(item))),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
Label::new("meta").unwrap(),
|
||||||
|
PileValue::Extractor(Arc::new(EpubMetaExtractor::new(item))),
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
};
|
||||||
|
|
||||||
|
Self { inner }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
impl Extractor for EpubExtractor<'_> {
|
||||||
|
async fn field<'a>(
|
||||||
|
&'a self,
|
||||||
|
name: &pile_config::Label,
|
||||||
|
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
|
||||||
|
#[expect(clippy::unwrap_used)]
|
||||||
|
if name.as_str() == "text" {
|
||||||
|
match self.inner.inner.get(name).unwrap() {
|
||||||
|
PileValue::Extractor(x) => return x.field(name).await,
|
||||||
|
_ => unreachable!(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
self.inner.field(name).await
|
||||||
|
}
|
||||||
|
|
||||||
|
#[expect(clippy::unwrap_used)]
|
||||||
|
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||||
|
Ok(vec![
|
||||||
|
Label::new("text").unwrap(),
|
||||||
|
Label::new("meta").unwrap(),
|
||||||
|
])
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -10,6 +10,9 @@ pub use id3::*;
|
|||||||
mod fs;
|
mod fs;
|
||||||
pub use fs::*;
|
pub use fs::*;
|
||||||
|
|
||||||
|
mod epub;
|
||||||
|
pub use epub::*;
|
||||||
|
|
||||||
mod pdf;
|
mod pdf;
|
||||||
pub use pdf::*;
|
pub use pdf::*;
|
||||||
|
|
||||||
@@ -69,6 +72,10 @@ impl<'a> MetaExtractor<'a> {
|
|||||||
Label::new("fs").unwrap(),
|
Label::new("fs").unwrap(),
|
||||||
crate::PileValue::Extractor(Arc::new(FsExtractor::new(item))),
|
crate::PileValue::Extractor(Arc::new(FsExtractor::new(item))),
|
||||||
),
|
),
|
||||||
|
(
|
||||||
|
Label::new("epub").unwrap(),
|
||||||
|
crate::PileValue::Extractor(Arc::new(EpubExtractor::new(item))),
|
||||||
|
),
|
||||||
(
|
(
|
||||||
Label::new("pdf").unwrap(),
|
Label::new("pdf").unwrap(),
|
||||||
crate::PileValue::Extractor(Arc::new(PdfExtractor::new(item))),
|
crate::PileValue::Extractor(Arc::new(PdfExtractor::new(item))),
|
||||||
@@ -103,6 +110,7 @@ impl Extractor for MetaExtractor<'_> {
|
|||||||
Label::new("flac").unwrap(),
|
Label::new("flac").unwrap(),
|
||||||
Label::new("id3").unwrap(),
|
Label::new("id3").unwrap(),
|
||||||
Label::new("fs").unwrap(),
|
Label::new("fs").unwrap(),
|
||||||
|
Label::new("epub").unwrap(),
|
||||||
Label::new("pdf").unwrap(),
|
Label::new("pdf").unwrap(),
|
||||||
Label::new("sidecar").unwrap(),
|
Label::new("sidecar").unwrap(),
|
||||||
]);
|
]);
|
||||||
|
|||||||
Reference in New Issue
Block a user