diff --git a/Cargo.lock b/Cargo.lock index 2623661..94a3817 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1086,6 +1086,19 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" +[[package]] +name = "epub" +version = "1.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83c5ac32621967f51e8b82def1a8a86bf4f4e4ab21b6e22f3486d42121fa6581" +dependencies = [ + "anyhow", + "percent-encoding", + "regex", + "xml-rs", + "zip", +] + [[package]] name = "equivalent" version = "1.0.2" @@ -2254,6 +2267,7 @@ dependencies = [ "aws-sdk-s3", "blake3", "chrono", + "epub", "id3", "itertools 0.14.0", "pdf", @@ -4152,6 +4166,12 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" +[[package]] +name = "xml-rs" +version = "0.8.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ae8337f8a065cfc972643663ea4279e04e7256de865aa66fe25cec5fb912d3f" + [[package]] name = "xmlparser" version = "0.13.6" @@ -4261,6 +4281,18 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "zip" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261" +dependencies = [ + "byteorder", + "crc32fast", + "crossbeam-utils", + "flate2", +] + [[package]] name = "zmij" version = "1.0.21" diff --git a/Cargo.toml b/Cargo.toml index d8b8d74..b3e2251 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -96,6 +96,7 @@ sha2 = "0.11.0-rc.5" blake3 = "1.8.3" pdf = "0.10.0" id3 = "1.16.4" +epub = "1.2.2" # Misc helpers thiserror = "2.0.18" diff --git a/crates/pile-dataset/Cargo.toml b/crates/pile-dataset/Cargo.toml index f04c52d..2439e8e 100644 --- a/crates/pile-dataset/Cargo.toml +++ b/crates/pile-dataset/Cargo.toml @@ -22,6 +22,7 @@ toml = { workspace = true } thiserror = { workspace = true } smartstring = { workspace = true } blake3 = { workspace = true } +epub = { workspace = true } pdf = { workspace = true } id3 = { workspace = true } tokio = { workspace = true } diff --git a/crates/pile-dataset/src/extract/epub/epub_meta.rs b/crates/pile-dataset/src/extract/epub/epub_meta.rs new file mode 100644 index 0000000..4681a67 --- /dev/null +++ b/crates/pile-dataset/src/extract/epub/epub_meta.rs @@ -0,0 +1,76 @@ +use epub::doc::EpubDoc; +use pile_config::Label; +use std::{collections::HashMap, io::Cursor, sync::OnceLock}; +use tracing::debug; + +use crate::{Item, PileValue, extract::Extractor}; + +pub struct EpubMetaExtractor<'a> { + item: &'a Item, + output: OnceLock>>, +} + +impl<'a> EpubMetaExtractor<'a> { + pub fn new(item: &'a Item) -> Self { + Self { + item, + output: OnceLock::new(), + } + } + + async fn get_inner(&self) -> Result<&HashMap>, std::io::Error> { + if let Some(x) = self.output.get() { + return Ok(x); + } + + let bytes = self.item.read().await?.read_to_end().await?; + + let cursor = Cursor::new(bytes); + let doc = match EpubDoc::from_reader(cursor) { + Ok(x) => x, + Err(error) => { + debug!(message = "Could not process epub", ?error, key = ?self.item.key()); + return Ok(self.output.get_or_init(HashMap::new)); + } + }; + + let mut output: HashMap> = HashMap::new(); + + let fields = &[ + "title", + "creator", + "description", + "language", + "publisher", + "date", + "subject", + "identifier", + ]; + + #[expect(clippy::unwrap_used)] + for key in fields { + let label = Label::new(*key).unwrap(); + let value = match doc.mdata(key) { + Some(s) => PileValue::String(s.into()), + None => PileValue::Null, + }; + output.insert(label, value); + } + + return Ok(self.output.get_or_init(|| output)); + } +} + +#[async_trait::async_trait] +impl Extractor for EpubMetaExtractor<'_> { + async fn field<'a>( + &'a self, + name: &Label, + ) -> Result>, std::io::Error> { + Ok(self.get_inner().await?.get(name)) + } + + async fn fields(&self) -> Result, std::io::Error> { + Ok(self.get_inner().await?.keys().cloned().collect()) + } +} diff --git a/crates/pile-dataset/src/extract/epub/epub_text.rs b/crates/pile-dataset/src/extract/epub/epub_text.rs new file mode 100644 index 0000000..8d91c8b --- /dev/null +++ b/crates/pile-dataset/src/extract/epub/epub_text.rs @@ -0,0 +1,88 @@ +use epub::doc::EpubDoc; +use pile_config::Label; +use std::{collections::HashMap, io::Cursor, sync::OnceLock}; +use tracing::debug; + +use crate::{Item, PileValue, extract::Extractor}; + +pub struct EpubTextExtractor<'a> { + item: &'a Item, + output: OnceLock>>, +} + +impl<'a> EpubTextExtractor<'a> { + pub fn new(item: &'a Item) -> Self { + Self { + item, + output: OnceLock::new(), + } + } + + async fn get_inner(&self) -> Result<&HashMap>, std::io::Error> { + if let Some(x) = self.output.get() { + return Ok(x); + } + + let bytes = self.item.read().await?.read_to_end().await?; + + let cursor = Cursor::new(bytes); + let mut doc = match EpubDoc::from_reader(cursor) { + Ok(x) => x, + Err(error) => { + debug!(message = "Could not process epub", ?error, key = ?self.item.key()); + return Ok(self.output.get_or_init(HashMap::new)); + } + }; + + let mut text_parts: Vec = Vec::new(); + + loop { + if let Ok(content) = doc.get_current_str() { + text_parts.push(strip_html(&content)); + } + if doc.go_next().is_err() { + break; + } + } + + let text = text_parts.join(" "); + + #[expect(clippy::unwrap_used)] + let output = HashMap::from([(Label::new("text").unwrap(), PileValue::String(text.into()))]); + + let _ = self.output.set(output); + #[expect(clippy::unwrap_used)] + return Ok(self.output.get().unwrap()); + } +} + +/// Strip HTML/XHTML tags from a string, leaving only text nodes. +fn strip_html(html: &str) -> String { + let mut result = String::with_capacity(html.len()); + let mut in_tag = false; + + for c in html.chars() { + match c { + '<' => in_tag = true, + '>' => in_tag = false, + _ if !in_tag => result.push(c), + _ => {} + } + } + + result +} + +#[async_trait::async_trait] +impl Extractor for EpubTextExtractor<'_> { + async fn field<'a>( + &'a self, + name: &Label, + ) -> Result>, std::io::Error> { + Ok(self.get_inner().await?.get(name)) + } + + async fn fields(&self) -> Result, std::io::Error> { + Ok(self.get_inner().await?.keys().cloned().collect()) + } +} diff --git a/crates/pile-dataset/src/extract/epub/mod.rs b/crates/pile-dataset/src/extract/epub/mod.rs new file mode 100644 index 0000000..b4d6e90 --- /dev/null +++ b/crates/pile-dataset/src/extract/epub/mod.rs @@ -0,0 +1,63 @@ +use pile_config::Label; +use std::{collections::HashMap, sync::Arc}; + +mod epub_meta; +pub use epub_meta::*; + +mod epub_text; +pub use epub_text::*; + +use crate::{ + Item, PileValue, + extract::{Extractor, MapExtractor}, +}; + +pub struct EpubExtractor<'a> { + inner: MapExtractor<'a>, +} + +impl<'a> EpubExtractor<'a> { + #[expect(clippy::unwrap_used)] + pub fn new(item: &'a Item) -> Self { + let inner = MapExtractor { + inner: HashMap::from([ + ( + Label::new("text").unwrap(), + PileValue::Extractor(Arc::new(EpubTextExtractor::new(item))), + ), + ( + Label::new("meta").unwrap(), + PileValue::Extractor(Arc::new(EpubMetaExtractor::new(item))), + ), + ]), + }; + + Self { inner } + } +} + +#[async_trait::async_trait] +impl Extractor for EpubExtractor<'_> { + async fn field<'a>( + &'a self, + name: &pile_config::Label, + ) -> Result>, std::io::Error> { + #[expect(clippy::unwrap_used)] + if name.as_str() == "text" { + match self.inner.inner.get(name).unwrap() { + PileValue::Extractor(x) => return x.field(name).await, + _ => unreachable!(), + }; + } + + self.inner.field(name).await + } + + #[expect(clippy::unwrap_used)] + async fn fields(&self) -> Result, std::io::Error> { + Ok(vec![ + Label::new("text").unwrap(), + Label::new("meta").unwrap(), + ]) + } +} diff --git a/crates/pile-dataset/src/extract/mod.rs b/crates/pile-dataset/src/extract/mod.rs index 6feeaaa..c7b9606 100644 --- a/crates/pile-dataset/src/extract/mod.rs +++ b/crates/pile-dataset/src/extract/mod.rs @@ -10,6 +10,9 @@ pub use id3::*; mod fs; pub use fs::*; +mod epub; +pub use epub::*; + mod pdf; pub use pdf::*; @@ -69,6 +72,10 @@ impl<'a> MetaExtractor<'a> { Label::new("fs").unwrap(), crate::PileValue::Extractor(Arc::new(FsExtractor::new(item))), ), + ( + Label::new("epub").unwrap(), + crate::PileValue::Extractor(Arc::new(EpubExtractor::new(item))), + ), ( Label::new("pdf").unwrap(), crate::PileValue::Extractor(Arc::new(PdfExtractor::new(item))), @@ -103,6 +110,7 @@ impl Extractor for MetaExtractor<'_> { Label::new("flac").unwrap(), Label::new("id3").unwrap(), Label::new("fs").unwrap(), + Label::new("epub").unwrap(), Label::new("pdf").unwrap(), Label::new("sidecar").unwrap(), ]);