use epub::doc::EpubDoc; use pile_config::Label; use std::{ collections::HashMap, sync::{Arc, OnceLock}, }; use tracing::debug; use crate::{ extract::traits::ObjectExtractor, value::{Item, PileValue, SyncReadBridge}, }; pub struct EpubTextExtractor { item: Item, output: OnceLock>, } impl EpubTextExtractor { pub fn new(item: &Item) -> Self { Self { item: item.clone(), output: OnceLock::new(), } } async fn get_inner(&self) -> Result<&HashMap, std::io::Error> { if let Some(x) = self.output.get() { return Ok(x); } let key = self.item.key(); let ext = key.as_str().rsplit('.').next(); if !matches!(ext, Some("epub")) { return Ok(self.output.get_or_init(HashMap::new)); } let reader = SyncReadBridge::new_current(self.item.read().await?); let raw_text = tokio::task::spawn_blocking(move || { let mut doc = EpubDoc::from_reader(reader) .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?; let mut text_parts: Vec = Vec::new(); loop { if let Ok(content) = doc.get_current_str() { text_parts.push(strip_html(&content)); } if doc.go_next().is_err() { break; } } Ok::<_, std::io::Error>(text_parts.join(" ")) }) .await .map_err(std::io::Error::other)?; let raw_text = match raw_text { Ok(x) => x, Err(error) => { debug!(message = "Could not process epub", ?error, key = ?self.item.key()); return Ok(self.output.get_or_init(HashMap::new)); } }; #[expect(clippy::unwrap_used)] let output = HashMap::from([( Label::new("text").unwrap(), PileValue::String(Arc::new(raw_text.into())), )]); let _ = self.output.set(output); #[expect(clippy::unwrap_used)] return Ok(self.output.get().unwrap()); } } /// Strip HTML/XHTML tags from a string, leaving only text nodes. fn strip_html(html: &str) -> String { let mut result = String::with_capacity(html.len()); let mut in_tag = false; for c in html.chars() { match c { '<' => in_tag = true, '>' => in_tag = false, _ if !in_tag => result.push(c), _ => {} } } result } #[async_trait::async_trait] impl ObjectExtractor for EpubTextExtractor { async fn field(&self, name: &Label) -> Result, std::io::Error> { Ok(self.get_inner().await?.get(name).cloned()) } async fn fields(&self) -> Result, std::io::Error> { Ok(self.get_inner().await?.keys().cloned().collect()) } }