pile/crates/pile-value/src/extract/item/epub/epub_text.rs

use epub::doc::EpubDoc;
use pile_config::Label;
use std::{
	collections::HashMap,
	sync::{Arc, OnceLock},
};
use tracing::debug;

use crate::{
	extract::traits::ObjectExtractor,
	value::{Item, PileValue, SyncReadBridge},
};

pub struct EpubTextExtractor {
	item: Item,
	output: OnceLock<HashMap<Label, PileValue>>,
}

impl EpubTextExtractor {
	pub fn new(item: &Item) -> Self {
		Self {
			item: item.clone(),
			output: OnceLock::new(),
		}
	}

	async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
		if let Some(x) = self.output.get() {
			return Ok(x);
		}

		let key = self.item.key();
		let ext = key.as_str().rsplit('.').next();
		if !matches!(ext, Some("epub")) {
			return Ok(self.output.get_or_init(HashMap::new));
		}

		let reader = SyncReadBridge::new_current(self.item.read().await?);
		let raw_text = tokio::task::spawn_blocking(move || {
			let mut doc = EpubDoc::from_reader(reader)
				.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;

			let mut text_parts: Vec<String> = Vec::new();

			loop {
				if let Ok(content) = doc.get_current_str() {
					text_parts.push(strip_html(&content));
				}
				if doc.go_next().is_err() {
					break;
				}
			}

			Ok::<_, std::io::Error>(text_parts.join(" "))
		})
		.await
		.map_err(std::io::Error::other)?;

		let raw_text = match raw_text {
			Ok(x) => x,
			Err(error) => {
				debug!(message = "Could not process epub", ?error, key = ?self.item.key());
				return Ok(self.output.get_or_init(HashMap::new));
			}
		};

		#[expect(clippy::unwrap_used)]
		let output = HashMap::from([(
			Label::new("text").unwrap(),
			PileValue::String(Arc::new(raw_text.into())),
		)]);

		let _ = self.output.set(output);
		#[expect(clippy::unwrap_used)]
		return Ok(self.output.get().unwrap());
	}
}

/// Strip HTML/XHTML tags from a string, leaving only text nodes.
fn strip_html(html: &str) -> String {
	let mut result = String::with_capacity(html.len());
	let mut in_tag = false;

	for c in html.chars() {
		match c {
			'<' => in_tag = true,
			'>' => in_tag = false,
			_ if !in_tag => result.push(c),
			_ => {}
		}
	}

	result
}

#[async_trait::async_trait]
impl ObjectExtractor for EpubTextExtractor {
	async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
		Ok(self.get_inner().await?.get(name).cloned())
	}

	async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
		Ok(self.get_inner().await?.keys().cloned().collect())
	}
}