Add epub extractor
This commit is contained in:
63
crates/pile-dataset/src/extract/epub/mod.rs
Normal file
63
crates/pile-dataset/src/extract/epub/mod.rs
Normal file
@@ -0,0 +1,63 @@
|
||||
use pile_config::Label;
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
mod epub_meta;
|
||||
pub use epub_meta::*;
|
||||
|
||||
mod epub_text;
|
||||
pub use epub_text::*;
|
||||
|
||||
use crate::{
|
||||
Item, PileValue,
|
||||
extract::{Extractor, MapExtractor},
|
||||
};
|
||||
|
||||
pub struct EpubExtractor<'a> {
|
||||
inner: MapExtractor<'a>,
|
||||
}
|
||||
|
||||
impl<'a> EpubExtractor<'a> {
|
||||
#[expect(clippy::unwrap_used)]
|
||||
pub fn new(item: &'a Item) -> Self {
|
||||
let inner = MapExtractor {
|
||||
inner: HashMap::from([
|
||||
(
|
||||
Label::new("text").unwrap(),
|
||||
PileValue::Extractor(Arc::new(EpubTextExtractor::new(item))),
|
||||
),
|
||||
(
|
||||
Label::new("meta").unwrap(),
|
||||
PileValue::Extractor(Arc::new(EpubMetaExtractor::new(item))),
|
||||
),
|
||||
]),
|
||||
};
|
||||
|
||||
Self { inner }
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Extractor for EpubExtractor<'_> {
|
||||
async fn field<'a>(
|
||||
&'a self,
|
||||
name: &pile_config::Label,
|
||||
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
|
||||
#[expect(clippy::unwrap_used)]
|
||||
if name.as_str() == "text" {
|
||||
match self.inner.inner.get(name).unwrap() {
|
||||
PileValue::Extractor(x) => return x.field(name).await,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
}
|
||||
|
||||
self.inner.field(name).await
|
||||
}
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(vec![
|
||||
Label::new("text").unwrap(),
|
||||
Label::new("meta").unwrap(),
|
||||
])
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user