Add text extractor

This commit is contained in:
2026-03-18 20:47:29 -07:00
parent 915d10bd0e
commit 2f2eb323d5
2 changed files with 75 additions and 0 deletions

View File

@@ -28,6 +28,9 @@ pub use toml::*;
mod group;
pub use group::*;
mod text;
pub use text::*;
use crate::{
extract::{
misc::MapExtractor,
@@ -77,6 +80,10 @@ impl ItemExtractor {
Label::new("toml").unwrap(),
PileValue::ObjectExtractor(Arc::new(TomlExtractor::new(item))),
),
(
Label::new("text").unwrap(),
PileValue::ObjectExtractor(Arc::new(TextExtractor::new(item))),
),
(
Label::new("groups").unwrap(),
PileValue::ObjectExtractor(Arc::new(GroupExtractor::new(item))),
@@ -110,6 +117,7 @@ impl ObjectExtractor for ItemExtractor {
Label::new("pdf").unwrap(),
Label::new("json").unwrap(),
Label::new("toml").unwrap(),
Label::new("text").unwrap(),
Label::new("groups").unwrap(),
]);
}

View File

@@ -0,0 +1,67 @@
use pile_config::Label;
use std::sync::{Arc, OnceLock};
use crate::{
extract::traits::{ExtractState, ObjectExtractor},
value::{AsyncReader, Item, PileValue},
};
pub struct TextExtractor {
item: Item,
output: OnceLock<PileValue>,
}
impl TextExtractor {
pub fn new(item: &Item) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
}
}
}
#[async_trait::async_trait]
impl ObjectExtractor for TextExtractor {
async fn field(
&self,
state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
if !state.ignore_mime
&& (self.item.mime().type_() != mime::TEXT
&& self.item.mime().type_() != mime::APPLICATION)
{
return Ok(None);
}
if name.as_str() != "text" {
return Ok(None);
}
{
if let Some(x) = self.output.get() {
return Ok(Some(x.clone()));
}
let mut reader = self.item.read().await?;
let bytes = reader.read_to_end().await?;
let string = String::from_utf8(bytes).ok();
let value = match string {
Some(x) => PileValue::String(Arc::new(x.into())),
None => PileValue::Null,
};
return Ok(Some(self.output.get_or_init(|| value).clone()));
}
}
#[expect(clippy::unwrap_used)]
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(vec![Label::new("text").unwrap()])
}
}