Files
pile/crates/pile-value/src/extract/traits.rs
2026-03-16 09:56:48 -07:00

87 lines
2.7 KiB
Rust

#[derive(Debug, Clone)]
pub struct ExtractState {
/// If true, extract all fields from all items.
/// Do not pre-filter using mime type.
///
/// This may detect additional fields, but
/// makes extraction take much longer
pub ignore_mime: bool,
}
/// An attachment that extracts metadata from an [Item].
///
/// Metadata is exposed as an immutable map of {label: value},
/// much like a json object.
#[async_trait::async_trait]
pub trait ObjectExtractor: Send + Sync {
/// Get the field at `name` from `item`.
/// - returns `None` if `name` is not a valid field
/// - returns `Some(Null)` if `name` is not available
///
/// For extractors that parse binary, this fn should return
/// an error only if we failed to obtain the data we need (permission denied, etc).
///
/// If the underlying data has an invalid format (e.g, running a pdf extractor on a non-pdf file),
/// this fn should return `Ok(Some(None))`.
async fn field(
&self,
state: &ExtractState,
name: &pile_config::Label,
args: Option<&str>,
) -> Result<Option<crate::value::PileValue>, std::io::Error>;
/// Return all fields in this extractor.
/// `Self::field` must return [Some] for all these keys
/// and [None] for all others.
async fn fields(&self) -> Result<Vec<pile_config::Label>, std::io::Error>;
/// Convert this to a JSON value.
async fn to_json(&self, state: &ExtractState) -> Result<serde_json::Value, std::io::Error> {
let keys = self.fields().await?;
let mut map = serde_json::Map::new();
for k in &keys {
let v = match self.field(state, k, None).await? {
Some(x) => x,
None => continue,
};
map.insert(k.to_string(), Box::pin(v.to_json(state)).await?);
}
Ok(serde_json::Value::Object(map))
}
}
/// An attachment that extracts metadata from an [Item].
///
/// Metadata is exposed as an immutable list of values.
#[async_trait::async_trait]
pub trait ListExtractor: Send + Sync {
/// Get the item at index `idx`.
/// Indices start at zero, and must be consecutive.
/// - returns `None` if `idx` is out of range
/// - returns `Some(Null)` if `None` is at `idx`
async fn get(
&self,
state: &ExtractState,
idx: usize,
) -> Result<Option<crate::value::PileValue>, std::io::Error>;
async fn len(&self, state: &ExtractState) -> Result<usize, std::io::Error>;
/// Convert this list to a JSON value.
async fn to_json(&self, state: &ExtractState) -> Result<serde_json::Value, std::io::Error> {
let len = self.len(state).await?;
let mut list = Vec::with_capacity(len);
for i in 0..len {
#[expect(clippy::expect_used)]
let v = self
.get(state, i)
.await?
.expect("value must be present according to length");
list.push(Box::pin(v.to_json(state)).await?);
}
Ok(serde_json::Value::Array(list))
}
}