87 lines
2.7 KiB
Rust
87 lines
2.7 KiB
Rust
#[derive(Debug, Clone)]
|
|
pub struct ExtractState {
|
|
/// If true, extract all fields from all items.
|
|
/// Do not pre-filter using mime type.
|
|
///
|
|
/// This may detect additional fields, but
|
|
/// makes extraction take much longer
|
|
pub ignore_mime: bool,
|
|
}
|
|
|
|
/// An attachment that extracts metadata from an [Item].
|
|
///
|
|
/// Metadata is exposed as an immutable map of {label: value},
|
|
/// much like a json object.
|
|
#[async_trait::async_trait]
|
|
pub trait ObjectExtractor: Send + Sync {
|
|
/// Get the field at `name` from `item`.
|
|
/// - returns `None` if `name` is not a valid field
|
|
/// - returns `Some(Null)` if `name` is not available
|
|
///
|
|
/// For extractors that parse binary, this fn should return
|
|
/// an error only if we failed to obtain the data we need (permission denied, etc).
|
|
///
|
|
/// If the underlying data has an invalid format (e.g, running a pdf extractor on a non-pdf file),
|
|
/// this fn should return `Ok(Some(None))`.
|
|
async fn field(
|
|
&self,
|
|
state: &ExtractState,
|
|
name: &pile_config::Label,
|
|
args: Option<&str>,
|
|
) -> Result<Option<crate::value::PileValue>, std::io::Error>;
|
|
|
|
/// Return all fields in this extractor.
|
|
/// `Self::field` must return [Some] for all these keys
|
|
/// and [None] for all others.
|
|
async fn fields(&self) -> Result<Vec<pile_config::Label>, std::io::Error>;
|
|
|
|
/// Convert this to a JSON value.
|
|
async fn to_json(&self, state: &ExtractState) -> Result<serde_json::Value, std::io::Error> {
|
|
let keys = self.fields().await?;
|
|
let mut map = serde_json::Map::new();
|
|
for k in &keys {
|
|
let v = match self.field(state, k, None).await? {
|
|
Some(x) => x,
|
|
None => continue,
|
|
};
|
|
map.insert(k.to_string(), Box::pin(v.to_json(state)).await?);
|
|
}
|
|
|
|
Ok(serde_json::Value::Object(map))
|
|
}
|
|
}
|
|
|
|
/// An attachment that extracts metadata from an [Item].
|
|
///
|
|
/// Metadata is exposed as an immutable list of values.
|
|
#[async_trait::async_trait]
|
|
pub trait ListExtractor: Send + Sync {
|
|
/// Get the item at index `idx`.
|
|
/// Indices start at zero, and must be consecutive.
|
|
/// - returns `None` if `idx` is out of range
|
|
/// - returns `Some(Null)` if `None` is at `idx`
|
|
async fn get(
|
|
&self,
|
|
state: &ExtractState,
|
|
idx: usize,
|
|
) -> Result<Option<crate::value::PileValue>, std::io::Error>;
|
|
|
|
async fn len(&self, state: &ExtractState) -> Result<usize, std::io::Error>;
|
|
|
|
/// Convert this list to a JSON value.
|
|
async fn to_json(&self, state: &ExtractState) -> Result<serde_json::Value, std::io::Error> {
|
|
let len = self.len(state).await?;
|
|
let mut list = Vec::with_capacity(len);
|
|
for i in 0..len {
|
|
#[expect(clippy::expect_used)]
|
|
let v = self
|
|
.get(state, i)
|
|
.await?
|
|
.expect("value must be present according to length");
|
|
list.push(Box::pin(v.to_json(state)).await?);
|
|
}
|
|
|
|
Ok(serde_json::Value::Array(list))
|
|
}
|
|
}
|