#[derive(Debug, Clone)] pub struct ExtractState { /// If true, extract all fields from all items. /// Do not pre-filter using mime type. /// /// This may detect additional fields, but /// makes extraction take much longer pub ignore_mime: bool, } /// An attachment that extracts metadata from an [Item]. /// /// Metadata is exposed as an immutable map of {label: value}, /// much like a json object. #[async_trait::async_trait] pub trait ObjectExtractor: Send + Sync { /// Get the field at `name` from `item`. /// - returns `None` if `name` is not a valid field /// - returns `Some(Null)` if `name` is not available /// /// For extractors that parse binary, this fn should return /// an error only if we failed to obtain the data we need (permission denied, etc). /// /// If the underlying data has an invalid format (e.g, running a pdf extractor on a non-pdf file), /// this fn should return `Ok(Some(None))`. async fn field( &self, state: &ExtractState, name: &pile_config::Label, args: Option<&str>, ) -> Result, std::io::Error>; /// Return all fields in this extractor. /// `Self::field` must return [Some] for all these keys /// and [None] for all others. async fn fields(&self) -> Result, std::io::Error>; /// Convert this to a JSON value. async fn to_json(&self, state: &ExtractState) -> Result { let keys = self.fields().await?; let mut map = serde_json::Map::new(); for k in &keys { let v = match self.field(state, k, None).await? { Some(x) => x, None => continue, }; map.insert(k.to_string(), Box::pin(v.to_json(state)).await?); } Ok(serde_json::Value::Object(map)) } } /// An attachment that extracts metadata from an [Item]. /// /// Metadata is exposed as an immutable list of values. #[async_trait::async_trait] pub trait ListExtractor: Send + Sync { /// Get the item at index `idx`. /// Indices start at zero, and must be consecutive. /// - returns `None` if `idx` is out of range /// - returns `Some(Null)` if `None` is at `idx` async fn get( &self, state: &ExtractState, idx: usize, ) -> Result, std::io::Error>; async fn len(&self, state: &ExtractState) -> Result; /// Convert this list to a JSON value. async fn to_json(&self, state: &ExtractState) -> Result { let len = self.len(state).await?; let mut list = Vec::with_capacity(len); for i in 0..len { #[expect(clippy::expect_used)] let v = self .get(state, i) .await? .expect("value must be present according to length"); list.push(Box::pin(v.to_json(state)).await?); } Ok(serde_json::Value::Array(list)) } }