From b789255ea9a89584ce278ffcc688746f5bfda04c Mon Sep 17 00:00:00 2001 From: rm-dr <96270320+rm-dr@users.noreply.github.com> Date: Tue, 10 Mar 2026 21:56:56 -0700 Subject: [PATCH] `to_json` tweak --- crates/pile-dataset/src/extract/mod.rs | 31 +++++++++++++++++++ .../pile-dataset/src/extract/pdf/pdf_pages.rs | 9 ++++++ crates/pile-dataset/src/value.rs | 14 +-------- 3 files changed, 41 insertions(+), 13 deletions(-) diff --git a/crates/pile-dataset/src/extract/mod.rs b/crates/pile-dataset/src/extract/mod.rs index c494177..2482e7f 100644 --- a/crates/pile-dataset/src/extract/mod.rs +++ b/crates/pile-dataset/src/extract/mod.rs @@ -45,6 +45,21 @@ pub trait ObjectExtractor: Send + Sync { /// `Self::field` must return [Some] for all these keys /// and [None] for all others. async fn fields(&self) -> Result, std::io::Error>; + + /// Convert this to a JSON value. + async fn to_json(&self) -> Result { + let keys = self.fields().await?; + let mut map = serde_json::Map::new(); + for k in &keys { + let v = match self.field(k).await? { + Some(x) => x, + None => continue, + }; + map.insert(k.to_string(), Box::pin(v.to_json()).await?); + } + + Ok(serde_json::Value::Object(map)) + } } /// An attachment that extracts metadata from an [Item]. @@ -63,6 +78,22 @@ pub trait ListExtractor: Send + Sync { async fn is_empty(&self) -> Result { Ok(self.len().await? == 0) } + + /// Convert this list to a JSON value. + async fn to_json(&self) -> Result { + let len = self.len().await?; + let mut list = Vec::with_capacity(len); + for i in 0..len { + #[expect(clippy::expect_used)] + let v = self + .get(i) + .await? + .expect("value must be present according to length"); + list.push(Box::pin(v.to_json()).await?); + } + + Ok(serde_json::Value::Array(list)) + } } pub struct MetaExtractor { diff --git a/crates/pile-dataset/src/extract/pdf/pdf_pages.rs b/crates/pile-dataset/src/extract/pdf/pdf_pages.rs index 48f0bee..de38e0b 100644 --- a/crates/pile-dataset/src/extract/pdf/pdf_pages.rs +++ b/crates/pile-dataset/src/extract/pdf/pdf_pages.rs @@ -92,4 +92,13 @@ impl ListExtractor for PdfPagesExtractor { } } } + + // Override, extracting all pages is very slow, + // and we can't display binary in json anyway + async fn to_json(&self) -> Result { + Ok(serde_json::Value::String(format!( + "", + self.len().await? + ))) + } } diff --git a/crates/pile-dataset/src/value.rs b/crates/pile-dataset/src/value.rs index 9d99f22..b34c376 100644 --- a/crates/pile-dataset/src/value.rs +++ b/crates/pile-dataset/src/value.rs @@ -188,19 +188,7 @@ impl PileValue { Value::Object(map) } - Self::ListExtractor(e) => { - let len = e.len().await?; - let mut list = Vec::with_capacity(len); - for i in 0..len { - #[expect(clippy::expect_used)] - let v = e.get(i) - .await? - .expect("value must be present according to length"); - list.push(Box::pin(v.to_json()).await?); - } - - Value::Array(list) - } + Self::ListExtractor(e) => e.to_json().await?, }) } }