Refactor grouping
This commit is contained in:
@@ -6,16 +6,16 @@ use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
extract::traits::ExtractState,
|
||||
value::{Item, PileValue},
|
||||
value::{ArcBytes, BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
pub struct EpubCoverExtractor {
|
||||
item: Item,
|
||||
item: BinaryPileValue,
|
||||
output: OnceLock<Option<(Mime, Vec<u8>)>>,
|
||||
}
|
||||
|
||||
impl EpubCoverExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
@@ -51,7 +51,7 @@ impl EpubCoverExtractor {
|
||||
Err(error) => match error.downcast::<std::io::Error>() {
|
||||
Ok(x) => return Err(x),
|
||||
Err(error) => {
|
||||
trace!(message = "Could not extract epub cover", ?error, key = ?self.item.key());
|
||||
trace!(message = "Could not extract epub cover", ?error, item = ?self.item);
|
||||
None
|
||||
}
|
||||
},
|
||||
@@ -65,12 +65,11 @@ impl EpubCoverExtractor {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Ok(self
|
||||
.get_inner()
|
||||
.await?
|
||||
.map(|(mime, bytes)| PileValue::Blob {
|
||||
Ok(self.get_inner().await?.map(|(mime, bytes)| {
|
||||
PileValue::Binary(BinaryPileValue::Blob {
|
||||
mime: mime.clone(),
|
||||
bytes: Arc::new(bytes.clone()),
|
||||
}))
|
||||
bytes: ArcBytes(Arc::new(bytes.clone())),
|
||||
})
|
||||
}))
|
||||
}
|
||||
}
|
||||
@@ -9,16 +9,16 @@ use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::{BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
pub struct EpubMetaExtractor {
|
||||
item: Item,
|
||||
item: BinaryPileValue,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl EpubMetaExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
@@ -57,7 +57,7 @@ impl EpubMetaExtractor {
|
||||
Err(error) => match error.downcast::<std::io::Error>() {
|
||||
Ok(x) => return Err(x),
|
||||
Err(error) => {
|
||||
trace!(message = "Could not process epub", ?error, key = ?self.item.key());
|
||||
trace!(message = "Could not process epub", ?error, item = ?self.item);
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
},
|
||||
@@ -9,16 +9,16 @@ use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::{BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
pub struct EpubTextExtractor {
|
||||
item: Item,
|
||||
item: BinaryPileValue,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl EpubTextExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
@@ -54,7 +54,7 @@ impl EpubTextExtractor {
|
||||
Err(error) => match error.downcast::<std::io::Error>() {
|
||||
Ok(x) => return Err(x),
|
||||
Err(error) => {
|
||||
trace!(message = "Could not process epub", ?error, key = ?self.item.key());
|
||||
trace!(message = "Could not process epub", ?error, item = ?self.item);
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
},
|
||||
@@ -12,7 +12,7 @@ pub use epub_text::*;
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::{BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
pub struct EpubExtractor {
|
||||
@@ -22,7 +22,7 @@ pub struct EpubExtractor {
|
||||
}
|
||||
|
||||
impl EpubExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
text: Arc::new(EpubTextExtractor::new(item)),
|
||||
meta: Arc::new(EpubMetaExtractor::new(item)),
|
||||
@@ -87,9 +87,13 @@ impl ObjectExtractor for EpubExtractor {
|
||||
|
||||
if k.as_str() == "cover" {
|
||||
let summary = match &v {
|
||||
PileValue::Blob { mime, bytes } => {
|
||||
format!("<Blob ({}, {} bytes)>", mime, bytes.len())
|
||||
PileValue::Binary(BinaryPileValue::Blob { mime, bytes }) => {
|
||||
format!("<Blob ({mime}, {} bytes)>", bytes.0.len())
|
||||
}
|
||||
PileValue::Binary(BinaryPileValue::File { mime, .. }) => {
|
||||
format!("<File ({mime})>")
|
||||
}
|
||||
|
||||
PileValue::Null => "<null>".to_owned(),
|
||||
_ => "<cover>".to_owned(),
|
||||
};
|
||||
@@ -9,16 +9,16 @@ use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::{BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
pub struct ExifExtractor {
|
||||
item: Item,
|
||||
item: BinaryPileValue,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl ExifExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
@@ -53,7 +53,7 @@ impl ExifExtractor {
|
||||
Ok(x) => x,
|
||||
Err(exif::Error::Io(x)) => return Err(x),
|
||||
Err(error) => {
|
||||
trace!(message = "Could not process exif", ?error, key = ?self.item.key());
|
||||
trace!(message = "Could not process exif", ?error, item = ?self.item);
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
};
|
||||
@@ -94,7 +94,7 @@ impl ObjectExtractor for ExifExtractor {
|
||||
) -> Result<Option<PileValue>, std::io::Error> {
|
||||
trace!(
|
||||
?args,
|
||||
key = self.item.key().as_str(),
|
||||
item = ?self.item,
|
||||
"Getting field {name:?} from ExifExtractor",
|
||||
);
|
||||
|
||||
@@ -11,16 +11,16 @@ use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ListExtractor, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::{ArcBytes, BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
pub struct FlacImagesExtractor {
|
||||
item: Item,
|
||||
item: BinaryPileValue,
|
||||
cached_count: OnceLock<usize>,
|
||||
}
|
||||
|
||||
impl FlacImagesExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
cached_count: OnceLock::new(),
|
||||
@@ -65,7 +65,7 @@ impl ListExtractor for FlacImagesExtractor {
|
||||
mut idx: usize,
|
||||
) -> Result<Option<PileValue>, std::io::Error> {
|
||||
trace!(
|
||||
key = self.item.key().as_str(),
|
||||
item = ?self.item,
|
||||
"Getting index {idx} from FlacImagesExtractor",
|
||||
);
|
||||
|
||||
@@ -73,7 +73,7 @@ impl ListExtractor for FlacImagesExtractor {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let key = self.item.key();
|
||||
let item = self.item.clone();
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
let image = tokio::task::spawn_blocking(move || {
|
||||
let reader = FlacReader::new(BufReader::new(reader));
|
||||
@@ -93,11 +93,7 @@ impl ListExtractor for FlacImagesExtractor {
|
||||
|
||||
Err(FlacDecodeError::IoError(err)) => return Err(err),
|
||||
Err(error) => {
|
||||
trace!(
|
||||
message = "Could not parse FLAC images",
|
||||
key = key.as_str(),
|
||||
?error
|
||||
);
|
||||
trace!(message = "Could not parse FLAC images", ?item, ?error);
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
@@ -109,9 +105,11 @@ impl ListExtractor for FlacImagesExtractor {
|
||||
.await
|
||||
.map_err(std::io::Error::other)??;
|
||||
|
||||
Ok(image.map(|(mime, data)| PileValue::Blob {
|
||||
mime,
|
||||
bytes: Arc::new(data),
|
||||
Ok(image.map(|(mime, data)| {
|
||||
PileValue::Binary(BinaryPileValue::Blob {
|
||||
mime,
|
||||
bytes: ArcBytes(Arc::new(data)),
|
||||
})
|
||||
}))
|
||||
}
|
||||
|
||||
@@ -130,13 +128,13 @@ impl ListExtractor for FlacImagesExtractor {
|
||||
}
|
||||
|
||||
pub struct FlacExtractor {
|
||||
item: Item,
|
||||
item: BinaryPileValue,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
images: PileValue,
|
||||
}
|
||||
|
||||
impl FlacExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
@@ -149,12 +147,9 @@ impl FlacExtractor {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
trace!(
|
||||
message = "Reading FLAC tags",
|
||||
key = self.item.key().as_str()
|
||||
);
|
||||
trace!(message = "Reading FLAC tags", item = ?self.item);
|
||||
|
||||
let key = self.item.key();
|
||||
let item = self.item.clone();
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
let output = tokio::task::spawn_blocking(move || {
|
||||
let reader = FlacReader::new(BufReader::new(reader));
|
||||
@@ -176,11 +171,7 @@ impl FlacExtractor {
|
||||
|
||||
Err(FlacDecodeError::IoError(err)) => return Err(err),
|
||||
Err(error) => {
|
||||
trace!(
|
||||
message = "Could not parse FLAC metadata",
|
||||
key = key.as_str(),
|
||||
?error
|
||||
);
|
||||
trace!(message = "Could not parse FLAC metadata", ?item, ?error);
|
||||
return Ok(HashMap::new());
|
||||
}
|
||||
|
||||
@@ -1,21 +1,21 @@
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::{BinaryPileValue, PileValue},
|
||||
};
|
||||
use pile_config::Label;
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
path::{Component, PathBuf},
|
||||
path::Component,
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
|
||||
pub struct FsExtractor {
|
||||
item: Item,
|
||||
item: BinaryPileValue,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl FsExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
@@ -27,7 +27,10 @@ impl FsExtractor {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let path = PathBuf::from(self.item.key().as_str());
|
||||
let path = match &self.item {
|
||||
BinaryPileValue::File { path, .. } => path,
|
||||
_ => return Ok(self.output.get_or_init(HashMap::new)),
|
||||
};
|
||||
|
||||
let mut root = false;
|
||||
let components = path
|
||||
@@ -11,16 +11,16 @@ use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::{BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
pub struct Id3Extractor {
|
||||
item: Item,
|
||||
item: BinaryPileValue,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl Id3Extractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
@@ -32,9 +32,9 @@ impl Id3Extractor {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
trace!(message = "Reading id3 tags", key = self.item.key().as_str());
|
||||
trace!(message = "Reading id3 tags", key = ?self.item);
|
||||
|
||||
let key = self.item.key();
|
||||
let item = self.item.clone();
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
let tag = match tokio::task::spawn_blocking(move || Tag::read_from2(BufReader::new(reader)))
|
||||
.await
|
||||
@@ -48,11 +48,7 @@ impl Id3Extractor {
|
||||
})) => return Err(e),
|
||||
|
||||
Ok(Err(error)) => {
|
||||
trace!(
|
||||
message = "Could not parse id3 tags",
|
||||
key = key.as_str(),
|
||||
?error
|
||||
);
|
||||
trace!(message = "Could not parse id3 tags", ?item, ?error);
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
};
|
||||
@@ -1,63 +1,25 @@
|
||||
mod transform;
|
||||
use transform::{CropTransformer, ImageTransformer, MaxDimTransformer};
|
||||
|
||||
use image::ImageFormat;
|
||||
use mime::Mime;
|
||||
use pile_config::Label;
|
||||
use pile_io::AsyncReader;
|
||||
use std::{
|
||||
io::Cursor,
|
||||
str::FromStr,
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
use std::{io::Cursor, str::FromStr, sync::Arc};
|
||||
use tracing::trace;
|
||||
use transform::{CropTransformer, ImageTransformer, MaxDimTransformer};
|
||||
|
||||
mod transform;
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::{ArcBytes, BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
enum ImageSource {
|
||||
Item(Item, OnceLock<Arc<Vec<u8>>>),
|
||||
Blob(Arc<Vec<u8>>, Mime),
|
||||
}
|
||||
|
||||
pub struct ImageExtractor {
|
||||
source: ImageSource,
|
||||
item: BinaryPileValue,
|
||||
}
|
||||
|
||||
impl ImageExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
Self {
|
||||
source: ImageSource::Item(item.clone(), OnceLock::new()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_blob(bytes: Arc<Vec<u8>>, mime: Mime) -> Self {
|
||||
Self {
|
||||
source: ImageSource::Blob(bytes, mime),
|
||||
}
|
||||
}
|
||||
|
||||
fn mime(&self) -> &Mime {
|
||||
match &self.source {
|
||||
ImageSource::Item(item, _) => item.mime(),
|
||||
ImageSource::Blob(_, mime) => mime,
|
||||
}
|
||||
}
|
||||
|
||||
async fn read_bytes(&self) -> Result<Arc<Vec<u8>>, std::io::Error> {
|
||||
match &self.source {
|
||||
ImageSource::Blob(bytes, _) => Ok(bytes.clone()),
|
||||
ImageSource::Item(item, cache) => {
|
||||
if let Some(x) = cache.get() {
|
||||
return Ok(x.clone());
|
||||
}
|
||||
let mut reader = item.read().await?;
|
||||
let bytes = reader.read_to_end().await?;
|
||||
Ok(cache.get_or_init(|| Arc::new(bytes)).clone())
|
||||
}
|
||||
}
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self { item: item.clone() }
|
||||
}
|
||||
|
||||
async fn apply<T: ImageTransformer + Send + 'static>(
|
||||
@@ -69,11 +31,14 @@ impl ImageExtractor {
|
||||
Err(_) => return Ok(None),
|
||||
};
|
||||
|
||||
let mime = self.mime().clone();
|
||||
let bytes = self.read_bytes().await?;
|
||||
let mime = self.item.mime().clone();
|
||||
let bytes = self.item.read().await?.read_to_end().await?;
|
||||
|
||||
let Some(format) = ImageFormat::from_mime_type(&mime) else {
|
||||
return Ok(Some(PileValue::Blob { mime, bytes }));
|
||||
return Ok(Some(PileValue::Binary(BinaryPileValue::Blob {
|
||||
mime,
|
||||
bytes: ArcBytes(Arc::new(bytes)),
|
||||
})));
|
||||
};
|
||||
|
||||
let bytes_for_closure = bytes.clone();
|
||||
@@ -91,11 +56,15 @@ impl ImageExtractor {
|
||||
.await?;
|
||||
|
||||
match result {
|
||||
Ok((out_mime, out_bytes)) => Ok(Some(PileValue::Blob {
|
||||
Ok((out_mime, out_bytes)) => Ok(Some(PileValue::Binary(BinaryPileValue::Blob {
|
||||
mime: out_mime,
|
||||
bytes: Arc::new(out_bytes),
|
||||
})),
|
||||
Err(_) => Ok(Some(PileValue::Blob { mime, bytes })),
|
||||
bytes: ArcBytes(Arc::new(out_bytes)),
|
||||
}))),
|
||||
|
||||
Err(_) => Ok(Some(PileValue::Binary(BinaryPileValue::Blob {
|
||||
mime,
|
||||
bytes: ArcBytes(Arc::new(bytes)),
|
||||
}))),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -7,7 +7,7 @@ use std::{
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::{BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
fn json_to_pile(value: serde_json::Value) -> PileValue {
|
||||
@@ -24,12 +24,12 @@ fn json_to_pile(value: serde_json::Value) -> PileValue {
|
||||
}
|
||||
|
||||
pub struct JsonExtractor {
|
||||
item: Item,
|
||||
item: BinaryPileValue,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl JsonExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
@@ -25,9 +25,6 @@ mod toml;
|
||||
use pile_config::Label;
|
||||
pub use toml::*;
|
||||
|
||||
mod group;
|
||||
pub use group::*;
|
||||
|
||||
mod text;
|
||||
pub use text::*;
|
||||
|
||||
@@ -39,17 +36,17 @@ use crate::{
|
||||
misc::MapExtractor,
|
||||
traits::{ExtractState, ObjectExtractor},
|
||||
},
|
||||
value::{Item, PileValue},
|
||||
value::{BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
pub struct ItemExtractor {
|
||||
pub struct BinaryExtractor {
|
||||
inner: MapExtractor,
|
||||
image: Arc<ImageExtractor>,
|
||||
}
|
||||
|
||||
impl ItemExtractor {
|
||||
impl BinaryExtractor {
|
||||
#[expect(clippy::unwrap_used)]
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
let inner = MapExtractor {
|
||||
inner: HashMap::from([
|
||||
(
|
||||
@@ -88,10 +85,6 @@ impl ItemExtractor {
|
||||
Label::new("text").unwrap(),
|
||||
PileValue::ObjectExtractor(Arc::new(TextExtractor::new(item))),
|
||||
),
|
||||
(
|
||||
Label::new("groups").unwrap(),
|
||||
PileValue::ObjectExtractor(Arc::new(GroupExtractor::new(item))),
|
||||
),
|
||||
]),
|
||||
};
|
||||
|
||||
@@ -103,7 +96,7 @@ impl ItemExtractor {
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for ItemExtractor {
|
||||
impl ObjectExtractor for BinaryExtractor {
|
||||
async fn field(
|
||||
&self,
|
||||
state: &ExtractState,
|
||||
@@ -15,7 +15,7 @@ pub use pdf_text::*;
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::{BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
pub struct PdfExtractor {
|
||||
@@ -26,7 +26,7 @@ pub struct PdfExtractor {
|
||||
}
|
||||
|
||||
impl PdfExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
text: Arc::new(PdfTextExtractor::new(item)),
|
||||
meta: Arc::new(PdfMetaExtractor::new(item)),
|
||||
@@ -46,7 +46,7 @@ impl ObjectExtractor for PdfExtractor {
|
||||
) -> Result<Option<PileValue>, std::io::Error> {
|
||||
trace!(
|
||||
?args,
|
||||
key = self.text.item.key().as_str(),
|
||||
item = ?self.text.item,
|
||||
"Getting field {name:?} from PdfExtractor",
|
||||
);
|
||||
|
||||
@@ -9,18 +9,19 @@ use std::{
|
||||
};
|
||||
use tracing::trace;
|
||||
|
||||
use crate::value::BinaryPileValue;
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::PileValue,
|
||||
};
|
||||
|
||||
pub struct PdfMetaExtractor {
|
||||
item: Item,
|
||||
item: BinaryPileValue,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl PdfMetaExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
@@ -82,7 +83,7 @@ impl PdfMetaExtractor {
|
||||
let (page_count, raw_meta) = match raw_meta {
|
||||
Ok(x) => x,
|
||||
Err(error) => {
|
||||
trace!(message = "Could not process pdf", ?error, key = ?self.item.key());
|
||||
trace!(message = "Could not process pdf", ?error, item = ?self.item);
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
};
|
||||
@@ -9,15 +9,15 @@ use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ListExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::{ArcBytes, BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
pub struct PdfPagesExtractor {
|
||||
item: Item,
|
||||
item: BinaryPileValue,
|
||||
}
|
||||
|
||||
impl PdfPagesExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self { item: item.clone() }
|
||||
}
|
||||
|
||||
@@ -41,7 +41,7 @@ impl ListExtractor for PdfPagesExtractor {
|
||||
idx: usize,
|
||||
) -> Result<Option<PileValue>, std::io::Error> {
|
||||
trace!(
|
||||
key = self.item.key().as_str(),
|
||||
item = ?self.item,
|
||||
"Getting index {idx} from PdfPagesExtractor",
|
||||
);
|
||||
|
||||
@@ -78,12 +78,12 @@ impl ListExtractor for PdfPagesExtractor {
|
||||
|
||||
let value = match png {
|
||||
Ok(None) => return Ok(None),
|
||||
Ok(Some(bytes)) => PileValue::Blob {
|
||||
Ok(Some(bytes)) => PileValue::Binary(BinaryPileValue::Blob {
|
||||
mime: mime::IMAGE_PNG,
|
||||
bytes: Arc::new(bytes),
|
||||
},
|
||||
bytes: ArcBytes(Arc::new(bytes)),
|
||||
}),
|
||||
Err(error) => {
|
||||
trace!(message = "Could not render pdf page", ?error, idx, key = ?self.item.key());
|
||||
trace!(message = "Could not render pdf page", ?error, idx, item = ?self.item);
|
||||
PileValue::Null
|
||||
}
|
||||
};
|
||||
@@ -108,7 +108,7 @@ impl ListExtractor for PdfPagesExtractor {
|
||||
match count {
|
||||
Ok(n) => Ok(n),
|
||||
Err(error) => {
|
||||
trace!(message = "Could not read pdf page count", ?error, key = ?self.item.key());
|
||||
trace!(message = "Could not read pdf page count", ?error, item = ?self.item);
|
||||
Ok(0)
|
||||
}
|
||||
}
|
||||
@@ -9,18 +9,19 @@ use std::{
|
||||
};
|
||||
use tracing::trace;
|
||||
|
||||
use crate::value::BinaryPileValue;
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::PileValue,
|
||||
};
|
||||
|
||||
pub struct PdfTextExtractor {
|
||||
pub(super) item: Item,
|
||||
pub(super) item: BinaryPileValue,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl PdfTextExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
@@ -86,7 +87,7 @@ impl PdfTextExtractor {
|
||||
let raw_text = match raw_text {
|
||||
Ok(x) => x,
|
||||
Err(error) => {
|
||||
trace!(message = "Could not process pdf", ?error, key = ?self.item.key());
|
||||
trace!(message = "Could not process pdf", ?error, item = ?self.item);
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
};
|
||||
@@ -4,16 +4,16 @@ use std::sync::{Arc, OnceLock};
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::{BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
pub struct TextExtractor {
|
||||
item: Item,
|
||||
item: BinaryPileValue,
|
||||
output: OnceLock<PileValue>,
|
||||
}
|
||||
|
||||
impl TextExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
@@ -7,7 +7,7 @@ use std::{
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::{BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
fn toml_to_pile(value: toml::Value) -> PileValue {
|
||||
@@ -25,12 +25,12 @@ fn toml_to_pile(value: toml::Value) -> PileValue {
|
||||
}
|
||||
|
||||
pub struct TomlExtractor {
|
||||
item: Item,
|
||||
item: BinaryPileValue,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl TomlExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
58
crates/pile-value/src/extract/item.rs
Normal file
58
crates/pile-value/src/extract/item.rs
Normal file
@@ -0,0 +1,58 @@
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
use pile_config::Label;
|
||||
|
||||
use crate::{
|
||||
extract::{
|
||||
misc::MapExtractor,
|
||||
traits::{ExtractState, ObjectExtractor},
|
||||
},
|
||||
value::{Item, PileValue},
|
||||
};
|
||||
|
||||
pub struct ItemExtractor {
|
||||
inner: MapExtractor,
|
||||
}
|
||||
|
||||
impl ItemExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
let files = {
|
||||
let Item::File { files, .. } = &item;
|
||||
let mut inner = HashMap::new();
|
||||
for f in files {
|
||||
inner.insert(f.0.clone(), f.1.clone());
|
||||
}
|
||||
PileValue::ObjectExtractor(Arc::new(MapExtractor { inner }))
|
||||
};
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
let inner = MapExtractor {
|
||||
inner: HashMap::from([
|
||||
(Label::new("files").unwrap(), files),
|
||||
(
|
||||
Label::new("key").unwrap(),
|
||||
PileValue::String(Arc::new(item.key())),
|
||||
),
|
||||
]),
|
||||
};
|
||||
|
||||
Self { inner }
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for ItemExtractor {
|
||||
async fn field(
|
||||
&self,
|
||||
state: &ExtractState,
|
||||
name: &pile_config::Label,
|
||||
args: Option<&str>,
|
||||
) -> Result<Option<PileValue>, std::io::Error> {
|
||||
self.inner.field(state, name, args).await
|
||||
}
|
||||
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
let fields = self.inner.fields().await?;
|
||||
Ok(fields)
|
||||
}
|
||||
}
|
||||
@@ -1,56 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use pile_config::Label;
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
};
|
||||
|
||||
pub struct GroupExtractor {
|
||||
item: Item,
|
||||
}
|
||||
|
||||
impl GroupExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
Self { item: item.clone() }
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for GroupExtractor {
|
||||
async fn field(
|
||||
&self,
|
||||
_state: &ExtractState,
|
||||
name: &Label,
|
||||
args: Option<&str>,
|
||||
) -> Result<Option<PileValue>, std::io::Error> {
|
||||
if args.is_some() {
|
||||
return Ok(None);
|
||||
}
|
||||
Ok(self
|
||||
.item
|
||||
.group()
|
||||
.get(name)
|
||||
.map(|item| PileValue::ObjectExtractor(Arc::new(super::ItemExtractor::new(item)))))
|
||||
}
|
||||
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.item.group().keys().cloned().collect())
|
||||
}
|
||||
|
||||
async fn to_json(&self, _state: &ExtractState) -> Result<serde_json::Value, std::io::Error> {
|
||||
Ok(serde_json::Value::Object(
|
||||
self.item
|
||||
.group()
|
||||
.iter()
|
||||
.map(|(k, v)| {
|
||||
(
|
||||
k.to_string(),
|
||||
serde_json::Value::String(format!("<GroupItem ({})>", v.key())),
|
||||
)
|
||||
})
|
||||
.collect(),
|
||||
))
|
||||
}
|
||||
}
|
||||
@@ -1,3 +1,4 @@
|
||||
pub mod blob;
|
||||
pub mod item;
|
||||
pub mod misc;
|
||||
pub mod regex;
|
||||
|
||||
@@ -1,27 +1,25 @@
|
||||
use chrono::{DateTime, Utc};
|
||||
use pile_config::{
|
||||
Label,
|
||||
pattern::{GroupPattern, GroupSegment},
|
||||
};
|
||||
use pile_config::Label;
|
||||
use regex::Regex;
|
||||
use smartstring::{LazyCompact, SmartString};
|
||||
use std::{
|
||||
collections::{BTreeMap, HashMap, HashSet},
|
||||
collections::{BTreeMap, HashMap},
|
||||
path::PathBuf,
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::{
|
||||
extract::traits::ExtractState,
|
||||
source::{DataSource, misc::path_ts_latest},
|
||||
value::{Item, PileValue},
|
||||
value::{BinaryPileValue, Item, PileValue},
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DirDataSource {
|
||||
pub name: Label,
|
||||
pub dir: PathBuf,
|
||||
pub pattern: GroupPattern,
|
||||
pub base_pattern: Regex,
|
||||
pub files: HashMap<Label, String>,
|
||||
pub index: OnceLock<BTreeMap<SmartString<LazyCompact>, Item>>,
|
||||
}
|
||||
|
||||
@@ -29,21 +27,18 @@ impl DirDataSource {
|
||||
pub async fn new(
|
||||
name: &Label,
|
||||
dir: PathBuf,
|
||||
pattern: GroupPattern,
|
||||
base_pattern: Regex,
|
||||
files: HashMap<Label, String>,
|
||||
) -> Result<Arc<Self>, std::io::Error> {
|
||||
let source = Arc::new(Self {
|
||||
name: name.clone(),
|
||||
dir,
|
||||
pattern,
|
||||
base_pattern,
|
||||
files,
|
||||
index: OnceLock::new(),
|
||||
});
|
||||
|
||||
//
|
||||
// MARK: list paths
|
||||
//
|
||||
|
||||
let mut paths_items = HashSet::new();
|
||||
let mut paths_grouped_items = HashSet::new();
|
||||
let mut index = BTreeMap::new();
|
||||
'entry: for entry in WalkDir::new(&source.dir) {
|
||||
let entry = match entry {
|
||||
Err(e) => {
|
||||
@@ -59,51 +54,52 @@ impl DirDataSource {
|
||||
}
|
||||
|
||||
let path = entry.into_path();
|
||||
let path_str = match path.to_str() {
|
||||
let rel_path = match path.strip_prefix(&source.dir) {
|
||||
Ok(p) => p,
|
||||
Err(_) => continue 'entry,
|
||||
};
|
||||
let path_str = match rel_path.to_str() {
|
||||
Some(x) => x,
|
||||
None => continue 'entry,
|
||||
};
|
||||
|
||||
let groups = resolve_groups(&source.pattern, path_str).await;
|
||||
paths_grouped_items.extend(groups.into_values());
|
||||
paths_items.insert(path);
|
||||
}
|
||||
|
||||
//
|
||||
// MARK: resolve groups
|
||||
//
|
||||
|
||||
let mut index = BTreeMap::new();
|
||||
'entry: for path in paths_items.difference(&paths_grouped_items) {
|
||||
let path_str = match path.to_str() {
|
||||
Some(x) => x,
|
||||
let captures = match source.base_pattern.captures(path_str) {
|
||||
Some(c) => c,
|
||||
None => continue 'entry,
|
||||
};
|
||||
let base = match captures.get(1) {
|
||||
Some(m) => m.as_str(),
|
||||
None => continue 'entry,
|
||||
};
|
||||
|
||||
let group = resolve_groups(&source.pattern, path_str).await;
|
||||
let group = group
|
||||
.into_iter()
|
||||
.map(|(k, group_path)| {
|
||||
(
|
||||
k,
|
||||
Box::new(Item::File {
|
||||
source: Arc::clone(&source),
|
||||
mime: mime_guess::from_path(&group_path).first_or_octet_stream(),
|
||||
path: group_path.clone(),
|
||||
group: Arc::new(HashMap::new()),
|
||||
let key: SmartString<LazyCompact> = base.into();
|
||||
if index.contains_key(&key) {
|
||||
continue 'entry;
|
||||
}
|
||||
|
||||
let mut item_files = HashMap::new();
|
||||
for (label, template) in &source.files {
|
||||
let file_path = source.dir.join(template.replace("{base}", base));
|
||||
if file_path.exists() {
|
||||
let mime = mime_guess::from_path(&file_path).first_or_octet_stream();
|
||||
item_files.insert(
|
||||
label.clone(),
|
||||
PileValue::Binary(BinaryPileValue::File {
|
||||
mime,
|
||||
path: file_path,
|
||||
}),
|
||||
)
|
||||
})
|
||||
.collect::<HashMap<_, _>>();
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let item = Item::File {
|
||||
source: Arc::clone(&source),
|
||||
mime: mime_guess::from_path(path).first_or_octet_stream(),
|
||||
path: path.into(),
|
||||
group: Arc::new(group),
|
||||
};
|
||||
|
||||
index.insert(item.key(), item);
|
||||
index.insert(
|
||||
key.clone(),
|
||||
Item::File {
|
||||
key,
|
||||
source: Arc::clone(&source),
|
||||
files: item_files,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
source.index.get_or_init(|| index);
|
||||
@@ -139,43 +135,3 @@ impl DataSource for Arc<DirDataSource> {
|
||||
path_ts_latest(&self.dir)
|
||||
}
|
||||
}
|
||||
|
||||
async fn resolve_groups(pattern: &GroupPattern, path_str: &str) -> HashMap<Label, PathBuf> {
|
||||
let state = ExtractState { ignore_mime: false };
|
||||
let mut group = HashMap::new();
|
||||
'pattern: for (l, pat) in &pattern.pattern {
|
||||
let item = PileValue::String(Arc::new(path_str.into()));
|
||||
let mut target = String::new();
|
||||
for p in pat {
|
||||
match p {
|
||||
GroupSegment::Literal(x) => target.push_str(x),
|
||||
GroupSegment::Path(op) => {
|
||||
let res = match item.query(&state, op).await {
|
||||
Ok(Some(x)) => x,
|
||||
_ => continue 'pattern,
|
||||
};
|
||||
|
||||
let res = match res.as_str() {
|
||||
Some(x) => x,
|
||||
None => continue 'pattern,
|
||||
};
|
||||
|
||||
target.push_str(res);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let group_path: PathBuf = match target.parse() {
|
||||
Ok(x) => x,
|
||||
Err(_) => continue 'pattern,
|
||||
};
|
||||
|
||||
if !group_path.exists() {
|
||||
continue;
|
||||
}
|
||||
|
||||
group.insert(l.clone(), group_path);
|
||||
}
|
||||
|
||||
return group;
|
||||
}
|
||||
|
||||
@@ -1,74 +1,45 @@
|
||||
use mime::Mime;
|
||||
use pile_config::Label;
|
||||
use pile_io::SyncReadBridge;
|
||||
use smartstring::{LazyCompact, SmartString};
|
||||
use std::{collections::HashMap, fs::File, path::PathBuf, sync::Arc};
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
use crate::{source::DirDataSource, value::ItemReader};
|
||||
use crate::{source::DirDataSource, value::PileValue};
|
||||
|
||||
//
|
||||
// MARK: item
|
||||
//
|
||||
|
||||
/// A cheaply-cloneable pointer to an item in a dataset
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Clone)]
|
||||
pub enum Item {
|
||||
File {
|
||||
key: SmartString<LazyCompact>,
|
||||
source: Arc<DirDataSource>,
|
||||
mime: Mime,
|
||||
|
||||
path: PathBuf,
|
||||
group: Arc<HashMap<Label, Box<Item>>>,
|
||||
files: HashMap<Label, PileValue>,
|
||||
},
|
||||
}
|
||||
|
||||
impl Item {
|
||||
/// Open the item for reading.
|
||||
pub async fn read(&self) -> Result<ItemReader, std::io::Error> {
|
||||
Ok(match self {
|
||||
Self::File { path, .. } => ItemReader::File(File::open(path)?),
|
||||
})
|
||||
impl std::fmt::Debug for Item {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::File { key, files, .. } => f
|
||||
.debug_struct("Item::File")
|
||||
.field("key", key)
|
||||
.field("files", &files.keys().collect::<Vec<_>>())
|
||||
.finish(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Item {
|
||||
pub fn source_name(&self) -> &pile_config::Label {
|
||||
match self {
|
||||
Self::File { source, .. } => &source.name,
|
||||
}
|
||||
}
|
||||
|
||||
#[expect(clippy::expect_used)]
|
||||
pub fn key(&self) -> SmartString<LazyCompact> {
|
||||
match self {
|
||||
Self::File { source, path, .. } => path
|
||||
.strip_prefix(&source.dir)
|
||||
.expect("item must be inside source")
|
||||
.to_str()
|
||||
.expect("path is not utf-8")
|
||||
.into(),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn hash(&self) -> Result<blake3::Hash, std::io::Error> {
|
||||
let read = self.read().await?;
|
||||
let mut read = SyncReadBridge::new_current(read);
|
||||
let out = tokio::task::spawn_blocking(move || {
|
||||
let mut hasher = blake3::Hasher::new();
|
||||
std::io::copy(&mut read, &mut hasher)?;
|
||||
return Ok::<_, std::io::Error>(hasher.finalize());
|
||||
})
|
||||
.await??;
|
||||
return Ok(out);
|
||||
}
|
||||
|
||||
pub fn mime(&self) -> &Mime {
|
||||
match self {
|
||||
Self::File { mime, .. } => mime,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn group(&self) -> &HashMap<Label, Box<Self>> {
|
||||
match self {
|
||||
Self::File { group, .. } => group,
|
||||
Self::File { key, .. } => key.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,10 @@
|
||||
use pile_io::{AsyncReader, AsyncSeekReader};
|
||||
use std::{fs::File, io::Seek};
|
||||
use std::{
|
||||
fs::File,
|
||||
io::{Cursor, Seek},
|
||||
};
|
||||
|
||||
use crate::value::ArcBytes;
|
||||
|
||||
//
|
||||
// MARK: itemreader
|
||||
@@ -7,12 +12,14 @@ use std::{fs::File, io::Seek};
|
||||
|
||||
pub enum ItemReader {
|
||||
File(File),
|
||||
Vec(Cursor<ArcBytes>),
|
||||
}
|
||||
|
||||
impl AsyncReader for ItemReader {
|
||||
async fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
|
||||
match self {
|
||||
Self::File(x) => std::io::Read::read(x, buf),
|
||||
Self::Vec(x) => std::io::Read::read(x, buf),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -21,6 +28,7 @@ impl AsyncSeekReader for ItemReader {
|
||||
async fn seek(&mut self, pos: std::io::SeekFrom) -> Result<u64, std::io::Error> {
|
||||
match self {
|
||||
Self::File(x) => x.seek(pos),
|
||||
Self::Vec(x) => x.seek(pos),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,19 +1,75 @@
|
||||
use mime::Mime;
|
||||
use pile_config::objectpath::{ObjectPath, PathSegment};
|
||||
use pile_io::SyncReadBridge;
|
||||
use serde_json::{Map, Value};
|
||||
use smartstring::{LazyCompact, SmartString};
|
||||
use std::sync::Arc;
|
||||
use std::{fmt::Debug, fs::File, io::Cursor, path::PathBuf, sync::Arc};
|
||||
|
||||
use crate::{
|
||||
extract::{
|
||||
item::{ImageExtractor, ItemExtractor},
|
||||
blob::BinaryExtractor,
|
||||
item::ItemExtractor,
|
||||
misc::{ArrayExtractor, MapExtractor, VecExtractor},
|
||||
string::StringExtractor,
|
||||
traits::{ExtractState, ListExtractor, ObjectExtractor},
|
||||
},
|
||||
value::Item,
|
||||
value::{Item, ItemReader},
|
||||
};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct ArcBytes(pub Arc<Vec<u8>>);
|
||||
impl Debug for ArcBytes {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("ArcBytes")
|
||||
.field("len()", &self.0.len())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for ArcBytes {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum BinaryPileValue {
|
||||
/// A binary blob
|
||||
Blob { mime: Mime, bytes: ArcBytes },
|
||||
|
||||
/// An pointer to a file
|
||||
File { mime: Mime, path: PathBuf },
|
||||
}
|
||||
|
||||
impl BinaryPileValue {
|
||||
/// Open the item for reading.
|
||||
pub async fn read(&self) -> Result<ItemReader, std::io::Error> {
|
||||
match self {
|
||||
Self::File { path, .. } => Ok(ItemReader::File(File::open(path)?)),
|
||||
Self::Blob { bytes, .. } => Ok(ItemReader::Vec(Cursor::new(bytes.clone()))),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn hash(&self) -> Result<blake3::Hash, std::io::Error> {
|
||||
let read = self.read().await?;
|
||||
let mut read = SyncReadBridge::new_current(read);
|
||||
let out = tokio::task::spawn_blocking(move || {
|
||||
let mut hasher = blake3::Hasher::new();
|
||||
std::io::copy(&mut read, &mut hasher)?;
|
||||
return Ok::<_, std::io::Error>(hasher.finalize());
|
||||
})
|
||||
.await??;
|
||||
return Ok(out);
|
||||
}
|
||||
|
||||
pub fn mime(&self) -> &Mime {
|
||||
match self {
|
||||
Self::Blob { mime, .. } => mime,
|
||||
Self::File { mime, .. } => mime,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An immutable, cheaply-cloneable, lazily-computed value.
|
||||
/// Very similar to [serde_json::Value].
|
||||
pub enum PileValue {
|
||||
@@ -27,12 +83,6 @@ pub enum PileValue {
|
||||
/// An array of values
|
||||
Array(Arc<Vec<PileValue>>),
|
||||
|
||||
/// A binary blob
|
||||
Blob {
|
||||
mime: Mime,
|
||||
bytes: Arc<Vec<u8>>,
|
||||
},
|
||||
|
||||
/// A lazily-computed map of {label: value}
|
||||
ObjectExtractor(Arc<dyn ObjectExtractor>),
|
||||
|
||||
@@ -41,6 +91,9 @@ pub enum PileValue {
|
||||
|
||||
/// An pointer to an item in this dataset
|
||||
Item(Item),
|
||||
|
||||
/// Binary data
|
||||
Binary(BinaryPileValue),
|
||||
}
|
||||
|
||||
impl Clone for PileValue {
|
||||
@@ -53,11 +106,8 @@ impl Clone for PileValue {
|
||||
Self::Array(x) => Self::Array(x.clone()),
|
||||
Self::ObjectExtractor(x) => Self::ObjectExtractor(x.clone()),
|
||||
Self::ListExtractor(x) => Self::ListExtractor(x.clone()),
|
||||
Self::Blob { mime, bytes } => Self::Blob {
|
||||
mime: mime.clone(),
|
||||
bytes: bytes.clone(),
|
||||
},
|
||||
Self::Item(i) => Self::Item(i.clone()),
|
||||
Self::Binary(b) => Self::Binary(b.clone()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -70,13 +120,10 @@ impl PileValue {
|
||||
Self::I64(_) => Arc::new(MapExtractor::default()),
|
||||
Self::Array(_) => Arc::new(MapExtractor::default()),
|
||||
Self::String(s) => Arc::new(StringExtractor::new(s)),
|
||||
Self::Blob { mime, bytes } => {
|
||||
// TODO: make a blobextractor (with pdf, epub, etc; like item)
|
||||
Arc::new(ImageExtractor::from_blob(bytes.clone(), mime.clone()))
|
||||
}
|
||||
Self::ListExtractor(_) => Arc::new(MapExtractor::default()),
|
||||
Self::ObjectExtractor(e) => e.clone(),
|
||||
Self::Item(i) => Arc::new(ItemExtractor::new(i)),
|
||||
Self::Binary(b) => Arc::new(BinaryExtractor::new(b)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -87,12 +134,12 @@ impl PileValue {
|
||||
Self::I64(_) => Arc::new(VecExtractor::default()),
|
||||
Self::Array(a) => Arc::new(ArrayExtractor::new(a.clone())),
|
||||
Self::String(_) => Arc::new(VecExtractor::default()),
|
||||
Self::Blob { .. } => Arc::new(VecExtractor::default()),
|
||||
Self::ListExtractor(e) => e.clone(),
|
||||
Self::ObjectExtractor(e) => e
|
||||
.as_list()
|
||||
.unwrap_or_else(|| Arc::new(VecExtractor::default())),
|
||||
Self::Item(_) => Arc::new(VecExtractor::default()),
|
||||
Self::Binary(_) => Arc::new(VecExtractor::default()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -197,14 +244,17 @@ impl PileValue {
|
||||
Ok(match self {
|
||||
Self::Null => None,
|
||||
|
||||
Self::U64(_) | Self::I64(_) | Self::String(_) | Self::Blob { .. } => {
|
||||
Some(Value::Number(1u64.into()))
|
||||
}
|
||||
Self::U64(_)
|
||||
| Self::I64(_)
|
||||
| Self::String(_)
|
||||
| Self::Binary(BinaryPileValue::Blob { .. }) => Some(Value::Number(1u64.into())),
|
||||
|
||||
Self::Array(x) => (!x.is_empty()).then(|| Value::Number(1u64.into())),
|
||||
Self::ListExtractor(x) => (x.len(state).await? > 0).then(|| Value::Number(1u64.into())),
|
||||
|
||||
Self::ObjectExtractor(_) | Self::Item(_) => {
|
||||
Self::ObjectExtractor(_)
|
||||
| Self::Item(_)
|
||||
| Self::Binary(BinaryPileValue::File { .. }) => {
|
||||
let e = self.object_extractor();
|
||||
let keys = e.fields().await?;
|
||||
let mut map = Map::new();
|
||||
@@ -241,8 +291,8 @@ impl PileValue {
|
||||
Self::String(x) => Value::String(x.to_string()),
|
||||
|
||||
// TODO: replace with something meaningful?
|
||||
Self::Blob { mime, bytes } => {
|
||||
Value::String(format!("<Blob ({mime}, {} bytes)>", bytes.len()))
|
||||
Self::Binary(BinaryPileValue::Blob { mime, bytes }) => {
|
||||
Value::String(format!("<Blob ({mime}, {} bytes)>", bytes.0.len()))
|
||||
}
|
||||
|
||||
Self::Array(_) | Self::ListExtractor(_) => {
|
||||
@@ -250,7 +300,9 @@ impl PileValue {
|
||||
return e.to_json(state).await;
|
||||
}
|
||||
|
||||
Self::ObjectExtractor(_) | Self::Item(_) => {
|
||||
Self::ObjectExtractor(_)
|
||||
| Self::Item(_)
|
||||
| Self::Binary(BinaryPileValue::File { .. }) => {
|
||||
let e = self.object_extractor();
|
||||
return e.to_json(state).await;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user