Refactor grouping

This commit is contained in:
2026-03-28 11:20:16 -07:00
parent 9967e066bb
commit 5527b61d39
40 changed files with 466 additions and 630 deletions

View File

@@ -1,74 +1,45 @@
use mime::Mime;
use pile_config::Label;
use pile_io::SyncReadBridge;
use smartstring::{LazyCompact, SmartString};
use std::{collections::HashMap, fs::File, path::PathBuf, sync::Arc};
use std::{collections::HashMap, sync::Arc};
use crate::{source::DirDataSource, value::ItemReader};
use crate::{source::DirDataSource, value::PileValue};
//
// MARK: item
//
/// A cheaply-cloneable pointer to an item in a dataset
#[derive(Debug, Clone)]
#[derive(Clone)]
pub enum Item {
File {
key: SmartString<LazyCompact>,
source: Arc<DirDataSource>,
mime: Mime,
path: PathBuf,
group: Arc<HashMap<Label, Box<Item>>>,
files: HashMap<Label, PileValue>,
},
}
impl Item {
/// Open the item for reading.
pub async fn read(&self) -> Result<ItemReader, std::io::Error> {
Ok(match self {
Self::File { path, .. } => ItemReader::File(File::open(path)?),
})
impl std::fmt::Debug for Item {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::File { key, files, .. } => f
.debug_struct("Item::File")
.field("key", key)
.field("files", &files.keys().collect::<Vec<_>>())
.finish(),
}
}
}
impl Item {
pub fn source_name(&self) -> &pile_config::Label {
match self {
Self::File { source, .. } => &source.name,
}
}
#[expect(clippy::expect_used)]
pub fn key(&self) -> SmartString<LazyCompact> {
match self {
Self::File { source, path, .. } => path
.strip_prefix(&source.dir)
.expect("item must be inside source")
.to_str()
.expect("path is not utf-8")
.into(),
}
}
pub async fn hash(&self) -> Result<blake3::Hash, std::io::Error> {
let read = self.read().await?;
let mut read = SyncReadBridge::new_current(read);
let out = tokio::task::spawn_blocking(move || {
let mut hasher = blake3::Hasher::new();
std::io::copy(&mut read, &mut hasher)?;
return Ok::<_, std::io::Error>(hasher.finalize());
})
.await??;
return Ok(out);
}
pub fn mime(&self) -> &Mime {
match self {
Self::File { mime, .. } => mime,
}
}
pub fn group(&self) -> &HashMap<Label, Box<Self>> {
match self {
Self::File { group, .. } => group,
Self::File { key, .. } => key.clone(),
}
}
}

View File

@@ -1,5 +1,10 @@
use pile_io::{AsyncReader, AsyncSeekReader};
use std::{fs::File, io::Seek};
use std::{
fs::File,
io::{Cursor, Seek},
};
use crate::value::ArcBytes;
//
// MARK: itemreader
@@ -7,12 +12,14 @@ use std::{fs::File, io::Seek};
pub enum ItemReader {
File(File),
Vec(Cursor<ArcBytes>),
}
impl AsyncReader for ItemReader {
async fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
match self {
Self::File(x) => std::io::Read::read(x, buf),
Self::Vec(x) => std::io::Read::read(x, buf),
}
}
}
@@ -21,6 +28,7 @@ impl AsyncSeekReader for ItemReader {
async fn seek(&mut self, pos: std::io::SeekFrom) -> Result<u64, std::io::Error> {
match self {
Self::File(x) => x.seek(pos),
Self::Vec(x) => x.seek(pos),
}
}
}

View File

@@ -1,19 +1,75 @@
use mime::Mime;
use pile_config::objectpath::{ObjectPath, PathSegment};
use pile_io::SyncReadBridge;
use serde_json::{Map, Value};
use smartstring::{LazyCompact, SmartString};
use std::sync::Arc;
use std::{fmt::Debug, fs::File, io::Cursor, path::PathBuf, sync::Arc};
use crate::{
extract::{
item::{ImageExtractor, ItemExtractor},
blob::BinaryExtractor,
item::ItemExtractor,
misc::{ArrayExtractor, MapExtractor, VecExtractor},
string::StringExtractor,
traits::{ExtractState, ListExtractor, ObjectExtractor},
},
value::Item,
value::{Item, ItemReader},
};
#[derive(Clone)]
pub struct ArcBytes(pub Arc<Vec<u8>>);
impl Debug for ArcBytes {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("ArcBytes")
.field("len()", &self.0.len())
.finish()
}
}
impl AsRef<[u8]> for ArcBytes {
fn as_ref(&self) -> &[u8] {
&self.0
}
}
#[derive(Debug, Clone)]
pub enum BinaryPileValue {
/// A binary blob
Blob { mime: Mime, bytes: ArcBytes },
/// An pointer to a file
File { mime: Mime, path: PathBuf },
}
impl BinaryPileValue {
/// Open the item for reading.
pub async fn read(&self) -> Result<ItemReader, std::io::Error> {
match self {
Self::File { path, .. } => Ok(ItemReader::File(File::open(path)?)),
Self::Blob { bytes, .. } => Ok(ItemReader::Vec(Cursor::new(bytes.clone()))),
}
}
pub async fn hash(&self) -> Result<blake3::Hash, std::io::Error> {
let read = self.read().await?;
let mut read = SyncReadBridge::new_current(read);
let out = tokio::task::spawn_blocking(move || {
let mut hasher = blake3::Hasher::new();
std::io::copy(&mut read, &mut hasher)?;
return Ok::<_, std::io::Error>(hasher.finalize());
})
.await??;
return Ok(out);
}
pub fn mime(&self) -> &Mime {
match self {
Self::Blob { mime, .. } => mime,
Self::File { mime, .. } => mime,
}
}
}
/// An immutable, cheaply-cloneable, lazily-computed value.
/// Very similar to [serde_json::Value].
pub enum PileValue {
@@ -27,12 +83,6 @@ pub enum PileValue {
/// An array of values
Array(Arc<Vec<PileValue>>),
/// A binary blob
Blob {
mime: Mime,
bytes: Arc<Vec<u8>>,
},
/// A lazily-computed map of {label: value}
ObjectExtractor(Arc<dyn ObjectExtractor>),
@@ -41,6 +91,9 @@ pub enum PileValue {
/// An pointer to an item in this dataset
Item(Item),
/// Binary data
Binary(BinaryPileValue),
}
impl Clone for PileValue {
@@ -53,11 +106,8 @@ impl Clone for PileValue {
Self::Array(x) => Self::Array(x.clone()),
Self::ObjectExtractor(x) => Self::ObjectExtractor(x.clone()),
Self::ListExtractor(x) => Self::ListExtractor(x.clone()),
Self::Blob { mime, bytes } => Self::Blob {
mime: mime.clone(),
bytes: bytes.clone(),
},
Self::Item(i) => Self::Item(i.clone()),
Self::Binary(b) => Self::Binary(b.clone()),
}
}
}
@@ -70,13 +120,10 @@ impl PileValue {
Self::I64(_) => Arc::new(MapExtractor::default()),
Self::Array(_) => Arc::new(MapExtractor::default()),
Self::String(s) => Arc::new(StringExtractor::new(s)),
Self::Blob { mime, bytes } => {
// TODO: make a blobextractor (with pdf, epub, etc; like item)
Arc::new(ImageExtractor::from_blob(bytes.clone(), mime.clone()))
}
Self::ListExtractor(_) => Arc::new(MapExtractor::default()),
Self::ObjectExtractor(e) => e.clone(),
Self::Item(i) => Arc::new(ItemExtractor::new(i)),
Self::Binary(b) => Arc::new(BinaryExtractor::new(b)),
}
}
@@ -87,12 +134,12 @@ impl PileValue {
Self::I64(_) => Arc::new(VecExtractor::default()),
Self::Array(a) => Arc::new(ArrayExtractor::new(a.clone())),
Self::String(_) => Arc::new(VecExtractor::default()),
Self::Blob { .. } => Arc::new(VecExtractor::default()),
Self::ListExtractor(e) => e.clone(),
Self::ObjectExtractor(e) => e
.as_list()
.unwrap_or_else(|| Arc::new(VecExtractor::default())),
Self::Item(_) => Arc::new(VecExtractor::default()),
Self::Binary(_) => Arc::new(VecExtractor::default()),
}
}
@@ -197,14 +244,17 @@ impl PileValue {
Ok(match self {
Self::Null => None,
Self::U64(_) | Self::I64(_) | Self::String(_) | Self::Blob { .. } => {
Some(Value::Number(1u64.into()))
}
Self::U64(_)
| Self::I64(_)
| Self::String(_)
| Self::Binary(BinaryPileValue::Blob { .. }) => Some(Value::Number(1u64.into())),
Self::Array(x) => (!x.is_empty()).then(|| Value::Number(1u64.into())),
Self::ListExtractor(x) => (x.len(state).await? > 0).then(|| Value::Number(1u64.into())),
Self::ObjectExtractor(_) | Self::Item(_) => {
Self::ObjectExtractor(_)
| Self::Item(_)
| Self::Binary(BinaryPileValue::File { .. }) => {
let e = self.object_extractor();
let keys = e.fields().await?;
let mut map = Map::new();
@@ -241,8 +291,8 @@ impl PileValue {
Self::String(x) => Value::String(x.to_string()),
// TODO: replace with something meaningful?
Self::Blob { mime, bytes } => {
Value::String(format!("<Blob ({mime}, {} bytes)>", bytes.len()))
Self::Binary(BinaryPileValue::Blob { mime, bytes }) => {
Value::String(format!("<Blob ({mime}, {} bytes)>", bytes.0.len()))
}
Self::Array(_) | Self::ListExtractor(_) => {
@@ -250,7 +300,9 @@ impl PileValue {
return e.to_json(state).await;
}
Self::ObjectExtractor(_) | Self::Item(_) => {
Self::ObjectExtractor(_)
| Self::Item(_)
| Self::Binary(BinaryPileValue::File { .. }) => {
let e = self.object_extractor();
return e.to_json(state).await;
}