Add ListExtractor
This commit is contained in:
@@ -184,7 +184,7 @@ impl Datasets {
|
|||||||
return Ok(None);
|
return Ok(None);
|
||||||
};
|
};
|
||||||
let extractor = MetaExtractor::new(&item);
|
let extractor = MetaExtractor::new(&item);
|
||||||
let root = PileValue::Extractor(Arc::new(extractor));
|
let root = PileValue::ObjectExtractor(Arc::new(extractor));
|
||||||
let Some(value) = root.query(path).await? else {
|
let Some(value) = root.query(path).await? else {
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ use pile_config::Label;
|
|||||||
use std::{collections::HashMap, sync::OnceLock};
|
use std::{collections::HashMap, sync::OnceLock};
|
||||||
use tracing::trace;
|
use tracing::trace;
|
||||||
|
|
||||||
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor};
|
use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor};
|
||||||
|
|
||||||
pub struct EpubMetaExtractor<'a> {
|
pub struct EpubMetaExtractor<'a> {
|
||||||
item: &'a Item,
|
item: &'a Item,
|
||||||
@@ -78,7 +78,7 @@ impl<'a> EpubMetaExtractor<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl Extractor for EpubMetaExtractor<'_> {
|
impl ObjectExtractor for EpubMetaExtractor<'_> {
|
||||||
async fn field<'a>(
|
async fn field<'a>(
|
||||||
&'a self,
|
&'a self,
|
||||||
name: &Label,
|
name: &Label,
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ use pile_config::Label;
|
|||||||
use std::{collections::HashMap, sync::OnceLock};
|
use std::{collections::HashMap, sync::OnceLock};
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor};
|
use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor};
|
||||||
|
|
||||||
pub struct EpubTextExtractor<'a> {
|
pub struct EpubTextExtractor<'a> {
|
||||||
item: &'a Item,
|
item: &'a Item,
|
||||||
@@ -88,7 +88,7 @@ fn strip_html(html: &str) -> String {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl Extractor for EpubTextExtractor<'_> {
|
impl ObjectExtractor for EpubTextExtractor<'_> {
|
||||||
async fn field<'a>(
|
async fn field<'a>(
|
||||||
&'a self,
|
&'a self,
|
||||||
name: &Label,
|
name: &Label,
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ pub use epub_text::*;
|
|||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
Item, PileValue,
|
Item, PileValue,
|
||||||
extract::{Extractor, MapExtractor},
|
extract::{MapExtractor, ObjectExtractor},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct EpubExtractor<'a> {
|
pub struct EpubExtractor<'a> {
|
||||||
@@ -23,11 +23,11 @@ impl<'a> EpubExtractor<'a> {
|
|||||||
inner: HashMap::from([
|
inner: HashMap::from([
|
||||||
(
|
(
|
||||||
Label::new("text").unwrap(),
|
Label::new("text").unwrap(),
|
||||||
PileValue::Extractor(Arc::new(EpubTextExtractor::new(item))),
|
PileValue::ObjectExtractor(Arc::new(EpubTextExtractor::new(item))),
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
Label::new("meta").unwrap(),
|
Label::new("meta").unwrap(),
|
||||||
PileValue::Extractor(Arc::new(EpubMetaExtractor::new(item))),
|
PileValue::ObjectExtractor(Arc::new(EpubMetaExtractor::new(item))),
|
||||||
),
|
),
|
||||||
]),
|
]),
|
||||||
};
|
};
|
||||||
@@ -37,7 +37,7 @@ impl<'a> EpubExtractor<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl Extractor for EpubExtractor<'_> {
|
impl ObjectExtractor for EpubExtractor<'_> {
|
||||||
async fn field<'a>(
|
async fn field<'a>(
|
||||||
&'a self,
|
&'a self,
|
||||||
name: &pile_config::Label,
|
name: &pile_config::Label,
|
||||||
@@ -45,7 +45,7 @@ impl Extractor for EpubExtractor<'_> {
|
|||||||
#[expect(clippy::unwrap_used)]
|
#[expect(clippy::unwrap_used)]
|
||||||
if name.as_str() == "text" {
|
if name.as_str() == "text" {
|
||||||
match self.inner.inner.get(name).unwrap() {
|
match self.inner.inner.get(name).unwrap() {
|
||||||
PileValue::Extractor(x) => return x.field(name).await,
|
PileValue::ObjectExtractor(x) => return x.field(name).await,
|
||||||
_ => unreachable!(),
|
_ => unreachable!(),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ use pile_config::Label;
|
|||||||
use std::{collections::HashMap, io::BufReader, sync::OnceLock};
|
use std::{collections::HashMap, io::BufReader, sync::OnceLock};
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor};
|
use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor};
|
||||||
|
|
||||||
pub struct ExifExtractor<'a> {
|
pub struct ExifExtractor<'a> {
|
||||||
item: &'a Item,
|
item: &'a Item,
|
||||||
@@ -78,7 +78,7 @@ fn tag_to_label(tag: &str) -> Option<Label> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl Extractor for ExifExtractor<'_> {
|
impl ObjectExtractor for ExifExtractor<'_> {
|
||||||
async fn field<'a>(
|
async fn field<'a>(
|
||||||
&'a self,
|
&'a self,
|
||||||
name: &Label,
|
name: &Label,
|
||||||
|
|||||||
@@ -7,14 +7,17 @@ use std::{
|
|||||||
sync::{Arc, OnceLock},
|
sync::{Arc, OnceLock},
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor};
|
use crate::{
|
||||||
|
Item, PileValue, SyncReadBridge,
|
||||||
|
extract::{ListExtractor, ObjectExtractor},
|
||||||
|
};
|
||||||
|
|
||||||
pub struct FlacExtractor<'a> {
|
pub struct FlacImagesExtractor<'a> {
|
||||||
item: &'a Item,
|
item: &'a Item,
|
||||||
output: OnceLock<HashMap<Label, PileValue<'a>>>,
|
output: OnceLock<Vec<PileValue<'a>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> FlacExtractor<'a> {
|
impl<'a> FlacImagesExtractor<'a> {
|
||||||
pub fn new(item: &'a Item) -> Self {
|
pub fn new(item: &'a Item) -> Self {
|
||||||
Self {
|
Self {
|
||||||
item,
|
item,
|
||||||
@@ -22,6 +25,77 @@ impl<'a> FlacExtractor<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn get_inner(&self) -> Result<&Vec<PileValue<'a>>, std::io::Error> {
|
||||||
|
if let Some(x) = self.output.get() {
|
||||||
|
return Ok(x);
|
||||||
|
}
|
||||||
|
|
||||||
|
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||||
|
let raw_images = tokio::task::spawn_blocking(move || {
|
||||||
|
let reader = FlacReader::new(BufReader::new(reader));
|
||||||
|
let mut images: Vec<(Mime, Vec<u8>)> = Vec::new();
|
||||||
|
for block in reader {
|
||||||
|
match block.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))? {
|
||||||
|
FlacBlock::Picture(picture) => {
|
||||||
|
images.push((picture.mime, picture.img_data));
|
||||||
|
}
|
||||||
|
FlacBlock::AudioFrame(_) => break,
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok::<_, std::io::Error>(images)
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.map_err(std::io::Error::other)??;
|
||||||
|
|
||||||
|
let images = raw_images
|
||||||
|
.into_iter()
|
||||||
|
.map(|(mime, data)| PileValue::Blob {
|
||||||
|
mime,
|
||||||
|
bytes: Arc::new(data),
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let _ = self.output.set(images);
|
||||||
|
#[expect(clippy::unwrap_used)]
|
||||||
|
return Ok(self.output.get().unwrap());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
impl ListExtractor for FlacImagesExtractor<'_> {
|
||||||
|
async fn get<'a>(&'a self, idx: usize) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
|
||||||
|
Ok(self.get_inner().await?.get(idx))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn len(&self) -> Result<usize, std::io::Error> {
|
||||||
|
Ok(self.get_inner().await?.len())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct FlacExtractor<'a> {
|
||||||
|
item: &'a Item,
|
||||||
|
output: OnceLock<HashMap<Label, PileValue<'a>>>,
|
||||||
|
images: Option<PileValue<'a>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> FlacExtractor<'a> {
|
||||||
|
pub fn new(item: &'a Item) -> Self {
|
||||||
|
let is_flac = match item {
|
||||||
|
Item::File { path, .. } => path.to_str().unwrap_or_default().ends_with(".flac"),
|
||||||
|
Item::S3 { key, .. } => key.ends_with(".flac"),
|
||||||
|
};
|
||||||
|
|
||||||
|
let images =
|
||||||
|
is_flac.then(|| PileValue::ListExtractor(Arc::new(FlacImagesExtractor::new(item))));
|
||||||
|
|
||||||
|
Self {
|
||||||
|
item,
|
||||||
|
output: OnceLock::new(),
|
||||||
|
images,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
|
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a>>, std::io::Error> {
|
||||||
if let Some(x) = self.output.get() {
|
if let Some(x) = self.output.get() {
|
||||||
return Ok(x);
|
return Ok(x);
|
||||||
@@ -39,10 +113,9 @@ impl<'a> FlacExtractor<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||||
let (raw_tags, raw_images) = tokio::task::spawn_blocking(move || {
|
let raw_tags = tokio::task::spawn_blocking(move || {
|
||||||
let reader = FlacReader::new(BufReader::new(reader));
|
let reader = FlacReader::new(BufReader::new(reader));
|
||||||
let mut tags: Vec<(String, String)> = Vec::new();
|
let mut tags: Vec<(String, String)> = Vec::new();
|
||||||
let mut images: Vec<(Mime, Vec<u8>)> = Vec::new();
|
|
||||||
for block in reader {
|
for block in reader {
|
||||||
match block.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))? {
|
match block.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))? {
|
||||||
FlacBlock::VorbisComment(comment) => {
|
FlacBlock::VorbisComment(comment) => {
|
||||||
@@ -50,14 +123,11 @@ impl<'a> FlacExtractor<'a> {
|
|||||||
tags.push((k.to_string().to_lowercase(), v.into()));
|
tags.push((k.to_string().to_lowercase(), v.into()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
FlacBlock::Picture(picture) => {
|
|
||||||
images.push((picture.mime, picture.img_data));
|
|
||||||
}
|
|
||||||
FlacBlock::AudioFrame(_) => break,
|
FlacBlock::AudioFrame(_) => break,
|
||||||
_ => {}
|
_ => {}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok::<_, std::io::Error>((tags, images))
|
Ok::<_, std::io::Error>(tags)
|
||||||
})
|
})
|
||||||
.await
|
.await
|
||||||
.map_err(std::io::Error::other)??;
|
.map_err(std::io::Error::other)??;
|
||||||
@@ -71,24 +141,11 @@ impl<'a> FlacExtractor<'a> {
|
|||||||
.push(PileValue::String(v.into()));
|
.push(PileValue::String(v.into()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let mut output: HashMap<Label, PileValue<'a>> = output
|
let output: HashMap<Label, PileValue<'a>> = output
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(k, v)| (k, PileValue::Array(v)))
|
.map(|(k, v)| (k, PileValue::Array(v)))
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
if !raw_images.is_empty()
|
|
||||||
&& let Some(label) = Label::new("images".to_owned())
|
|
||||||
{
|
|
||||||
let images = raw_images
|
|
||||||
.into_iter()
|
|
||||||
.map(|(mime, data)| PileValue::Blob {
|
|
||||||
mime,
|
|
||||||
bytes: Arc::new(data),
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
output.insert(label, PileValue::Array(images));
|
|
||||||
}
|
|
||||||
|
|
||||||
let _ = self.output.set(output);
|
let _ = self.output.set(output);
|
||||||
#[expect(clippy::unwrap_used)]
|
#[expect(clippy::unwrap_used)]
|
||||||
return Ok(self.output.get().unwrap());
|
return Ok(self.output.get().unwrap());
|
||||||
@@ -96,15 +153,25 @@ impl<'a> FlacExtractor<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl Extractor for FlacExtractor<'_> {
|
impl ObjectExtractor for FlacExtractor<'_> {
|
||||||
async fn field<'a>(
|
async fn field<'a>(
|
||||||
&'a self,
|
&'a self,
|
||||||
name: &Label,
|
name: &Label,
|
||||||
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
|
) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
|
||||||
|
if name.as_str() == "images"
|
||||||
|
&& let Some(ref images) = self.images
|
||||||
|
{
|
||||||
|
return Ok(Some(images));
|
||||||
|
}
|
||||||
Ok(self.get_inner().await?.get(name))
|
Ok(self.get_inner().await?.get(name))
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||||
Ok(self.get_inner().await?.keys().cloned().collect())
|
let mut fields = self.get_inner().await?.keys().cloned().collect::<Vec<_>>();
|
||||||
|
if self.images.is_some() {
|
||||||
|
#[expect(clippy::unwrap_used)]
|
||||||
|
fields.push(Label::new("images").unwrap());
|
||||||
|
}
|
||||||
|
Ok(fields)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
use pile_config::Label;
|
use pile_config::Label;
|
||||||
use std::{collections::HashMap, path::Component, sync::OnceLock};
|
use std::{collections::HashMap, path::Component, sync::OnceLock};
|
||||||
|
|
||||||
use crate::{Item, PileValue, extract::Extractor};
|
use crate::{Item, PileValue, extract::ObjectExtractor};
|
||||||
|
|
||||||
pub struct FsExtractor<'a> {
|
pub struct FsExtractor<'a> {
|
||||||
item: &'a Item,
|
item: &'a Item,
|
||||||
@@ -62,7 +62,7 @@ impl<'a> FsExtractor<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl Extractor for FsExtractor<'_> {
|
impl ObjectExtractor for FsExtractor<'_> {
|
||||||
async fn field<'a>(
|
async fn field<'a>(
|
||||||
&'a self,
|
&'a self,
|
||||||
name: &Label,
|
name: &Label,
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ use id3::Tag;
|
|||||||
use pile_config::Label;
|
use pile_config::Label;
|
||||||
use std::{borrow::Cow, collections::HashMap, io::BufReader, sync::OnceLock};
|
use std::{borrow::Cow, collections::HashMap, io::BufReader, sync::OnceLock};
|
||||||
|
|
||||||
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor};
|
use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor};
|
||||||
|
|
||||||
pub struct Id3Extractor<'a> {
|
pub struct Id3Extractor<'a> {
|
||||||
item: &'a Item,
|
item: &'a Item,
|
||||||
@@ -114,7 +114,7 @@ fn frame_id_to_field(id: &str) -> Cow<'static, str> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl Extractor for Id3Extractor<'_> {
|
impl ObjectExtractor for Id3Extractor<'_> {
|
||||||
async fn field<'a>(
|
async fn field<'a>(
|
||||||
&'a self,
|
&'a self,
|
||||||
name: &Label,
|
name: &Label,
|
||||||
|
|||||||
@@ -1,14 +1,14 @@
|
|||||||
use pile_config::Label;
|
use pile_config::Label;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
use crate::{PileValue, extract::Extractor};
|
use crate::{PileValue, extract::ObjectExtractor};
|
||||||
|
|
||||||
pub struct MapExtractor<'a> {
|
pub struct MapExtractor<'a> {
|
||||||
pub(crate) inner: HashMap<Label, PileValue<'a>>,
|
pub(crate) inner: HashMap<Label, PileValue<'a>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl Extractor for MapExtractor<'_> {
|
impl ObjectExtractor for MapExtractor<'_> {
|
||||||
async fn field<'a>(
|
async fn field<'a>(
|
||||||
&'a self,
|
&'a self,
|
||||||
name: &Label,
|
name: &Label,
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ use crate::Item;
|
|||||||
/// Metadata is exposed as an immutable map of {label: value},
|
/// Metadata is exposed as an immutable map of {label: value},
|
||||||
/// much like a json object.
|
/// much like a json object.
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
pub trait Extractor: Send + Sync {
|
pub trait ObjectExtractor: Send + Sync {
|
||||||
/// Get the field at `name` from `item`.
|
/// Get the field at `name` from `item`.
|
||||||
/// - returns `None` if `name` is not a valid field
|
/// - returns `None` if `name` is not a valid field
|
||||||
/// - returns `Some(Null)` if `name` is not available
|
/// - returns `Some(Null)` if `name` is not available
|
||||||
@@ -50,14 +50,31 @@ pub trait Extractor: Send + Sync {
|
|||||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error>;
|
async fn fields(&self) -> Result<Vec<Label>, std::io::Error>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// An attachment that extracts metadata from an [Item].
|
||||||
|
///
|
||||||
|
/// Metadata is exposed as an immutable list of values.
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
pub trait ListExtractor: Send + Sync {
|
||||||
|
/// Get the item at index `idx`.
|
||||||
|
/// Indices start at zero, and must be consecutive.
|
||||||
|
/// - returns `None` if `idx` is out of range
|
||||||
|
/// - returns `Some(Null)` if `None` is at `idx`
|
||||||
|
async fn get<'a>(
|
||||||
|
&'a self,
|
||||||
|
idx: usize,
|
||||||
|
) -> Result<Option<&'a crate::PileValue<'a>>, std::io::Error>;
|
||||||
|
|
||||||
|
async fn len(&self) -> Result<usize, std::io::Error>;
|
||||||
|
|
||||||
|
async fn is_empty(&self) -> Result<bool, std::io::Error> {
|
||||||
|
Ok(self.len().await? == 0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub struct MetaExtractor<'a> {
|
pub struct MetaExtractor<'a> {
|
||||||
inner: MapExtractor<'a>,
|
inner: MapExtractor<'a>,
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
|
||||||
// MARK: file
|
|
||||||
//
|
|
||||||
|
|
||||||
impl<'a> MetaExtractor<'a> {
|
impl<'a> MetaExtractor<'a> {
|
||||||
#[expect(clippy::unwrap_used)]
|
#[expect(clippy::unwrap_used)]
|
||||||
pub fn new(item: &'a Item) -> Self {
|
pub fn new(item: &'a Item) -> Self {
|
||||||
@@ -65,35 +82,35 @@ impl<'a> MetaExtractor<'a> {
|
|||||||
inner: HashMap::from([
|
inner: HashMap::from([
|
||||||
(
|
(
|
||||||
Label::new("flac").unwrap(),
|
Label::new("flac").unwrap(),
|
||||||
crate::PileValue::Extractor(Arc::new(FlacExtractor::new(item))),
|
crate::PileValue::ObjectExtractor(Arc::new(FlacExtractor::new(item))),
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
Label::new("id3").unwrap(),
|
Label::new("id3").unwrap(),
|
||||||
crate::PileValue::Extractor(Arc::new(Id3Extractor::new(item))),
|
crate::PileValue::ObjectExtractor(Arc::new(Id3Extractor::new(item))),
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
Label::new("fs").unwrap(),
|
Label::new("fs").unwrap(),
|
||||||
crate::PileValue::Extractor(Arc::new(FsExtractor::new(item))),
|
crate::PileValue::ObjectExtractor(Arc::new(FsExtractor::new(item))),
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
Label::new("epub").unwrap(),
|
Label::new("epub").unwrap(),
|
||||||
crate::PileValue::Extractor(Arc::new(EpubExtractor::new(item))),
|
crate::PileValue::ObjectExtractor(Arc::new(EpubExtractor::new(item))),
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
Label::new("exif").unwrap(),
|
Label::new("exif").unwrap(),
|
||||||
crate::PileValue::Extractor(Arc::new(ExifExtractor::new(item))),
|
crate::PileValue::ObjectExtractor(Arc::new(ExifExtractor::new(item))),
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
Label::new("pdf").unwrap(),
|
Label::new("pdf").unwrap(),
|
||||||
crate::PileValue::Extractor(Arc::new(PdfExtractor::new(item))),
|
crate::PileValue::ObjectExtractor(Arc::new(PdfExtractor::new(item))),
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
Label::new("toml").unwrap(),
|
Label::new("toml").unwrap(),
|
||||||
crate::PileValue::Extractor(Arc::new(TomlExtractor::new(item))),
|
crate::PileValue::ObjectExtractor(Arc::new(TomlExtractor::new(item))),
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
Label::new("sidecar").unwrap(),
|
Label::new("sidecar").unwrap(),
|
||||||
crate::PileValue::Extractor(Arc::new(SidecarExtractor::new(item))),
|
crate::PileValue::ObjectExtractor(Arc::new(SidecarExtractor::new(item))),
|
||||||
),
|
),
|
||||||
]),
|
]),
|
||||||
};
|
};
|
||||||
@@ -103,7 +120,7 @@ impl<'a> MetaExtractor<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl Extractor for MetaExtractor<'_> {
|
impl ObjectExtractor for MetaExtractor<'_> {
|
||||||
async fn field<'a>(
|
async fn field<'a>(
|
||||||
&'a self,
|
&'a self,
|
||||||
name: &pile_config::Label,
|
name: &pile_config::Label,
|
||||||
|
|||||||
@@ -6,6 +6,11 @@ mod pdf_cover;
|
|||||||
#[cfg(feature = "pdfium")]
|
#[cfg(feature = "pdfium")]
|
||||||
pub use pdf_cover::*;
|
pub use pdf_cover::*;
|
||||||
|
|
||||||
|
#[cfg(feature = "pdfium")]
|
||||||
|
mod pdf_pages;
|
||||||
|
#[cfg(feature = "pdfium")]
|
||||||
|
pub use pdf_pages::*;
|
||||||
|
|
||||||
mod pdf_meta;
|
mod pdf_meta;
|
||||||
pub use pdf_meta::*;
|
pub use pdf_meta::*;
|
||||||
|
|
||||||
@@ -14,7 +19,7 @@ pub use pdf_text::*;
|
|||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
Item, PileValue,
|
Item, PileValue,
|
||||||
extract::{Extractor, MapExtractor},
|
extract::{MapExtractor, ObjectExtractor},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct PdfExtractor<'a> {
|
pub struct PdfExtractor<'a> {
|
||||||
@@ -27,16 +32,21 @@ impl<'a> PdfExtractor<'a> {
|
|||||||
let mut inner_map = HashMap::new();
|
let mut inner_map = HashMap::new();
|
||||||
inner_map.insert(
|
inner_map.insert(
|
||||||
Label::new("text").unwrap(),
|
Label::new("text").unwrap(),
|
||||||
PileValue::Extractor(Arc::new(PdfTextExtractor::new(item))),
|
PileValue::ObjectExtractor(Arc::new(PdfTextExtractor::new(item))),
|
||||||
);
|
);
|
||||||
inner_map.insert(
|
inner_map.insert(
|
||||||
Label::new("meta").unwrap(),
|
Label::new("meta").unwrap(),
|
||||||
PileValue::Extractor(Arc::new(PdfMetaExtractor::new(item))),
|
PileValue::ObjectExtractor(Arc::new(PdfMetaExtractor::new(item))),
|
||||||
);
|
);
|
||||||
#[cfg(feature = "pdfium")]
|
#[cfg(feature = "pdfium")]
|
||||||
inner_map.insert(
|
inner_map.insert(
|
||||||
Label::new("cover").unwrap(),
|
Label::new("cover").unwrap(),
|
||||||
PileValue::Extractor(Arc::new(PdfCoverExtractor::new(item))),
|
PileValue::ObjectExtractor(Arc::new(PdfCoverExtractor::new(item))),
|
||||||
|
);
|
||||||
|
#[cfg(feature = "pdfium")]
|
||||||
|
inner_map.insert(
|
||||||
|
Label::new("pages").unwrap(),
|
||||||
|
PileValue::ListExtractor(Arc::new(PdfPagesExtractor::new(item))),
|
||||||
);
|
);
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
@@ -46,7 +56,7 @@ impl<'a> PdfExtractor<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl Extractor for PdfExtractor<'_> {
|
impl ObjectExtractor for PdfExtractor<'_> {
|
||||||
async fn field<'a>(
|
async fn field<'a>(
|
||||||
&'a self,
|
&'a self,
|
||||||
name: &pile_config::Label,
|
name: &pile_config::Label,
|
||||||
@@ -54,7 +64,7 @@ impl Extractor for PdfExtractor<'_> {
|
|||||||
#[expect(clippy::unwrap_used)]
|
#[expect(clippy::unwrap_used)]
|
||||||
if name.as_str() == "text" {
|
if name.as_str() == "text" {
|
||||||
match self.inner.inner.get(name).unwrap() {
|
match self.inner.inner.get(name).unwrap() {
|
||||||
PileValue::Extractor(x) => return x.field(name).await,
|
PileValue::ObjectExtractor(x) => return x.field(name).await,
|
||||||
_ => unreachable!(),
|
_ => unreachable!(),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@@ -63,7 +73,7 @@ impl Extractor for PdfExtractor<'_> {
|
|||||||
#[expect(clippy::unwrap_used)]
|
#[expect(clippy::unwrap_used)]
|
||||||
if name.as_str() == "cover" {
|
if name.as_str() == "cover" {
|
||||||
match self.inner.inner.get(name).unwrap() {
|
match self.inner.inner.get(name).unwrap() {
|
||||||
PileValue::Extractor(x) => return x.field(name).await,
|
PileValue::ObjectExtractor(x) => return x.field(name).await,
|
||||||
_ => unreachable!(),
|
_ => unreachable!(),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@@ -78,6 +88,8 @@ impl Extractor for PdfExtractor<'_> {
|
|||||||
Label::new("meta").unwrap(),
|
Label::new("meta").unwrap(),
|
||||||
#[cfg(feature = "pdfium")]
|
#[cfg(feature = "pdfium")]
|
||||||
Label::new("cover").unwrap(),
|
Label::new("cover").unwrap(),
|
||||||
|
#[cfg(feature = "pdfium")]
|
||||||
|
Label::new("pages").unwrap(),
|
||||||
])
|
])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ use std::{
|
|||||||
};
|
};
|
||||||
use tracing::trace;
|
use tracing::trace;
|
||||||
|
|
||||||
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor};
|
use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor};
|
||||||
|
|
||||||
pub struct PdfCoverExtractor<'a> {
|
pub struct PdfCoverExtractor<'a> {
|
||||||
item: &'a Item,
|
item: &'a Item,
|
||||||
@@ -84,7 +84,7 @@ impl<'a> PdfCoverExtractor<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl Extractor for PdfCoverExtractor<'_> {
|
impl ObjectExtractor for PdfCoverExtractor<'_> {
|
||||||
async fn field<'a>(
|
async fn field<'a>(
|
||||||
&'a self,
|
&'a self,
|
||||||
name: &Label,
|
name: &Label,
|
||||||
|
|||||||
@@ -4,7 +4,8 @@ use pile_config::Label;
|
|||||||
use std::{collections::HashMap, io::BufReader, sync::OnceLock};
|
use std::{collections::HashMap, io::BufReader, sync::OnceLock};
|
||||||
use tracing::trace;
|
use tracing::trace;
|
||||||
|
|
||||||
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor};
|
use crate::extract::ObjectExtractor;
|
||||||
|
use crate::{Item, PileValue, SyncReadBridge};
|
||||||
|
|
||||||
pub struct PdfMetaExtractor<'a> {
|
pub struct PdfMetaExtractor<'a> {
|
||||||
item: &'a Item,
|
item: &'a Item,
|
||||||
@@ -40,6 +41,8 @@ impl<'a> PdfMetaExtractor<'a> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let page_count = file.num_pages();
|
||||||
|
|
||||||
let mut meta: Vec<(&'static str, Option<String>)> = Vec::new();
|
let mut meta: Vec<(&'static str, Option<String>)> = Vec::new();
|
||||||
|
|
||||||
if let Some(info) = &file.trailer.info_dict {
|
if let Some(info) = &file.trailer.info_dict {
|
||||||
@@ -64,12 +67,12 @@ impl<'a> PdfMetaExtractor<'a> {
|
|||||||
meta.push(("mod_date", info.mod_date.as_ref().map(format_date)));
|
meta.push(("mod_date", info.mod_date.as_ref().map(format_date)));
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok::<_, std::io::Error>(meta)
|
Ok::<_, std::io::Error>((page_count, meta))
|
||||||
})
|
})
|
||||||
.await
|
.await
|
||||||
.map_err(std::io::Error::other)?;
|
.map_err(std::io::Error::other)?;
|
||||||
|
|
||||||
let raw_meta = match raw_meta {
|
let (page_count, raw_meta) = match raw_meta {
|
||||||
Ok(x) => x,
|
Ok(x) => x,
|
||||||
Err(error) => {
|
Err(error) => {
|
||||||
trace!(message = "Could not process pdf", ?error, key = ?self.item.key());
|
trace!(message = "Could not process pdf", ?error, key = ?self.item.key());
|
||||||
@@ -79,6 +82,12 @@ impl<'a> PdfMetaExtractor<'a> {
|
|||||||
|
|
||||||
let mut output: HashMap<Label, PileValue<'a>> = HashMap::new();
|
let mut output: HashMap<Label, PileValue<'a>> = HashMap::new();
|
||||||
|
|
||||||
|
#[expect(clippy::unwrap_used)]
|
||||||
|
output.insert(
|
||||||
|
Label::new("pages").unwrap(),
|
||||||
|
PileValue::U64(page_count as u64),
|
||||||
|
);
|
||||||
|
|
||||||
#[expect(clippy::unwrap_used)]
|
#[expect(clippy::unwrap_used)]
|
||||||
for (key, val) in raw_meta {
|
for (key, val) in raw_meta {
|
||||||
let label = Label::new(key).unwrap();
|
let label = Label::new(key).unwrap();
|
||||||
@@ -106,7 +115,7 @@ fn format_date(d: &Date) -> String {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl Extractor for PdfMetaExtractor<'_> {
|
impl ObjectExtractor for PdfMetaExtractor<'_> {
|
||||||
async fn field<'a>(
|
async fn field<'a>(
|
||||||
&'a self,
|
&'a self,
|
||||||
name: &Label,
|
name: &Label,
|
||||||
|
|||||||
119
crates/pile-dataset/src/extract/pdf/pdf_pages.rs
Normal file
119
crates/pile-dataset/src/extract/pdf/pdf_pages.rs
Normal file
@@ -0,0 +1,119 @@
|
|||||||
|
use image::ImageFormat;
|
||||||
|
use pdfium_render::prelude::*;
|
||||||
|
use std::{
|
||||||
|
io::{BufReader, Cursor},
|
||||||
|
sync::{Arc, OnceLock},
|
||||||
|
};
|
||||||
|
use tracing::trace;
|
||||||
|
|
||||||
|
use crate::{Item, PileValue, SyncReadBridge, extract::ListExtractor};
|
||||||
|
|
||||||
|
pub struct PdfPagesExtractor<'a> {
|
||||||
|
item: &'a Item,
|
||||||
|
bytes: OnceLock<Arc<Vec<u8>>>,
|
||||||
|
pages: OnceLock<Vec<OnceLock<PileValue<'a>>>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> PdfPagesExtractor<'a> {
|
||||||
|
pub fn new(item: &'a Item) -> Self {
|
||||||
|
Self {
|
||||||
|
item,
|
||||||
|
bytes: OnceLock::new(),
|
||||||
|
pages: OnceLock::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_bytes(&self) -> Result<&Arc<Vec<u8>>, std::io::Error> {
|
||||||
|
if let Some(x) = self.bytes.get() {
|
||||||
|
return Ok(x);
|
||||||
|
}
|
||||||
|
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||||
|
let bytes = tokio::task::spawn_blocking(move || {
|
||||||
|
let mut b = Vec::new();
|
||||||
|
std::io::Read::read_to_end(&mut BufReader::new(reader), &mut b)?;
|
||||||
|
Ok::<_, std::io::Error>(b)
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.map_err(std::io::Error::other)??;
|
||||||
|
let _ = self.bytes.set(Arc::new(bytes));
|
||||||
|
#[expect(clippy::unwrap_used)]
|
||||||
|
return Ok(self.bytes.get().unwrap());
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn init_pages(&self) -> Result<&Vec<OnceLock<PileValue<'a>>>, std::io::Error> {
|
||||||
|
if let Some(x) = self.pages.get() {
|
||||||
|
return Ok(x);
|
||||||
|
}
|
||||||
|
let bytes = Arc::clone(self.get_bytes().await?);
|
||||||
|
let count = tokio::task::spawn_blocking(move || {
|
||||||
|
let pdfium = Pdfium::default();
|
||||||
|
let doc = pdfium
|
||||||
|
.load_pdf_from_byte_slice(&bytes, None)
|
||||||
|
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||||
|
Ok::<_, std::io::Error>(doc.pages().len() as usize)
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.map_err(std::io::Error::other)?;
|
||||||
|
let slots = match count {
|
||||||
|
Ok(n) => (0..n).map(|_| OnceLock::new()).collect(),
|
||||||
|
Err(error) => {
|
||||||
|
trace!(message = "Could not read pdf page count", ?error, key = ?self.item.key());
|
||||||
|
Vec::new()
|
||||||
|
}
|
||||||
|
};
|
||||||
|
return Ok(self.pages.get_or_init(|| slots));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
impl ListExtractor for PdfPagesExtractor<'_> {
|
||||||
|
async fn get<'a>(&'a self, idx: usize) -> Result<Option<&'a PileValue<'a>>, std::io::Error> {
|
||||||
|
let pages = self.init_pages().await?;
|
||||||
|
let Some(slot) = pages.get(idx) else {
|
||||||
|
return Ok(None);
|
||||||
|
};
|
||||||
|
if let Some(v) = slot.get() {
|
||||||
|
return Ok(Some(v));
|
||||||
|
}
|
||||||
|
|
||||||
|
let bytes = Arc::clone(self.get_bytes().await?);
|
||||||
|
let png = tokio::task::spawn_blocking(move || {
|
||||||
|
let pdfium = Pdfium::default();
|
||||||
|
let doc = pdfium
|
||||||
|
.load_pdf_from_byte_slice(&bytes, None)
|
||||||
|
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||||
|
let render_config = PdfRenderConfig::new().set_target_width(1024);
|
||||||
|
let page = doc
|
||||||
|
.pages()
|
||||||
|
.get(idx as u16)
|
||||||
|
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||||
|
let image = page
|
||||||
|
.render_with_config(&render_config)
|
||||||
|
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?
|
||||||
|
.as_image();
|
||||||
|
let mut png_bytes = Vec::new();
|
||||||
|
image
|
||||||
|
.write_to(&mut Cursor::new(&mut png_bytes), ImageFormat::Png)
|
||||||
|
.map_err(|e| std::io::Error::other(e.to_string()))?;
|
||||||
|
Ok::<_, std::io::Error>(png_bytes)
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.map_err(std::io::Error::other)?;
|
||||||
|
|
||||||
|
let value = match png {
|
||||||
|
Ok(bytes) => PileValue::Blob {
|
||||||
|
mime: mime::IMAGE_PNG,
|
||||||
|
bytes: Arc::new(bytes),
|
||||||
|
},
|
||||||
|
Err(error) => {
|
||||||
|
trace!(message = "Could not render pdf page", ?error, idx, key = ?self.item.key());
|
||||||
|
PileValue::Null
|
||||||
|
}
|
||||||
|
};
|
||||||
|
return Ok(Some(slot.get_or_init(|| value)));
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn len(&self) -> Result<usize, std::io::Error> {
|
||||||
|
Ok(self.init_pages().await?.len())
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -4,7 +4,8 @@ use pile_config::Label;
|
|||||||
use std::{collections::HashMap, io::BufReader, sync::OnceLock};
|
use std::{collections::HashMap, io::BufReader, sync::OnceLock};
|
||||||
use tracing::trace;
|
use tracing::trace;
|
||||||
|
|
||||||
use crate::{Item, PileValue, SyncReadBridge, extract::Extractor};
|
use crate::extract::ObjectExtractor;
|
||||||
|
use crate::{Item, PileValue, SyncReadBridge};
|
||||||
|
|
||||||
pub struct PdfTextExtractor<'a> {
|
pub struct PdfTextExtractor<'a> {
|
||||||
item: &'a Item,
|
item: &'a Item,
|
||||||
@@ -94,7 +95,7 @@ impl<'a> PdfTextExtractor<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl Extractor for PdfTextExtractor<'_> {
|
impl ObjectExtractor for PdfTextExtractor<'_> {
|
||||||
async fn field<'a>(
|
async fn field<'a>(
|
||||||
&'a self,
|
&'a self,
|
||||||
name: &Label,
|
name: &Label,
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ use std::sync::OnceLock;
|
|||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
Item, PileValue,
|
Item, PileValue,
|
||||||
extract::{Extractor, TomlExtractor},
|
extract::{ObjectExtractor, TomlExtractor},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct SidecarExtractor<'a> {
|
pub struct SidecarExtractor<'a> {
|
||||||
@@ -21,7 +21,7 @@ impl<'a> SidecarExtractor<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl Extractor for SidecarExtractor<'_> {
|
impl ObjectExtractor for SidecarExtractor<'_> {
|
||||||
async fn field<'a>(
|
async fn field<'a>(
|
||||||
&'a self,
|
&'a self,
|
||||||
name: &Label,
|
name: &Label,
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
use pile_config::Label;
|
use pile_config::Label;
|
||||||
use std::{collections::HashMap, sync::OnceLock};
|
use std::{collections::HashMap, sync::OnceLock};
|
||||||
|
|
||||||
use crate::{AsyncReader, Item, PileValue, extract::Extractor};
|
use crate::{AsyncReader, Item, PileValue, extract::ObjectExtractor};
|
||||||
|
|
||||||
fn toml_to_pile(value: toml::Value) -> PileValue<'static> {
|
fn toml_to_pile(value: toml::Value) -> PileValue<'static> {
|
||||||
match value {
|
match value {
|
||||||
@@ -52,7 +52,7 @@ impl<'a> TomlExtractor<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl Extractor for TomlExtractor<'_> {
|
impl ObjectExtractor for TomlExtractor<'_> {
|
||||||
async fn field<'a>(
|
async fn field<'a>(
|
||||||
&'a self,
|
&'a self,
|
||||||
name: &Label,
|
name: &Label,
|
||||||
|
|||||||
@@ -76,7 +76,7 @@ impl DbFtsIndex {
|
|||||||
doc.add_text(self.schema.get_field("_meta_source")?, item.source_name());
|
doc.add_text(self.schema.get_field("_meta_source")?, item.source_name());
|
||||||
doc.add_text(self.schema.get_field("_meta_key")?, key);
|
doc.add_text(self.schema.get_field("_meta_key")?, key);
|
||||||
|
|
||||||
let extractor = PileValue::Extractor(Arc::new(MetaExtractor::new(item)));
|
let extractor = PileValue::ObjectExtractor(Arc::new(MetaExtractor::new(item)));
|
||||||
|
|
||||||
let mut empty = true;
|
let mut empty = true;
|
||||||
for name in self.fts_cfg().fields.keys() {
|
for name in self.fts_cfg().fields.keys() {
|
||||||
@@ -145,6 +145,8 @@ impl DbFtsIndex {
|
|||||||
loop {
|
loop {
|
||||||
val = match val {
|
val = match val {
|
||||||
PileValue::String(x) => return Ok(Some(x.to_string())),
|
PileValue::String(x) => return Ok(Some(x.to_string())),
|
||||||
|
PileValue::U64(x) => return Ok(Some(x.to_string())),
|
||||||
|
PileValue::I64(x) => return Ok(Some(x.to_string())),
|
||||||
|
|
||||||
#[expect(clippy::unwrap_used)]
|
#[expect(clippy::unwrap_used)]
|
||||||
PileValue::Array(ref mut x) => {
|
PileValue::Array(ref mut x) => {
|
||||||
@@ -177,7 +179,7 @@ impl DbFtsIndex {
|
|||||||
continue 'outer;
|
continue 'outer;
|
||||||
}
|
}
|
||||||
|
|
||||||
PileValue::Extractor(_) => {
|
PileValue::ObjectExtractor(_) => {
|
||||||
trace!(
|
trace!(
|
||||||
message = "Skipping field, is object",
|
message = "Skipping field, is object",
|
||||||
field = field_name.to_string(),
|
field = field_name.to_string(),
|
||||||
@@ -186,6 +188,15 @@ impl DbFtsIndex {
|
|||||||
continue 'outer;
|
continue 'outer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PileValue::ListExtractor(_) => {
|
||||||
|
trace!(
|
||||||
|
message = "Skipping field, is ListExtractor",
|
||||||
|
field = field_name.to_string(),
|
||||||
|
?path,
|
||||||
|
);
|
||||||
|
continue 'outer;
|
||||||
|
}
|
||||||
|
|
||||||
PileValue::Blob { .. } => {
|
PileValue::Blob { .. } => {
|
||||||
trace!(
|
trace!(
|
||||||
message = "Skipping field, is blob",
|
message = "Skipping field, is blob",
|
||||||
@@ -300,8 +311,11 @@ pub fn apply<'a>(post: &FieldSpecPost, val: &PileValue<'a>) -> Option<PileValue<
|
|||||||
|
|
||||||
FieldSpecPost::SetCase { case: Case::Lower } => match val {
|
FieldSpecPost::SetCase { case: Case::Lower } => match val {
|
||||||
PileValue::Null => return None,
|
PileValue::Null => return None,
|
||||||
|
PileValue::U64(_) => return None,
|
||||||
|
PileValue::I64(_) => return None,
|
||||||
PileValue::Blob { .. } => return None,
|
PileValue::Blob { .. } => return None,
|
||||||
PileValue::Extractor(_) => return None,
|
PileValue::ObjectExtractor(_) => return None,
|
||||||
|
PileValue::ListExtractor(_) => return None,
|
||||||
PileValue::String(x) => PileValue::String(x.to_lowercase().into()),
|
PileValue::String(x) => PileValue::String(x.to_lowercase().into()),
|
||||||
|
|
||||||
PileValue::Array(x) => {
|
PileValue::Array(x) => {
|
||||||
@@ -311,8 +325,11 @@ pub fn apply<'a>(post: &FieldSpecPost, val: &PileValue<'a>) -> Option<PileValue<
|
|||||||
|
|
||||||
FieldSpecPost::SetCase { case: Case::Upper } => match val {
|
FieldSpecPost::SetCase { case: Case::Upper } => match val {
|
||||||
PileValue::Null => return None,
|
PileValue::Null => return None,
|
||||||
|
PileValue::U64(_) => return None,
|
||||||
|
PileValue::I64(_) => return None,
|
||||||
PileValue::Blob { .. } => return None,
|
PileValue::Blob { .. } => return None,
|
||||||
PileValue::Extractor(_) => return None,
|
PileValue::ObjectExtractor(_) => return None,
|
||||||
|
PileValue::ListExtractor(_) => return None,
|
||||||
PileValue::String(x) => PileValue::String(x.to_uppercase().into()),
|
PileValue::String(x) => PileValue::String(x.to_uppercase().into()),
|
||||||
|
|
||||||
PileValue::Array(x) => {
|
PileValue::Array(x) => {
|
||||||
@@ -322,8 +339,11 @@ pub fn apply<'a>(post: &FieldSpecPost, val: &PileValue<'a>) -> Option<PileValue<
|
|||||||
|
|
||||||
FieldSpecPost::TrimSuffix { trim_suffix } => match val {
|
FieldSpecPost::TrimSuffix { trim_suffix } => match val {
|
||||||
PileValue::Null => return None,
|
PileValue::Null => return None,
|
||||||
|
PileValue::U64(_) => return None,
|
||||||
|
PileValue::I64(_) => return None,
|
||||||
PileValue::Blob { .. } => return None,
|
PileValue::Blob { .. } => return None,
|
||||||
PileValue::Extractor(_) => return None,
|
PileValue::ObjectExtractor(_) => return None,
|
||||||
|
PileValue::ListExtractor(_) => return None,
|
||||||
|
|
||||||
PileValue::String(x) => {
|
PileValue::String(x) => {
|
||||||
PileValue::String(x.strip_suffix(trim_suffix).unwrap_or(x).into())
|
PileValue::String(x.strip_suffix(trim_suffix).unwrap_or(x).into())
|
||||||
@@ -336,8 +356,11 @@ pub fn apply<'a>(post: &FieldSpecPost, val: &PileValue<'a>) -> Option<PileValue<
|
|||||||
|
|
||||||
FieldSpecPost::TrimPrefix { trim_prefix } => match val {
|
FieldSpecPost::TrimPrefix { trim_prefix } => match val {
|
||||||
PileValue::Null => return None,
|
PileValue::Null => return None,
|
||||||
|
PileValue::U64(_) => return None,
|
||||||
|
PileValue::I64(_) => return None,
|
||||||
PileValue::Blob { .. } => return None,
|
PileValue::Blob { .. } => return None,
|
||||||
PileValue::Extractor(_) => return None,
|
PileValue::ObjectExtractor(_) => return None,
|
||||||
|
PileValue::ListExtractor(_) => return None,
|
||||||
|
|
||||||
PileValue::String(x) => {
|
PileValue::String(x) => {
|
||||||
PileValue::String(x.strip_prefix(trim_prefix).unwrap_or(x).into())
|
PileValue::String(x.strip_prefix(trim_prefix).unwrap_or(x).into())
|
||||||
@@ -350,10 +373,14 @@ pub fn apply<'a>(post: &FieldSpecPost, val: &PileValue<'a>) -> Option<PileValue<
|
|||||||
|
|
||||||
FieldSpecPost::Join { join } => match val {
|
FieldSpecPost::Join { join } => match val {
|
||||||
PileValue::Null => return None,
|
PileValue::Null => return None,
|
||||||
|
PileValue::U64(_) => return None,
|
||||||
|
PileValue::I64(_) => return None,
|
||||||
PileValue::Blob { .. } => return None,
|
PileValue::Blob { .. } => return None,
|
||||||
PileValue::Extractor(_) => return None,
|
PileValue::ObjectExtractor(_) => return None,
|
||||||
|
PileValue::ListExtractor(_) => return None,
|
||||||
|
|
||||||
PileValue::String(x) => PileValue::String(x.clone()),
|
PileValue::String(x) => PileValue::String(x.clone()),
|
||||||
|
|
||||||
PileValue::Array(x) => PileValue::String(
|
PileValue::Array(x) => PileValue::String(
|
||||||
x.iter()
|
x.iter()
|
||||||
.map(|x| apply(post, x))
|
.map(|x| apply(post, x))
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ pub async fn get_field(
|
|||||||
};
|
};
|
||||||
|
|
||||||
let extractor = MetaExtractor::new(&item);
|
let extractor = MetaExtractor::new(&item);
|
||||||
let root: PileValue<'_> = PileValue::Extractor(Arc::new(extractor));
|
let root: PileValue<'_> = PileValue::ObjectExtractor(Arc::new(extractor));
|
||||||
|
|
||||||
let value = match root.query(&path).await {
|
let value = match root.query(&path).await {
|
||||||
Ok(Some(v)) => v,
|
Ok(Some(v)) => v,
|
||||||
|
|||||||
@@ -4,11 +4,13 @@ use serde_json::{Map, Value};
|
|||||||
use smartstring::{LazyCompact, SmartString};
|
use smartstring::{LazyCompact, SmartString};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use crate::extract::Extractor;
|
use crate::extract::{ListExtractor, ObjectExtractor};
|
||||||
|
|
||||||
/// An immutable, lazily-computed value similar to [serde_json::Value].
|
/// An immutable, lazily-computed value similar to [serde_json::Value].
|
||||||
pub enum PileValue<'a> {
|
pub enum PileValue<'a> {
|
||||||
Null,
|
Null,
|
||||||
|
U64(u64),
|
||||||
|
I64(i64),
|
||||||
|
|
||||||
/// A string
|
/// A string
|
||||||
String(SmartString<LazyCompact>),
|
String(SmartString<LazyCompact>),
|
||||||
@@ -23,16 +25,22 @@ pub enum PileValue<'a> {
|
|||||||
},
|
},
|
||||||
|
|
||||||
/// A lazily-computed map of {label: value}
|
/// A lazily-computed map of {label: value}
|
||||||
Extractor(Arc<dyn Extractor + 'a>),
|
ObjectExtractor(Arc<dyn ObjectExtractor + 'a>),
|
||||||
|
|
||||||
|
/// A lazily-computed array
|
||||||
|
ListExtractor(Arc<dyn ListExtractor + 'a>),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Clone for PileValue<'_> {
|
impl Clone for PileValue<'_> {
|
||||||
fn clone(&self) -> Self {
|
fn clone(&self) -> Self {
|
||||||
match self {
|
match self {
|
||||||
Self::Null => Self::Null,
|
Self::Null => Self::Null,
|
||||||
|
Self::U64(x) => Self::U64(*x),
|
||||||
|
Self::I64(x) => Self::I64(*x),
|
||||||
Self::String(x) => Self::String(x.clone()),
|
Self::String(x) => Self::String(x.clone()),
|
||||||
Self::Array(x) => Self::Array(x.clone()),
|
Self::Array(x) => Self::Array(x.clone()),
|
||||||
Self::Extractor(x) => Self::Extractor(x.clone()),
|
Self::ObjectExtractor(x) => Self::ObjectExtractor(x.clone()),
|
||||||
|
Self::ListExtractor(x) => Self::ListExtractor(x.clone()),
|
||||||
Self::Blob { mime, bytes } => Self::Blob {
|
Self::Blob { mime, bytes } => Self::Blob {
|
||||||
mime: mime.clone(),
|
mime: mime.clone(),
|
||||||
bytes: bytes.clone(),
|
bytes: bytes.clone(),
|
||||||
@@ -52,10 +60,13 @@ impl<'a> PileValue<'a> {
|
|||||||
out = match &out {
|
out = match &out {
|
||||||
None => return Ok(None),
|
None => return Ok(None),
|
||||||
Some(Self::Null) => None,
|
Some(Self::Null) => None,
|
||||||
|
Some(Self::U64(_)) => None,
|
||||||
|
Some(Self::I64(_)) => None,
|
||||||
Some(Self::Array(_)) => None,
|
Some(Self::Array(_)) => None,
|
||||||
Some(Self::String(_)) => None,
|
Some(Self::String(_)) => None,
|
||||||
Some(Self::Blob { .. }) => None,
|
Some(Self::Blob { .. }) => None,
|
||||||
Some(Self::Extractor(e)) => e.field(field).await?,
|
Some(Self::ListExtractor(_)) => None,
|
||||||
|
Some(Self::ObjectExtractor(e)) => e.field(field).await?,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -63,6 +74,8 @@ impl<'a> PileValue<'a> {
|
|||||||
out = match &out {
|
out = match &out {
|
||||||
None => return Ok(None),
|
None => return Ok(None),
|
||||||
Some(Self::Null) => None,
|
Some(Self::Null) => None,
|
||||||
|
Some(Self::U64(_)) => None,
|
||||||
|
Some(Self::I64(_)) => None,
|
||||||
Some(Self::Blob { .. }) => None,
|
Some(Self::Blob { .. }) => None,
|
||||||
Some(Self::Array(v)) => {
|
Some(Self::Array(v)) => {
|
||||||
let idx = if *idx >= 0 {
|
let idx = if *idx >= 0 {
|
||||||
@@ -74,7 +87,19 @@ impl<'a> PileValue<'a> {
|
|||||||
idx.and_then(|idx| v.get(idx))
|
idx.and_then(|idx| v.get(idx))
|
||||||
}
|
}
|
||||||
Some(Self::String(_)) => None,
|
Some(Self::String(_)) => None,
|
||||||
Some(Self::Extractor(_)) => None,
|
Some(Self::ObjectExtractor(_)) => None,
|
||||||
|
Some(Self::ListExtractor(e)) => {
|
||||||
|
let idx = if *idx >= 0 {
|
||||||
|
usize::try_from(*idx).ok()
|
||||||
|
} else {
|
||||||
|
usize::try_from(e.len().await? as i64 - idx).ok()
|
||||||
|
};
|
||||||
|
|
||||||
|
match idx {
|
||||||
|
Some(idx) => e.get(idx).await?,
|
||||||
|
None => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -93,6 +118,8 @@ impl<'a> PileValue<'a> {
|
|||||||
pub async fn to_json(&self) -> Result<Value, std::io::Error> {
|
pub async fn to_json(&self) -> Result<Value, std::io::Error> {
|
||||||
Ok(match self {
|
Ok(match self {
|
||||||
Self::Null => Value::Null,
|
Self::Null => Value::Null,
|
||||||
|
Self::U64(x) => Value::Number((*x).into()),
|
||||||
|
Self::I64(x) => Value::Number((*x).into()),
|
||||||
|
|
||||||
// TODO: replace with something meaningful
|
// TODO: replace with something meaningful
|
||||||
Self::Blob { mime, bytes } => {
|
Self::Blob { mime, bytes } => {
|
||||||
@@ -108,7 +135,7 @@ impl<'a> PileValue<'a> {
|
|||||||
Value::Array(arr)
|
Value::Array(arr)
|
||||||
}
|
}
|
||||||
|
|
||||||
Self::Extractor(e) => {
|
Self::ObjectExtractor(e) => {
|
||||||
let keys = e.fields().await?;
|
let keys = e.fields().await?;
|
||||||
let mut map = Map::new();
|
let mut map = Map::new();
|
||||||
for k in &keys {
|
for k in &keys {
|
||||||
@@ -120,6 +147,20 @@ impl<'a> PileValue<'a> {
|
|||||||
}
|
}
|
||||||
Value::Object(map)
|
Value::Object(map)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Self::ListExtractor(e) => {
|
||||||
|
let len = e.len().await?;
|
||||||
|
let mut list = Vec::with_capacity(len);
|
||||||
|
for i in 0..len {
|
||||||
|
#[expect(clippy::expect_used)]
|
||||||
|
let v = e.get(i)
|
||||||
|
.await?
|
||||||
|
.expect("value must be present according to length");
|
||||||
|
list.push(Box::pin(v.to_json()).await?);
|
||||||
|
}
|
||||||
|
|
||||||
|
Value::Array(list)
|
||||||
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -73,7 +73,7 @@ impl CliCmd for AnnotateCommand {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let meta = MetaExtractor::new(&item);
|
let meta = MetaExtractor::new(&item);
|
||||||
let extractor = PileValue::Extractor(Arc::new(meta));
|
let extractor = PileValue::ObjectExtractor(Arc::new(meta));
|
||||||
|
|
||||||
let Some(value) =
|
let Some(value) =
|
||||||
index.get_field(&extractor, &field).await.with_context(|| {
|
index.get_field(&extractor, &field).await.with_context(|| {
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ impl CliCmd for ProbeCommand {
|
|||||||
anyhow::anyhow!("{:?} not found in source {:?}", self.key, self.source)
|
anyhow::anyhow!("{:?} not found in source {:?}", self.key, self.source)
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
let value = PileValue::Extractor(Arc::new(MetaExtractor::new(&item)));
|
let value = PileValue::ObjectExtractor(Arc::new(MetaExtractor::new(&item)));
|
||||||
value
|
value
|
||||||
.to_json()
|
.to_json()
|
||||||
.await
|
.await
|
||||||
|
|||||||
Reference in New Issue
Block a user