Extractor rewrite
This commit is contained in:
95
crates/pile-value/src/extract/item/epub/epub_meta.rs
Normal file
95
crates/pile-value/src/extract/item/epub/epub_meta.rs
Normal file
@@ -0,0 +1,95 @@
|
||||
use epub::doc::EpubDoc;
|
||||
use pile_config::Label;
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
extract::traits::ObjectExtractor,
|
||||
value::{Item, PileValue, SyncReadBridge},
|
||||
};
|
||||
|
||||
pub struct EpubMetaExtractor {
|
||||
item: Item,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl EpubMetaExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
|
||||
if let Some(x) = self.output.get() {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let key = self.item.key();
|
||||
let ext = key.as_str().rsplit('.').next();
|
||||
if !matches!(ext, Some("epub")) {
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
let raw_meta = tokio::task::spawn_blocking(move || {
|
||||
let doc = EpubDoc::from_reader(reader)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||
|
||||
let fields: &[&'static str] = &[
|
||||
"title",
|
||||
"creator",
|
||||
"description",
|
||||
"language",
|
||||
"publisher",
|
||||
"date",
|
||||
"subject",
|
||||
"identifier",
|
||||
];
|
||||
|
||||
let meta: Vec<(&'static str, Option<String>)> =
|
||||
fields.iter().map(|&key| (key, doc.mdata(key))).collect();
|
||||
|
||||
Ok::<_, std::io::Error>(meta)
|
||||
})
|
||||
.await
|
||||
.map_err(std::io::Error::other)?;
|
||||
|
||||
let raw_meta = match raw_meta {
|
||||
Ok(x) => x,
|
||||
Err(error) => {
|
||||
trace!(message = "Could not process epub", ?error, key = ?self.item.key());
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
};
|
||||
|
||||
let mut output: HashMap<Label, PileValue> = HashMap::new();
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
for (key, val) in raw_meta {
|
||||
let label = Label::new(key).unwrap();
|
||||
let value = match val {
|
||||
Some(s) => PileValue::String(Arc::new(s.into())),
|
||||
None => PileValue::Null,
|
||||
};
|
||||
output.insert(label, value);
|
||||
}
|
||||
|
||||
return Ok(self.output.get_or_init(|| output));
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for EpubMetaExtractor {
|
||||
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.get(name).cloned())
|
||||
}
|
||||
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.keys().cloned().collect())
|
||||
}
|
||||
}
|
||||
105
crates/pile-value/src/extract/item/epub/epub_text.rs
Normal file
105
crates/pile-value/src/extract/item/epub/epub_text.rs
Normal file
@@ -0,0 +1,105 @@
|
||||
use epub::doc::EpubDoc;
|
||||
use pile_config::Label;
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
use tracing::debug;
|
||||
|
||||
use crate::{
|
||||
extract::traits::ObjectExtractor,
|
||||
value::{Item, PileValue, SyncReadBridge},
|
||||
};
|
||||
|
||||
pub struct EpubTextExtractor {
|
||||
item: Item,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl EpubTextExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
|
||||
if let Some(x) = self.output.get() {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let key = self.item.key();
|
||||
let ext = key.as_str().rsplit('.').next();
|
||||
if !matches!(ext, Some("epub")) {
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
let raw_text = tokio::task::spawn_blocking(move || {
|
||||
let mut doc = EpubDoc::from_reader(reader)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||
|
||||
let mut text_parts: Vec<String> = Vec::new();
|
||||
|
||||
loop {
|
||||
if let Ok(content) = doc.get_current_str() {
|
||||
text_parts.push(strip_html(&content));
|
||||
}
|
||||
if doc.go_next().is_err() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok::<_, std::io::Error>(text_parts.join(" "))
|
||||
})
|
||||
.await
|
||||
.map_err(std::io::Error::other)?;
|
||||
|
||||
let raw_text = match raw_text {
|
||||
Ok(x) => x,
|
||||
Err(error) => {
|
||||
debug!(message = "Could not process epub", ?error, key = ?self.item.key());
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
};
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
let output = HashMap::from([(
|
||||
Label::new("text").unwrap(),
|
||||
PileValue::String(Arc::new(raw_text.into())),
|
||||
)]);
|
||||
|
||||
let _ = self.output.set(output);
|
||||
#[expect(clippy::unwrap_used)]
|
||||
return Ok(self.output.get().unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
/// Strip HTML/XHTML tags from a string, leaving only text nodes.
|
||||
fn strip_html(html: &str) -> String {
|
||||
let mut result = String::with_capacity(html.len());
|
||||
let mut in_tag = false;
|
||||
|
||||
for c in html.chars() {
|
||||
match c {
|
||||
'<' => in_tag = true,
|
||||
'>' => in_tag = false,
|
||||
_ if !in_tag => result.push(c),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for EpubTextExtractor {
|
||||
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.get(name).cloned())
|
||||
}
|
||||
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.keys().cloned().collect())
|
||||
}
|
||||
}
|
||||
46
crates/pile-value/src/extract/item/epub/mod.rs
Normal file
46
crates/pile-value/src/extract/item/epub/mod.rs
Normal file
@@ -0,0 +1,46 @@
|
||||
use pile_config::Label;
|
||||
use std::sync::Arc;
|
||||
|
||||
mod epub_meta;
|
||||
pub use epub_meta::*;
|
||||
|
||||
mod epub_text;
|
||||
pub use epub_text::*;
|
||||
|
||||
use crate::{
|
||||
extract::traits::ObjectExtractor,
|
||||
value::{Item, PileValue},
|
||||
};
|
||||
|
||||
pub struct EpubExtractor {
|
||||
text: Arc<EpubTextExtractor>,
|
||||
meta: Arc<EpubMetaExtractor>,
|
||||
}
|
||||
|
||||
impl EpubExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
Self {
|
||||
text: Arc::new(EpubTextExtractor::new(item)),
|
||||
meta: Arc::new(EpubMetaExtractor::new(item)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for EpubExtractor {
|
||||
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||
match name.as_str() {
|
||||
"text" => self.text.field(name).await,
|
||||
"meta" => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))),
|
||||
_ => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(vec![
|
||||
Label::new("text").unwrap(),
|
||||
Label::new("meta").unwrap(),
|
||||
])
|
||||
}
|
||||
}
|
||||
96
crates/pile-value/src/extract/item/exif.rs
Normal file
96
crates/pile-value/src/extract/item/exif.rs
Normal file
@@ -0,0 +1,96 @@
|
||||
use pile_config::Label;
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
io::BufReader,
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
extract::traits::ObjectExtractor,
|
||||
value::{Item, PileValue, SyncReadBridge},
|
||||
};
|
||||
|
||||
pub struct ExifExtractor {
|
||||
item: Item,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl ExifExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
|
||||
if let Some(x) = self.output.get() {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
let raw_fields = tokio::task::spawn_blocking(move || {
|
||||
let mut br = BufReader::new(reader);
|
||||
let exif = exif::Reader::new()
|
||||
.read_from_container(&mut br)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||
|
||||
let fields: Vec<(String, String)> = exif
|
||||
.fields()
|
||||
.map(|f| {
|
||||
(
|
||||
f.tag.to_string(),
|
||||
f.display_value().with_unit(&exif).to_string(),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok::<_, std::io::Error>(fields)
|
||||
})
|
||||
.await
|
||||
.map_err(std::io::Error::other)?;
|
||||
|
||||
let raw_fields = match raw_fields {
|
||||
Ok(x) => x,
|
||||
Err(error) => {
|
||||
trace!(message = "Could not process exif", ?error, key = ?self.item.key());
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
};
|
||||
|
||||
let mut output: HashMap<Label, PileValue> = HashMap::new();
|
||||
|
||||
for (tag_name, value) in raw_fields {
|
||||
let Some(label) = tag_to_label(&tag_name) else {
|
||||
continue;
|
||||
};
|
||||
// First occurrence wins (PRIMARY IFD comes before THUMBNAIL)
|
||||
output
|
||||
.entry(label)
|
||||
.or_insert_with(|| PileValue::String(Arc::new(value.into())));
|
||||
}
|
||||
|
||||
return Ok(self.output.get_or_init(|| output));
|
||||
}
|
||||
}
|
||||
|
||||
fn tag_to_label(tag: &str) -> Option<Label> {
|
||||
let sanitized: String = tag
|
||||
.chars()
|
||||
.map(|c| if c == ' ' { '_' } else { c })
|
||||
.filter(|c| Label::VALID_CHARS.contains(*c))
|
||||
.collect();
|
||||
Label::new(sanitized)
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for ExifExtractor {
|
||||
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.get(name).cloned())
|
||||
}
|
||||
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.keys().cloned().collect())
|
||||
}
|
||||
}
|
||||
162
crates/pile-value/src/extract/item/flac.rs
Normal file
162
crates/pile-value/src/extract/item/flac.rs
Normal file
@@ -0,0 +1,162 @@
|
||||
use mime::Mime;
|
||||
use pile_config::Label;
|
||||
use pile_flac::{FlacBlock, FlacReader};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
io::BufReader,
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ListExtractor, ObjectExtractor},
|
||||
value::{Item, PileValue, SyncReadBridge},
|
||||
};
|
||||
|
||||
pub struct FlacImagesExtractor {
|
||||
item: Item,
|
||||
}
|
||||
|
||||
impl FlacImagesExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
Self { item: item.clone() }
|
||||
}
|
||||
|
||||
async fn get_images(&self) -> Result<Vec<PileValue>, std::io::Error> {
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
let raw_images = tokio::task::spawn_blocking(move || {
|
||||
let reader = FlacReader::new(BufReader::new(reader));
|
||||
let mut images: Vec<(Mime, Vec<u8>)> = Vec::new();
|
||||
for block in reader {
|
||||
match block.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))? {
|
||||
FlacBlock::Picture(picture) => {
|
||||
images.push((picture.mime, picture.img_data));
|
||||
}
|
||||
FlacBlock::AudioFrame(_) => break,
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok::<_, std::io::Error>(images)
|
||||
})
|
||||
.await
|
||||
.map_err(std::io::Error::other)??;
|
||||
|
||||
Ok(raw_images
|
||||
.into_iter()
|
||||
.map(|(mime, data)| PileValue::Blob {
|
||||
mime,
|
||||
bytes: Arc::new(data),
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ListExtractor for FlacImagesExtractor {
|
||||
async fn get<'a>(&'a self, idx: usize) -> Result<Option<PileValue>, std::io::Error> {
|
||||
Ok(self.get_images().await?.into_iter().nth(idx))
|
||||
}
|
||||
|
||||
async fn len(&self) -> Result<usize, std::io::Error> {
|
||||
Ok(self.get_images().await?.len())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FlacExtractor {
|
||||
item: Item,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
images: Option<PileValue>,
|
||||
}
|
||||
|
||||
impl FlacExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
let is_flac = match item {
|
||||
Item::File { path, .. } => path.to_str().unwrap_or_default().ends_with(".flac"),
|
||||
Item::S3 { key, .. } => key.ends_with(".flac"),
|
||||
};
|
||||
|
||||
let images =
|
||||
is_flac.then(|| PileValue::ListExtractor(Arc::new(FlacImagesExtractor::new(item))));
|
||||
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
images,
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
|
||||
if let Some(x) = self.output.get() {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let key = match &self.item {
|
||||
Item::File { path, .. } => path.to_str().unwrap_or_default().to_owned(),
|
||||
Item::S3 { key, .. } => key.to_string(),
|
||||
};
|
||||
|
||||
if !key.ends_with(".flac") {
|
||||
let _ = self.output.set(HashMap::new());
|
||||
#[expect(clippy::unwrap_used)]
|
||||
return Ok(self.output.get().unwrap());
|
||||
}
|
||||
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
let raw_tags = tokio::task::spawn_blocking(move || {
|
||||
let reader = FlacReader::new(BufReader::new(reader));
|
||||
let mut tags: Vec<(String, String)> = Vec::new();
|
||||
for block in reader {
|
||||
match block.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))? {
|
||||
FlacBlock::VorbisComment(comment) => {
|
||||
for (k, v) in comment.comment.comments {
|
||||
tags.push((k.to_string().to_lowercase(), v.into()));
|
||||
}
|
||||
}
|
||||
FlacBlock::AudioFrame(_) => break,
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok::<_, std::io::Error>(tags)
|
||||
})
|
||||
.await
|
||||
.map_err(std::io::Error::other)??;
|
||||
|
||||
let mut output: HashMap<Label, Vec<PileValue>> = HashMap::new();
|
||||
for (k, v) in raw_tags {
|
||||
if let Some(label) = Label::new(k) {
|
||||
output
|
||||
.entry(label)
|
||||
.or_default()
|
||||
.push(PileValue::String(Arc::new(v.into())));
|
||||
}
|
||||
}
|
||||
let output: HashMap<Label, PileValue> = output
|
||||
.into_iter()
|
||||
.map(|(k, v)| (k, PileValue::Array(Arc::new(v))))
|
||||
.collect();
|
||||
|
||||
let _ = self.output.set(output);
|
||||
#[expect(clippy::unwrap_used)]
|
||||
return Ok(self.output.get().unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for FlacExtractor {
|
||||
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||
if name.as_str() == "images"
|
||||
&& let Some(ref images) = self.images
|
||||
{
|
||||
return Ok(Some(images.clone()));
|
||||
}
|
||||
Ok(self.get_inner().await?.get(name).cloned())
|
||||
}
|
||||
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
let mut fields = self.get_inner().await?.keys().cloned().collect::<Vec<_>>();
|
||||
if self.images.is_some() {
|
||||
#[expect(clippy::unwrap_used)]
|
||||
fields.push(Label::new("images").unwrap());
|
||||
}
|
||||
Ok(fields)
|
||||
}
|
||||
}
|
||||
80
crates/pile-value/src/extract/item/fs.rs
Normal file
80
crates/pile-value/src/extract/item/fs.rs
Normal file
@@ -0,0 +1,80 @@
|
||||
use pile_config::Label;
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
path::Component,
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
extract::traits::ObjectExtractor,
|
||||
value::{Item, PileValue},
|
||||
};
|
||||
|
||||
pub struct FsExtractor {
|
||||
item: Item,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl FsExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
|
||||
if let Some(x) = self.output.get() {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let Item::File { path, .. } = &self.item else {
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
};
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
let output = HashMap::from([
|
||||
(
|
||||
Label::new("extension").unwrap(),
|
||||
path.extension()
|
||||
.and_then(|x| x.to_str())
|
||||
.map(|x| PileValue::String(Arc::new(x.into())))
|
||||
.unwrap_or(PileValue::Null),
|
||||
),
|
||||
(
|
||||
Label::new("path").unwrap(),
|
||||
path.to_str()
|
||||
.map(|x| PileValue::String(Arc::new(x.into())))
|
||||
.unwrap_or(PileValue::Null),
|
||||
),
|
||||
(
|
||||
Label::new("segments").unwrap(),
|
||||
path.components()
|
||||
.map(|x| match x {
|
||||
Component::CurDir => Some(".".to_owned()),
|
||||
Component::Normal(x) => x.to_str().map(|x| x.to_owned()),
|
||||
Component::ParentDir => Some("..".to_owned()),
|
||||
Component::RootDir => Some("/".to_owned()),
|
||||
Component::Prefix(x) => x.as_os_str().to_str().map(|x| x.to_owned()),
|
||||
})
|
||||
.map(|x| x.map(|x| PileValue::String(Arc::new(x.into()))))
|
||||
.collect::<Option<Vec<_>>>()
|
||||
.map(|v| PileValue::Array(Arc::new(v)))
|
||||
.unwrap_or(PileValue::Null),
|
||||
),
|
||||
]);
|
||||
|
||||
return Ok(self.output.get_or_init(|| output));
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for FsExtractor {
|
||||
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||
Ok(self.get_inner()?.get(name).cloned())
|
||||
}
|
||||
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner()?.keys().cloned().collect())
|
||||
}
|
||||
}
|
||||
135
crates/pile-value/src/extract/item/id3.rs
Normal file
135
crates/pile-value/src/extract/item/id3.rs
Normal file
@@ -0,0 +1,135 @@
|
||||
use id3::Tag;
|
||||
use pile_config::Label;
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
collections::HashMap,
|
||||
io::BufReader,
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
extract::traits::ObjectExtractor,
|
||||
value::{Item, PileValue, SyncReadBridge},
|
||||
};
|
||||
|
||||
pub struct Id3Extractor {
|
||||
item: Item,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl Id3Extractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
|
||||
if let Some(x) = self.output.get() {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let key = self.item.key();
|
||||
let ext = key.as_str().rsplit('.').next();
|
||||
if !matches!(ext, Some("mp3") | Some("aiff") | Some("aif") | Some("wav")) {
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
let tag = match tokio::task::spawn_blocking(move || Tag::read_from2(BufReader::new(reader)))
|
||||
.await
|
||||
{
|
||||
Ok(Ok(tag)) => tag,
|
||||
|
||||
Ok(Err(id3::Error {
|
||||
kind: id3::ErrorKind::NoTag,
|
||||
..
|
||||
})) => {
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
|
||||
Ok(Err(id3::Error {
|
||||
kind: id3::ErrorKind::Io(e),
|
||||
..
|
||||
})) => return Err(e),
|
||||
|
||||
Ok(Err(e)) => return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, e)),
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
|
||||
let mut output: HashMap<Label, Vec<PileValue>> = HashMap::new();
|
||||
for frame in tag.frames() {
|
||||
if let Some(texts) = frame.content().text_values() {
|
||||
let name = frame_id_to_field(frame.id());
|
||||
if let Some(key) = Label::new(name) {
|
||||
for text in texts {
|
||||
output
|
||||
.entry(key.clone())
|
||||
.or_default()
|
||||
.push(PileValue::String(Arc::new(text.into())));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let output = output
|
||||
.into_iter()
|
||||
.map(|(k, v)| (k, PileValue::Array(Arc::new(v))))
|
||||
.collect();
|
||||
|
||||
return Ok(self.output.get_or_init(|| output));
|
||||
}
|
||||
}
|
||||
|
||||
/// Map an ID3 frame ID to the equivalent Vorbis Comment field name.
|
||||
/// Falls back to the lowercased frame ID if no mapping exists.
|
||||
fn frame_id_to_field(id: &str) -> Cow<'static, str> {
|
||||
match id {
|
||||
// spell:off
|
||||
"TIT2" => Cow::Borrowed("title"),
|
||||
"TIT1" => Cow::Borrowed("grouping"),
|
||||
"TIT3" => Cow::Borrowed("subtitle"),
|
||||
"TPE1" => Cow::Borrowed("artist"),
|
||||
"TPE2" => Cow::Borrowed("albumartist"),
|
||||
"TPE3" => Cow::Borrowed("conductor"),
|
||||
"TOPE" => Cow::Borrowed("originalartist"),
|
||||
"TALB" => Cow::Borrowed("album"),
|
||||
"TOAL" => Cow::Borrowed("originalalbum"),
|
||||
"TRCK" => Cow::Borrowed("tracknumber"),
|
||||
"TPOS" => Cow::Borrowed("discnumber"),
|
||||
"TSST" => Cow::Borrowed("discsubtitle"),
|
||||
"TDRC" | "TYER" => Cow::Borrowed("date"),
|
||||
"TDOR" | "TORY" => Cow::Borrowed("originaldate"),
|
||||
"TCON" => Cow::Borrowed("genre"),
|
||||
"TCOM" => Cow::Borrowed("composer"),
|
||||
"TEXT" => Cow::Borrowed("lyricist"),
|
||||
"TPUB" => Cow::Borrowed("label"),
|
||||
"TSRC" => Cow::Borrowed("isrc"),
|
||||
"TBPM" => Cow::Borrowed("bpm"),
|
||||
"TLAN" => Cow::Borrowed("language"),
|
||||
"TMED" => Cow::Borrowed("media"),
|
||||
"TMOO" => Cow::Borrowed("mood"),
|
||||
"TCOP" => Cow::Borrowed("copyright"),
|
||||
"TENC" => Cow::Borrowed("encodedby"),
|
||||
"TSSE" => Cow::Borrowed("encodersettings"),
|
||||
"TSOA" => Cow::Borrowed("albumsort"),
|
||||
"TSOP" => Cow::Borrowed("artistsort"),
|
||||
"TSOT" => Cow::Borrowed("titlesort"),
|
||||
"MVNM" => Cow::Borrowed("movement"),
|
||||
"MVIN" => Cow::Borrowed("movementnumber"),
|
||||
_ => Cow::Owned(id.to_lowercase()),
|
||||
// spell:on
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for Id3Extractor {
|
||||
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.get(name).cloned())
|
||||
}
|
||||
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.keys().cloned().collect())
|
||||
}
|
||||
}
|
||||
99
crates/pile-value/src/extract/item/mod.rs
Normal file
99
crates/pile-value/src/extract/item/mod.rs
Normal file
@@ -0,0 +1,99 @@
|
||||
mod flac;
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
pub use flac::*;
|
||||
|
||||
mod id3;
|
||||
pub use id3::*;
|
||||
|
||||
mod fs;
|
||||
pub use fs::*;
|
||||
|
||||
mod epub;
|
||||
pub use epub::*;
|
||||
|
||||
mod exif;
|
||||
pub use exif::*;
|
||||
|
||||
mod pdf;
|
||||
pub use pdf::*;
|
||||
|
||||
mod toml;
|
||||
use pile_config::Label;
|
||||
pub use toml::*;
|
||||
|
||||
mod sidecar;
|
||||
pub use sidecar::*;
|
||||
|
||||
use crate::{
|
||||
extract::{misc::MapExtractor, traits::ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
};
|
||||
|
||||
pub struct ItemExtractor {
|
||||
inner: MapExtractor,
|
||||
}
|
||||
|
||||
impl ItemExtractor {
|
||||
#[expect(clippy::unwrap_used)]
|
||||
pub fn new(item: &Item) -> Self {
|
||||
let inner = MapExtractor {
|
||||
inner: HashMap::from([
|
||||
(
|
||||
Label::new("flac").unwrap(),
|
||||
PileValue::ObjectExtractor(Arc::new(FlacExtractor::new(item))),
|
||||
),
|
||||
(
|
||||
Label::new("id3").unwrap(),
|
||||
PileValue::ObjectExtractor(Arc::new(Id3Extractor::new(item))),
|
||||
),
|
||||
(
|
||||
Label::new("fs").unwrap(),
|
||||
PileValue::ObjectExtractor(Arc::new(FsExtractor::new(item))),
|
||||
),
|
||||
(
|
||||
Label::new("epub").unwrap(),
|
||||
PileValue::ObjectExtractor(Arc::new(EpubExtractor::new(item))),
|
||||
),
|
||||
(
|
||||
Label::new("exif").unwrap(),
|
||||
PileValue::ObjectExtractor(Arc::new(ExifExtractor::new(item))),
|
||||
),
|
||||
(
|
||||
Label::new("pdf").unwrap(),
|
||||
PileValue::ObjectExtractor(Arc::new(PdfExtractor::new(item))),
|
||||
),
|
||||
(
|
||||
Label::new("toml").unwrap(),
|
||||
PileValue::ObjectExtractor(Arc::new(TomlExtractor::new(item))),
|
||||
),
|
||||
(
|
||||
Label::new("sidecar").unwrap(),
|
||||
PileValue::ObjectExtractor(Arc::new(SidecarExtractor::new(item))),
|
||||
),
|
||||
]),
|
||||
};
|
||||
|
||||
Self { inner }
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for ItemExtractor {
|
||||
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||
self.inner.field(name).await
|
||||
}
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
return Ok(vec![
|
||||
Label::new("flac").unwrap(),
|
||||
Label::new("id3").unwrap(),
|
||||
Label::new("fs").unwrap(),
|
||||
Label::new("epub").unwrap(),
|
||||
Label::new("exif").unwrap(),
|
||||
Label::new("pdf").unwrap(),
|
||||
Label::new("sidecar").unwrap(),
|
||||
]);
|
||||
}
|
||||
}
|
||||
61
crates/pile-value/src/extract/item/pdf/mod.rs
Normal file
61
crates/pile-value/src/extract/item/pdf/mod.rs
Normal file
@@ -0,0 +1,61 @@
|
||||
use pile_config::Label;
|
||||
use std::sync::Arc;
|
||||
|
||||
#[cfg(feature = "pdfium")]
|
||||
mod pdf_pages;
|
||||
#[cfg(feature = "pdfium")]
|
||||
pub use pdf_pages::*;
|
||||
|
||||
mod pdf_meta;
|
||||
pub use pdf_meta::*;
|
||||
|
||||
mod pdf_text;
|
||||
pub use pdf_text::*;
|
||||
|
||||
use crate::{
|
||||
extract::traits::ObjectExtractor,
|
||||
value::{Item, PileValue},
|
||||
};
|
||||
|
||||
pub struct PdfExtractor {
|
||||
text: Arc<PdfTextExtractor>,
|
||||
meta: Arc<PdfMetaExtractor>,
|
||||
#[cfg(feature = "pdfium")]
|
||||
pages: Arc<PdfPagesExtractor>,
|
||||
}
|
||||
|
||||
impl PdfExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
Self {
|
||||
text: Arc::new(PdfTextExtractor::new(item)),
|
||||
meta: Arc::new(PdfMetaExtractor::new(item)),
|
||||
#[cfg(feature = "pdfium")]
|
||||
pages: Arc::new(PdfPagesExtractor::new(item)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for PdfExtractor {
|
||||
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||
match name.as_str() {
|
||||
"text" => self.text.field(name).await,
|
||||
"meta" => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))),
|
||||
#[cfg(feature = "pdfium")]
|
||||
"pages" => Ok(Some(PileValue::ListExtractor(self.pages.clone()))),
|
||||
_ => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(vec![
|
||||
Label::new("text").unwrap(),
|
||||
Label::new("meta").unwrap(),
|
||||
#[cfg(feature = "pdfium")]
|
||||
Label::new("cover").unwrap(),
|
||||
#[cfg(feature = "pdfium")]
|
||||
Label::new("pages").unwrap(),
|
||||
])
|
||||
}
|
||||
}
|
||||
132
crates/pile-value/src/extract/item/pdf/pdf_meta.rs
Normal file
132
crates/pile-value/src/extract/item/pdf/pdf_meta.rs
Normal file
@@ -0,0 +1,132 @@
|
||||
use pdf::file::FileOptions;
|
||||
use pdf::primitive::{Date, TimeRel};
|
||||
use pile_config::Label;
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
io::BufReader,
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
extract::traits::ObjectExtractor,
|
||||
value::{Item, PileValue, SyncReadBridge},
|
||||
};
|
||||
|
||||
pub struct PdfMetaExtractor {
|
||||
item: Item,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl PdfMetaExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
|
||||
if let Some(x) = self.output.get() {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
let raw_meta = tokio::task::spawn_blocking(move || {
|
||||
let mut bytes = Vec::new();
|
||||
std::io::Read::read_to_end(&mut BufReader::new(reader), &mut bytes)?;
|
||||
|
||||
let file = match FileOptions::cached().load(bytes) {
|
||||
Ok(x) => x,
|
||||
Err(pdf::PdfError::Io { source }) => return Err(source),
|
||||
Err(error) => {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
error.to_string(),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
let page_count = file.num_pages();
|
||||
|
||||
let mut meta: Vec<(&'static str, Option<String>)> = Vec::new();
|
||||
|
||||
if let Some(info) = &file.trailer.info_dict {
|
||||
use pdf::primitive::PdfString;
|
||||
let fields: &[(&'static str, Option<&PdfString>)] = &[
|
||||
("title", info.title.as_ref()),
|
||||
("author", info.author.as_ref()),
|
||||
("subject", info.subject.as_ref()),
|
||||
("keywords", info.keywords.as_ref()),
|
||||
("creator", info.creator.as_ref()),
|
||||
("producer", info.producer.as_ref()),
|
||||
];
|
||||
|
||||
for (key, val) in fields {
|
||||
meta.push((key, val.map(|s| s.to_string_lossy())));
|
||||
}
|
||||
|
||||
meta.push((
|
||||
"creation_date",
|
||||
info.creation_date.as_ref().map(format_date),
|
||||
));
|
||||
meta.push(("mod_date", info.mod_date.as_ref().map(format_date)));
|
||||
}
|
||||
|
||||
Ok::<_, std::io::Error>((page_count, meta))
|
||||
})
|
||||
.await
|
||||
.map_err(std::io::Error::other)?;
|
||||
|
||||
let (page_count, raw_meta) = match raw_meta {
|
||||
Ok(x) => x,
|
||||
Err(error) => {
|
||||
trace!(message = "Could not process pdf", ?error, key = ?self.item.key());
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
};
|
||||
|
||||
let mut output: HashMap<Label, PileValue> = HashMap::new();
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
output.insert(
|
||||
Label::new("pages").unwrap(),
|
||||
PileValue::U64(page_count as u64),
|
||||
);
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
for (key, val) in raw_meta {
|
||||
let label = Label::new(key).unwrap();
|
||||
let value = match val {
|
||||
Some(s) => PileValue::String(Arc::new(s.into())),
|
||||
None => PileValue::Null,
|
||||
};
|
||||
output.insert(label, value);
|
||||
}
|
||||
|
||||
return Ok(self.output.get_or_init(|| output));
|
||||
}
|
||||
}
|
||||
|
||||
fn format_date(d: &Date) -> String {
|
||||
let tz = match d.rel {
|
||||
TimeRel::Universal => "Z".to_owned(),
|
||||
TimeRel::Later => format!("+{:02}:{:02}", d.tz_hour, d.tz_minute),
|
||||
TimeRel::Earlier => format!("-{:02}:{:02}", d.tz_hour, d.tz_minute),
|
||||
};
|
||||
format!(
|
||||
"{:04}-{:02}-{:02}T{:02}:{:02}:{:02}{}",
|
||||
d.year, d.month, d.day, d.hour, d.minute, d.second, tz
|
||||
)
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for PdfMetaExtractor {
|
||||
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.get(name).cloned())
|
||||
}
|
||||
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.keys().cloned().collect())
|
||||
}
|
||||
}
|
||||
107
crates/pile-value/src/extract/item/pdf/pdf_pages.rs
Normal file
107
crates/pile-value/src/extract/item/pdf/pdf_pages.rs
Normal file
@@ -0,0 +1,107 @@
|
||||
use image::ImageFormat;
|
||||
use pdfium_render::prelude::*;
|
||||
use std::{
|
||||
io::{BufReader, Cursor},
|
||||
sync::Arc,
|
||||
};
|
||||
use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
extract::traits::ListExtractor,
|
||||
value::{Item, PileValue, SyncReadBridge},
|
||||
};
|
||||
|
||||
pub struct PdfPagesExtractor {
|
||||
item: Item,
|
||||
}
|
||||
|
||||
impl PdfPagesExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
Self { item: item.clone() }
|
||||
}
|
||||
|
||||
async fn get_bytes(&self) -> Result<Vec<u8>, std::io::Error> {
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let mut b = Vec::new();
|
||||
std::io::Read::read_to_end(&mut BufReader::new(reader), &mut b)?;
|
||||
Ok::<_, std::io::Error>(b)
|
||||
})
|
||||
.await
|
||||
.map_err(std::io::Error::other)?
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ListExtractor for PdfPagesExtractor {
|
||||
async fn get(&self, idx: usize) -> Result<Option<PileValue>, std::io::Error> {
|
||||
let bytes = self.get_bytes().await?;
|
||||
let png = tokio::task::spawn_blocking(move || {
|
||||
let pdfium = Pdfium::default();
|
||||
let doc = pdfium
|
||||
.load_pdf_from_byte_slice(&bytes, None)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||
if idx >= doc.pages().len() as usize {
|
||||
return Ok::<_, std::io::Error>(None);
|
||||
}
|
||||
let render_config = PdfRenderConfig::new().set_target_width(1024);
|
||||
let page = doc
|
||||
.pages()
|
||||
.get(idx as u16)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||
let image = page
|
||||
.render_with_config(&render_config)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?
|
||||
.as_image();
|
||||
let mut png_bytes = Vec::new();
|
||||
image
|
||||
.write_to(&mut Cursor::new(&mut png_bytes), ImageFormat::Png)
|
||||
.map_err(|e| std::io::Error::other(e.to_string()))?;
|
||||
Ok(Some(png_bytes))
|
||||
})
|
||||
.await
|
||||
.map_err(std::io::Error::other)?;
|
||||
|
||||
let value = match png {
|
||||
Ok(None) => return Ok(None),
|
||||
Ok(Some(bytes)) => PileValue::Blob {
|
||||
mime: mime::IMAGE_PNG,
|
||||
bytes: Arc::new(bytes),
|
||||
},
|
||||
Err(error) => {
|
||||
trace!(message = "Could not render pdf page", ?error, idx, key = ?self.item.key());
|
||||
PileValue::Null
|
||||
}
|
||||
};
|
||||
Ok(Some(value))
|
||||
}
|
||||
|
||||
async fn len(&self) -> Result<usize, std::io::Error> {
|
||||
let bytes = self.get_bytes().await?;
|
||||
let count = tokio::task::spawn_blocking(move || {
|
||||
let pdfium = Pdfium::default();
|
||||
let doc = pdfium
|
||||
.load_pdf_from_byte_slice(&bytes, None)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||
Ok::<_, std::io::Error>(doc.pages().len() as usize)
|
||||
})
|
||||
.await
|
||||
.map_err(std::io::Error::other)?;
|
||||
match count {
|
||||
Ok(n) => Ok(n),
|
||||
Err(error) => {
|
||||
trace!(message = "Could not read pdf page count", ?error, key = ?self.item.key());
|
||||
Ok(0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Override, extracting all pages is very slow,
|
||||
// and we can't display binary in json anyway
|
||||
async fn to_json(&self) -> Result<serde_json::Value, std::io::Error> {
|
||||
Ok(serde_json::Value::String(format!(
|
||||
"<PdfPages ({} pages)>",
|
||||
self.len().await?
|
||||
)))
|
||||
}
|
||||
}
|
||||
112
crates/pile-value/src/extract/item/pdf/pdf_text.rs
Normal file
112
crates/pile-value/src/extract/item/pdf/pdf_text.rs
Normal file
@@ -0,0 +1,112 @@
|
||||
use pdf::content::{Op, TextDrawAdjusted};
|
||||
use pdf::file::FileOptions;
|
||||
use pile_config::Label;
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
io::BufReader,
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
extract::traits::ObjectExtractor,
|
||||
value::{Item, PileValue, SyncReadBridge},
|
||||
};
|
||||
|
||||
pub struct PdfTextExtractor {
|
||||
item: Item,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl PdfTextExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
|
||||
if let Some(x) = self.output.get() {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
let raw_text = tokio::task::spawn_blocking(move || {
|
||||
let mut bytes = Vec::new();
|
||||
std::io::Read::read_to_end(&mut BufReader::new(reader), &mut bytes)?;
|
||||
|
||||
let file = match FileOptions::cached().load(bytes) {
|
||||
Ok(x) => x,
|
||||
Err(pdf::PdfError::Io { source }) => return Err(source),
|
||||
Err(error) => {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
error.to_string(),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
let mut text_parts: Vec<String> = Vec::new();
|
||||
|
||||
for page in file.pages() {
|
||||
let page = page.map_err(|e| {
|
||||
std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string())
|
||||
})?;
|
||||
|
||||
if let Some(content) = &page.contents {
|
||||
let ops = content.operations(&file.resolver()).map_err(|e| {
|
||||
std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string())
|
||||
})?;
|
||||
|
||||
for op in ops {
|
||||
match op {
|
||||
Op::TextDraw { text } => {
|
||||
text_parts.push(text.to_string_lossy());
|
||||
}
|
||||
Op::TextDrawAdjusted { array } => {
|
||||
for item in array {
|
||||
if let TextDrawAdjusted::Text(text) = item {
|
||||
text_parts.push(text.to_string_lossy());
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok::<_, std::io::Error>(text_parts.join(" "))
|
||||
})
|
||||
.await
|
||||
.map_err(std::io::Error::other)?;
|
||||
|
||||
let raw_text = match raw_text {
|
||||
Ok(x) => x,
|
||||
Err(error) => {
|
||||
trace!(message = "Could not process pdf", ?error, key = ?self.item.key());
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
};
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
let output = HashMap::from([(
|
||||
Label::new("text").unwrap(),
|
||||
PileValue::String(Arc::new(raw_text.into())),
|
||||
)]);
|
||||
|
||||
return Ok(self.output.get_or_init(|| output));
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for PdfTextExtractor {
|
||||
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.get(name).cloned())
|
||||
}
|
||||
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.keys().cloned().collect())
|
||||
}
|
||||
}
|
||||
45
crates/pile-value/src/extract/item/sidecar.rs
Normal file
45
crates/pile-value/src/extract/item/sidecar.rs
Normal file
@@ -0,0 +1,45 @@
|
||||
use pile_config::Label;
|
||||
use std::sync::OnceLock;
|
||||
|
||||
use super::TomlExtractor;
|
||||
use crate::{
|
||||
extract::traits::ObjectExtractor,
|
||||
value::{Item, PileValue},
|
||||
};
|
||||
|
||||
pub struct SidecarExtractor {
|
||||
item: Item,
|
||||
output: OnceLock<Option<TomlExtractor>>,
|
||||
}
|
||||
|
||||
impl SidecarExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for SidecarExtractor {
|
||||
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||
match self
|
||||
.output
|
||||
.get_or_init(|| self.item.sidecar().map(TomlExtractor::new))
|
||||
{
|
||||
Some(x) => Ok(x.field(name).await?),
|
||||
None => Ok(Some(PileValue::Null)),
|
||||
}
|
||||
}
|
||||
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
match self
|
||||
.output
|
||||
.get_or_init(|| self.item.sidecar().map(TomlExtractor::new))
|
||||
{
|
||||
Some(x) => Ok(x.fields().await?),
|
||||
None => Ok(Vec::new()),
|
||||
}
|
||||
}
|
||||
}
|
||||
78
crates/pile-value/src/extract/item/toml.rs
Normal file
78
crates/pile-value/src/extract/item/toml.rs
Normal file
@@ -0,0 +1,78 @@
|
||||
use pile_config::Label;
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
extract::traits::ObjectExtractor,
|
||||
value::{AsyncReader, Item, PileValue},
|
||||
};
|
||||
|
||||
fn toml_to_pile(value: toml::Value) -> PileValue {
|
||||
match value {
|
||||
toml::Value::String(s) => PileValue::String(Arc::new(s.into())),
|
||||
toml::Value::Integer(i) => PileValue::String(Arc::new(i.to_string().into())),
|
||||
toml::Value::Float(f) => PileValue::String(Arc::new(f.to_string().into())),
|
||||
toml::Value::Boolean(b) => PileValue::String(Arc::new(b.to_string().into())),
|
||||
toml::Value::Datetime(d) => PileValue::String(Arc::new(d.to_string().into())),
|
||||
toml::Value::Array(a) => {
|
||||
PileValue::Array(Arc::new(a.into_iter().map(toml_to_pile).collect()))
|
||||
}
|
||||
toml::Value::Table(_) => PileValue::Null,
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TomlExtractor {
|
||||
item: Item,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl TomlExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
|
||||
if let Some(x) = self.output.get() {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let mut reader = match self.item.read().await {
|
||||
Ok(r) => r,
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
Err(e) => return Err(e),
|
||||
};
|
||||
let bytes = reader.read_to_end().await?;
|
||||
let toml: toml::Value = match toml::from_slice(&bytes) {
|
||||
Ok(x) => x,
|
||||
Err(_) => return Ok(self.output.get_or_init(HashMap::new)),
|
||||
};
|
||||
|
||||
let output: HashMap<Label, PileValue> = match toml {
|
||||
toml::Value::Table(t) => t
|
||||
.into_iter()
|
||||
.filter_map(|(k, v)| Label::new(&k).map(|label| (label, toml_to_pile(v))))
|
||||
.collect(),
|
||||
_ => HashMap::new(),
|
||||
};
|
||||
|
||||
return Ok(self.output.get_or_init(|| output));
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for TomlExtractor {
|
||||
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.get(name).cloned())
|
||||
}
|
||||
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.keys().cloned().collect())
|
||||
}
|
||||
}
|
||||
24
crates/pile-value/src/extract/misc/list.rs
Normal file
24
crates/pile-value/src/extract/misc/list.rs
Normal file
@@ -0,0 +1,24 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::{extract::traits::ListExtractor, value::PileValue};
|
||||
|
||||
pub struct ArrayExtractor {
|
||||
inner: Arc<Vec<PileValue>>,
|
||||
}
|
||||
|
||||
impl ArrayExtractor {
|
||||
pub fn new(inner: Arc<Vec<PileValue>>) -> Self {
|
||||
Self { inner }
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ListExtractor for ArrayExtractor {
|
||||
async fn get(&self, idx: usize) -> Result<Option<PileValue>, std::io::Error> {
|
||||
Ok(self.inner.get(idx).cloned())
|
||||
}
|
||||
|
||||
async fn len(&self) -> Result<usize, std::io::Error> {
|
||||
Ok(self.inner.len())
|
||||
}
|
||||
}
|
||||
20
crates/pile-value/src/extract/misc/map.rs
Normal file
20
crates/pile-value/src/extract/misc/map.rs
Normal file
@@ -0,0 +1,20 @@
|
||||
use pile_config::Label;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::{extract::traits::ObjectExtractor, value::PileValue};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct MapExtractor {
|
||||
pub inner: HashMap<Label, PileValue>,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for MapExtractor {
|
||||
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||
Ok(self.inner.get(name).cloned())
|
||||
}
|
||||
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.inner.keys().cloned().collect())
|
||||
}
|
||||
}
|
||||
8
crates/pile-value/src/extract/misc/mod.rs
Normal file
8
crates/pile-value/src/extract/misc/mod.rs
Normal file
@@ -0,0 +1,8 @@
|
||||
mod list;
|
||||
pub use list::*;
|
||||
|
||||
mod vec;
|
||||
pub use vec::*;
|
||||
|
||||
mod map;
|
||||
pub use map::*;
|
||||
17
crates/pile-value/src/extract/misc/vec.rs
Normal file
17
crates/pile-value/src/extract/misc/vec.rs
Normal file
@@ -0,0 +1,17 @@
|
||||
use crate::{extract::traits::ListExtractor, value::PileValue};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct VecExtractor {
|
||||
pub inner: Vec<PileValue>,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ListExtractor for VecExtractor {
|
||||
async fn get(&self, idx: usize) -> Result<Option<PileValue>, std::io::Error> {
|
||||
Ok(self.inner.get(idx).cloned())
|
||||
}
|
||||
|
||||
async fn len(&self) -> Result<usize, std::io::Error> {
|
||||
Ok(self.inner.len())
|
||||
}
|
||||
}
|
||||
4
crates/pile-value/src/extract/mod.rs
Normal file
4
crates/pile-value/src/extract/mod.rs
Normal file
@@ -0,0 +1,4 @@
|
||||
pub mod item;
|
||||
pub mod misc;
|
||||
pub mod string;
|
||||
pub mod traits;
|
||||
51
crates/pile-value/src/extract/string.rs
Normal file
51
crates/pile-value/src/extract/string.rs
Normal file
@@ -0,0 +1,51 @@
|
||||
use pile_config::Label;
|
||||
use smartstring::{LazyCompact, SmartString};
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::{extract::traits::ObjectExtractor, value::PileValue};
|
||||
|
||||
pub struct StringExtractor {
|
||||
item: Arc<SmartString<LazyCompact>>,
|
||||
}
|
||||
|
||||
impl StringExtractor {
|
||||
pub fn new(item: &Arc<SmartString<LazyCompact>>) -> Self {
|
||||
Self { item: item.clone() }
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for StringExtractor {
|
||||
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||
Ok(match name.as_str() {
|
||||
"trim" => Some(PileValue::String(Arc::new(
|
||||
self.item.as_str().trim().into(),
|
||||
))),
|
||||
|
||||
"upper" => Some(PileValue::String(Arc::new(
|
||||
self.item.as_str().to_lowercase().into(),
|
||||
))),
|
||||
|
||||
"lower" => Some(PileValue::String(Arc::new(
|
||||
self.item.as_str().to_uppercase().into(),
|
||||
))),
|
||||
|
||||
"nonempty" => Some(match self.item.is_empty() {
|
||||
true => PileValue::Null,
|
||||
false => PileValue::String(self.item.clone()),
|
||||
}),
|
||||
|
||||
_ => None,
|
||||
})
|
||||
}
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
return Ok(vec![
|
||||
Label::new("trim").unwrap(),
|
||||
Label::new("upper").unwrap(),
|
||||
Label::new("lower").unwrap(),
|
||||
Label::new("nonempty").unwrap(),
|
||||
]);
|
||||
}
|
||||
}
|
||||
68
crates/pile-value/src/extract/traits.rs
Normal file
68
crates/pile-value/src/extract/traits.rs
Normal file
@@ -0,0 +1,68 @@
|
||||
/// An attachment that extracts metadata from an [Item].
|
||||
///
|
||||
/// Metadata is exposed as an immutable map of {label: value},
|
||||
/// much like a json object.
|
||||
#[async_trait::async_trait]
|
||||
pub trait ObjectExtractor: Send + Sync {
|
||||
/// Get the field at `name` from `item`.
|
||||
/// - returns `None` if `name` is not a valid field
|
||||
/// - returns `Some(Null)` if `name` is not available
|
||||
async fn field(
|
||||
&self,
|
||||
name: &pile_config::Label,
|
||||
) -> Result<Option<crate::value::PileValue>, std::io::Error>;
|
||||
|
||||
/// Return all fields in this extractor.
|
||||
/// `Self::field` must return [Some] for all these keys
|
||||
/// and [None] for all others.
|
||||
async fn fields(&self) -> Result<Vec<pile_config::Label>, std::io::Error>;
|
||||
|
||||
/// Convert this to a JSON value.
|
||||
async fn to_json(&self) -> Result<serde_json::Value, std::io::Error> {
|
||||
let keys = self.fields().await?;
|
||||
let mut map = serde_json::Map::new();
|
||||
for k in &keys {
|
||||
let v = match self.field(k).await? {
|
||||
Some(x) => x,
|
||||
None => continue,
|
||||
};
|
||||
map.insert(k.to_string(), Box::pin(v.to_json()).await?);
|
||||
}
|
||||
|
||||
Ok(serde_json::Value::Object(map))
|
||||
}
|
||||
}
|
||||
|
||||
/// An attachment that extracts metadata from an [Item].
|
||||
///
|
||||
/// Metadata is exposed as an immutable list of values.
|
||||
#[async_trait::async_trait]
|
||||
pub trait ListExtractor: Send + Sync {
|
||||
/// Get the item at index `idx`.
|
||||
/// Indices start at zero, and must be consecutive.
|
||||
/// - returns `None` if `idx` is out of range
|
||||
/// - returns `Some(Null)` if `None` is at `idx`
|
||||
async fn get(&self, idx: usize) -> Result<Option<crate::value::PileValue>, std::io::Error>;
|
||||
|
||||
async fn len(&self) -> Result<usize, std::io::Error>;
|
||||
|
||||
async fn is_empty(&self) -> Result<bool, std::io::Error> {
|
||||
Ok(self.len().await? == 0)
|
||||
}
|
||||
|
||||
/// Convert this list to a JSON value.
|
||||
async fn to_json(&self) -> Result<serde_json::Value, std::io::Error> {
|
||||
let len = self.len().await?;
|
||||
let mut list = Vec::with_capacity(len);
|
||||
for i in 0..len {
|
||||
#[expect(clippy::expect_used)]
|
||||
let v = self
|
||||
.get(i)
|
||||
.await?
|
||||
.expect("value must be present according to length");
|
||||
list.push(Box::pin(v.to_json()).await?);
|
||||
}
|
||||
|
||||
Ok(serde_json::Value::Array(list))
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user