Refactor errors
This commit is contained in:
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -2559,6 +2559,7 @@ dependencies = [
|
||||
name = "pile-value"
|
||||
version = "0.0.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"aws-sdk-s3",
|
||||
"blake3",
|
||||
|
||||
@@ -11,6 +11,7 @@ workspace = true
|
||||
pile-config = { workspace = true }
|
||||
pile-flac = { workspace = true }
|
||||
|
||||
anyhow = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
walkdir = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
|
||||
@@ -29,16 +29,9 @@ impl EpubMetaExtractor {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let key = self.item.key();
|
||||
let ext = key.as_str().rsplit('.').next();
|
||||
if !matches!(ext, Some("epub")) {
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
let raw_meta = tokio::task::spawn_blocking(move || {
|
||||
let doc = EpubDoc::from_reader(reader)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||
let doc = EpubDoc::from_reader(reader)?;
|
||||
|
||||
let fields: &[&'static str] = &[
|
||||
"title",
|
||||
@@ -54,17 +47,19 @@ impl EpubMetaExtractor {
|
||||
let meta: Vec<(&'static str, Option<String>)> =
|
||||
fields.iter().map(|&key| (key, doc.mdata(key))).collect();
|
||||
|
||||
Ok::<_, std::io::Error>(meta)
|
||||
Ok::<_, anyhow::Error>(meta)
|
||||
})
|
||||
.await
|
||||
.map_err(std::io::Error::other)?;
|
||||
.await?;
|
||||
|
||||
let raw_meta = match raw_meta {
|
||||
Ok(x) => x,
|
||||
Err(error) => {
|
||||
trace!(message = "Could not process epub", ?error, key = ?self.item.key());
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
Err(error) => match error.downcast::<std::io::Error>() {
|
||||
Ok(x) => return Err(x),
|
||||
Err(error) => {
|
||||
trace!(message = "Could not process epub", ?error, key = ?self.item.key());
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
let mut output: HashMap<Label, PileValue> = HashMap::new();
|
||||
|
||||
@@ -4,7 +4,7 @@ use std::{
|
||||
collections::HashMap,
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
use tracing::debug;
|
||||
use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
extract::traits::ObjectExtractor,
|
||||
@@ -29,16 +29,9 @@ impl EpubTextExtractor {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let key = self.item.key();
|
||||
let ext = key.as_str().rsplit('.').next();
|
||||
if !matches!(ext, Some("epub")) {
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
let raw_text = tokio::task::spawn_blocking(move || {
|
||||
let mut doc = EpubDoc::from_reader(reader)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||
let mut doc = EpubDoc::from_reader(reader)?;
|
||||
|
||||
let mut text_parts: Vec<String> = Vec::new();
|
||||
|
||||
@@ -51,17 +44,19 @@ impl EpubTextExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
Ok::<_, std::io::Error>(text_parts.join(" "))
|
||||
Ok::<_, anyhow::Error>(text_parts.join(" "))
|
||||
})
|
||||
.await
|
||||
.map_err(std::io::Error::other)?;
|
||||
.await?;
|
||||
|
||||
let raw_text = match raw_text {
|
||||
Ok(x) => x,
|
||||
Err(error) => {
|
||||
debug!(message = "Could not process epub", ?error, key = ?self.item.key());
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
Err(error) => match error.downcast::<std::io::Error>() {
|
||||
Ok(x) => return Err(x),
|
||||
Err(error) => {
|
||||
trace!(message = "Could not process epub", ?error, key = ?self.item.key());
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
|
||||
@@ -34,7 +34,13 @@ impl ObjectExtractor for EpubExtractor {
|
||||
args: Option<&str>,
|
||||
) -> Result<Option<PileValue>, std::io::Error> {
|
||||
match (name.as_str(), args) {
|
||||
("text", args) => self.text.field(name, args).await,
|
||||
("text", args) => Ok(Some(
|
||||
self.text
|
||||
.field(name, args)
|
||||
.await
|
||||
.map(|x| x.unwrap_or(PileValue::Null))?,
|
||||
)),
|
||||
|
||||
("meta", None) => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))),
|
||||
_ => Ok(None),
|
||||
}
|
||||
|
||||
@@ -32,9 +32,7 @@ impl ExifExtractor {
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
let raw_fields = tokio::task::spawn_blocking(move || {
|
||||
let mut br = BufReader::new(reader);
|
||||
let exif = exif::Reader::new()
|
||||
.read_from_container(&mut br)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||
let exif = exif::Reader::new().read_from_container(&mut br)?;
|
||||
|
||||
let fields: Vec<(String, String)> = exif
|
||||
.fields()
|
||||
@@ -46,13 +44,13 @@ impl ExifExtractor {
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok::<_, std::io::Error>(fields)
|
||||
Ok::<_, exif::Error>(fields)
|
||||
})
|
||||
.await
|
||||
.map_err(std::io::Error::other)?;
|
||||
.await?;
|
||||
|
||||
let raw_fields = match raw_fields {
|
||||
Ok(x) => x,
|
||||
Err(exif::Error::Io(x)) => return Err(x),
|
||||
Err(error) => {
|
||||
trace!(message = "Could not process exif", ?error, key = ?self.item.key());
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
@@ -65,6 +63,7 @@ impl ExifExtractor {
|
||||
let Some(label) = tag_to_label(&tag_name) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
// First occurrence wins (PRIMARY IFD comes before THUMBNAIL)
|
||||
output
|
||||
.entry(label)
|
||||
@@ -91,6 +90,12 @@ impl ObjectExtractor for ExifExtractor {
|
||||
name: &Label,
|
||||
args: Option<&str>,
|
||||
) -> Result<Option<PileValue>, std::io::Error> {
|
||||
trace!(
|
||||
?args,
|
||||
key = self.item.key().as_str(),
|
||||
"Getting field {name:?} from ExifExtractor",
|
||||
);
|
||||
|
||||
if args.is_some() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
use mime::Mime;
|
||||
use pile_config::Label;
|
||||
use pile_flac::{FlacBlock, FlacReader};
|
||||
use pile_flac::{FlacBlock, FlacDecodeError, FlacReader};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
io::BufReader,
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ListExtractor, ObjectExtractor},
|
||||
@@ -31,16 +32,17 @@ impl FlacImagesExtractor {
|
||||
let reader = FlacReader::new(BufReader::new(reader));
|
||||
let mut count = 0usize;
|
||||
for block in reader {
|
||||
match block.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))? {
|
||||
FlacBlock::AudioFrame(_) => break,
|
||||
FlacBlock::Picture(_) => count += 1,
|
||||
match block {
|
||||
Ok(FlacBlock::AudioFrame(_)) => break,
|
||||
Ok(FlacBlock::Picture(_)) => count += 1,
|
||||
Err(FlacDecodeError::IoError(err)) => return Err(err),
|
||||
Err(_) => return Ok(0),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok::<_, std::io::Error>(count)
|
||||
})
|
||||
.await
|
||||
.map_err(std::io::Error::other)??;
|
||||
.await??;
|
||||
|
||||
return Ok(count);
|
||||
}
|
||||
@@ -49,14 +51,20 @@ impl FlacImagesExtractor {
|
||||
#[async_trait::async_trait]
|
||||
impl ListExtractor for FlacImagesExtractor {
|
||||
async fn get<'a>(&'a self, mut idx: usize) -> Result<Option<PileValue>, std::io::Error> {
|
||||
trace!(
|
||||
key = self.item.key().as_str(),
|
||||
"Getting index {idx} from FlacImagesExtractor",
|
||||
);
|
||||
|
||||
let key = self.item.key();
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
let image = tokio::task::spawn_blocking(move || {
|
||||
let reader = FlacReader::new(BufReader::new(reader));
|
||||
let mut out: Option<(Mime, Vec<u8>)> = None;
|
||||
'blocks: for block in reader {
|
||||
match block.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))? {
|
||||
FlacBlock::AudioFrame(_) => break,
|
||||
FlacBlock::Picture(picture) => {
|
||||
match block {
|
||||
Ok(FlacBlock::AudioFrame(_)) => break,
|
||||
Ok(FlacBlock::Picture(picture)) => {
|
||||
if idx > 0 {
|
||||
idx -= 1;
|
||||
continue;
|
||||
@@ -66,6 +74,16 @@ impl ListExtractor for FlacImagesExtractor {
|
||||
break 'blocks;
|
||||
}
|
||||
|
||||
Err(FlacDecodeError::IoError(err)) => return Err(err),
|
||||
Err(error) => {
|
||||
trace!(
|
||||
message = "Could not parse FLAC images",
|
||||
key = key.as_str(),
|
||||
?error
|
||||
);
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
@@ -93,23 +111,15 @@ impl ListExtractor for FlacImagesExtractor {
|
||||
pub struct FlacExtractor {
|
||||
item: Item,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
images: Option<PileValue>,
|
||||
images: PileValue,
|
||||
}
|
||||
|
||||
impl FlacExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
let is_flac = match item {
|
||||
Item::File { path, .. } => path.to_str().unwrap_or_default().ends_with(".flac"),
|
||||
Item::S3 { key, .. } => key.ends_with(".flac"),
|
||||
};
|
||||
|
||||
let images =
|
||||
is_flac.then(|| PileValue::ListExtractor(Arc::new(FlacImagesExtractor::new(item))));
|
||||
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
images,
|
||||
images: PileValue::ListExtractor(Arc::new(FlacImagesExtractor::new(item))),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -118,54 +128,55 @@ impl FlacExtractor {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let key = match &self.item {
|
||||
Item::File { path, .. } => path.to_str().unwrap_or_default().to_owned(),
|
||||
Item::S3 { key, .. } => key.to_string(),
|
||||
};
|
||||
|
||||
if !key.ends_with(".flac") {
|
||||
let _ = self.output.set(HashMap::new());
|
||||
#[expect(clippy::unwrap_used)]
|
||||
return Ok(self.output.get().unwrap());
|
||||
}
|
||||
trace!(
|
||||
message = "Reading FLAC tags",
|
||||
key = self.item.key().as_str()
|
||||
);
|
||||
|
||||
let key = self.item.key();
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
let raw_tags = tokio::task::spawn_blocking(move || {
|
||||
let output = tokio::task::spawn_blocking(move || {
|
||||
let reader = FlacReader::new(BufReader::new(reader));
|
||||
let mut tags: Vec<(String, String)> = Vec::new();
|
||||
let mut output: HashMap<Label, Vec<PileValue>> = HashMap::new();
|
||||
|
||||
for block in reader {
|
||||
match block.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))? {
|
||||
FlacBlock::VorbisComment(comment) => {
|
||||
match block {
|
||||
Ok(FlacBlock::AudioFrame(_)) => break,
|
||||
Ok(FlacBlock::VorbisComment(comment)) => {
|
||||
for (k, v) in comment.comment.comments {
|
||||
tags.push((k.to_string().to_lowercase(), v.into()));
|
||||
if let Some(label) = Label::new(k.to_string().to_lowercase()) {
|
||||
output
|
||||
.entry(label)
|
||||
.or_default()
|
||||
.push(PileValue::String(Arc::new(v)));
|
||||
}
|
||||
}
|
||||
}
|
||||
FlacBlock::AudioFrame(_) => break,
|
||||
|
||||
Err(FlacDecodeError::IoError(err)) => return Err(err),
|
||||
Err(error) => {
|
||||
trace!(
|
||||
message = "Could not parse FLAC metadata",
|
||||
key = key.as_str(),
|
||||
?error
|
||||
);
|
||||
return Ok(HashMap::new());
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok::<_, std::io::Error>(tags)
|
||||
|
||||
let output: HashMap<Label, PileValue> = output
|
||||
.into_iter()
|
||||
.map(|(k, v)| (k, PileValue::Array(Arc::new(v))))
|
||||
.collect();
|
||||
|
||||
Ok::<HashMap<Label, PileValue>, std::io::Error>(output)
|
||||
})
|
||||
.await
|
||||
.map_err(std::io::Error::other)??;
|
||||
.await??;
|
||||
|
||||
let mut output: HashMap<Label, Vec<PileValue>> = HashMap::new();
|
||||
for (k, v) in raw_tags {
|
||||
if let Some(label) = Label::new(k) {
|
||||
output
|
||||
.entry(label)
|
||||
.or_default()
|
||||
.push(PileValue::String(Arc::new(v.into())));
|
||||
}
|
||||
}
|
||||
let output: HashMap<Label, PileValue> = output
|
||||
.into_iter()
|
||||
.map(|(k, v)| (k, PileValue::Array(Arc::new(v))))
|
||||
.collect();
|
||||
|
||||
let _ = self.output.set(output);
|
||||
#[expect(clippy::unwrap_used)]
|
||||
return Ok(self.output.get().unwrap());
|
||||
return Ok(self.output.get_or_init(|| output));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -180,20 +191,21 @@ impl ObjectExtractor for FlacExtractor {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
if name.as_str() == "images"
|
||||
&& let Some(ref images) = self.images
|
||||
{
|
||||
return Ok(Some(images.clone()));
|
||||
if name.as_str() == "images" {
|
||||
return Ok(Some(self.images.clone()));
|
||||
}
|
||||
|
||||
Ok(self.get_inner().await?.get(name).cloned())
|
||||
}
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
let mut fields = self.get_inner().await?.keys().cloned().collect::<Vec<_>>();
|
||||
if self.images.is_some() {
|
||||
#[expect(clippy::unwrap_used)]
|
||||
fields.push(Label::new("images").unwrap());
|
||||
}
|
||||
Ok(fields)
|
||||
Ok(self
|
||||
.get_inner()
|
||||
.await?
|
||||
.keys()
|
||||
.cloned()
|
||||
.chain([Label::new("images").unwrap()])
|
||||
.collect::<Vec<_>>())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
use crate::{
|
||||
extract::traits::ObjectExtractor,
|
||||
value::{Item, PileValue},
|
||||
};
|
||||
use pile_config::Label;
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
@@ -5,11 +9,6 @@ use std::{
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
extract::traits::ObjectExtractor,
|
||||
value::{Item, PileValue},
|
||||
};
|
||||
|
||||
pub struct FsExtractor {
|
||||
item: Item,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
@@ -32,6 +31,26 @@ impl FsExtractor {
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
};
|
||||
|
||||
let mut root = false;
|
||||
let components = path
|
||||
.components()
|
||||
.map(|x| match x {
|
||||
Component::CurDir => None,
|
||||
Component::Normal(x) => x.to_str().map(|x| x.to_owned()),
|
||||
Component::ParentDir => Some("..".to_owned()),
|
||||
Component::RootDir => {
|
||||
root = true;
|
||||
None
|
||||
}
|
||||
Component::Prefix(x) => x.as_os_str().to_str().map(|x| x.to_owned()),
|
||||
})
|
||||
.collect::<Option<Vec<_>>>();
|
||||
|
||||
let mut path_str = components.as_ref().map(|x| x.join("/"));
|
||||
if root {
|
||||
path_str = path_str.map(|x| format!("/{x}"));
|
||||
}
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
let output = HashMap::from([
|
||||
(
|
||||
@@ -43,23 +62,20 @@ impl FsExtractor {
|
||||
),
|
||||
(
|
||||
Label::new("path").unwrap(),
|
||||
path.to_str()
|
||||
path_str
|
||||
.map(|x| PileValue::String(Arc::new(x.into())))
|
||||
.unwrap_or(PileValue::Null),
|
||||
),
|
||||
(
|
||||
Label::new("segments").unwrap(),
|
||||
path.components()
|
||||
.map(|x| match x {
|
||||
Component::CurDir => Some(".".to_owned()),
|
||||
Component::Normal(x) => x.to_str().map(|x| x.to_owned()),
|
||||
Component::ParentDir => Some("..".to_owned()),
|
||||
Component::RootDir => Some("/".to_owned()),
|
||||
Component::Prefix(x) => x.as_os_str().to_str().map(|x| x.to_owned()),
|
||||
components
|
||||
.map(|x| {
|
||||
PileValue::Array(Arc::new(
|
||||
x.iter()
|
||||
.map(|x| PileValue::String(Arc::new(x.into())))
|
||||
.collect(),
|
||||
))
|
||||
})
|
||||
.map(|x| x.map(|x| PileValue::String(Arc::new(x.into()))))
|
||||
.collect::<Option<Vec<_>>>()
|
||||
.map(|v| PileValue::Array(Arc::new(v)))
|
||||
.unwrap_or(PileValue::Null),
|
||||
),
|
||||
]);
|
||||
|
||||
@@ -6,6 +6,7 @@ use std::{
|
||||
io::BufReader,
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
extract::traits::ObjectExtractor,
|
||||
@@ -30,32 +31,29 @@ impl Id3Extractor {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let key = self.item.key();
|
||||
let ext = key.as_str().rsplit('.').next();
|
||||
if !matches!(ext, Some("mp3") | Some("aiff") | Some("aif") | Some("wav")) {
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
trace!(message = "Reading id3 tags", key = self.item.key().as_str());
|
||||
|
||||
let key = self.item.key();
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
let tag = match tokio::task::spawn_blocking(move || Tag::read_from2(BufReader::new(reader)))
|
||||
.await
|
||||
{
|
||||
Ok(Ok(tag)) => tag,
|
||||
|
||||
Ok(Err(id3::Error {
|
||||
kind: id3::ErrorKind::NoTag,
|
||||
..
|
||||
})) => {
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
|
||||
Err(e) => return Err(e.into()),
|
||||
Ok(Err(id3::Error {
|
||||
kind: id3::ErrorKind::Io(e),
|
||||
..
|
||||
})) => return Err(e),
|
||||
|
||||
Ok(Err(e)) => return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, e)),
|
||||
Err(e) => return Err(e.into()),
|
||||
Ok(Err(error)) => {
|
||||
trace!(
|
||||
message = "Could not parse id3 tags",
|
||||
key = key.as_str(),
|
||||
?error
|
||||
);
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
};
|
||||
|
||||
let mut output: HashMap<Label, Vec<PileValue>> = HashMap::new();
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use pile_config::Label;
|
||||
use std::sync::Arc;
|
||||
use tracing::trace;
|
||||
|
||||
#[cfg(feature = "pdfium")]
|
||||
mod pdf_pages;
|
||||
@@ -42,6 +43,12 @@ impl ObjectExtractor for PdfExtractor {
|
||||
name: &pile_config::Label,
|
||||
args: Option<&str>,
|
||||
) -> Result<Option<PileValue>, std::io::Error> {
|
||||
trace!(
|
||||
?args,
|
||||
key = self.text.item.key().as_str(),
|
||||
"Getting field {name:?} from PdfExtractor",
|
||||
);
|
||||
|
||||
match (name.as_str(), args) {
|
||||
("text", args) => self.text.field(name, args).await,
|
||||
("meta", None) => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))),
|
||||
@@ -57,8 +64,6 @@ impl ObjectExtractor for PdfExtractor {
|
||||
Label::new("text").unwrap(),
|
||||
Label::new("meta").unwrap(),
|
||||
#[cfg(feature = "pdfium")]
|
||||
Label::new("cover").unwrap(),
|
||||
#[cfg(feature = "pdfium")]
|
||||
Label::new("pages").unwrap(),
|
||||
])
|
||||
}
|
||||
|
||||
@@ -35,6 +35,11 @@ impl PdfPagesExtractor {
|
||||
#[async_trait::async_trait]
|
||||
impl ListExtractor for PdfPagesExtractor {
|
||||
async fn get(&self, idx: usize) -> Result<Option<PileValue>, std::io::Error> {
|
||||
trace!(
|
||||
key = self.item.key().as_str(),
|
||||
"Getting index {idx} from PdfPagesExtractor",
|
||||
);
|
||||
|
||||
let bytes = self.get_bytes().await?;
|
||||
let png = tokio::task::spawn_blocking(move || {
|
||||
let pdfium = Pdfium::default();
|
||||
|
||||
@@ -14,7 +14,7 @@ use crate::{
|
||||
};
|
||||
|
||||
pub struct PdfTextExtractor {
|
||||
item: Item,
|
||||
pub(super) item: Item,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use pile_config::Label;
|
||||
use std::sync::OnceLock;
|
||||
use tracing::trace;
|
||||
|
||||
use super::TomlExtractor;
|
||||
use crate::{
|
||||
@@ -28,6 +29,12 @@ impl ObjectExtractor for SidecarExtractor {
|
||||
name: &Label,
|
||||
args: Option<&str>,
|
||||
) -> Result<Option<PileValue>, std::io::Error> {
|
||||
trace!(
|
||||
?args,
|
||||
key = self.item.key().as_str(),
|
||||
"Getting field {name:?} from SidecarExtractor",
|
||||
);
|
||||
|
||||
match self
|
||||
.output
|
||||
.get_or_init(|| self.item.sidecar().map(TomlExtractor::new))
|
||||
|
||||
@@ -41,13 +41,7 @@ impl TomlExtractor {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let mut reader = match self.item.read().await {
|
||||
Ok(r) => r,
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
Err(e) => return Err(e),
|
||||
};
|
||||
let mut reader = self.item.read().await?;
|
||||
let bytes = reader.read_to_end().await?;
|
||||
let toml: toml::Value = match toml::from_slice(&bytes) {
|
||||
Ok(x) => x,
|
||||
|
||||
@@ -79,6 +79,7 @@ impl ObjectExtractor for StringExtractor {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[expect(clippy::expect_used)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
|
||||
@@ -7,6 +7,12 @@ pub trait ObjectExtractor: Send + Sync {
|
||||
/// Get the field at `name` from `item`.
|
||||
/// - returns `None` if `name` is not a valid field
|
||||
/// - returns `Some(Null)` if `name` is not available
|
||||
///
|
||||
/// For extractors that parse binary, this fn should return
|
||||
/// an error only if we failed to obtain the data we need (permission denied, etc).
|
||||
///
|
||||
/// If the underlying data has an invalid format (e.g, running a pdf extractor on a non-pdf file),
|
||||
/// this fn should return `Ok(Some(None))`.
|
||||
async fn field(
|
||||
&self,
|
||||
name: &pile_config::Label,
|
||||
|
||||
@@ -199,30 +199,14 @@ impl PileValue {
|
||||
Value::String(format!("<Blob ({mime}, {} bytes)>", bytes.len()))
|
||||
}
|
||||
|
||||
#[expect(clippy::expect_used)]
|
||||
Self::Array(_) | Self::ListExtractor(_) => {
|
||||
let e = self.list_extractor();
|
||||
let len = e.len().await?;
|
||||
let mut arr = Vec::new();
|
||||
for i in 0..len {
|
||||
let v = e.get(i).await?.expect("item must be present");
|
||||
arr.push(Box::pin(v.to_json()).await?);
|
||||
}
|
||||
Value::Array(arr)
|
||||
return e.to_json().await;
|
||||
}
|
||||
|
||||
Self::ObjectExtractor(_) | Self::Item(_) => {
|
||||
let e = self.object_extractor();
|
||||
let keys = e.fields().await?;
|
||||
let mut map = Map::new();
|
||||
for k in &keys {
|
||||
let v = match e.field(k, None).await? {
|
||||
Some(x) => x,
|
||||
None => continue,
|
||||
};
|
||||
map.insert(k.to_string(), Box::pin(v.to_json()).await?);
|
||||
}
|
||||
Value::Object(map)
|
||||
return e.to_json().await;
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user