Refactor errors
Some checks failed
CI / Typos (push) Successful in 28s
CI / Clippy (push) Failing after 1m21s
CI / Build and test (all features) (push) Successful in 4m18s
CI / Build and test (push) Successful in 6m10s

This commit is contained in:
2026-03-12 23:04:59 -07:00
parent 15e56d895c
commit 95a547045d
17 changed files with 192 additions and 161 deletions

1
Cargo.lock generated
View File

@@ -2559,6 +2559,7 @@ dependencies = [
name = "pile-value"
version = "0.0.2"
dependencies = [
"anyhow",
"async-trait",
"aws-sdk-s3",
"blake3",

View File

@@ -11,6 +11,7 @@ workspace = true
pile-config = { workspace = true }
pile-flac = { workspace = true }
anyhow = { workspace = true }
serde_json = { workspace = true }
walkdir = { workspace = true }
tracing = { workspace = true }

View File

@@ -29,16 +29,9 @@ impl EpubMetaExtractor {
return Ok(x);
}
let key = self.item.key();
let ext = key.as_str().rsplit('.').next();
if !matches!(ext, Some("epub")) {
return Ok(self.output.get_or_init(HashMap::new));
}
let reader = SyncReadBridge::new_current(self.item.read().await?);
let raw_meta = tokio::task::spawn_blocking(move || {
let doc = EpubDoc::from_reader(reader)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let doc = EpubDoc::from_reader(reader)?;
let fields: &[&'static str] = &[
"title",
@@ -54,17 +47,19 @@ impl EpubMetaExtractor {
let meta: Vec<(&'static str, Option<String>)> =
fields.iter().map(|&key| (key, doc.mdata(key))).collect();
Ok::<_, std::io::Error>(meta)
Ok::<_, anyhow::Error>(meta)
})
.await
.map_err(std::io::Error::other)?;
.await?;
let raw_meta = match raw_meta {
Ok(x) => x,
Err(error) => {
trace!(message = "Could not process epub", ?error, key = ?self.item.key());
return Ok(self.output.get_or_init(HashMap::new));
}
Err(error) => match error.downcast::<std::io::Error>() {
Ok(x) => return Err(x),
Err(error) => {
trace!(message = "Could not process epub", ?error, key = ?self.item.key());
return Ok(self.output.get_or_init(HashMap::new));
}
},
};
let mut output: HashMap<Label, PileValue> = HashMap::new();

View File

@@ -4,7 +4,7 @@ use std::{
collections::HashMap,
sync::{Arc, OnceLock},
};
use tracing::debug;
use tracing::trace;
use crate::{
extract::traits::ObjectExtractor,
@@ -29,16 +29,9 @@ impl EpubTextExtractor {
return Ok(x);
}
let key = self.item.key();
let ext = key.as_str().rsplit('.').next();
if !matches!(ext, Some("epub")) {
return Ok(self.output.get_or_init(HashMap::new));
}
let reader = SyncReadBridge::new_current(self.item.read().await?);
let raw_text = tokio::task::spawn_blocking(move || {
let mut doc = EpubDoc::from_reader(reader)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let mut doc = EpubDoc::from_reader(reader)?;
let mut text_parts: Vec<String> = Vec::new();
@@ -51,17 +44,19 @@ impl EpubTextExtractor {
}
}
Ok::<_, std::io::Error>(text_parts.join(" "))
Ok::<_, anyhow::Error>(text_parts.join(" "))
})
.await
.map_err(std::io::Error::other)?;
.await?;
let raw_text = match raw_text {
Ok(x) => x,
Err(error) => {
debug!(message = "Could not process epub", ?error, key = ?self.item.key());
return Ok(self.output.get_or_init(HashMap::new));
}
Err(error) => match error.downcast::<std::io::Error>() {
Ok(x) => return Err(x),
Err(error) => {
trace!(message = "Could not process epub", ?error, key = ?self.item.key());
return Ok(self.output.get_or_init(HashMap::new));
}
},
};
#[expect(clippy::unwrap_used)]

View File

@@ -34,7 +34,13 @@ impl ObjectExtractor for EpubExtractor {
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
match (name.as_str(), args) {
("text", args) => self.text.field(name, args).await,
("text", args) => Ok(Some(
self.text
.field(name, args)
.await
.map(|x| x.unwrap_or(PileValue::Null))?,
)),
("meta", None) => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))),
_ => Ok(None),
}

View File

@@ -32,9 +32,7 @@ impl ExifExtractor {
let reader = SyncReadBridge::new_current(self.item.read().await?);
let raw_fields = tokio::task::spawn_blocking(move || {
let mut br = BufReader::new(reader);
let exif = exif::Reader::new()
.read_from_container(&mut br)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let exif = exif::Reader::new().read_from_container(&mut br)?;
let fields: Vec<(String, String)> = exif
.fields()
@@ -46,13 +44,13 @@ impl ExifExtractor {
})
.collect();
Ok::<_, std::io::Error>(fields)
Ok::<_, exif::Error>(fields)
})
.await
.map_err(std::io::Error::other)?;
.await?;
let raw_fields = match raw_fields {
Ok(x) => x,
Err(exif::Error::Io(x)) => return Err(x),
Err(error) => {
trace!(message = "Could not process exif", ?error, key = ?self.item.key());
return Ok(self.output.get_or_init(HashMap::new));
@@ -65,6 +63,7 @@ impl ExifExtractor {
let Some(label) = tag_to_label(&tag_name) else {
continue;
};
// First occurrence wins (PRIMARY IFD comes before THUMBNAIL)
output
.entry(label)
@@ -91,6 +90,12 @@ impl ObjectExtractor for ExifExtractor {
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
trace!(
?args,
key = self.item.key().as_str(),
"Getting field {name:?} from ExifExtractor",
);
if args.is_some() {
return Ok(None);
}

View File

@@ -1,11 +1,12 @@
use mime::Mime;
use pile_config::Label;
use pile_flac::{FlacBlock, FlacReader};
use pile_flac::{FlacBlock, FlacDecodeError, FlacReader};
use std::{
collections::HashMap,
io::BufReader,
sync::{Arc, OnceLock},
};
use tracing::trace;
use crate::{
extract::traits::{ListExtractor, ObjectExtractor},
@@ -31,16 +32,17 @@ impl FlacImagesExtractor {
let reader = FlacReader::new(BufReader::new(reader));
let mut count = 0usize;
for block in reader {
match block.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))? {
FlacBlock::AudioFrame(_) => break,
FlacBlock::Picture(_) => count += 1,
match block {
Ok(FlacBlock::AudioFrame(_)) => break,
Ok(FlacBlock::Picture(_)) => count += 1,
Err(FlacDecodeError::IoError(err)) => return Err(err),
Err(_) => return Ok(0),
_ => {}
}
}
Ok::<_, std::io::Error>(count)
})
.await
.map_err(std::io::Error::other)??;
.await??;
return Ok(count);
}
@@ -49,14 +51,20 @@ impl FlacImagesExtractor {
#[async_trait::async_trait]
impl ListExtractor for FlacImagesExtractor {
async fn get<'a>(&'a self, mut idx: usize) -> Result<Option<PileValue>, std::io::Error> {
trace!(
key = self.item.key().as_str(),
"Getting index {idx} from FlacImagesExtractor",
);
let key = self.item.key();
let reader = SyncReadBridge::new_current(self.item.read().await?);
let image = tokio::task::spawn_blocking(move || {
let reader = FlacReader::new(BufReader::new(reader));
let mut out: Option<(Mime, Vec<u8>)> = None;
'blocks: for block in reader {
match block.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))? {
FlacBlock::AudioFrame(_) => break,
FlacBlock::Picture(picture) => {
match block {
Ok(FlacBlock::AudioFrame(_)) => break,
Ok(FlacBlock::Picture(picture)) => {
if idx > 0 {
idx -= 1;
continue;
@@ -66,6 +74,16 @@ impl ListExtractor for FlacImagesExtractor {
break 'blocks;
}
Err(FlacDecodeError::IoError(err)) => return Err(err),
Err(error) => {
trace!(
message = "Could not parse FLAC images",
key = key.as_str(),
?error
);
return Ok(None);
}
_ => {}
}
}
@@ -93,23 +111,15 @@ impl ListExtractor for FlacImagesExtractor {
pub struct FlacExtractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
images: Option<PileValue>,
images: PileValue,
}
impl FlacExtractor {
pub fn new(item: &Item) -> Self {
let is_flac = match item {
Item::File { path, .. } => path.to_str().unwrap_or_default().ends_with(".flac"),
Item::S3 { key, .. } => key.ends_with(".flac"),
};
let images =
is_flac.then(|| PileValue::ListExtractor(Arc::new(FlacImagesExtractor::new(item))));
Self {
item: item.clone(),
output: OnceLock::new(),
images,
images: PileValue::ListExtractor(Arc::new(FlacImagesExtractor::new(item))),
}
}
@@ -118,54 +128,55 @@ impl FlacExtractor {
return Ok(x);
}
let key = match &self.item {
Item::File { path, .. } => path.to_str().unwrap_or_default().to_owned(),
Item::S3 { key, .. } => key.to_string(),
};
if !key.ends_with(".flac") {
let _ = self.output.set(HashMap::new());
#[expect(clippy::unwrap_used)]
return Ok(self.output.get().unwrap());
}
trace!(
message = "Reading FLAC tags",
key = self.item.key().as_str()
);
let key = self.item.key();
let reader = SyncReadBridge::new_current(self.item.read().await?);
let raw_tags = tokio::task::spawn_blocking(move || {
let output = tokio::task::spawn_blocking(move || {
let reader = FlacReader::new(BufReader::new(reader));
let mut tags: Vec<(String, String)> = Vec::new();
let mut output: HashMap<Label, Vec<PileValue>> = HashMap::new();
for block in reader {
match block.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))? {
FlacBlock::VorbisComment(comment) => {
match block {
Ok(FlacBlock::AudioFrame(_)) => break,
Ok(FlacBlock::VorbisComment(comment)) => {
for (k, v) in comment.comment.comments {
tags.push((k.to_string().to_lowercase(), v.into()));
if let Some(label) = Label::new(k.to_string().to_lowercase()) {
output
.entry(label)
.or_default()
.push(PileValue::String(Arc::new(v)));
}
}
}
FlacBlock::AudioFrame(_) => break,
Err(FlacDecodeError::IoError(err)) => return Err(err),
Err(error) => {
trace!(
message = "Could not parse FLAC metadata",
key = key.as_str(),
?error
);
return Ok(HashMap::new());
}
_ => {}
}
}
Ok::<_, std::io::Error>(tags)
let output: HashMap<Label, PileValue> = output
.into_iter()
.map(|(k, v)| (k, PileValue::Array(Arc::new(v))))
.collect();
Ok::<HashMap<Label, PileValue>, std::io::Error>(output)
})
.await
.map_err(std::io::Error::other)??;
.await??;
let mut output: HashMap<Label, Vec<PileValue>> = HashMap::new();
for (k, v) in raw_tags {
if let Some(label) = Label::new(k) {
output
.entry(label)
.or_default()
.push(PileValue::String(Arc::new(v.into())));
}
}
let output: HashMap<Label, PileValue> = output
.into_iter()
.map(|(k, v)| (k, PileValue::Array(Arc::new(v))))
.collect();
let _ = self.output.set(output);
#[expect(clippy::unwrap_used)]
return Ok(self.output.get().unwrap());
return Ok(self.output.get_or_init(|| output));
}
}
@@ -180,20 +191,21 @@ impl ObjectExtractor for FlacExtractor {
return Ok(None);
}
if name.as_str() == "images"
&& let Some(ref images) = self.images
{
return Ok(Some(images.clone()));
if name.as_str() == "images" {
return Ok(Some(self.images.clone()));
}
Ok(self.get_inner().await?.get(name).cloned())
}
#[expect(clippy::unwrap_used)]
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
let mut fields = self.get_inner().await?.keys().cloned().collect::<Vec<_>>();
if self.images.is_some() {
#[expect(clippy::unwrap_used)]
fields.push(Label::new("images").unwrap());
}
Ok(fields)
Ok(self
.get_inner()
.await?
.keys()
.cloned()
.chain([Label::new("images").unwrap()])
.collect::<Vec<_>>())
}
}

View File

@@ -1,3 +1,7 @@
use crate::{
extract::traits::ObjectExtractor,
value::{Item, PileValue},
};
use pile_config::Label;
use std::{
collections::HashMap,
@@ -5,11 +9,6 @@ use std::{
sync::{Arc, OnceLock},
};
use crate::{
extract::traits::ObjectExtractor,
value::{Item, PileValue},
};
pub struct FsExtractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
@@ -32,6 +31,26 @@ impl FsExtractor {
return Ok(self.output.get_or_init(HashMap::new));
};
let mut root = false;
let components = path
.components()
.map(|x| match x {
Component::CurDir => None,
Component::Normal(x) => x.to_str().map(|x| x.to_owned()),
Component::ParentDir => Some("..".to_owned()),
Component::RootDir => {
root = true;
None
}
Component::Prefix(x) => x.as_os_str().to_str().map(|x| x.to_owned()),
})
.collect::<Option<Vec<_>>>();
let mut path_str = components.as_ref().map(|x| x.join("/"));
if root {
path_str = path_str.map(|x| format!("/{x}"));
}
#[expect(clippy::unwrap_used)]
let output = HashMap::from([
(
@@ -43,23 +62,20 @@ impl FsExtractor {
),
(
Label::new("path").unwrap(),
path.to_str()
path_str
.map(|x| PileValue::String(Arc::new(x.into())))
.unwrap_or(PileValue::Null),
),
(
Label::new("segments").unwrap(),
path.components()
.map(|x| match x {
Component::CurDir => Some(".".to_owned()),
Component::Normal(x) => x.to_str().map(|x| x.to_owned()),
Component::ParentDir => Some("..".to_owned()),
Component::RootDir => Some("/".to_owned()),
Component::Prefix(x) => x.as_os_str().to_str().map(|x| x.to_owned()),
components
.map(|x| {
PileValue::Array(Arc::new(
x.iter()
.map(|x| PileValue::String(Arc::new(x.into())))
.collect(),
))
})
.map(|x| x.map(|x| PileValue::String(Arc::new(x.into()))))
.collect::<Option<Vec<_>>>()
.map(|v| PileValue::Array(Arc::new(v)))
.unwrap_or(PileValue::Null),
),
]);

View File

@@ -6,6 +6,7 @@ use std::{
io::BufReader,
sync::{Arc, OnceLock},
};
use tracing::trace;
use crate::{
extract::traits::ObjectExtractor,
@@ -30,32 +31,29 @@ impl Id3Extractor {
return Ok(x);
}
let key = self.item.key();
let ext = key.as_str().rsplit('.').next();
if !matches!(ext, Some("mp3") | Some("aiff") | Some("aif") | Some("wav")) {
return Ok(self.output.get_or_init(HashMap::new));
}
trace!(message = "Reading id3 tags", key = self.item.key().as_str());
let key = self.item.key();
let reader = SyncReadBridge::new_current(self.item.read().await?);
let tag = match tokio::task::spawn_blocking(move || Tag::read_from2(BufReader::new(reader)))
.await
{
Ok(Ok(tag)) => tag,
Ok(Err(id3::Error {
kind: id3::ErrorKind::NoTag,
..
})) => {
return Ok(self.output.get_or_init(HashMap::new));
}
Err(e) => return Err(e.into()),
Ok(Err(id3::Error {
kind: id3::ErrorKind::Io(e),
..
})) => return Err(e),
Ok(Err(e)) => return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, e)),
Err(e) => return Err(e.into()),
Ok(Err(error)) => {
trace!(
message = "Could not parse id3 tags",
key = key.as_str(),
?error
);
return Ok(self.output.get_or_init(HashMap::new));
}
};
let mut output: HashMap<Label, Vec<PileValue>> = HashMap::new();

View File

@@ -1,5 +1,6 @@
use pile_config::Label;
use std::sync::Arc;
use tracing::trace;
#[cfg(feature = "pdfium")]
mod pdf_pages;
@@ -42,6 +43,12 @@ impl ObjectExtractor for PdfExtractor {
name: &pile_config::Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
trace!(
?args,
key = self.text.item.key().as_str(),
"Getting field {name:?} from PdfExtractor",
);
match (name.as_str(), args) {
("text", args) => self.text.field(name, args).await,
("meta", None) => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))),
@@ -57,8 +64,6 @@ impl ObjectExtractor for PdfExtractor {
Label::new("text").unwrap(),
Label::new("meta").unwrap(),
#[cfg(feature = "pdfium")]
Label::new("cover").unwrap(),
#[cfg(feature = "pdfium")]
Label::new("pages").unwrap(),
])
}

View File

@@ -35,6 +35,11 @@ impl PdfPagesExtractor {
#[async_trait::async_trait]
impl ListExtractor for PdfPagesExtractor {
async fn get(&self, idx: usize) -> Result<Option<PileValue>, std::io::Error> {
trace!(
key = self.item.key().as_str(),
"Getting index {idx} from PdfPagesExtractor",
);
let bytes = self.get_bytes().await?;
let png = tokio::task::spawn_blocking(move || {
let pdfium = Pdfium::default();

View File

@@ -14,7 +14,7 @@ use crate::{
};
pub struct PdfTextExtractor {
item: Item,
pub(super) item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
}

View File

@@ -1,5 +1,6 @@
use pile_config::Label;
use std::sync::OnceLock;
use tracing::trace;
use super::TomlExtractor;
use crate::{
@@ -28,6 +29,12 @@ impl ObjectExtractor for SidecarExtractor {
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
trace!(
?args,
key = self.item.key().as_str(),
"Getting field {name:?} from SidecarExtractor",
);
match self
.output
.get_or_init(|| self.item.sidecar().map(TomlExtractor::new))

View File

@@ -41,13 +41,7 @@ impl TomlExtractor {
return Ok(x);
}
let mut reader = match self.item.read().await {
Ok(r) => r,
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
return Ok(self.output.get_or_init(HashMap::new));
}
Err(e) => return Err(e),
};
let mut reader = self.item.read().await?;
let bytes = reader.read_to_end().await?;
let toml: toml::Value = match toml::from_slice(&bytes) {
Ok(x) => x,

View File

@@ -79,6 +79,7 @@ impl ObjectExtractor for StringExtractor {
}
#[cfg(test)]
#[expect(clippy::expect_used)]
mod tests {
use super::*;

View File

@@ -7,6 +7,12 @@ pub trait ObjectExtractor: Send + Sync {
/// Get the field at `name` from `item`.
/// - returns `None` if `name` is not a valid field
/// - returns `Some(Null)` if `name` is not available
///
/// For extractors that parse binary, this fn should return
/// an error only if we failed to obtain the data we need (permission denied, etc).
///
/// If the underlying data has an invalid format (e.g, running a pdf extractor on a non-pdf file),
/// this fn should return `Ok(Some(None))`.
async fn field(
&self,
name: &pile_config::Label,

View File

@@ -199,30 +199,14 @@ impl PileValue {
Value::String(format!("<Blob ({mime}, {} bytes)>", bytes.len()))
}
#[expect(clippy::expect_used)]
Self::Array(_) | Self::ListExtractor(_) => {
let e = self.list_extractor();
let len = e.len().await?;
let mut arr = Vec::new();
for i in 0..len {
let v = e.get(i).await?.expect("item must be present");
arr.push(Box::pin(v.to_json()).await?);
}
Value::Array(arr)
return e.to_json().await;
}
Self::ObjectExtractor(_) | Self::Item(_) => {
let e = self.object_extractor();
let keys = e.fields().await?;
let mut map = Map::new();
for k in &keys {
let v = match e.field(k, None).await? {
Some(x) => x,
None => continue,
};
map.insert(k.to_string(), Box::pin(v.to_json()).await?);
}
Value::Object(map)
return e.to_json().await;
}
})
}