Add pdf extractors
This commit is contained in:
@@ -22,6 +22,11 @@ impl<'a> FlacExtractor<'a> {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
// If this isn't a flac file, ignore it.
|
||||
if self.item.path.extension().map(|x| x.to_str()).flatten() != Some("flac") {
|
||||
return Ok(self.output.get_or_init(|| HashMap::new()));
|
||||
}
|
||||
|
||||
let file = File::open(&self.item.path)?;
|
||||
let reader = FlacReader::new(BufReader::new(file));
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ use std::collections::HashMap;
|
||||
use crate::{Item, PileValue, extract::Extractor};
|
||||
|
||||
pub struct MapExtractor<'a, I: Item> {
|
||||
pub(super) inner: HashMap<Label, PileValue<'a, I>>,
|
||||
pub(crate) inner: HashMap<Label, PileValue<'a, I>>,
|
||||
}
|
||||
|
||||
impl<I: Item> Extractor<I> for MapExtractor<'_, I> {
|
||||
|
||||
@@ -7,6 +7,9 @@ pub use flac::*;
|
||||
mod fs;
|
||||
pub use fs::*;
|
||||
|
||||
mod pdf;
|
||||
pub use pdf::*;
|
||||
|
||||
mod sidecar;
|
||||
pub use sidecar::*;
|
||||
|
||||
@@ -49,6 +52,10 @@ impl<'a> MetaExtractor<'a, crate::FileItem> {
|
||||
Label::new("fs").unwrap(),
|
||||
crate::PileValue::Extractor(Rc::new(FsExtractor::new(item))),
|
||||
),
|
||||
(
|
||||
Label::new("pdf").unwrap(),
|
||||
crate::PileValue::Extractor(Rc::new(PdfExtractor::new(item))),
|
||||
),
|
||||
(
|
||||
Label::new("sidecar").unwrap(),
|
||||
crate::PileValue::Extractor(Rc::new(SidecarExtractor::new(item))),
|
||||
@@ -73,6 +80,7 @@ impl Extractor<crate::FileItem> for MetaExtractor<'_, crate::FileItem> {
|
||||
return Ok(vec![
|
||||
Label::new("flac").unwrap(),
|
||||
Label::new("fs").unwrap(),
|
||||
Label::new("pdf").unwrap(),
|
||||
Label::new("sidecar").unwrap(),
|
||||
]);
|
||||
}
|
||||
|
||||
61
crates/pile-dataset/src/extract/pdf/mod.rs
Normal file
61
crates/pile-dataset/src/extract/pdf/mod.rs
Normal file
@@ -0,0 +1,61 @@
|
||||
use pile_config::Label;
|
||||
use std::{collections::HashMap, rc::Rc};
|
||||
|
||||
mod pdf_meta;
|
||||
pub use pdf_meta::*;
|
||||
|
||||
mod pdf_text;
|
||||
pub use pdf_text::*;
|
||||
|
||||
use crate::{
|
||||
FileItem, PileValue,
|
||||
extract::{Extractor, MapExtractor},
|
||||
};
|
||||
|
||||
pub struct PdfExtractor<'a> {
|
||||
inner: MapExtractor<'a, FileItem>,
|
||||
}
|
||||
|
||||
impl<'a> PdfExtractor<'a> {
|
||||
#[expect(clippy::unwrap_used)]
|
||||
pub fn new(item: &'a FileItem) -> Self {
|
||||
let inner = MapExtractor {
|
||||
inner: HashMap::from([
|
||||
(
|
||||
Label::new("text").unwrap(),
|
||||
PileValue::Extractor(Rc::new(PdfTextExtractor::new(item))),
|
||||
),
|
||||
(
|
||||
Label::new("meta").unwrap(),
|
||||
PileValue::Extractor(Rc::new(PdfMetaExtractor::new(item))),
|
||||
),
|
||||
]),
|
||||
};
|
||||
|
||||
Self { inner }
|
||||
}
|
||||
}
|
||||
|
||||
impl Extractor<FileItem> for PdfExtractor<'_> {
|
||||
fn field<'a>(
|
||||
&'a self,
|
||||
name: &pile_config::Label,
|
||||
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> {
|
||||
if name.as_str() == "text" {
|
||||
match self.inner.inner.get(name).unwrap() {
|
||||
PileValue::Extractor(x) => return x.field(name),
|
||||
_ => unreachable!(),
|
||||
};
|
||||
}
|
||||
|
||||
self.inner.field(name)
|
||||
}
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(vec![
|
||||
Label::new("text").unwrap(),
|
||||
Label::new("meta").unwrap(),
|
||||
])
|
||||
}
|
||||
}
|
||||
98
crates/pile-dataset/src/extract/pdf/pdf_meta.rs
Normal file
98
crates/pile-dataset/src/extract/pdf/pdf_meta.rs
Normal file
@@ -0,0 +1,98 @@
|
||||
use pdf::file::FileOptions;
|
||||
use pdf::primitive::{Date, TimeRel};
|
||||
use pile_config::Label;
|
||||
use std::{collections::HashMap, sync::OnceLock};
|
||||
|
||||
use crate::{FileItem, PileValue, extract::Extractor};
|
||||
|
||||
pub struct PdfMetaExtractor<'a> {
|
||||
item: &'a FileItem,
|
||||
output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>,
|
||||
}
|
||||
|
||||
impl<'a> PdfMetaExtractor<'a> {
|
||||
pub fn new(item: &'a FileItem) -> Self {
|
||||
Self {
|
||||
item,
|
||||
output: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> {
|
||||
if let Some(x) = self.output.get() {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let file = FileOptions::cached()
|
||||
.open(&self.item.path)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||
|
||||
let mut output: HashMap<Label, PileValue<'a, FileItem>> = HashMap::new();
|
||||
|
||||
if let Some(info) = &file.trailer.info_dict {
|
||||
let fields: &[(&str, Option<&_>)] = &[
|
||||
("title", info.title.as_ref()),
|
||||
("author", info.author.as_ref()),
|
||||
("subject", info.subject.as_ref()),
|
||||
("keywords", info.keywords.as_ref()),
|
||||
("creator", info.creator.as_ref()),
|
||||
("producer", info.producer.as_ref()),
|
||||
];
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
for (key, val) in fields {
|
||||
let label = Label::new(*key).unwrap();
|
||||
let value = match val {
|
||||
Some(s) => PileValue::String(s.to_string_lossy().into()),
|
||||
None => PileValue::Null,
|
||||
};
|
||||
output.insert(label, value);
|
||||
}
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
{
|
||||
output.insert(
|
||||
Label::new("creation_date").unwrap(),
|
||||
info.creation_date
|
||||
.as_ref()
|
||||
.map(|d| PileValue::String(format_date(d).into()))
|
||||
.unwrap_or(PileValue::Null),
|
||||
);
|
||||
output.insert(
|
||||
Label::new("mod_date").unwrap(),
|
||||
info.mod_date
|
||||
.as_ref()
|
||||
.map(|d| PileValue::String(format_date(d).into()))
|
||||
.unwrap_or(PileValue::Null),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return Ok(self.output.get_or_init(|| output));
|
||||
}
|
||||
}
|
||||
|
||||
fn format_date(d: &Date) -> String {
|
||||
let tz = match d.rel {
|
||||
TimeRel::Universal => "Z".to_owned(),
|
||||
TimeRel::Later => format!("+{:02}:{:02}", d.tz_hour, d.tz_minute),
|
||||
TimeRel::Earlier => format!("-{:02}:{:02}", d.tz_hour, d.tz_minute),
|
||||
};
|
||||
format!(
|
||||
"{:04}-{:02}-{:02}T{:02}:{:02}:{:02}{}",
|
||||
d.year, d.month, d.day, d.hour, d.minute, d.second, tz
|
||||
)
|
||||
}
|
||||
|
||||
impl Extractor<FileItem> for PdfMetaExtractor<'_> {
|
||||
fn field<'a>(
|
||||
&'a self,
|
||||
name: &Label,
|
||||
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> {
|
||||
Ok(self.get_inner()?.get(name))
|
||||
}
|
||||
|
||||
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner()?.keys().cloned().collect())
|
||||
}
|
||||
}
|
||||
79
crates/pile-dataset/src/extract/pdf/pdf_text.rs
Normal file
79
crates/pile-dataset/src/extract/pdf/pdf_text.rs
Normal file
@@ -0,0 +1,79 @@
|
||||
use pdf::content::{Op, TextDrawAdjusted};
|
||||
use pdf::file::FileOptions;
|
||||
use pile_config::Label;
|
||||
use std::{collections::HashMap, sync::OnceLock};
|
||||
|
||||
use crate::{FileItem, PileValue, extract::Extractor};
|
||||
|
||||
pub struct PdfTextExtractor<'a> {
|
||||
item: &'a FileItem,
|
||||
output: OnceLock<HashMap<Label, PileValue<'a, FileItem>>>,
|
||||
}
|
||||
|
||||
impl<'a> PdfTextExtractor<'a> {
|
||||
pub fn new(item: &'a FileItem) -> Self {
|
||||
Self {
|
||||
item,
|
||||
output: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_inner(&self) -> Result<&HashMap<Label, PileValue<'a, FileItem>>, std::io::Error> {
|
||||
if let Some(x) = self.output.get() {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let file = FileOptions::cached()
|
||||
.open(&self.item.path)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||
|
||||
let mut text_parts: Vec<String> = Vec::new();
|
||||
|
||||
for page in file.pages() {
|
||||
let page = page
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||
|
||||
if let Some(content) = &page.contents {
|
||||
let ops = content.operations(&file.resolver()).map_err(|e| {
|
||||
std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string())
|
||||
})?;
|
||||
|
||||
for op in ops {
|
||||
match op {
|
||||
Op::TextDraw { text } => {
|
||||
text_parts.push(text.to_string_lossy());
|
||||
}
|
||||
Op::TextDrawAdjusted { array } => {
|
||||
for item in array {
|
||||
if let TextDrawAdjusted::Text(text) = item {
|
||||
text_parts.push(text.to_string_lossy());
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let text = text_parts.join(" ");
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
let output = HashMap::from([(Label::new("text").unwrap(), PileValue::String(text.into()))]);
|
||||
|
||||
return Ok(self.output.get_or_init(|| output));
|
||||
}
|
||||
}
|
||||
|
||||
impl Extractor<FileItem> for PdfTextExtractor<'_> {
|
||||
fn field<'a>(
|
||||
&'a self,
|
||||
name: &Label,
|
||||
) -> Result<Option<&'a PileValue<'a, FileItem>>, std::io::Error> {
|
||||
Ok(self.get_inner()?.get(name))
|
||||
}
|
||||
|
||||
fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner()?.keys().cloned().collect())
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user