Compare commits
1 Commits
8a8e0a2770
...
e7afca3010
| Author | SHA1 | Date | |
|---|---|---|---|
| e7afca3010 |
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -2574,6 +2574,7 @@ dependencies = [
|
|||||||
"pdfium-render",
|
"pdfium-render",
|
||||||
"pile-config",
|
"pile-config",
|
||||||
"pile-flac",
|
"pile-flac",
|
||||||
|
"regex",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"smartstring",
|
"smartstring",
|
||||||
"tokio",
|
"tokio",
|
||||||
|
|||||||
@@ -127,6 +127,7 @@ mime = "0.3.17"
|
|||||||
mime_guess = "2.0.5"
|
mime_guess = "2.0.5"
|
||||||
paste = "1.0.15"
|
paste = "1.0.15"
|
||||||
smartstring = "1.0.1"
|
smartstring = "1.0.1"
|
||||||
|
regex = "1"
|
||||||
chrono = "0.4.43"
|
chrono = "0.4.43"
|
||||||
parking_lot = "0.12.5"
|
parking_lot = "0.12.5"
|
||||||
rayon = "1.11.0"
|
rayon = "1.11.0"
|
||||||
|
|||||||
37
README.md
Normal file
37
README.md
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
# TODO
|
||||||
|
|
||||||
|
- redo sidecars
|
||||||
|
- sidecar manipulation commands:
|
||||||
|
- fill (defaults, defined in toml)
|
||||||
|
- delete
|
||||||
|
- writing to files would be nice, but we want to be able to revert changes
|
||||||
|
- make sure to `upload` sidecars
|
||||||
|
- json extractor
|
||||||
|
|
||||||
|
- incremental index
|
||||||
|
- better errors (s3 not found)
|
||||||
|
|
||||||
|
## later
|
||||||
|
|
||||||
|
- search ui (betalupi books + handouts)
|
||||||
|
- lists
|
||||||
|
- export
|
||||||
|
- libgen search?
|
||||||
|
- remote encryption
|
||||||
|
- publish (arch, nix, crates)
|
||||||
|
|
||||||
|
- extractors
|
||||||
|
- ogg
|
||||||
|
- regex (from any string, filename, etc)
|
||||||
|
- whisper (speech-to-text)
|
||||||
|
- ocr (pdf pages, pass through fn!)
|
||||||
|
- list, fields on schemas instead of fields
|
||||||
|
- blobs as items or items as blobs?
|
||||||
|
- default args for each label (to_json, count)?
|
||||||
|
- which fields to include in json?
|
||||||
|
- nonempty (array)
|
||||||
|
|
||||||
|
- redo docs
|
||||||
|
- source types
|
||||||
|
- sidecars (rename?)
|
||||||
|
- doc all keys
|
||||||
@@ -18,6 +18,7 @@ tracing = { workspace = true }
|
|||||||
chrono = { workspace = true }
|
chrono = { workspace = true }
|
||||||
toml = { workspace = true }
|
toml = { workspace = true }
|
||||||
smartstring = { workspace = true }
|
smartstring = { workspace = true }
|
||||||
|
regex = { workspace = true }
|
||||||
blake3 = { workspace = true }
|
blake3 = { workspace = true }
|
||||||
epub = { workspace = true }
|
epub = { workspace = true }
|
||||||
kamadak-exif = { workspace = true }
|
kamadak-exif = { workspace = true }
|
||||||
|
|||||||
@@ -28,6 +28,9 @@ pub use toml::*;
|
|||||||
mod group;
|
mod group;
|
||||||
pub use group::*;
|
pub use group::*;
|
||||||
|
|
||||||
|
mod text;
|
||||||
|
pub use text::*;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::{
|
extract::{
|
||||||
misc::MapExtractor,
|
misc::MapExtractor,
|
||||||
@@ -77,6 +80,10 @@ impl ItemExtractor {
|
|||||||
Label::new("toml").unwrap(),
|
Label::new("toml").unwrap(),
|
||||||
PileValue::ObjectExtractor(Arc::new(TomlExtractor::new(item))),
|
PileValue::ObjectExtractor(Arc::new(TomlExtractor::new(item))),
|
||||||
),
|
),
|
||||||
|
(
|
||||||
|
Label::new("text").unwrap(),
|
||||||
|
PileValue::ObjectExtractor(Arc::new(TextExtractor::new(item))),
|
||||||
|
),
|
||||||
(
|
(
|
||||||
Label::new("groups").unwrap(),
|
Label::new("groups").unwrap(),
|
||||||
PileValue::ObjectExtractor(Arc::new(GroupExtractor::new(item))),
|
PileValue::ObjectExtractor(Arc::new(GroupExtractor::new(item))),
|
||||||
@@ -110,6 +117,7 @@ impl ObjectExtractor for ItemExtractor {
|
|||||||
Label::new("pdf").unwrap(),
|
Label::new("pdf").unwrap(),
|
||||||
Label::new("json").unwrap(),
|
Label::new("json").unwrap(),
|
||||||
Label::new("toml").unwrap(),
|
Label::new("toml").unwrap(),
|
||||||
|
Label::new("text").unwrap(),
|
||||||
Label::new("groups").unwrap(),
|
Label::new("groups").unwrap(),
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|||||||
67
crates/pile-value/src/extract/item/text.rs
Normal file
67
crates/pile-value/src/extract/item/text.rs
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
use pile_config::Label;
|
||||||
|
use std::sync::{Arc, OnceLock};
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
extract::traits::{ExtractState, ObjectExtractor},
|
||||||
|
value::{AsyncReader, Item, PileValue},
|
||||||
|
};
|
||||||
|
|
||||||
|
pub struct TextExtractor {
|
||||||
|
item: Item,
|
||||||
|
output: OnceLock<PileValue>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TextExtractor {
|
||||||
|
pub fn new(item: &Item) -> Self {
|
||||||
|
Self {
|
||||||
|
item: item.clone(),
|
||||||
|
output: OnceLock::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
impl ObjectExtractor for TextExtractor {
|
||||||
|
async fn field(
|
||||||
|
&self,
|
||||||
|
state: &ExtractState,
|
||||||
|
name: &Label,
|
||||||
|
args: Option<&str>,
|
||||||
|
) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
|
if args.is_some() {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
if !state.ignore_mime
|
||||||
|
&& (self.item.mime().type_() != mime::TEXT
|
||||||
|
&& self.item.mime().type_() != mime::APPLICATION)
|
||||||
|
{
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
if name.as_str() != "text" {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
if let Some(x) = self.output.get() {
|
||||||
|
return Ok(Some(x.clone()));
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut reader = self.item.read().await?;
|
||||||
|
let bytes = reader.read_to_end().await?;
|
||||||
|
let string = String::from_utf8(bytes).ok();
|
||||||
|
let value = match string {
|
||||||
|
Some(x) => PileValue::String(Arc::new(x.into())),
|
||||||
|
None => PileValue::Null,
|
||||||
|
};
|
||||||
|
|
||||||
|
return Ok(Some(self.output.get_or_init(|| value).clone()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[expect(clippy::unwrap_used)]
|
||||||
|
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||||
|
Ok(vec![Label::new("text").unwrap()])
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,4 +1,5 @@
|
|||||||
pub mod item;
|
pub mod item;
|
||||||
pub mod misc;
|
pub mod misc;
|
||||||
|
pub mod regex;
|
||||||
pub mod string;
|
pub mod string;
|
||||||
pub mod traits;
|
pub mod traits;
|
||||||
|
|||||||
107
crates/pile-value/src/extract/regex.rs
Normal file
107
crates/pile-value/src/extract/regex.rs
Normal file
@@ -0,0 +1,107 @@
|
|||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use pile_config::Label;
|
||||||
|
use regex::Regex;
|
||||||
|
use smartstring::{LazyCompact, SmartString};
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
extract::traits::{ExtractState, ListExtractor, ObjectExtractor},
|
||||||
|
value::PileValue,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct RegexData {
|
||||||
|
regex: Arc<Regex>,
|
||||||
|
/// Captured substrings indexed by group index (0 = whole match).
|
||||||
|
captures: Vec<Option<Arc<SmartString<LazyCompact>>>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RegexData {
|
||||||
|
fn new(regex: Arc<Regex>, input: &str) -> Option<Self> {
|
||||||
|
let caps = regex.captures(input)?;
|
||||||
|
let captures = caps
|
||||||
|
.iter()
|
||||||
|
.map(|m| m.map(|m| Arc::new(m.as_str().into())))
|
||||||
|
.collect();
|
||||||
|
Some(Self { regex, captures })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Exposes named capture groups as object fields.
|
||||||
|
pub struct RegexExtractor(Arc<RegexData>);
|
||||||
|
|
||||||
|
impl RegexExtractor {
|
||||||
|
/// Run `regex` against `input`. Returns `None` if there is no match.
|
||||||
|
pub fn new(regex: Arc<Regex>, input: &str) -> Option<Self> {
|
||||||
|
Some(Self(Arc::new(RegexData::new(regex, input)?)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
impl ObjectExtractor for RegexExtractor {
|
||||||
|
async fn field(
|
||||||
|
&self,
|
||||||
|
_state: &ExtractState,
|
||||||
|
name: &Label,
|
||||||
|
args: Option<&str>,
|
||||||
|
) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
|
if args.is_some() {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
let Some(idx) = self
|
||||||
|
.0
|
||||||
|
.regex
|
||||||
|
.capture_names()
|
||||||
|
.position(|n| n == Some(name.as_str()))
|
||||||
|
else {
|
||||||
|
return Ok(None);
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(Some(
|
||||||
|
match self.0.captures.get(idx).and_then(|v| v.as_ref()) {
|
||||||
|
Some(s) => PileValue::String(s.clone()),
|
||||||
|
None => PileValue::Null,
|
||||||
|
},
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||||
|
#[expect(clippy::unwrap_used)]
|
||||||
|
Ok(self
|
||||||
|
.0
|
||||||
|
.regex
|
||||||
|
.capture_names()
|
||||||
|
.flatten()
|
||||||
|
.map(|n| Label::new(n).unwrap())
|
||||||
|
.collect())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn as_list(&self) -> Option<Arc<dyn ListExtractor>> {
|
||||||
|
Some(Arc::new(RegexListExtractor(self.0.clone())))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Exposes unnamed (positional) capture groups as a list (group 0 excluded).
|
||||||
|
pub struct RegexListExtractor(Arc<RegexData>);
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
impl ListExtractor for RegexListExtractor {
|
||||||
|
async fn get(
|
||||||
|
&self,
|
||||||
|
_state: &ExtractState,
|
||||||
|
idx: usize,
|
||||||
|
) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
|
let raw_idx = idx + 1;
|
||||||
|
let Some(slot) = self.0.captures.get(raw_idx) else {
|
||||||
|
return Ok(None);
|
||||||
|
};
|
||||||
|
Ok(Some(match slot {
|
||||||
|
Some(s) => PileValue::String(s.clone()),
|
||||||
|
None => PileValue::Null,
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn len(&self, _state: &ExtractState) -> Result<usize, std::io::Error> {
|
||||||
|
Ok(self.0.captures.len().saturating_sub(1))
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,9 +1,13 @@
|
|||||||
use pile_config::Label;
|
use pile_config::Label;
|
||||||
|
use regex::Regex;
|
||||||
use smartstring::{LazyCompact, SmartString};
|
use smartstring::{LazyCompact, SmartString};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
extract::{
|
||||||
|
regex::RegexExtractor,
|
||||||
|
traits::{ExtractState, ObjectExtractor},
|
||||||
|
},
|
||||||
value::PileValue,
|
value::PileValue,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -67,6 +71,16 @@ impl ObjectExtractor for StringExtractor {
|
|||||||
.collect(),
|
.collect(),
|
||||||
))),
|
))),
|
||||||
|
|
||||||
|
("regex", Some(pattern)) => {
|
||||||
|
let Ok(re) = Regex::new(pattern) else {
|
||||||
|
return Ok(None);
|
||||||
|
};
|
||||||
|
Some(match RegexExtractor::new(Arc::new(re), self.item.as_str()) {
|
||||||
|
Some(ext) => PileValue::ObjectExtractor(Arc::new(ext)),
|
||||||
|
None => PileValue::Null,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
_ => None,
|
_ => None,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -78,6 +92,9 @@ impl ObjectExtractor for StringExtractor {
|
|||||||
Label::new("upper").unwrap(),
|
Label::new("upper").unwrap(),
|
||||||
Label::new("lower").unwrap(),
|
Label::new("lower").unwrap(),
|
||||||
Label::new("nonempty").unwrap(),
|
Label::new("nonempty").unwrap(),
|
||||||
|
Label::new("trimprefix").unwrap(),
|
||||||
|
Label::new("trimsuffix").unwrap(),
|
||||||
|
Label::new("split").unwrap(),
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -35,6 +35,11 @@ pub trait ObjectExtractor: Send + Sync {
|
|||||||
/// and [None] for all others.
|
/// and [None] for all others.
|
||||||
async fn fields(&self) -> Result<Vec<pile_config::Label>, std::io::Error>;
|
async fn fields(&self) -> Result<Vec<pile_config::Label>, std::io::Error>;
|
||||||
|
|
||||||
|
/// Return a list view of this extractor, if supported.
|
||||||
|
fn as_list(&self) -> Option<std::sync::Arc<dyn ListExtractor>> {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
/// Convert this to a JSON value.
|
/// Convert this to a JSON value.
|
||||||
async fn to_json(&self, state: &ExtractState) -> Result<serde_json::Value, std::io::Error> {
|
async fn to_json(&self, state: &ExtractState) -> Result<serde_json::Value, std::io::Error> {
|
||||||
let keys = self.fields().await?;
|
let keys = self.fields().await?;
|
||||||
|
|||||||
@@ -86,7 +86,9 @@ impl PileValue {
|
|||||||
Self::String(_) => Arc::new(VecExtractor::default()),
|
Self::String(_) => Arc::new(VecExtractor::default()),
|
||||||
Self::Blob { .. } => Arc::new(VecExtractor::default()),
|
Self::Blob { .. } => Arc::new(VecExtractor::default()),
|
||||||
Self::ListExtractor(e) => e.clone(),
|
Self::ListExtractor(e) => e.clone(),
|
||||||
Self::ObjectExtractor(_) => Arc::new(VecExtractor::default()),
|
Self::ObjectExtractor(e) => {
|
||||||
|
e.as_list().unwrap_or_else(|| Arc::new(VecExtractor::default()))
|
||||||
|
}
|
||||||
Self::Item(_) => Arc::new(VecExtractor::default()),
|
Self::Item(_) => Arc::new(VecExtractor::default()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user