From e7afca3010a6e17f3fabdfc7db6e7e348ad539bd Mon Sep 17 00:00:00 2001 From: rm-dr <96270320+rm-dr@users.noreply.github.com> Date: Wed, 18 Mar 2026 09:36:54 -0700 Subject: [PATCH] TMP --- Cargo.lock | 1 + Cargo.toml | 1 + README.md | 37 +++++++ crates/pile-value/Cargo.toml | 1 + crates/pile-value/src/extract/item/mod.rs | 8 ++ crates/pile-value/src/extract/item/text.rs | 67 +++++++++++++ crates/pile-value/src/extract/mod.rs | 1 + crates/pile-value/src/extract/regex.rs | 107 +++++++++++++++++++++ crates/pile-value/src/extract/string.rs | 19 +++- crates/pile-value/src/extract/traits.rs | 5 + crates/pile-value/src/value/value.rs | 4 +- 11 files changed, 249 insertions(+), 2 deletions(-) create mode 100644 README.md create mode 100644 crates/pile-value/src/extract/item/text.rs create mode 100644 crates/pile-value/src/extract/regex.rs diff --git a/Cargo.lock b/Cargo.lock index 909351a..28a5767 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2574,6 +2574,7 @@ dependencies = [ "pdfium-render", "pile-config", "pile-flac", + "regex", "serde_json", "smartstring", "tokio", diff --git a/Cargo.toml b/Cargo.toml index bf33874..70ba53b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -127,6 +127,7 @@ mime = "0.3.17" mime_guess = "2.0.5" paste = "1.0.15" smartstring = "1.0.1" +regex = "1" chrono = "0.4.43" parking_lot = "0.12.5" rayon = "1.11.0" diff --git a/README.md b/README.md new file mode 100644 index 0000000..dca0236 --- /dev/null +++ b/README.md @@ -0,0 +1,37 @@ +# TODO + +- redo sidecars + - sidecar manipulation commands: + - fill (defaults, defined in toml) + - delete + - writing to files would be nice, but we want to be able to revert changes + - make sure to `upload` sidecars + - json extractor + +- incremental index +- better errors (s3 not found) + +## later + +- search ui (betalupi books + handouts) +- lists +- export +- libgen search? +- remote encryption +- publish (arch, nix, crates) + +- extractors + - ogg + - regex (from any string, filename, etc) + - whisper (speech-to-text) + - ocr (pdf pages, pass through fn!) + - list, fields on schemas instead of fields + - blobs as items or items as blobs? + - default args for each label (to_json, count)? + - which fields to include in json? + - nonempty (array) + +- redo docs + - source types + - sidecars (rename?) + - doc all keys diff --git a/crates/pile-value/Cargo.toml b/crates/pile-value/Cargo.toml index c079484..6f96cd7 100644 --- a/crates/pile-value/Cargo.toml +++ b/crates/pile-value/Cargo.toml @@ -18,6 +18,7 @@ tracing = { workspace = true } chrono = { workspace = true } toml = { workspace = true } smartstring = { workspace = true } +regex = { workspace = true } blake3 = { workspace = true } epub = { workspace = true } kamadak-exif = { workspace = true } diff --git a/crates/pile-value/src/extract/item/mod.rs b/crates/pile-value/src/extract/item/mod.rs index 907d9d3..8496d25 100644 --- a/crates/pile-value/src/extract/item/mod.rs +++ b/crates/pile-value/src/extract/item/mod.rs @@ -28,6 +28,9 @@ pub use toml::*; mod group; pub use group::*; +mod text; +pub use text::*; + use crate::{ extract::{ misc::MapExtractor, @@ -77,6 +80,10 @@ impl ItemExtractor { Label::new("toml").unwrap(), PileValue::ObjectExtractor(Arc::new(TomlExtractor::new(item))), ), + ( + Label::new("text").unwrap(), + PileValue::ObjectExtractor(Arc::new(TextExtractor::new(item))), + ), ( Label::new("groups").unwrap(), PileValue::ObjectExtractor(Arc::new(GroupExtractor::new(item))), @@ -110,6 +117,7 @@ impl ObjectExtractor for ItemExtractor { Label::new("pdf").unwrap(), Label::new("json").unwrap(), Label::new("toml").unwrap(), + Label::new("text").unwrap(), Label::new("groups").unwrap(), ]); } diff --git a/crates/pile-value/src/extract/item/text.rs b/crates/pile-value/src/extract/item/text.rs new file mode 100644 index 0000000..c58b342 --- /dev/null +++ b/crates/pile-value/src/extract/item/text.rs @@ -0,0 +1,67 @@ +use pile_config::Label; +use std::sync::{Arc, OnceLock}; + +use crate::{ + extract::traits::{ExtractState, ObjectExtractor}, + value::{AsyncReader, Item, PileValue}, +}; + +pub struct TextExtractor { + item: Item, + output: OnceLock, +} + +impl TextExtractor { + pub fn new(item: &Item) -> Self { + Self { + item: item.clone(), + output: OnceLock::new(), + } + } +} + +#[async_trait::async_trait] +impl ObjectExtractor for TextExtractor { + async fn field( + &self, + state: &ExtractState, + name: &Label, + args: Option<&str>, + ) -> Result, std::io::Error> { + if args.is_some() { + return Ok(None); + } + + if !state.ignore_mime + && (self.item.mime().type_() != mime::TEXT + && self.item.mime().type_() != mime::APPLICATION) + { + return Ok(None); + } + + if name.as_str() != "text" { + return Ok(None); + } + + { + if let Some(x) = self.output.get() { + return Ok(Some(x.clone())); + } + + let mut reader = self.item.read().await?; + let bytes = reader.read_to_end().await?; + let string = String::from_utf8(bytes).ok(); + let value = match string { + Some(x) => PileValue::String(Arc::new(x.into())), + None => PileValue::Null, + }; + + return Ok(Some(self.output.get_or_init(|| value).clone())); + } + } + + #[expect(clippy::unwrap_used)] + async fn fields(&self) -> Result, std::io::Error> { + Ok(vec![Label::new("text").unwrap()]) + } +} diff --git a/crates/pile-value/src/extract/mod.rs b/crates/pile-value/src/extract/mod.rs index 2e91ea1..be4e103 100644 --- a/crates/pile-value/src/extract/mod.rs +++ b/crates/pile-value/src/extract/mod.rs @@ -1,4 +1,5 @@ pub mod item; pub mod misc; +pub mod regex; pub mod string; pub mod traits; diff --git a/crates/pile-value/src/extract/regex.rs b/crates/pile-value/src/extract/regex.rs new file mode 100644 index 0000000..998b04c --- /dev/null +++ b/crates/pile-value/src/extract/regex.rs @@ -0,0 +1,107 @@ +use std::sync::Arc; + +use pile_config::Label; +use regex::Regex; +use smartstring::{LazyCompact, SmartString}; + +use crate::{ + extract::traits::{ExtractState, ListExtractor, ObjectExtractor}, + value::PileValue, +}; + +struct RegexData { + regex: Arc, + /// Captured substrings indexed by group index (0 = whole match). + captures: Vec>>>, +} + +impl RegexData { + fn new(regex: Arc, input: &str) -> Option { + let caps = regex.captures(input)?; + let captures = caps + .iter() + .map(|m| m.map(|m| Arc::new(m.as_str().into()))) + .collect(); + Some(Self { regex, captures }) + } +} + +/// Exposes named capture groups as object fields. +pub struct RegexExtractor(Arc); + +impl RegexExtractor { + /// Run `regex` against `input`. Returns `None` if there is no match. + pub fn new(regex: Arc, input: &str) -> Option { + Some(Self(Arc::new(RegexData::new(regex, input)?))) + } +} + +#[async_trait::async_trait] +impl ObjectExtractor for RegexExtractor { + async fn field( + &self, + _state: &ExtractState, + name: &Label, + args: Option<&str>, + ) -> Result, std::io::Error> { + if args.is_some() { + return Ok(None); + } + + let Some(idx) = self + .0 + .regex + .capture_names() + .position(|n| n == Some(name.as_str())) + else { + return Ok(None); + }; + + Ok(Some( + match self.0.captures.get(idx).and_then(|v| v.as_ref()) { + Some(s) => PileValue::String(s.clone()), + None => PileValue::Null, + }, + )) + } + + async fn fields(&self) -> Result, std::io::Error> { + #[expect(clippy::unwrap_used)] + Ok(self + .0 + .regex + .capture_names() + .flatten() + .map(|n| Label::new(n).unwrap()) + .collect()) + } + + fn as_list(&self) -> Option> { + Some(Arc::new(RegexListExtractor(self.0.clone()))) + } +} + +/// Exposes unnamed (positional) capture groups as a list (group 0 excluded). +pub struct RegexListExtractor(Arc); + +#[async_trait::async_trait] +impl ListExtractor for RegexListExtractor { + async fn get( + &self, + _state: &ExtractState, + idx: usize, + ) -> Result, std::io::Error> { + let raw_idx = idx + 1; + let Some(slot) = self.0.captures.get(raw_idx) else { + return Ok(None); + }; + Ok(Some(match slot { + Some(s) => PileValue::String(s.clone()), + None => PileValue::Null, + })) + } + + async fn len(&self, _state: &ExtractState) -> Result { + Ok(self.0.captures.len().saturating_sub(1)) + } +} diff --git a/crates/pile-value/src/extract/string.rs b/crates/pile-value/src/extract/string.rs index 63fd0fa..512ff1c 100644 --- a/crates/pile-value/src/extract/string.rs +++ b/crates/pile-value/src/extract/string.rs @@ -1,9 +1,13 @@ use pile_config::Label; +use regex::Regex; use smartstring::{LazyCompact, SmartString}; use std::sync::Arc; use crate::{ - extract::traits::{ExtractState, ObjectExtractor}, + extract::{ + regex::RegexExtractor, + traits::{ExtractState, ObjectExtractor}, + }, value::PileValue, }; @@ -67,6 +71,16 @@ impl ObjectExtractor for StringExtractor { .collect(), ))), + ("regex", Some(pattern)) => { + let Ok(re) = Regex::new(pattern) else { + return Ok(None); + }; + Some(match RegexExtractor::new(Arc::new(re), self.item.as_str()) { + Some(ext) => PileValue::ObjectExtractor(Arc::new(ext)), + None => PileValue::Null, + }) + } + _ => None, }) } @@ -78,6 +92,9 @@ impl ObjectExtractor for StringExtractor { Label::new("upper").unwrap(), Label::new("lower").unwrap(), Label::new("nonempty").unwrap(), + Label::new("trimprefix").unwrap(), + Label::new("trimsuffix").unwrap(), + Label::new("split").unwrap(), ]); } } diff --git a/crates/pile-value/src/extract/traits.rs b/crates/pile-value/src/extract/traits.rs index fdb232a..1dbd803 100644 --- a/crates/pile-value/src/extract/traits.rs +++ b/crates/pile-value/src/extract/traits.rs @@ -35,6 +35,11 @@ pub trait ObjectExtractor: Send + Sync { /// and [None] for all others. async fn fields(&self) -> Result, std::io::Error>; + /// Return a list view of this extractor, if supported. + fn as_list(&self) -> Option> { + None + } + /// Convert this to a JSON value. async fn to_json(&self, state: &ExtractState) -> Result { let keys = self.fields().await?; diff --git a/crates/pile-value/src/value/value.rs b/crates/pile-value/src/value/value.rs index d66d3a1..7d6d96f 100644 --- a/crates/pile-value/src/value/value.rs +++ b/crates/pile-value/src/value/value.rs @@ -86,7 +86,9 @@ impl PileValue { Self::String(_) => Arc::new(VecExtractor::default()), Self::Blob { .. } => Arc::new(VecExtractor::default()), Self::ListExtractor(e) => e.clone(), - Self::ObjectExtractor(_) => Arc::new(VecExtractor::default()), + Self::ObjectExtractor(e) => { + e.as_list().unwrap_or_else(|| Arc::new(VecExtractor::default())) + } Self::Item(_) => Arc::new(VecExtractor::default()), } }