Compare commits
10 Commits
9967e066bb
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 450ea7aa86 | |||
| 3bc66ddc48 | |||
| 251d130987 | |||
| 0281a33f86 | |||
| d3ab2684f4 | |||
| 4d4e9c93a2 | |||
| e6e340d082 | |||
| 8b4dfb1a1a | |||
| 60dc755561 | |||
| 5527b61d39 |
44
Cargo.lock
generated
44
Cargo.lock
generated
@@ -1994,6 +1994,7 @@ dependencies = [
|
|||||||
"indicatif",
|
"indicatif",
|
||||||
"pile-config",
|
"pile-config",
|
||||||
"pile-dataset",
|
"pile-dataset",
|
||||||
|
"pile-serve",
|
||||||
"pile-toolbox",
|
"pile-toolbox",
|
||||||
"pile-value",
|
"pile-value",
|
||||||
"serde",
|
"serde",
|
||||||
@@ -2016,8 +2017,7 @@ version = "0.0.2"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"axum",
|
"axum",
|
||||||
"bytes",
|
"bytes",
|
||||||
"futures-core",
|
"pile-serve",
|
||||||
"pile-dataset",
|
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"serde",
|
"serde",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
@@ -2038,23 +2038,17 @@ dependencies = [
|
|||||||
name = "pile-dataset"
|
name = "pile-dataset"
|
||||||
version = "0.0.2"
|
version = "0.0.2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"axum",
|
|
||||||
"chrono",
|
"chrono",
|
||||||
"percent-encoding",
|
|
||||||
"pile-config",
|
"pile-config",
|
||||||
"pile-io",
|
|
||||||
"pile-toolbox",
|
"pile-toolbox",
|
||||||
"pile-value",
|
"pile-value",
|
||||||
"serde",
|
"regex",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"tantivy",
|
"tantivy",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-stream",
|
|
||||||
"toml",
|
"toml",
|
||||||
"tracing",
|
"tracing",
|
||||||
"utoipa",
|
|
||||||
"utoipa-swagger-ui",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -2078,6 +2072,24 @@ dependencies = [
|
|||||||
"tokio",
|
"tokio",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pile-serve"
|
||||||
|
version = "0.0.2"
|
||||||
|
dependencies = [
|
||||||
|
"axum",
|
||||||
|
"percent-encoding",
|
||||||
|
"pile-config",
|
||||||
|
"pile-dataset",
|
||||||
|
"pile-value",
|
||||||
|
"serde",
|
||||||
|
"serde_json",
|
||||||
|
"tokio",
|
||||||
|
"tokio-util",
|
||||||
|
"tracing",
|
||||||
|
"utoipa",
|
||||||
|
"utoipa-swagger-ui",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pile-toolbox"
|
name = "pile-toolbox"
|
||||||
version = "0.0.2"
|
version = "0.0.2"
|
||||||
@@ -2098,6 +2110,7 @@ dependencies = [
|
|||||||
"id3",
|
"id3",
|
||||||
"image",
|
"image",
|
||||||
"kamadak-exif",
|
"kamadak-exif",
|
||||||
|
"md5",
|
||||||
"mime",
|
"mime",
|
||||||
"mime_guess",
|
"mime_guess",
|
||||||
"pdf",
|
"pdf",
|
||||||
@@ -2109,6 +2122,8 @@ dependencies = [
|
|||||||
"reqwest",
|
"reqwest",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
|
"sha1",
|
||||||
|
"sha2 0.11.0-rc.5",
|
||||||
"smartstring",
|
"smartstring",
|
||||||
"strum",
|
"strum",
|
||||||
"tokio",
|
"tokio",
|
||||||
@@ -2654,6 +2669,17 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "sha1"
|
||||||
|
version = "0.10.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
"cpufeatures",
|
||||||
|
"digest 0.10.7",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "sha2"
|
name = "sha2"
|
||||||
version = "0.10.9"
|
version = "0.10.9"
|
||||||
|
|||||||
@@ -71,6 +71,7 @@ pile-dataset = { path = "crates/pile-dataset" }
|
|||||||
pile-value = { path = "crates/pile-value" }
|
pile-value = { path = "crates/pile-value" }
|
||||||
pile-io = { path = "crates/pile-io" }
|
pile-io = { path = "crates/pile-io" }
|
||||||
pile-client = { path = "crates/pile-client" }
|
pile-client = { path = "crates/pile-client" }
|
||||||
|
pile-serve = { path = "crates/pile-serve" }
|
||||||
|
|
||||||
# MARK: Clients & servers
|
# MARK: Clients & servers
|
||||||
tantivy = "0.25.0"
|
tantivy = "0.25.0"
|
||||||
@@ -87,7 +88,7 @@ utoipa-swagger-ui = { version = "9.0.2", features = [
|
|||||||
"debug-embed",
|
"debug-embed",
|
||||||
"vendored",
|
"vendored",
|
||||||
] }
|
] }
|
||||||
reqwest = { version = "0.12", features = ["blocking"] }
|
reqwest = { version = "0.12", features = ["blocking", "json", "stream"] }
|
||||||
tracing-loki = "0.2.6"
|
tracing-loki = "0.2.6"
|
||||||
|
|
||||||
# MARK: Async & Parallelism
|
# MARK: Async & Parallelism
|
||||||
@@ -111,6 +112,8 @@ bytes = "1"
|
|||||||
toml = "1.0.3"
|
toml = "1.0.3"
|
||||||
toml_edit = "0.25.4"
|
toml_edit = "0.25.4"
|
||||||
sha2 = "0.11.0-rc.5"
|
sha2 = "0.11.0-rc.5"
|
||||||
|
sha1 = "0.10"
|
||||||
|
md5 = "0.7"
|
||||||
blake3 = "1.8.3"
|
blake3 = "1.8.3"
|
||||||
dotenvy = "0.15.7"
|
dotenvy = "0.15.7"
|
||||||
envy = "0.4.2"
|
envy = "0.4.2"
|
||||||
|
|||||||
@@ -8,10 +8,9 @@ edition = { workspace = true }
|
|||||||
workspace = true
|
workspace = true
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
pile-dataset = { workspace = true, features = ["axum"] }
|
pile-serve = { workspace = true }
|
||||||
|
|
||||||
reqwest = { version = "0.12", features = ["json", "stream"] }
|
reqwest = { workspace = true }
|
||||||
futures-core = "0.3"
|
|
||||||
serde = { workspace = true }
|
serde = { workspace = true }
|
||||||
thiserror = { workspace = true }
|
thiserror = { workspace = true }
|
||||||
bytes = { workspace = true }
|
bytes = { workspace = true }
|
||||||
|
|||||||
@@ -3,14 +3,12 @@ use axum::{
|
|||||||
routing::any,
|
routing::any,
|
||||||
};
|
};
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use futures_core::Stream;
|
|
||||||
use reqwest::{Client, StatusCode, header};
|
use reqwest::{Client, StatusCode, header};
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use std::pin::Pin;
|
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
use tracing::{trace, warn};
|
use tracing::{trace, warn};
|
||||||
|
|
||||||
pub use pile_dataset::serve::{
|
pub use pile_serve::{
|
||||||
ApiValue, FieldSpec, FieldsResponse, ItemsResponse, LookupRequest, LookupResponse,
|
ApiValue, FieldSpec, FieldsResponse, ItemsResponse, LookupRequest, LookupResponse,
|
||||||
SchemaResponse,
|
SchemaResponse,
|
||||||
};
|
};
|
||||||
@@ -120,26 +118,6 @@ impl DatasetClient {
|
|||||||
check_status(resp).await?.json().await.map_err(Into::into)
|
check_status(resp).await?.json().await.map_err(Into::into)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// `GET /item` — stream the raw bytes of an item.
|
|
||||||
///
|
|
||||||
/// The returned stream yields chunks as they arrive from the server.
|
|
||||||
pub async fn get_item(
|
|
||||||
&self,
|
|
||||||
source: &str,
|
|
||||||
key: &str,
|
|
||||||
) -> Result<Pin<Box<dyn Stream<Item = Result<Bytes, reqwest::Error>> + Send>>, ClientError> {
|
|
||||||
let url = format!("{}/item", self.base_url);
|
|
||||||
trace!(url, source, key, "GET /item");
|
|
||||||
let resp = self
|
|
||||||
.client
|
|
||||||
.get(url)
|
|
||||||
.query(&[("source", source), ("key", key)])
|
|
||||||
.send()
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
Ok(Box::pin(check_status(resp).await?.bytes_stream()))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// `GET /extract` — extract a field from an item by object path (e.g. `$.flac.title`).
|
/// `GET /extract` — extract a field from an item by object path (e.g. `$.flac.title`).
|
||||||
pub async fn get_extract(
|
pub async fn get_extract(
|
||||||
&self,
|
&self,
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::{collections::HashMap, fmt::Debug, path::PathBuf};
|
use std::{collections::HashMap, fmt::Debug, path::PathBuf};
|
||||||
|
|
||||||
use crate::{objectpath::ObjectPath, pattern::GroupPattern};
|
use crate::objectpath::ObjectPath;
|
||||||
|
|
||||||
mod misc;
|
mod misc;
|
||||||
pub use misc::*;
|
pub use misc::*;
|
||||||
@@ -15,6 +15,15 @@ fn default_true() -> bool {
|
|||||||
true
|
true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn default_base() -> String {
|
||||||
|
"(.*)".to_owned()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[expect(clippy::unwrap_used)]
|
||||||
|
pub fn default_files() -> HashMap<Label, String> {
|
||||||
|
[(Label::new("item").unwrap(), "{base}".to_owned())].into()
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
#[expect(clippy::expect_used)]
|
#[expect(clippy::expect_used)]
|
||||||
fn init_db_toml_valid() {
|
fn init_db_toml_valid() {
|
||||||
@@ -51,9 +60,17 @@ pub enum Source {
|
|||||||
/// Must be relative.
|
/// Must be relative.
|
||||||
path: PathBuf,
|
path: PathBuf,
|
||||||
|
|
||||||
/// How to group files into items in this source
|
/// Regex that extracts an item key from a file path.
|
||||||
#[serde(default)]
|
/// - File paths are relative to `path`.
|
||||||
pattern: GroupPattern,
|
/// - The first group in this regex is the file's item key.
|
||||||
|
#[serde(default = "default_base")]
|
||||||
|
base_pattern: String,
|
||||||
|
|
||||||
|
/// Map of files included in each item.'
|
||||||
|
/// `{base}` is replaced with the string extracted by base_pattern.
|
||||||
|
/// Default is `{ item: "{base}" }`
|
||||||
|
#[serde(default = "default_files")]
|
||||||
|
files: HashMap<Label, String>,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -11,8 +11,8 @@ workspace = true
|
|||||||
pile-config = { workspace = true }
|
pile-config = { workspace = true }
|
||||||
pile-toolbox = { workspace = true }
|
pile-toolbox = { workspace = true }
|
||||||
pile-value = { workspace = true }
|
pile-value = { workspace = true }
|
||||||
pile-io = { workspace = true }
|
|
||||||
|
|
||||||
|
regex = { workspace = true }
|
||||||
serde_json = { workspace = true }
|
serde_json = { workspace = true }
|
||||||
tantivy = { workspace = true }
|
tantivy = { workspace = true }
|
||||||
tracing = { workspace = true }
|
tracing = { workspace = true }
|
||||||
@@ -20,21 +20,7 @@ chrono = { workspace = true }
|
|||||||
toml = { workspace = true }
|
toml = { workspace = true }
|
||||||
thiserror = { workspace = true }
|
thiserror = { workspace = true }
|
||||||
tokio = { workspace = true }
|
tokio = { workspace = true }
|
||||||
tokio-stream = { workspace = true }
|
|
||||||
|
|
||||||
serde = { workspace = true, optional = true }
|
|
||||||
axum = { workspace = true, optional = true }
|
|
||||||
percent-encoding = { workspace = true, optional = true }
|
|
||||||
utoipa = { workspace = true, optional = true }
|
|
||||||
utoipa-swagger-ui = { workspace = true, optional = true }
|
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = []
|
default = []
|
||||||
pdfium = ["pile-value/pdfium"]
|
pdfium = ["pile-value/pdfium"]
|
||||||
axum = [
|
|
||||||
"dep:axum",
|
|
||||||
"dep:utoipa",
|
|
||||||
"dep:utoipa-swagger-ui",
|
|
||||||
"dep:serde",
|
|
||||||
"dep:percent-encoding",
|
|
||||||
]
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
use pile_config::{
|
use pile_config::{
|
||||||
ConfigToml, DatasetConfig, Label, Source, objectpath::ObjectPath, pattern::GroupPattern,
|
ConfigToml, DatasetConfig, Label, Source, default_base, default_files, objectpath::ObjectPath,
|
||||||
};
|
};
|
||||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
||||||
use pile_value::{
|
use pile_value::{
|
||||||
@@ -8,6 +8,7 @@ use pile_value::{
|
|||||||
source::{DataSource, DirDataSource, misc::path_ts_earliest},
|
source::{DataSource, DirDataSource, misc::path_ts_earliest},
|
||||||
value::{Item, PileValue},
|
value::{Item, PileValue},
|
||||||
};
|
};
|
||||||
|
use regex::Regex;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
use std::{collections::HashMap, io::ErrorKind, path::PathBuf, sync::Arc, time::Instant};
|
use std::{collections::HashMap, io::ErrorKind, path::PathBuf, sync::Arc, time::Instant};
|
||||||
use tantivy::{Executor, Index, IndexWriter, TantivyError, collector::TopDocs};
|
use tantivy::{Executor, Index, IndexWriter, TantivyError, collector::TopDocs};
|
||||||
@@ -107,7 +108,8 @@ impl Datasets {
|
|||||||
Source::Filesystem {
|
Source::Filesystem {
|
||||||
enabled: true,
|
enabled: true,
|
||||||
path: path_parent.clone(),
|
path: path_parent.clone(),
|
||||||
pattern: GroupPattern::default(),
|
base_pattern: default_base(),
|
||||||
|
files: default_files(),
|
||||||
},
|
},
|
||||||
)]
|
)]
|
||||||
.into_iter()
|
.into_iter()
|
||||||
@@ -125,18 +127,37 @@ impl Datasets {
|
|||||||
Source::Filesystem {
|
Source::Filesystem {
|
||||||
enabled,
|
enabled,
|
||||||
path,
|
path,
|
||||||
pattern,
|
base_pattern,
|
||||||
|
files,
|
||||||
} => {
|
} => {
|
||||||
let target = match enabled {
|
let target = match enabled {
|
||||||
true => &mut sources,
|
true => &mut sources,
|
||||||
false => &mut disabled_sources,
|
false => &mut disabled_sources,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let base_regex = Regex::new(base_pattern).map_err(|e| {
|
||||||
|
std::io::Error::new(
|
||||||
|
ErrorKind::InvalidInput,
|
||||||
|
format!("invalid base_pattern: {e}"),
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
if base_regex.captures_len() != 2 {
|
||||||
|
return Err(std::io::Error::new(
|
||||||
|
ErrorKind::InvalidInput,
|
||||||
|
"base_pattern must have exactly one capture group",
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
target.insert(
|
target.insert(
|
||||||
label.clone(),
|
label.clone(),
|
||||||
Dataset::Dir(
|
Dataset::Dir(
|
||||||
DirDataSource::new(label, path_parent.join(path), pattern.clone())
|
DirDataSource::new(
|
||||||
.await?,
|
label,
|
||||||
|
path_parent.join(path),
|
||||||
|
base_regex,
|
||||||
|
files.clone(),
|
||||||
|
)
|
||||||
|
.await?,
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -194,18 +215,37 @@ impl Datasets {
|
|||||||
Source::Filesystem {
|
Source::Filesystem {
|
||||||
enabled,
|
enabled,
|
||||||
path,
|
path,
|
||||||
pattern,
|
base_pattern,
|
||||||
|
files,
|
||||||
} => {
|
} => {
|
||||||
let target = match enabled {
|
let target = match enabled {
|
||||||
true => &mut sources,
|
true => &mut sources,
|
||||||
false => &mut disabled_sources,
|
false => &mut disabled_sources,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let base_regex = Regex::new(base_pattern).map_err(|e| {
|
||||||
|
std::io::Error::new(
|
||||||
|
ErrorKind::InvalidInput,
|
||||||
|
format!("invalid base_pattern: {e}"),
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
if base_regex.captures_len() != 2 {
|
||||||
|
return Err(std::io::Error::new(
|
||||||
|
ErrorKind::InvalidInput,
|
||||||
|
"base_pattern must have exactly one capture group",
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
target.insert(
|
target.insert(
|
||||||
label.clone(),
|
label.clone(),
|
||||||
Dataset::Dir(
|
Dataset::Dir(
|
||||||
DirDataSource::new(label, path_parent.join(path), pattern.clone())
|
DirDataSource::new(
|
||||||
.await?,
|
label,
|
||||||
|
path_parent.join(path),
|
||||||
|
base_regex,
|
||||||
|
files.clone(),
|
||||||
|
)
|
||||||
|
.await?,
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -262,6 +302,7 @@ impl Datasets {
|
|||||||
_threads: usize,
|
_threads: usize,
|
||||||
flag: Option<CancelFlag>,
|
flag: Option<CancelFlag>,
|
||||||
) -> Result<(), CancelableTaskError<DatasetError>> {
|
) -> Result<(), CancelableTaskError<DatasetError>> {
|
||||||
|
let start = Instant::now();
|
||||||
let workdir = match self.path_workdir.as_ref() {
|
let workdir = match self.path_workdir.as_ref() {
|
||||||
Some(x) => x,
|
Some(x) => x,
|
||||||
None => {
|
None => {
|
||||||
@@ -273,6 +314,14 @@ impl Datasets {
|
|||||||
let fts_tmp_dir = workdir.join(".tmp-fts");
|
let fts_tmp_dir = workdir.join(".tmp-fts");
|
||||||
let fts_dir = workdir.join("fts");
|
let fts_dir = workdir.join("fts");
|
||||||
|
|
||||||
|
debug!(
|
||||||
|
message = "Rebuilding fts index",
|
||||||
|
dataset = self.config.dataset.name.as_str(),
|
||||||
|
?fts_dir,
|
||||||
|
?fts_tmp_dir,
|
||||||
|
?workdir
|
||||||
|
);
|
||||||
|
|
||||||
if fts_tmp_dir.is_dir() {
|
if fts_tmp_dir.is_dir() {
|
||||||
warn!("Removing temporary index in {}", fts_dir.display());
|
warn!("Removing temporary index in {}", fts_dir.display());
|
||||||
std::fs::remove_dir_all(&fts_tmp_dir).map_err(DatasetError::from)?;
|
std::fs::remove_dir_all(&fts_tmp_dir).map_err(DatasetError::from)?;
|
||||||
@@ -352,9 +401,18 @@ impl Datasets {
|
|||||||
return Err(CancelableTaskError::Cancelled);
|
return Err(CancelableTaskError::Cancelled);
|
||||||
}
|
}
|
||||||
|
|
||||||
info!("Committing {total} documents");
|
|
||||||
index_writer.commit().map_err(DatasetError::from)?;
|
index_writer.commit().map_err(DatasetError::from)?;
|
||||||
|
|
||||||
|
debug!(
|
||||||
|
message = "Rebuilt fts index",
|
||||||
|
dataset = self.config.dataset.name.as_str(),
|
||||||
|
?fts_dir,
|
||||||
|
?fts_tmp_dir,
|
||||||
|
?workdir,
|
||||||
|
n_docs = total,
|
||||||
|
time_ms = start.elapsed().as_millis()
|
||||||
|
);
|
||||||
|
|
||||||
if fts_dir.is_dir() {
|
if fts_dir.is_dir() {
|
||||||
warn!("Removing existing index in {}", fts_dir.display());
|
warn!("Removing existing index in {}", fts_dir.display());
|
||||||
std::fs::remove_dir_all(&fts_dir).map_err(DatasetError::from)?;
|
std::fs::remove_dir_all(&fts_dir).map_err(DatasetError::from)?;
|
||||||
|
|||||||
@@ -245,7 +245,7 @@ async fn val_to_string(
|
|||||||
PileValue::Null => {}
|
PileValue::Null => {}
|
||||||
PileValue::ObjectExtractor(_) => {}
|
PileValue::ObjectExtractor(_) => {}
|
||||||
PileValue::Item(_) => {}
|
PileValue::Item(_) => {}
|
||||||
PileValue::Blob { .. } => {}
|
PileValue::Binary(_) => {}
|
||||||
}
|
}
|
||||||
|
|
||||||
return Ok(Vec::new());
|
return Ok(Vec::new());
|
||||||
|
|||||||
@@ -2,6 +2,3 @@ mod dataset;
|
|||||||
pub use dataset::{Dataset, DatasetError, Datasets};
|
pub use dataset::{Dataset, DatasetError, Datasets};
|
||||||
|
|
||||||
pub mod index;
|
pub mod index;
|
||||||
|
|
||||||
#[cfg(feature = "axum")]
|
|
||||||
pub mod serve;
|
|
||||||
|
|||||||
@@ -1,202 +0,0 @@
|
|||||||
use axum::{
|
|
||||||
body::Body,
|
|
||||||
extract::{Query, State},
|
|
||||||
http::{HeaderMap, StatusCode, header},
|
|
||||||
response::{IntoResponse, Response},
|
|
||||||
};
|
|
||||||
use pile_config::Label;
|
|
||||||
use pile_io::{AsyncReader, AsyncSeekReader};
|
|
||||||
use serde::Deserialize;
|
|
||||||
use std::{io::SeekFrom, sync::Arc, time::Instant};
|
|
||||||
use tokio::sync::mpsc;
|
|
||||||
use tokio_stream::wrappers::ReceiverStream;
|
|
||||||
use tracing::debug;
|
|
||||||
use utoipa::ToSchema;
|
|
||||||
|
|
||||||
use crate::Datasets;
|
|
||||||
|
|
||||||
#[derive(Deserialize, ToSchema)]
|
|
||||||
pub struct ItemQuery {
|
|
||||||
source: String,
|
|
||||||
key: String,
|
|
||||||
|
|
||||||
#[serde(default)]
|
|
||||||
download: bool,
|
|
||||||
name: Option<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Parse a `Range: bytes=...` header value.
|
|
||||||
/// Returns `(start, end)` where either may be `None` (suffix form has `None` start).
|
|
||||||
fn parse_byte_range(s: &str) -> Option<(Option<u64>, Option<u64>)> {
|
|
||||||
let spec = s.strip_prefix("bytes=")?;
|
|
||||||
if spec.contains(',') {
|
|
||||||
return None; // multiple ranges not supported
|
|
||||||
}
|
|
||||||
if let Some(suffix) = spec.strip_prefix('-') {
|
|
||||||
return Some((None, Some(suffix.parse().ok()?)));
|
|
||||||
}
|
|
||||||
let mut parts = spec.splitn(2, '-');
|
|
||||||
let start: u64 = parts.next()?.parse().ok()?;
|
|
||||||
let end = parts
|
|
||||||
.next()
|
|
||||||
.and_then(|e| if e.is_empty() { None } else { e.parse().ok() });
|
|
||||||
Some((Some(start), end))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Fetch the raw bytes of an item by source and key
|
|
||||||
#[utoipa::path(
|
|
||||||
get,
|
|
||||||
path = "/item",
|
|
||||||
params(
|
|
||||||
("source" = String, Query, description = "Source label"),
|
|
||||||
("key" = String, Query, description = "Item key"),
|
|
||||||
("name" = Option<String>, Query, description = "Downloaded filename; defaults to the last segment of the key"),
|
|
||||||
),
|
|
||||||
responses(
|
|
||||||
(status = 200, description = "Raw item bytes"),
|
|
||||||
(status = 206, description = "Partial content"),
|
|
||||||
(status = 400, description = "Invalid source label"),
|
|
||||||
(status = 404, description = "Item not found"),
|
|
||||||
(status = 416, description = "Range not satisfiable"),
|
|
||||||
(status = 500, description = "Internal server error"),
|
|
||||||
)
|
|
||||||
)]
|
|
||||||
pub async fn item_get(
|
|
||||||
State(state): State<Arc<Datasets>>,
|
|
||||||
Query(params): Query<ItemQuery>,
|
|
||||||
headers: HeaderMap,
|
|
||||||
) -> Response {
|
|
||||||
let start = Instant::now();
|
|
||||||
debug!(
|
|
||||||
message = "Serving /item",
|
|
||||||
source = params.source,
|
|
||||||
key = params.key
|
|
||||||
);
|
|
||||||
|
|
||||||
let label = match Label::try_from(params.source.clone()) {
|
|
||||||
Ok(l) => l,
|
|
||||||
Err(e) => return (StatusCode::BAD_REQUEST, format!("{e:?}")).into_response(),
|
|
||||||
};
|
|
||||||
|
|
||||||
let Some(item) = state.get(&label, ¶ms.key).await else {
|
|
||||||
return StatusCode::NOT_FOUND.into_response();
|
|
||||||
};
|
|
||||||
|
|
||||||
let mime = item.mime().to_string();
|
|
||||||
|
|
||||||
let mut reader = match item.read().await {
|
|
||||||
Ok(r) => r,
|
|
||||||
Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
|
|
||||||
};
|
|
||||||
|
|
||||||
let total = match reader.seek(SeekFrom::End(0)).await {
|
|
||||||
Ok(n) => n,
|
|
||||||
Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
|
|
||||||
};
|
|
||||||
|
|
||||||
let range = headers
|
|
||||||
.get(header::RANGE)
|
|
||||||
.and_then(|v| v.to_str().ok())
|
|
||||||
.and_then(parse_byte_range);
|
|
||||||
|
|
||||||
// Resolve (byte_start, byte_end, content_length, is_range)
|
|
||||||
let (byte_start, byte_end, length, is_range) = match range {
|
|
||||||
Some((Some(s), e)) => {
|
|
||||||
let e = e
|
|
||||||
.unwrap_or(total.saturating_sub(1))
|
|
||||||
.min(total.saturating_sub(1));
|
|
||||||
if s >= total || s > e {
|
|
||||||
return (
|
|
||||||
StatusCode::RANGE_NOT_SATISFIABLE,
|
|
||||||
[(header::CONTENT_RANGE, format!("bytes */{total}"))],
|
|
||||||
)
|
|
||||||
.into_response();
|
|
||||||
}
|
|
||||||
(s, e, e - s + 1, true)
|
|
||||||
}
|
|
||||||
Some((None, Some(suffix))) => {
|
|
||||||
let s = total.saturating_sub(suffix);
|
|
||||||
let e = total.saturating_sub(1);
|
|
||||||
(s, e, total.saturating_sub(s), true)
|
|
||||||
}
|
|
||||||
_ => (0, total.saturating_sub(1), total, false),
|
|
||||||
};
|
|
||||||
|
|
||||||
if let Err(e) = reader.seek(SeekFrom::Start(byte_start)).await {
|
|
||||||
return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response();
|
|
||||||
}
|
|
||||||
|
|
||||||
debug!(
|
|
||||||
message = "Served /item",
|
|
||||||
source = params.source,
|
|
||||||
key = params.key,
|
|
||||||
time_ms = start.elapsed().as_millis()
|
|
||||||
);
|
|
||||||
|
|
||||||
let (tx, rx) = mpsc::channel::<Result<Vec<u8>, std::io::Error>>(8);
|
|
||||||
|
|
||||||
tokio::spawn(async move {
|
|
||||||
let mut buf = vec![0u8; 65536];
|
|
||||||
let mut remaining = length;
|
|
||||||
loop {
|
|
||||||
if remaining == 0 {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
let to_read = (buf.len() as u64).min(remaining) as usize;
|
|
||||||
match reader.read(&mut buf[..to_read]).await {
|
|
||||||
Ok(0) => break,
|
|
||||||
Ok(n) => {
|
|
||||||
remaining -= n as u64;
|
|
||||||
if tx.send(Ok(buf[..n].to_vec())).await.is_err() {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
let _ = tx.send(Err(e)).await;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
let body = Body::from_stream(ReceiverStream::new(rx));
|
|
||||||
let status = if is_range {
|
|
||||||
StatusCode::PARTIAL_CONTENT
|
|
||||||
} else {
|
|
||||||
StatusCode::OK
|
|
||||||
};
|
|
||||||
|
|
||||||
let disposition_type = if params.download {
|
|
||||||
"attachment"
|
|
||||||
} else {
|
|
||||||
"inline"
|
|
||||||
};
|
|
||||||
let file_name = params.name.unwrap_or_else(|| {
|
|
||||||
params
|
|
||||||
.key
|
|
||||||
.rsplit('/')
|
|
||||||
.next()
|
|
||||||
.unwrap_or(¶ms.key)
|
|
||||||
.to_owned()
|
|
||||||
});
|
|
||||||
let disposition = format!("{disposition_type}; filename=\"{file_name}\"");
|
|
||||||
|
|
||||||
let mut builder = axum::http::Response::builder()
|
|
||||||
.status(status)
|
|
||||||
.header(header::CONTENT_TYPE, mime)
|
|
||||||
.header(header::ACCEPT_RANGES, "bytes")
|
|
||||||
.header(header::CONTENT_LENGTH, length)
|
|
||||||
.header(header::CONTENT_DISPOSITION, disposition);
|
|
||||||
|
|
||||||
if is_range {
|
|
||||||
builder = builder.header(
|
|
||||||
header::CONTENT_RANGE,
|
|
||||||
format!("bytes {byte_start}-{byte_end}/{total}"),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
builder
|
|
||||||
.body(body)
|
|
||||||
.map(IntoResponse::into_response)
|
|
||||||
.unwrap_or_else(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response())
|
|
||||||
}
|
|
||||||
@@ -1,98 +0,0 @@
|
|||||||
use axum::{
|
|
||||||
Router,
|
|
||||||
routing::{get, post},
|
|
||||||
};
|
|
||||||
use std::sync::Arc;
|
|
||||||
use utoipa::OpenApi;
|
|
||||||
use utoipa_swagger_ui::SwaggerUi;
|
|
||||||
|
|
||||||
use crate::Datasets;
|
|
||||||
|
|
||||||
mod lookup;
|
|
||||||
pub use lookup::*;
|
|
||||||
|
|
||||||
mod item;
|
|
||||||
pub use item::*;
|
|
||||||
|
|
||||||
mod extract;
|
|
||||||
pub use extract::*;
|
|
||||||
|
|
||||||
mod items;
|
|
||||||
pub use items::*;
|
|
||||||
|
|
||||||
mod config_schema;
|
|
||||||
pub use config_schema::*;
|
|
||||||
|
|
||||||
mod schema_field;
|
|
||||||
pub use schema_field::*;
|
|
||||||
|
|
||||||
mod schema;
|
|
||||||
pub use schema::*;
|
|
||||||
|
|
||||||
#[derive(OpenApi)]
|
|
||||||
#[openapi(
|
|
||||||
tags(),
|
|
||||||
paths(
|
|
||||||
lookup,
|
|
||||||
item_get,
|
|
||||||
get_extract,
|
|
||||||
items_list,
|
|
||||||
config_schema,
|
|
||||||
schema_field,
|
|
||||||
schema_all
|
|
||||||
),
|
|
||||||
components(schemas(
|
|
||||||
LookupRequest,
|
|
||||||
LookupResponse,
|
|
||||||
LookupResult,
|
|
||||||
ItemQuery,
|
|
||||||
ExtractQuery,
|
|
||||||
ItemsQuery,
|
|
||||||
ItemsResponse,
|
|
||||||
ItemRef
|
|
||||||
))
|
|
||||||
)]
|
|
||||||
pub(crate) struct Api;
|
|
||||||
|
|
||||||
impl Datasets {
|
|
||||||
#[inline]
|
|
||||||
pub fn router(self: Arc<Self>, with_docs: bool) -> Router<()> {
|
|
||||||
self.router_prefix(with_docs, None)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline]
|
|
||||||
pub fn router_prefix(self: Arc<Self>, with_docs: bool, prefix: Option<&str>) -> Router<()> {
|
|
||||||
let mut router = Router::new()
|
|
||||||
.route("/lookup", post(lookup))
|
|
||||||
.route("/item", get(item_get))
|
|
||||||
.route("/extract", get(get_extract))
|
|
||||||
.route("/items", get(items_list))
|
|
||||||
.route("/config/schema", get(config_schema))
|
|
||||||
.route("/schema", get(schema_all))
|
|
||||||
.route("/schema/{field}", get(schema_field))
|
|
||||||
.with_state(self.clone());
|
|
||||||
|
|
||||||
if let Some(prefix) = prefix {
|
|
||||||
router = Router::new().nest(prefix, router);
|
|
||||||
}
|
|
||||||
|
|
||||||
if with_docs {
|
|
||||||
let docs_path = match prefix {
|
|
||||||
None => "/docs".into(),
|
|
||||||
Some(prefix) => format!("{prefix}/docs"),
|
|
||||||
};
|
|
||||||
|
|
||||||
let api = Api::openapi();
|
|
||||||
let api = match prefix {
|
|
||||||
None => api,
|
|
||||||
Some(prefix) => utoipa::openapi::OpenApi::default().nest(prefix, api),
|
|
||||||
};
|
|
||||||
|
|
||||||
let docs =
|
|
||||||
SwaggerUi::new(docs_path.clone()).url(format!("{}/openapi.json", docs_path), api);
|
|
||||||
|
|
||||||
router = router.merge(docs);
|
|
||||||
}
|
|
||||||
router
|
|
||||||
}
|
|
||||||
}
|
|
||||||
28
crates/pile-serve/Cargo.toml
Normal file
28
crates/pile-serve/Cargo.toml
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
[package]
|
||||||
|
name = "pile-serve"
|
||||||
|
version = { workspace = true }
|
||||||
|
rust-version = { workspace = true }
|
||||||
|
edition = { workspace = true }
|
||||||
|
|
||||||
|
[lints]
|
||||||
|
workspace = true
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
pile-config = { workspace = true }
|
||||||
|
pile-value = { workspace = true }
|
||||||
|
pile-dataset = { workspace = true }
|
||||||
|
|
||||||
|
serde_json = { workspace = true }
|
||||||
|
tracing = { workspace = true }
|
||||||
|
tokio = { workspace = true }
|
||||||
|
tokio-util = { version = "0.7", features = ["io"] }
|
||||||
|
|
||||||
|
serde = { workspace = true }
|
||||||
|
axum = { workspace = true }
|
||||||
|
percent-encoding = { workspace = true }
|
||||||
|
utoipa = { workspace = true }
|
||||||
|
utoipa-swagger-ui = { workspace = true }
|
||||||
|
|
||||||
|
[features]
|
||||||
|
default = []
|
||||||
|
pdfium = ["pile-value/pdfium"]
|
||||||
@@ -4,12 +4,11 @@ use axum::{
|
|||||||
http::StatusCode,
|
http::StatusCode,
|
||||||
response::{IntoResponse, Response},
|
response::{IntoResponse, Response},
|
||||||
};
|
};
|
||||||
|
use pile_dataset::Datasets;
|
||||||
use std::{collections::HashMap, sync::Arc};
|
use std::{collections::HashMap, sync::Arc};
|
||||||
|
|
||||||
pub use pile_config::FieldSpec;
|
pub use pile_config::FieldSpec;
|
||||||
|
|
||||||
use crate::Datasets;
|
|
||||||
|
|
||||||
pub type FieldsResponse = HashMap<String, FieldSpec>;
|
pub type FieldsResponse = HashMap<String, FieldSpec>;
|
||||||
|
|
||||||
/// Retrieve this dataset's schema.
|
/// Retrieve this dataset's schema.
|
||||||
@@ -1,19 +1,23 @@
|
|||||||
use axum::{
|
use axum::{
|
||||||
Json,
|
Json,
|
||||||
|
body::Body,
|
||||||
extract::{Query, RawQuery, State},
|
extract::{Query, RawQuery, State},
|
||||||
http::{StatusCode, header},
|
http::{StatusCode, header},
|
||||||
response::{IntoResponse, Response},
|
response::{IntoResponse, Response},
|
||||||
};
|
};
|
||||||
use percent_encoding::percent_decode_str;
|
use percent_encoding::percent_decode_str;
|
||||||
use pile_config::{Label, objectpath::ObjectPath};
|
use pile_config::{Label, objectpath::ObjectPath};
|
||||||
use pile_value::{extract::traits::ExtractState, value::PileValue};
|
use pile_dataset::Datasets;
|
||||||
|
use pile_value::{
|
||||||
|
extract::traits::ExtractState,
|
||||||
|
value::{BinaryPileValue, PileValue},
|
||||||
|
};
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use std::{sync::Arc, time::Instant};
|
use std::{sync::Arc, time::Instant};
|
||||||
|
use tokio_util::io::ReaderStream;
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
use utoipa::ToSchema;
|
use utoipa::ToSchema;
|
||||||
|
|
||||||
use crate::Datasets;
|
|
||||||
|
|
||||||
#[derive(Deserialize, ToSchema)]
|
#[derive(Deserialize, ToSchema)]
|
||||||
pub struct ExtractQuery {
|
pub struct ExtractQuery {
|
||||||
source: String,
|
source: String,
|
||||||
@@ -96,17 +100,24 @@ pub async fn get_extract(
|
|||||||
let mut value = None;
|
let mut value = None;
|
||||||
for path in &paths {
|
for path in &paths {
|
||||||
match item.query(&extract_state, path).await {
|
match item.query(&extract_state, path).await {
|
||||||
Ok(Some(PileValue::Null)) | Ok(None) => continue,
|
Ok(None) => continue,
|
||||||
|
|
||||||
|
Ok(Some(PileValue::Null)) => {
|
||||||
|
value = Some(PileValue::Null);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
Ok(Some(v)) => {
|
Ok(Some(v)) => {
|
||||||
value = Some(v);
|
value = Some(v);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
|
Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let Some(value) = value else {
|
let Some(value) = value else {
|
||||||
return StatusCode::NOT_FOUND.into_response();
|
return (StatusCode::BAD_REQUEST, "no value").into_response();
|
||||||
};
|
};
|
||||||
|
|
||||||
debug!(
|
debug!(
|
||||||
@@ -141,15 +152,30 @@ pub async fn get_extract(
|
|||||||
s.to_string(),
|
s.to_string(),
|
||||||
)
|
)
|
||||||
.into_response(),
|
.into_response(),
|
||||||
PileValue::Blob { mime, bytes } => (
|
|
||||||
StatusCode::OK,
|
PileValue::Binary(binary) => {
|
||||||
[
|
let mime = binary.mime().to_string();
|
||||||
(header::CONTENT_TYPE, mime.to_string()),
|
let body = match binary {
|
||||||
(header::CONTENT_DISPOSITION, disposition),
|
BinaryPileValue::Blob { bytes, .. } => Body::from(bytes.0.to_vec()),
|
||||||
],
|
BinaryPileValue::File { path, .. } => match tokio::fs::File::open(&path).await {
|
||||||
bytes.as_ref().clone(),
|
Ok(file) => Body::from_stream(ReaderStream::new(file)),
|
||||||
)
|
Err(e) => {
|
||||||
.into_response(),
|
return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}"))
|
||||||
|
.into_response();
|
||||||
|
}
|
||||||
|
},
|
||||||
|
};
|
||||||
|
(
|
||||||
|
StatusCode::OK,
|
||||||
|
[
|
||||||
|
(header::CONTENT_TYPE, mime),
|
||||||
|
(header::CONTENT_DISPOSITION, disposition),
|
||||||
|
],
|
||||||
|
body,
|
||||||
|
)
|
||||||
|
.into_response()
|
||||||
|
}
|
||||||
|
|
||||||
_ => match value.to_json(&extract_state).await {
|
_ => match value.to_json(&extract_state).await {
|
||||||
Ok(json) => (
|
Ok(json) => (
|
||||||
StatusCode::OK,
|
StatusCode::OK,
|
||||||
@@ -157,6 +183,7 @@ pub async fn get_extract(
|
|||||||
Json(json),
|
Json(json),
|
||||||
)
|
)
|
||||||
.into_response(),
|
.into_response(),
|
||||||
|
|
||||||
Err(e) => (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
|
Err(e) => (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
@@ -4,13 +4,12 @@ use axum::{
|
|||||||
http::StatusCode,
|
http::StatusCode,
|
||||||
response::{IntoResponse, Response},
|
response::{IntoResponse, Response},
|
||||||
};
|
};
|
||||||
|
use pile_dataset::Datasets;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
use utoipa::ToSchema;
|
use utoipa::ToSchema;
|
||||||
|
|
||||||
use crate::Datasets;
|
|
||||||
|
|
||||||
#[derive(Deserialize, ToSchema)]
|
#[derive(Deserialize, ToSchema)]
|
||||||
pub struct ItemsQuery {
|
pub struct ItemsQuery {
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
89
crates/pile-serve/src/lib.rs
Normal file
89
crates/pile-serve/src/lib.rs
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
use axum::{
|
||||||
|
Router,
|
||||||
|
routing::{get, post},
|
||||||
|
};
|
||||||
|
use pile_dataset::Datasets;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use utoipa::OpenApi;
|
||||||
|
use utoipa_swagger_ui::SwaggerUi;
|
||||||
|
|
||||||
|
mod lookup;
|
||||||
|
pub use lookup::*;
|
||||||
|
|
||||||
|
mod extract;
|
||||||
|
pub use extract::*;
|
||||||
|
|
||||||
|
mod items;
|
||||||
|
pub use items::*;
|
||||||
|
|
||||||
|
mod config_schema;
|
||||||
|
pub use config_schema::*;
|
||||||
|
|
||||||
|
mod schema_field;
|
||||||
|
pub use schema_field::*;
|
||||||
|
|
||||||
|
mod schema;
|
||||||
|
pub use schema::*;
|
||||||
|
|
||||||
|
#[derive(OpenApi)]
|
||||||
|
#[openapi(
|
||||||
|
tags(),
|
||||||
|
paths(
|
||||||
|
lookup,
|
||||||
|
get_extract,
|
||||||
|
items_list,
|
||||||
|
config_schema,
|
||||||
|
schema_field,
|
||||||
|
schema_all
|
||||||
|
),
|
||||||
|
components(schemas(
|
||||||
|
LookupRequest,
|
||||||
|
LookupResponse,
|
||||||
|
LookupResult,
|
||||||
|
ExtractQuery,
|
||||||
|
ItemsQuery,
|
||||||
|
ItemsResponse,
|
||||||
|
ItemRef
|
||||||
|
))
|
||||||
|
)]
|
||||||
|
pub(crate) struct Api;
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
pub fn router(ds: Arc<Datasets>, with_docs: bool) -> Router<()> {
|
||||||
|
router_prefix(ds, with_docs, None)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
pub fn router_prefix(ds: Arc<Datasets>, with_docs: bool, prefix: Option<&str>) -> Router<()> {
|
||||||
|
let mut router = Router::new()
|
||||||
|
.route("/lookup", post(lookup))
|
||||||
|
.route("/extract", get(get_extract))
|
||||||
|
.route("/items", get(items_list))
|
||||||
|
.route("/config/schema", get(config_schema))
|
||||||
|
.route("/schema", get(schema_all))
|
||||||
|
.route("/schema/{field}", get(schema_field))
|
||||||
|
.with_state(ds.clone());
|
||||||
|
|
||||||
|
if let Some(prefix) = prefix {
|
||||||
|
router = Router::new().nest(prefix, router);
|
||||||
|
}
|
||||||
|
|
||||||
|
if with_docs {
|
||||||
|
let docs_path = match prefix {
|
||||||
|
None => "/docs".into(),
|
||||||
|
Some(prefix) => format!("{prefix}/docs"),
|
||||||
|
};
|
||||||
|
|
||||||
|
let api = Api::openapi();
|
||||||
|
let api = match prefix {
|
||||||
|
None => api,
|
||||||
|
Some(prefix) => utoipa::openapi::OpenApi::default().nest(prefix, api),
|
||||||
|
};
|
||||||
|
|
||||||
|
let docs =
|
||||||
|
SwaggerUi::new(docs_path.clone()).url(format!("{}/openapi.json", docs_path), api);
|
||||||
|
|
||||||
|
router = router.merge(docs);
|
||||||
|
}
|
||||||
|
router
|
||||||
|
}
|
||||||
@@ -4,13 +4,12 @@ use axum::{
|
|||||||
http::StatusCode,
|
http::StatusCode,
|
||||||
response::{IntoResponse, Response},
|
response::{IntoResponse, Response},
|
||||||
};
|
};
|
||||||
|
use pile_dataset::Datasets;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::{sync::Arc, time::Instant};
|
use std::{sync::Arc, time::Instant};
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
use utoipa::ToSchema;
|
use utoipa::ToSchema;
|
||||||
|
|
||||||
use crate::Datasets;
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, ToSchema, Debug)]
|
#[derive(Serialize, Deserialize, ToSchema, Debug)]
|
||||||
pub struct LookupRequest {
|
pub struct LookupRequest {
|
||||||
pub query: String,
|
pub query: String,
|
||||||
@@ -5,13 +5,12 @@ use axum::{
|
|||||||
response::{IntoResponse, Response},
|
response::{IntoResponse, Response},
|
||||||
};
|
};
|
||||||
use pile_config::Label;
|
use pile_config::Label;
|
||||||
|
use pile_dataset::Datasets;
|
||||||
use pile_value::{extract::traits::ExtractState, value::PileValue};
|
use pile_value::{extract::traits::ExtractState, value::PileValue};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::{collections::HashMap, sync::Arc};
|
use std::{collections::HashMap, sync::Arc};
|
||||||
use utoipa::IntoParams;
|
use utoipa::IntoParams;
|
||||||
|
|
||||||
use crate::Datasets;
|
|
||||||
|
|
||||||
#[derive(Deserialize, IntoParams)]
|
#[derive(Deserialize, IntoParams)]
|
||||||
pub struct SchemaQuery {
|
pub struct SchemaQuery {
|
||||||
source: String,
|
source: String,
|
||||||
@@ -21,7 +20,7 @@ pub struct SchemaQuery {
|
|||||||
hidden: bool,
|
hidden: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
#[serde(untagged)]
|
#[serde(untagged)]
|
||||||
pub enum ApiValue {
|
pub enum ApiValue {
|
||||||
Binary { binary: bool, mime: String },
|
Binary { binary: bool, mime: String },
|
||||||
@@ -44,9 +43,9 @@ async fn pile_value_to_api(
|
|||||||
PileValue::I64(n) => Ok(ApiValue::Number(n.into())),
|
PileValue::I64(n) => Ok(ApiValue::Number(n.into())),
|
||||||
PileValue::Null => Ok(ApiValue::Null),
|
PileValue::Null => Ok(ApiValue::Null),
|
||||||
|
|
||||||
PileValue::Blob { mime, .. } => Ok(ApiValue::Binary {
|
PileValue::Binary(x) => Ok(ApiValue::Binary {
|
||||||
binary: true,
|
binary: true,
|
||||||
mime: mime.to_string(),
|
mime: x.mime().to_string(),
|
||||||
}),
|
}),
|
||||||
|
|
||||||
PileValue::Array(arr) => {
|
PileValue::Array(arr) => {
|
||||||
@@ -1,18 +1,22 @@
|
|||||||
use axum::{
|
use axum::{
|
||||||
Json,
|
Json,
|
||||||
|
body::Body,
|
||||||
extract::{Path, Query, State},
|
extract::{Path, Query, State},
|
||||||
http::{StatusCode, header},
|
http::{StatusCode, header},
|
||||||
response::{IntoResponse, Response},
|
response::{IntoResponse, Response},
|
||||||
};
|
};
|
||||||
use pile_config::Label;
|
use pile_config::Label;
|
||||||
use pile_value::{extract::traits::ExtractState, value::PileValue};
|
use pile_dataset::Datasets;
|
||||||
|
use pile_value::{
|
||||||
|
extract::traits::ExtractState,
|
||||||
|
value::{BinaryPileValue, PileValue},
|
||||||
|
};
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use std::{sync::Arc, time::Instant};
|
use std::{sync::Arc, time::Instant};
|
||||||
|
use tokio_util::io::ReaderStream;
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
use utoipa::IntoParams;
|
use utoipa::IntoParams;
|
||||||
|
|
||||||
use crate::Datasets;
|
|
||||||
|
|
||||||
#[derive(Deserialize, IntoParams)]
|
#[derive(Deserialize, IntoParams)]
|
||||||
pub struct SchemaFieldQuery {
|
pub struct SchemaFieldQuery {
|
||||||
source: String,
|
source: String,
|
||||||
@@ -79,17 +83,24 @@ pub async fn schema_field(
|
|||||||
let mut value = None;
|
let mut value = None;
|
||||||
for path in paths {
|
for path in paths {
|
||||||
match item.query(&extract_state, path).await {
|
match item.query(&extract_state, path).await {
|
||||||
Ok(Some(PileValue::Null)) | Ok(None) => continue,
|
Ok(None) => continue,
|
||||||
|
|
||||||
|
Ok(Some(PileValue::Null)) => {
|
||||||
|
value = Some(PileValue::Null);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
Ok(Some(v)) => {
|
Ok(Some(v)) => {
|
||||||
value = Some(v);
|
value = Some(v);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
|
Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let Some(value) = value else {
|
let Some(value) = value else {
|
||||||
return StatusCode::NOT_FOUND.into_response();
|
return (StatusCode::BAD_REQUEST, "no value").into_response();
|
||||||
};
|
};
|
||||||
|
|
||||||
debug!(
|
debug!(
|
||||||
@@ -125,15 +136,30 @@ pub async fn schema_field(
|
|||||||
s.to_string(),
|
s.to_string(),
|
||||||
)
|
)
|
||||||
.into_response(),
|
.into_response(),
|
||||||
PileValue::Blob { mime, bytes } => (
|
|
||||||
StatusCode::OK,
|
PileValue::Binary(binary) => {
|
||||||
[
|
let mime = binary.mime().to_string();
|
||||||
(header::CONTENT_TYPE, mime.to_string()),
|
let body = match binary {
|
||||||
(header::CONTENT_DISPOSITION, disposition),
|
BinaryPileValue::Blob { bytes, .. } => Body::from(bytes.0.to_vec()),
|
||||||
],
|
BinaryPileValue::File { path, .. } => match tokio::fs::File::open(&path).await {
|
||||||
bytes.as_ref().clone(),
|
Ok(file) => Body::from_stream(ReaderStream::new(file)),
|
||||||
)
|
Err(e) => {
|
||||||
.into_response(),
|
return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}"))
|
||||||
|
.into_response();
|
||||||
|
}
|
||||||
|
},
|
||||||
|
};
|
||||||
|
(
|
||||||
|
StatusCode::OK,
|
||||||
|
[
|
||||||
|
(header::CONTENT_TYPE, mime),
|
||||||
|
(header::CONTENT_DISPOSITION, disposition),
|
||||||
|
],
|
||||||
|
body,
|
||||||
|
)
|
||||||
|
.into_response()
|
||||||
|
}
|
||||||
|
|
||||||
_ => match value.to_json(&extract_state).await {
|
_ => match value.to_json(&extract_state).await {
|
||||||
Ok(json) => (
|
Ok(json) => (
|
||||||
StatusCode::OK,
|
StatusCode::OK,
|
||||||
@@ -21,6 +21,9 @@ toml = { workspace = true }
|
|||||||
smartstring = { workspace = true }
|
smartstring = { workspace = true }
|
||||||
regex = { workspace = true }
|
regex = { workspace = true }
|
||||||
blake3 = { workspace = true }
|
blake3 = { workspace = true }
|
||||||
|
sha2 = { workspace = true }
|
||||||
|
sha1 = { workspace = true }
|
||||||
|
md5 = { workspace = true }
|
||||||
epub = { workspace = true }
|
epub = { workspace = true }
|
||||||
kamadak-exif = { workspace = true }
|
kamadak-exif = { workspace = true }
|
||||||
pdf = { workspace = true }
|
pdf = { workspace = true }
|
||||||
|
|||||||
@@ -6,16 +6,16 @@ use tracing::trace;
|
|||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::traits::ExtractState,
|
extract::traits::ExtractState,
|
||||||
value::{Item, PileValue},
|
value::{ArcBytes, BinaryPileValue, PileValue},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct EpubCoverExtractor {
|
pub struct EpubCoverExtractor {
|
||||||
item: Item,
|
item: BinaryPileValue,
|
||||||
output: OnceLock<Option<(Mime, Vec<u8>)>>,
|
output: OnceLock<Option<(Mime, Vec<u8>)>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl EpubCoverExtractor {
|
impl EpubCoverExtractor {
|
||||||
pub fn new(item: &Item) -> Self {
|
pub fn new(item: &BinaryPileValue) -> Self {
|
||||||
Self {
|
Self {
|
||||||
item: item.clone(),
|
item: item.clone(),
|
||||||
output: OnceLock::new(),
|
output: OnceLock::new(),
|
||||||
@@ -51,7 +51,7 @@ impl EpubCoverExtractor {
|
|||||||
Err(error) => match error.downcast::<std::io::Error>() {
|
Err(error) => match error.downcast::<std::io::Error>() {
|
||||||
Ok(x) => return Err(x),
|
Ok(x) => return Err(x),
|
||||||
Err(error) => {
|
Err(error) => {
|
||||||
trace!(message = "Could not extract epub cover", ?error, key = ?self.item.key());
|
trace!(message = "Could not extract epub cover", ?error, item = ?self.item);
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@@ -65,12 +65,11 @@ impl EpubCoverExtractor {
|
|||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(self
|
Ok(self.get_inner().await?.map(|(mime, bytes)| {
|
||||||
.get_inner()
|
PileValue::Binary(BinaryPileValue::Blob {
|
||||||
.await?
|
|
||||||
.map(|(mime, bytes)| PileValue::Blob {
|
|
||||||
mime: mime.clone(),
|
mime: mime.clone(),
|
||||||
bytes: Arc::new(bytes.clone()),
|
bytes: ArcBytes(Arc::new(bytes.clone())),
|
||||||
}))
|
})
|
||||||
|
}))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -9,16 +9,16 @@ use tracing::trace;
|
|||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
extract::traits::{ExtractState, ObjectExtractor},
|
||||||
value::{Item, PileValue},
|
value::{BinaryPileValue, PileValue},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct EpubMetaExtractor {
|
pub struct EpubMetaExtractor {
|
||||||
item: Item,
|
item: BinaryPileValue,
|
||||||
output: OnceLock<HashMap<Label, PileValue>>,
|
output: OnceLock<HashMap<Label, PileValue>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl EpubMetaExtractor {
|
impl EpubMetaExtractor {
|
||||||
pub fn new(item: &Item) -> Self {
|
pub fn new(item: &BinaryPileValue) -> Self {
|
||||||
Self {
|
Self {
|
||||||
item: item.clone(),
|
item: item.clone(),
|
||||||
output: OnceLock::new(),
|
output: OnceLock::new(),
|
||||||
@@ -57,7 +57,7 @@ impl EpubMetaExtractor {
|
|||||||
Err(error) => match error.downcast::<std::io::Error>() {
|
Err(error) => match error.downcast::<std::io::Error>() {
|
||||||
Ok(x) => return Err(x),
|
Ok(x) => return Err(x),
|
||||||
Err(error) => {
|
Err(error) => {
|
||||||
trace!(message = "Could not process epub", ?error, key = ?self.item.key());
|
trace!(message = "Could not process epub", ?error, item = ?self.item);
|
||||||
return Ok(self.output.get_or_init(HashMap::new));
|
return Ok(self.output.get_or_init(HashMap::new));
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@@ -9,16 +9,16 @@ use tracing::trace;
|
|||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
extract::traits::{ExtractState, ObjectExtractor},
|
||||||
value::{Item, PileValue},
|
value::{BinaryPileValue, PileValue},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct EpubTextExtractor {
|
pub struct EpubTextExtractor {
|
||||||
item: Item,
|
item: BinaryPileValue,
|
||||||
output: OnceLock<HashMap<Label, PileValue>>,
|
output: OnceLock<HashMap<Label, PileValue>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl EpubTextExtractor {
|
impl EpubTextExtractor {
|
||||||
pub fn new(item: &Item) -> Self {
|
pub fn new(item: &BinaryPileValue) -> Self {
|
||||||
Self {
|
Self {
|
||||||
item: item.clone(),
|
item: item.clone(),
|
||||||
output: OnceLock::new(),
|
output: OnceLock::new(),
|
||||||
@@ -54,7 +54,7 @@ impl EpubTextExtractor {
|
|||||||
Err(error) => match error.downcast::<std::io::Error>() {
|
Err(error) => match error.downcast::<std::io::Error>() {
|
||||||
Ok(x) => return Err(x),
|
Ok(x) => return Err(x),
|
||||||
Err(error) => {
|
Err(error) => {
|
||||||
trace!(message = "Could not process epub", ?error, key = ?self.item.key());
|
trace!(message = "Could not process epub", ?error, item = ?self.item);
|
||||||
return Ok(self.output.get_or_init(HashMap::new));
|
return Ok(self.output.get_or_init(HashMap::new));
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@@ -12,7 +12,7 @@ pub use epub_text::*;
|
|||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
extract::traits::{ExtractState, ObjectExtractor},
|
||||||
value::{Item, PileValue},
|
value::{BinaryPileValue, PileValue},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct EpubExtractor {
|
pub struct EpubExtractor {
|
||||||
@@ -22,7 +22,7 @@ pub struct EpubExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl EpubExtractor {
|
impl EpubExtractor {
|
||||||
pub fn new(item: &Item) -> Self {
|
pub fn new(item: &BinaryPileValue) -> Self {
|
||||||
Self {
|
Self {
|
||||||
text: Arc::new(EpubTextExtractor::new(item)),
|
text: Arc::new(EpubTextExtractor::new(item)),
|
||||||
meta: Arc::new(EpubMetaExtractor::new(item)),
|
meta: Arc::new(EpubMetaExtractor::new(item)),
|
||||||
@@ -87,9 +87,13 @@ impl ObjectExtractor for EpubExtractor {
|
|||||||
|
|
||||||
if k.as_str() == "cover" {
|
if k.as_str() == "cover" {
|
||||||
let summary = match &v {
|
let summary = match &v {
|
||||||
PileValue::Blob { mime, bytes } => {
|
PileValue::Binary(BinaryPileValue::Blob { mime, bytes }) => {
|
||||||
format!("<Blob ({}, {} bytes)>", mime, bytes.len())
|
format!("<Blob ({mime}, {} bytes)>", bytes.0.len())
|
||||||
}
|
}
|
||||||
|
PileValue::Binary(BinaryPileValue::File { mime, .. }) => {
|
||||||
|
format!("<File ({mime})>")
|
||||||
|
}
|
||||||
|
|
||||||
PileValue::Null => "<null>".to_owned(),
|
PileValue::Null => "<null>".to_owned(),
|
||||||
_ => "<cover>".to_owned(),
|
_ => "<cover>".to_owned(),
|
||||||
};
|
};
|
||||||
@@ -9,16 +9,16 @@ use tracing::trace;
|
|||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
extract::traits::{ExtractState, ObjectExtractor},
|
||||||
value::{Item, PileValue},
|
value::{BinaryPileValue, PileValue},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct ExifExtractor {
|
pub struct ExifExtractor {
|
||||||
item: Item,
|
item: BinaryPileValue,
|
||||||
output: OnceLock<HashMap<Label, PileValue>>,
|
output: OnceLock<HashMap<Label, PileValue>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ExifExtractor {
|
impl ExifExtractor {
|
||||||
pub fn new(item: &Item) -> Self {
|
pub fn new(item: &BinaryPileValue) -> Self {
|
||||||
Self {
|
Self {
|
||||||
item: item.clone(),
|
item: item.clone(),
|
||||||
output: OnceLock::new(),
|
output: OnceLock::new(),
|
||||||
@@ -53,7 +53,7 @@ impl ExifExtractor {
|
|||||||
Ok(x) => x,
|
Ok(x) => x,
|
||||||
Err(exif::Error::Io(x)) => return Err(x),
|
Err(exif::Error::Io(x)) => return Err(x),
|
||||||
Err(error) => {
|
Err(error) => {
|
||||||
trace!(message = "Could not process exif", ?error, key = ?self.item.key());
|
trace!(message = "Could not process exif", ?error, item = ?self.item);
|
||||||
return Ok(self.output.get_or_init(HashMap::new));
|
return Ok(self.output.get_or_init(HashMap::new));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -94,7 +94,7 @@ impl ObjectExtractor for ExifExtractor {
|
|||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
trace!(
|
trace!(
|
||||||
?args,
|
?args,
|
||||||
key = self.item.key().as_str(),
|
item = ?self.item,
|
||||||
"Getting field {name:?} from ExifExtractor",
|
"Getting field {name:?} from ExifExtractor",
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -11,16 +11,16 @@ use tracing::trace;
|
|||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::traits::{ExtractState, ListExtractor, ObjectExtractor},
|
extract::traits::{ExtractState, ListExtractor, ObjectExtractor},
|
||||||
value::{Item, PileValue},
|
value::{ArcBytes, BinaryPileValue, PileValue},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct FlacImagesExtractor {
|
pub struct FlacImagesExtractor {
|
||||||
item: Item,
|
item: BinaryPileValue,
|
||||||
cached_count: OnceLock<usize>,
|
cached_count: OnceLock<usize>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FlacImagesExtractor {
|
impl FlacImagesExtractor {
|
||||||
pub fn new(item: &Item) -> Self {
|
pub fn new(item: &BinaryPileValue) -> Self {
|
||||||
Self {
|
Self {
|
||||||
item: item.clone(),
|
item: item.clone(),
|
||||||
cached_count: OnceLock::new(),
|
cached_count: OnceLock::new(),
|
||||||
@@ -65,7 +65,7 @@ impl ListExtractor for FlacImagesExtractor {
|
|||||||
mut idx: usize,
|
mut idx: usize,
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
trace!(
|
trace!(
|
||||||
key = self.item.key().as_str(),
|
item = ?self.item,
|
||||||
"Getting index {idx} from FlacImagesExtractor",
|
"Getting index {idx} from FlacImagesExtractor",
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -73,7 +73,7 @@ impl ListExtractor for FlacImagesExtractor {
|
|||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
|
|
||||||
let key = self.item.key();
|
let item = self.item.clone();
|
||||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||||
let image = tokio::task::spawn_blocking(move || {
|
let image = tokio::task::spawn_blocking(move || {
|
||||||
let reader = FlacReader::new(BufReader::new(reader));
|
let reader = FlacReader::new(BufReader::new(reader));
|
||||||
@@ -93,11 +93,7 @@ impl ListExtractor for FlacImagesExtractor {
|
|||||||
|
|
||||||
Err(FlacDecodeError::IoError(err)) => return Err(err),
|
Err(FlacDecodeError::IoError(err)) => return Err(err),
|
||||||
Err(error) => {
|
Err(error) => {
|
||||||
trace!(
|
trace!(message = "Could not parse FLAC images", ?item, ?error);
|
||||||
message = "Could not parse FLAC images",
|
|
||||||
key = key.as_str(),
|
|
||||||
?error
|
|
||||||
);
|
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -109,9 +105,11 @@ impl ListExtractor for FlacImagesExtractor {
|
|||||||
.await
|
.await
|
||||||
.map_err(std::io::Error::other)??;
|
.map_err(std::io::Error::other)??;
|
||||||
|
|
||||||
Ok(image.map(|(mime, data)| PileValue::Blob {
|
Ok(image.map(|(mime, data)| {
|
||||||
mime,
|
PileValue::Binary(BinaryPileValue::Blob {
|
||||||
bytes: Arc::new(data),
|
mime,
|
||||||
|
bytes: ArcBytes(Arc::new(data)),
|
||||||
|
})
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -130,13 +128,13 @@ impl ListExtractor for FlacImagesExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub struct FlacExtractor {
|
pub struct FlacExtractor {
|
||||||
item: Item,
|
item: BinaryPileValue,
|
||||||
output: OnceLock<HashMap<Label, PileValue>>,
|
output: OnceLock<HashMap<Label, PileValue>>,
|
||||||
images: PileValue,
|
images: PileValue,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FlacExtractor {
|
impl FlacExtractor {
|
||||||
pub fn new(item: &Item) -> Self {
|
pub fn new(item: &BinaryPileValue) -> Self {
|
||||||
Self {
|
Self {
|
||||||
item: item.clone(),
|
item: item.clone(),
|
||||||
output: OnceLock::new(),
|
output: OnceLock::new(),
|
||||||
@@ -149,12 +147,9 @@ impl FlacExtractor {
|
|||||||
return Ok(x);
|
return Ok(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
trace!(
|
trace!(message = "Reading FLAC tags", item = ?self.item);
|
||||||
message = "Reading FLAC tags",
|
|
||||||
key = self.item.key().as_str()
|
|
||||||
);
|
|
||||||
|
|
||||||
let key = self.item.key();
|
let item = self.item.clone();
|
||||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||||
let output = tokio::task::spawn_blocking(move || {
|
let output = tokio::task::spawn_blocking(move || {
|
||||||
let reader = FlacReader::new(BufReader::new(reader));
|
let reader = FlacReader::new(BufReader::new(reader));
|
||||||
@@ -176,11 +171,7 @@ impl FlacExtractor {
|
|||||||
|
|
||||||
Err(FlacDecodeError::IoError(err)) => return Err(err),
|
Err(FlacDecodeError::IoError(err)) => return Err(err),
|
||||||
Err(error) => {
|
Err(error) => {
|
||||||
trace!(
|
trace!(message = "Could not parse FLAC metadata", ?item, ?error);
|
||||||
message = "Could not parse FLAC metadata",
|
|
||||||
key = key.as_str(),
|
|
||||||
?error
|
|
||||||
);
|
|
||||||
return Ok(HashMap::new());
|
return Ok(HashMap::new());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1,21 +1,21 @@
|
|||||||
use crate::{
|
use crate::{
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
extract::traits::{ExtractState, ObjectExtractor},
|
||||||
value::{Item, PileValue},
|
value::{BinaryPileValue, PileValue},
|
||||||
};
|
};
|
||||||
use pile_config::Label;
|
use pile_config::Label;
|
||||||
use std::{
|
use std::{
|
||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
path::{Component, PathBuf},
|
path::Component,
|
||||||
sync::{Arc, OnceLock},
|
sync::{Arc, OnceLock},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct FsExtractor {
|
pub struct FsExtractor {
|
||||||
item: Item,
|
item: BinaryPileValue,
|
||||||
output: OnceLock<HashMap<Label, PileValue>>,
|
output: OnceLock<HashMap<Label, PileValue>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FsExtractor {
|
impl FsExtractor {
|
||||||
pub fn new(item: &Item) -> Self {
|
pub fn new(item: &BinaryPileValue) -> Self {
|
||||||
Self {
|
Self {
|
||||||
item: item.clone(),
|
item: item.clone(),
|
||||||
output: OnceLock::new(),
|
output: OnceLock::new(),
|
||||||
@@ -27,7 +27,10 @@ impl FsExtractor {
|
|||||||
return Ok(x);
|
return Ok(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
let path = PathBuf::from(self.item.key().as_str());
|
let path = match &self.item {
|
||||||
|
BinaryPileValue::File { path, .. } => path,
|
||||||
|
_ => return Ok(self.output.get_or_init(HashMap::new)),
|
||||||
|
};
|
||||||
|
|
||||||
let mut root = false;
|
let mut root = false;
|
||||||
let components = path
|
let components = path
|
||||||
111
crates/pile-value/src/extract/blob/hash.rs
Normal file
111
crates/pile-value/src/extract/blob/hash.rs
Normal file
@@ -0,0 +1,111 @@
|
|||||||
|
use crate::{
|
||||||
|
extract::traits::{ExtractState, ObjectExtractor},
|
||||||
|
value::{BinaryPileValue, PileValue},
|
||||||
|
};
|
||||||
|
use pile_config::Label;
|
||||||
|
use pile_io::SyncReadBridge;
|
||||||
|
use std::{io::Read, sync::Arc};
|
||||||
|
use tokio::sync::OnceCell;
|
||||||
|
|
||||||
|
fn to_hex(bytes: &[u8]) -> String {
|
||||||
|
bytes.iter().map(|b| format!("{b:02x}")).collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
macro_rules! hash_algos {
|
||||||
|
($($name:ident),* $(,)?) => {
|
||||||
|
pub struct HashExtractor {
|
||||||
|
item: BinaryPileValue,
|
||||||
|
$($name: OnceCell<String>,)*
|
||||||
|
}
|
||||||
|
|
||||||
|
impl HashExtractor {
|
||||||
|
pub fn new(item: &BinaryPileValue) -> Self {
|
||||||
|
Self {
|
||||||
|
item: item.clone(),
|
||||||
|
$($name: OnceCell::new(),)*
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static LABELS: std::sync::LazyLock<Vec<Label>> = std::sync::LazyLock::new(|| {
|
||||||
|
vec![$(Label::new(stringify!($name)).unwrap()),*]
|
||||||
|
});
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
hash_algos!(blake3, md5, sha1, sha224, sha256, sha384, sha512);
|
||||||
|
|
||||||
|
impl HashExtractor {
|
||||||
|
async fn compute(&self, name: &Label) -> Result<Option<String>, std::io::Error> {
|
||||||
|
let name_str = name.as_ref();
|
||||||
|
|
||||||
|
macro_rules! algo {
|
||||||
|
($cell:ident, $compute:expr) => {
|
||||||
|
if name_str == stringify!($cell) {
|
||||||
|
return Ok(Some(
|
||||||
|
self.$cell
|
||||||
|
.get_or_try_init(|| async {
|
||||||
|
let read = self.item.read().await?;
|
||||||
|
let mut read = SyncReadBridge::new_current(read);
|
||||||
|
tokio::task::spawn_blocking(move || {
|
||||||
|
let mut bytes = Vec::new();
|
||||||
|
read.read_to_end(&mut bytes)?;
|
||||||
|
Ok::<String, std::io::Error>($compute(&bytes))
|
||||||
|
})
|
||||||
|
.await?
|
||||||
|
})
|
||||||
|
.await?
|
||||||
|
.clone(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
algo!(blake3, |b: &Vec<u8>| blake3::hash(b).to_hex().to_string());
|
||||||
|
algo!(md5, |b: &Vec<u8>| format!("{:x}", md5::compute(b)));
|
||||||
|
algo!(sha1, |b: &Vec<u8>| {
|
||||||
|
use sha1::Digest;
|
||||||
|
to_hex(sha1::Sha1::digest(b).as_ref())
|
||||||
|
});
|
||||||
|
algo!(sha224, |b: &Vec<u8>| {
|
||||||
|
use sha2::Digest;
|
||||||
|
to_hex(sha2::Sha224::digest(b).as_ref())
|
||||||
|
});
|
||||||
|
algo!(sha256, |b: &Vec<u8>| {
|
||||||
|
use sha2::Digest;
|
||||||
|
to_hex(sha2::Sha256::digest(b).as_ref())
|
||||||
|
});
|
||||||
|
algo!(sha384, |b: &Vec<u8>| {
|
||||||
|
use sha2::Digest;
|
||||||
|
to_hex(sha2::Sha384::digest(b).as_ref())
|
||||||
|
});
|
||||||
|
algo!(sha512, |b: &Vec<u8>| {
|
||||||
|
use sha2::Digest;
|
||||||
|
to_hex(sha2::Sha512::digest(b).as_ref())
|
||||||
|
});
|
||||||
|
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
impl ObjectExtractor for HashExtractor {
|
||||||
|
async fn field(
|
||||||
|
&self,
|
||||||
|
_state: &ExtractState,
|
||||||
|
name: &Label,
|
||||||
|
args: Option<&str>,
|
||||||
|
) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
|
if args.is_some() {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
Ok(self
|
||||||
|
.compute(name)
|
||||||
|
.await?
|
||||||
|
.map(|s| PileValue::String(Arc::new(s.into()))))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||||
|
Ok(LABELS.clone())
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,4 +1,5 @@
|
|||||||
use id3::Tag;
|
use id3::Tag;
|
||||||
|
use mime::Mime;
|
||||||
use pile_config::Label;
|
use pile_config::Label;
|
||||||
use pile_io::SyncReadBridge;
|
use pile_io::SyncReadBridge;
|
||||||
use std::{
|
use std::{
|
||||||
@@ -10,20 +11,106 @@ use std::{
|
|||||||
use tracing::trace;
|
use tracing::trace;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
extract::traits::{ExtractState, ListExtractor, ObjectExtractor},
|
||||||
value::{Item, PileValue},
|
value::{ArcBytes, BinaryPileValue, PileValue},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
pub struct Id3ImagesExtractor {
|
||||||
|
item: BinaryPileValue,
|
||||||
|
cached_count: OnceLock<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Id3ImagesExtractor {
|
||||||
|
pub fn new(item: &BinaryPileValue) -> Self {
|
||||||
|
Self {
|
||||||
|
item: item.clone(),
|
||||||
|
cached_count: OnceLock::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn read_tag(&self) -> Result<Option<Tag>, std::io::Error> {
|
||||||
|
let item = self.item.clone();
|
||||||
|
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||||
|
tokio::task::spawn_blocking(move || match Tag::read_from2(BufReader::new(reader)) {
|
||||||
|
Ok(tag) => Ok(Some(tag)),
|
||||||
|
Err(id3::Error {
|
||||||
|
kind: id3::ErrorKind::Io(e),
|
||||||
|
..
|
||||||
|
}) => Err(e),
|
||||||
|
Err(error) => {
|
||||||
|
trace!(message = "Could not parse id3 tags", ?item, ?error);
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.map_err(std::io::Error::other)?
|
||||||
|
}
|
||||||
|
|
||||||
|
fn mime_ok(&self, state: &ExtractState) -> bool {
|
||||||
|
state.ignore_mime || self.item.mime().essence_str() == "audio/mpeg"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
impl ListExtractor for Id3ImagesExtractor {
|
||||||
|
async fn get(
|
||||||
|
&self,
|
||||||
|
state: &ExtractState,
|
||||||
|
idx: usize,
|
||||||
|
) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
|
if !self.mime_ok(state) {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
let Some(tag) = self.read_tag().await? else {
|
||||||
|
return Ok(None);
|
||||||
|
};
|
||||||
|
|
||||||
|
let Some(picture) = tag.pictures().nth(idx) else {
|
||||||
|
return Ok(None);
|
||||||
|
};
|
||||||
|
|
||||||
|
let mime: Mime = picture
|
||||||
|
.mime_type
|
||||||
|
.parse()
|
||||||
|
.unwrap_or(mime::APPLICATION_OCTET_STREAM);
|
||||||
|
let data = picture.data.clone();
|
||||||
|
|
||||||
|
Ok(Some(PileValue::Binary(BinaryPileValue::Blob {
|
||||||
|
mime,
|
||||||
|
bytes: ArcBytes(Arc::new(data)),
|
||||||
|
})))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn len(&self, state: &ExtractState) -> Result<usize, std::io::Error> {
|
||||||
|
if !self.mime_ok(state) {
|
||||||
|
return Ok(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(x) = self.cached_count.get() {
|
||||||
|
return Ok(*x);
|
||||||
|
}
|
||||||
|
|
||||||
|
let count = match self.read_tag().await? {
|
||||||
|
Some(tag) => tag.pictures().count(),
|
||||||
|
None => 0,
|
||||||
|
};
|
||||||
|
Ok(*self.cached_count.get_or_init(|| count))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub struct Id3Extractor {
|
pub struct Id3Extractor {
|
||||||
item: Item,
|
item: BinaryPileValue,
|
||||||
output: OnceLock<HashMap<Label, PileValue>>,
|
output: OnceLock<HashMap<Label, PileValue>>,
|
||||||
|
images: PileValue,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Id3Extractor {
|
impl Id3Extractor {
|
||||||
pub fn new(item: &Item) -> Self {
|
pub fn new(item: &BinaryPileValue) -> Self {
|
||||||
Self {
|
Self {
|
||||||
item: item.clone(),
|
item: item.clone(),
|
||||||
output: OnceLock::new(),
|
output: OnceLock::new(),
|
||||||
|
images: PileValue::ListExtractor(Arc::new(Id3ImagesExtractor::new(item))),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -32,9 +119,9 @@ impl Id3Extractor {
|
|||||||
return Ok(x);
|
return Ok(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
trace!(message = "Reading id3 tags", key = self.item.key().as_str());
|
trace!(message = "Reading id3 tags", key = ?self.item);
|
||||||
|
|
||||||
let key = self.item.key();
|
let item = self.item.clone();
|
||||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||||
let tag = match tokio::task::spawn_blocking(move || Tag::read_from2(BufReader::new(reader)))
|
let tag = match tokio::task::spawn_blocking(move || Tag::read_from2(BufReader::new(reader)))
|
||||||
.await
|
.await
|
||||||
@@ -48,11 +135,7 @@ impl Id3Extractor {
|
|||||||
})) => return Err(e),
|
})) => return Err(e),
|
||||||
|
|
||||||
Ok(Err(error)) => {
|
Ok(Err(error)) => {
|
||||||
trace!(
|
trace!(message = "Could not parse id3 tags", ?item, ?error);
|
||||||
message = "Could not parse id3 tags",
|
|
||||||
key = key.as_str(),
|
|
||||||
?error
|
|
||||||
);
|
|
||||||
return Ok(self.output.get_or_init(HashMap::new));
|
return Ok(self.output.get_or_init(HashMap::new));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -138,10 +221,21 @@ impl ObjectExtractor for Id3Extractor {
|
|||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if name.as_str() == "images" {
|
||||||
|
return Ok(Some(self.images.clone()));
|
||||||
|
}
|
||||||
|
|
||||||
Ok(self.get_inner().await?.get(name).cloned())
|
Ok(self.get_inner().await?.get(name).cloned())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[expect(clippy::unwrap_used)]
|
||||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||||
Ok(self.get_inner().await?.keys().cloned().collect())
|
Ok(self
|
||||||
|
.get_inner()
|
||||||
|
.await?
|
||||||
|
.keys()
|
||||||
|
.cloned()
|
||||||
|
.chain([Label::new("images").unwrap()])
|
||||||
|
.collect::<Vec<_>>())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1,63 +1,25 @@
|
|||||||
mod transform;
|
|
||||||
use transform::{CropTransformer, ImageTransformer, MaxDimTransformer};
|
|
||||||
|
|
||||||
use image::ImageFormat;
|
use image::ImageFormat;
|
||||||
use mime::Mime;
|
use mime::Mime;
|
||||||
use pile_config::Label;
|
use pile_config::Label;
|
||||||
use pile_io::AsyncReader;
|
use pile_io::AsyncReader;
|
||||||
use std::{
|
use std::{io::Cursor, str::FromStr, sync::Arc};
|
||||||
io::Cursor,
|
|
||||||
str::FromStr,
|
|
||||||
sync::{Arc, OnceLock},
|
|
||||||
};
|
|
||||||
use tracing::trace;
|
use tracing::trace;
|
||||||
|
use transform::{CropTransformer, ImageTransformer, MaxDimTransformer};
|
||||||
|
|
||||||
|
mod transform;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
extract::traits::{ExtractState, ObjectExtractor},
|
||||||
value::{Item, PileValue},
|
value::{ArcBytes, BinaryPileValue, PileValue},
|
||||||
};
|
};
|
||||||
|
|
||||||
enum ImageSource {
|
|
||||||
Item(Item, OnceLock<Arc<Vec<u8>>>),
|
|
||||||
Blob(Arc<Vec<u8>>, Mime),
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct ImageExtractor {
|
pub struct ImageExtractor {
|
||||||
source: ImageSource,
|
item: BinaryPileValue,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ImageExtractor {
|
impl ImageExtractor {
|
||||||
pub fn new(item: &Item) -> Self {
|
pub fn new(item: &BinaryPileValue) -> Self {
|
||||||
Self {
|
Self { item: item.clone() }
|
||||||
source: ImageSource::Item(item.clone(), OnceLock::new()),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn from_blob(bytes: Arc<Vec<u8>>, mime: Mime) -> Self {
|
|
||||||
Self {
|
|
||||||
source: ImageSource::Blob(bytes, mime),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn mime(&self) -> &Mime {
|
|
||||||
match &self.source {
|
|
||||||
ImageSource::Item(item, _) => item.mime(),
|
|
||||||
ImageSource::Blob(_, mime) => mime,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn read_bytes(&self) -> Result<Arc<Vec<u8>>, std::io::Error> {
|
|
||||||
match &self.source {
|
|
||||||
ImageSource::Blob(bytes, _) => Ok(bytes.clone()),
|
|
||||||
ImageSource::Item(item, cache) => {
|
|
||||||
if let Some(x) = cache.get() {
|
|
||||||
return Ok(x.clone());
|
|
||||||
}
|
|
||||||
let mut reader = item.read().await?;
|
|
||||||
let bytes = reader.read_to_end().await?;
|
|
||||||
Ok(cache.get_or_init(|| Arc::new(bytes)).clone())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn apply<T: ImageTransformer + Send + 'static>(
|
async fn apply<T: ImageTransformer + Send + 'static>(
|
||||||
@@ -69,11 +31,14 @@ impl ImageExtractor {
|
|||||||
Err(_) => return Ok(None),
|
Err(_) => return Ok(None),
|
||||||
};
|
};
|
||||||
|
|
||||||
let mime = self.mime().clone();
|
let mime = self.item.mime().clone();
|
||||||
let bytes = self.read_bytes().await?;
|
let bytes = self.item.read().await?.read_to_end().await?;
|
||||||
|
|
||||||
let Some(format) = ImageFormat::from_mime_type(&mime) else {
|
let Some(format) = ImageFormat::from_mime_type(&mime) else {
|
||||||
return Ok(Some(PileValue::Blob { mime, bytes }));
|
return Ok(Some(PileValue::Binary(BinaryPileValue::Blob {
|
||||||
|
mime,
|
||||||
|
bytes: ArcBytes(Arc::new(bytes)),
|
||||||
|
})));
|
||||||
};
|
};
|
||||||
|
|
||||||
let bytes_for_closure = bytes.clone();
|
let bytes_for_closure = bytes.clone();
|
||||||
@@ -91,11 +56,15 @@ impl ImageExtractor {
|
|||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
match result {
|
match result {
|
||||||
Ok((out_mime, out_bytes)) => Ok(Some(PileValue::Blob {
|
Ok((out_mime, out_bytes)) => Ok(Some(PileValue::Binary(BinaryPileValue::Blob {
|
||||||
mime: out_mime,
|
mime: out_mime,
|
||||||
bytes: Arc::new(out_bytes),
|
bytes: ArcBytes(Arc::new(out_bytes)),
|
||||||
})),
|
}))),
|
||||||
Err(_) => Ok(Some(PileValue::Blob { mime, bytes })),
|
|
||||||
|
Err(_) => Ok(Some(PileValue::Binary(BinaryPileValue::Blob {
|
||||||
|
mime,
|
||||||
|
bytes: ArcBytes(Arc::new(bytes)),
|
||||||
|
}))),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -7,7 +7,7 @@ use std::{
|
|||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
extract::traits::{ExtractState, ObjectExtractor},
|
||||||
value::{Item, PileValue},
|
value::{BinaryPileValue, PileValue},
|
||||||
};
|
};
|
||||||
|
|
||||||
fn json_to_pile(value: serde_json::Value) -> PileValue {
|
fn json_to_pile(value: serde_json::Value) -> PileValue {
|
||||||
@@ -24,12 +24,12 @@ fn json_to_pile(value: serde_json::Value) -> PileValue {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub struct JsonExtractor {
|
pub struct JsonExtractor {
|
||||||
item: Item,
|
item: BinaryPileValue,
|
||||||
output: OnceLock<HashMap<Label, PileValue>>,
|
output: OnceLock<HashMap<Label, PileValue>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl JsonExtractor {
|
impl JsonExtractor {
|
||||||
pub fn new(item: &Item) -> Self {
|
pub fn new(item: &BinaryPileValue) -> Self {
|
||||||
Self {
|
Self {
|
||||||
item: item.clone(),
|
item: item.clone(),
|
||||||
output: OnceLock::new(),
|
output: OnceLock::new(),
|
||||||
@@ -25,31 +25,31 @@ mod toml;
|
|||||||
use pile_config::Label;
|
use pile_config::Label;
|
||||||
pub use toml::*;
|
pub use toml::*;
|
||||||
|
|
||||||
mod group;
|
|
||||||
pub use group::*;
|
|
||||||
|
|
||||||
mod text;
|
mod text;
|
||||||
pub use text::*;
|
pub use text::*;
|
||||||
|
|
||||||
mod image;
|
mod image;
|
||||||
pub use image::*;
|
pub use image::*;
|
||||||
|
|
||||||
|
mod hash;
|
||||||
|
pub use hash::*;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::{
|
extract::{
|
||||||
misc::MapExtractor,
|
misc::MapExtractor,
|
||||||
traits::{ExtractState, ObjectExtractor},
|
traits::{ExtractState, ObjectExtractor},
|
||||||
},
|
},
|
||||||
value::{Item, PileValue},
|
value::{BinaryPileValue, PileValue},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct ItemExtractor {
|
pub struct BinaryExtractor {
|
||||||
inner: MapExtractor,
|
inner: MapExtractor,
|
||||||
image: Arc<ImageExtractor>,
|
image: Arc<ImageExtractor>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ItemExtractor {
|
impl BinaryExtractor {
|
||||||
#[expect(clippy::unwrap_used)]
|
#[expect(clippy::unwrap_used)]
|
||||||
pub fn new(item: &Item) -> Self {
|
pub fn new(item: &BinaryPileValue) -> Self {
|
||||||
let inner = MapExtractor {
|
let inner = MapExtractor {
|
||||||
inner: HashMap::from([
|
inner: HashMap::from([
|
||||||
(
|
(
|
||||||
@@ -89,8 +89,8 @@ impl ItemExtractor {
|
|||||||
PileValue::ObjectExtractor(Arc::new(TextExtractor::new(item))),
|
PileValue::ObjectExtractor(Arc::new(TextExtractor::new(item))),
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
Label::new("groups").unwrap(),
|
Label::new("hash").unwrap(),
|
||||||
PileValue::ObjectExtractor(Arc::new(GroupExtractor::new(item))),
|
PileValue::ObjectExtractor(Arc::new(HashExtractor::new(item))),
|
||||||
),
|
),
|
||||||
]),
|
]),
|
||||||
};
|
};
|
||||||
@@ -103,7 +103,7 @@ impl ItemExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl ObjectExtractor for ItemExtractor {
|
impl ObjectExtractor for BinaryExtractor {
|
||||||
async fn field(
|
async fn field(
|
||||||
&self,
|
&self,
|
||||||
state: &ExtractState,
|
state: &ExtractState,
|
||||||
@@ -15,7 +15,7 @@ pub use pdf_text::*;
|
|||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
extract::traits::{ExtractState, ObjectExtractor},
|
||||||
value::{Item, PileValue},
|
value::{BinaryPileValue, PileValue},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct PdfExtractor {
|
pub struct PdfExtractor {
|
||||||
@@ -26,7 +26,7 @@ pub struct PdfExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl PdfExtractor {
|
impl PdfExtractor {
|
||||||
pub fn new(item: &Item) -> Self {
|
pub fn new(item: &BinaryPileValue) -> Self {
|
||||||
Self {
|
Self {
|
||||||
text: Arc::new(PdfTextExtractor::new(item)),
|
text: Arc::new(PdfTextExtractor::new(item)),
|
||||||
meta: Arc::new(PdfMetaExtractor::new(item)),
|
meta: Arc::new(PdfMetaExtractor::new(item)),
|
||||||
@@ -46,7 +46,7 @@ impl ObjectExtractor for PdfExtractor {
|
|||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
trace!(
|
trace!(
|
||||||
?args,
|
?args,
|
||||||
key = self.text.item.key().as_str(),
|
item = ?self.text.item,
|
||||||
"Getting field {name:?} from PdfExtractor",
|
"Getting field {name:?} from PdfExtractor",
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -9,18 +9,19 @@ use std::{
|
|||||||
};
|
};
|
||||||
use tracing::trace;
|
use tracing::trace;
|
||||||
|
|
||||||
|
use crate::value::BinaryPileValue;
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
extract::traits::{ExtractState, ObjectExtractor},
|
||||||
value::{Item, PileValue},
|
value::PileValue,
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct PdfMetaExtractor {
|
pub struct PdfMetaExtractor {
|
||||||
item: Item,
|
item: BinaryPileValue,
|
||||||
output: OnceLock<HashMap<Label, PileValue>>,
|
output: OnceLock<HashMap<Label, PileValue>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PdfMetaExtractor {
|
impl PdfMetaExtractor {
|
||||||
pub fn new(item: &Item) -> Self {
|
pub fn new(item: &BinaryPileValue) -> Self {
|
||||||
Self {
|
Self {
|
||||||
item: item.clone(),
|
item: item.clone(),
|
||||||
output: OnceLock::new(),
|
output: OnceLock::new(),
|
||||||
@@ -82,7 +83,7 @@ impl PdfMetaExtractor {
|
|||||||
let (page_count, raw_meta) = match raw_meta {
|
let (page_count, raw_meta) = match raw_meta {
|
||||||
Ok(x) => x,
|
Ok(x) => x,
|
||||||
Err(error) => {
|
Err(error) => {
|
||||||
trace!(message = "Could not process pdf", ?error, key = ?self.item.key());
|
trace!(message = "Could not process pdf", ?error, item = ?self.item);
|
||||||
return Ok(self.output.get_or_init(HashMap::new));
|
return Ok(self.output.get_or_init(HashMap::new));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -9,15 +9,15 @@ use tracing::trace;
|
|||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::traits::{ExtractState, ListExtractor},
|
extract::traits::{ExtractState, ListExtractor},
|
||||||
value::{Item, PileValue},
|
value::{ArcBytes, BinaryPileValue, PileValue},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct PdfPagesExtractor {
|
pub struct PdfPagesExtractor {
|
||||||
item: Item,
|
item: BinaryPileValue,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PdfPagesExtractor {
|
impl PdfPagesExtractor {
|
||||||
pub fn new(item: &Item) -> Self {
|
pub fn new(item: &BinaryPileValue) -> Self {
|
||||||
Self { item: item.clone() }
|
Self { item: item.clone() }
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -41,7 +41,7 @@ impl ListExtractor for PdfPagesExtractor {
|
|||||||
idx: usize,
|
idx: usize,
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
trace!(
|
trace!(
|
||||||
key = self.item.key().as_str(),
|
item = ?self.item,
|
||||||
"Getting index {idx} from PdfPagesExtractor",
|
"Getting index {idx} from PdfPagesExtractor",
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -78,12 +78,12 @@ impl ListExtractor for PdfPagesExtractor {
|
|||||||
|
|
||||||
let value = match png {
|
let value = match png {
|
||||||
Ok(None) => return Ok(None),
|
Ok(None) => return Ok(None),
|
||||||
Ok(Some(bytes)) => PileValue::Blob {
|
Ok(Some(bytes)) => PileValue::Binary(BinaryPileValue::Blob {
|
||||||
mime: mime::IMAGE_PNG,
|
mime: mime::IMAGE_PNG,
|
||||||
bytes: Arc::new(bytes),
|
bytes: ArcBytes(Arc::new(bytes)),
|
||||||
},
|
}),
|
||||||
Err(error) => {
|
Err(error) => {
|
||||||
trace!(message = "Could not render pdf page", ?error, idx, key = ?self.item.key());
|
trace!(message = "Could not render pdf page", ?error, idx, item = ?self.item);
|
||||||
PileValue::Null
|
PileValue::Null
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -108,7 +108,7 @@ impl ListExtractor for PdfPagesExtractor {
|
|||||||
match count {
|
match count {
|
||||||
Ok(n) => Ok(n),
|
Ok(n) => Ok(n),
|
||||||
Err(error) => {
|
Err(error) => {
|
||||||
trace!(message = "Could not read pdf page count", ?error, key = ?self.item.key());
|
trace!(message = "Could not read pdf page count", ?error, item = ?self.item);
|
||||||
Ok(0)
|
Ok(0)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -9,18 +9,19 @@ use std::{
|
|||||||
};
|
};
|
||||||
use tracing::trace;
|
use tracing::trace;
|
||||||
|
|
||||||
|
use crate::value::BinaryPileValue;
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
extract::traits::{ExtractState, ObjectExtractor},
|
||||||
value::{Item, PileValue},
|
value::PileValue,
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct PdfTextExtractor {
|
pub struct PdfTextExtractor {
|
||||||
pub(super) item: Item,
|
pub(super) item: BinaryPileValue,
|
||||||
output: OnceLock<HashMap<Label, PileValue>>,
|
output: OnceLock<HashMap<Label, PileValue>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PdfTextExtractor {
|
impl PdfTextExtractor {
|
||||||
pub fn new(item: &Item) -> Self {
|
pub fn new(item: &BinaryPileValue) -> Self {
|
||||||
Self {
|
Self {
|
||||||
item: item.clone(),
|
item: item.clone(),
|
||||||
output: OnceLock::new(),
|
output: OnceLock::new(),
|
||||||
@@ -86,7 +87,7 @@ impl PdfTextExtractor {
|
|||||||
let raw_text = match raw_text {
|
let raw_text = match raw_text {
|
||||||
Ok(x) => x,
|
Ok(x) => x,
|
||||||
Err(error) => {
|
Err(error) => {
|
||||||
trace!(message = "Could not process pdf", ?error, key = ?self.item.key());
|
trace!(message = "Could not process pdf", ?error, item = ?self.item);
|
||||||
return Ok(self.output.get_or_init(HashMap::new));
|
return Ok(self.output.get_or_init(HashMap::new));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -4,16 +4,16 @@ use std::sync::{Arc, OnceLock};
|
|||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
extract::traits::{ExtractState, ObjectExtractor},
|
||||||
value::{Item, PileValue},
|
value::{BinaryPileValue, PileValue},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct TextExtractor {
|
pub struct TextExtractor {
|
||||||
item: Item,
|
item: BinaryPileValue,
|
||||||
output: OnceLock<PileValue>,
|
output: OnceLock<PileValue>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TextExtractor {
|
impl TextExtractor {
|
||||||
pub fn new(item: &Item) -> Self {
|
pub fn new(item: &BinaryPileValue) -> Self {
|
||||||
Self {
|
Self {
|
||||||
item: item.clone(),
|
item: item.clone(),
|
||||||
output: OnceLock::new(),
|
output: OnceLock::new(),
|
||||||
@@ -7,7 +7,7 @@ use std::{
|
|||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
extract::traits::{ExtractState, ObjectExtractor},
|
||||||
value::{Item, PileValue},
|
value::{BinaryPileValue, PileValue},
|
||||||
};
|
};
|
||||||
|
|
||||||
fn toml_to_pile(value: toml::Value) -> PileValue {
|
fn toml_to_pile(value: toml::Value) -> PileValue {
|
||||||
@@ -25,12 +25,12 @@ fn toml_to_pile(value: toml::Value) -> PileValue {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub struct TomlExtractor {
|
pub struct TomlExtractor {
|
||||||
item: Item,
|
item: BinaryPileValue,
|
||||||
output: OnceLock<HashMap<Label, PileValue>>,
|
output: OnceLock<HashMap<Label, PileValue>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TomlExtractor {
|
impl TomlExtractor {
|
||||||
pub fn new(item: &Item) -> Self {
|
pub fn new(item: &BinaryPileValue) -> Self {
|
||||||
Self {
|
Self {
|
||||||
item: item.clone(),
|
item: item.clone(),
|
||||||
output: OnceLock::new(),
|
output: OnceLock::new(),
|
||||||
58
crates/pile-value/src/extract/item.rs
Normal file
58
crates/pile-value/src/extract/item.rs
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
use std::{collections::HashMap, sync::Arc};
|
||||||
|
|
||||||
|
use pile_config::Label;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
extract::{
|
||||||
|
misc::MapExtractor,
|
||||||
|
traits::{ExtractState, ObjectExtractor},
|
||||||
|
},
|
||||||
|
value::{Item, PileValue},
|
||||||
|
};
|
||||||
|
|
||||||
|
pub struct ItemExtractor {
|
||||||
|
inner: MapExtractor,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ItemExtractor {
|
||||||
|
pub fn new(item: &Item) -> Self {
|
||||||
|
let files = {
|
||||||
|
let Item::File { files, .. } = &item;
|
||||||
|
let mut inner = HashMap::new();
|
||||||
|
for f in files {
|
||||||
|
inner.insert(f.0.clone(), f.1.clone());
|
||||||
|
}
|
||||||
|
PileValue::ObjectExtractor(Arc::new(MapExtractor { inner }))
|
||||||
|
};
|
||||||
|
|
||||||
|
#[expect(clippy::unwrap_used)]
|
||||||
|
let inner = MapExtractor {
|
||||||
|
inner: HashMap::from([
|
||||||
|
(Label::new("files").unwrap(), files),
|
||||||
|
(
|
||||||
|
Label::new("key").unwrap(),
|
||||||
|
PileValue::String(Arc::new(item.key())),
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
};
|
||||||
|
|
||||||
|
Self { inner }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
impl ObjectExtractor for ItemExtractor {
|
||||||
|
async fn field(
|
||||||
|
&self,
|
||||||
|
state: &ExtractState,
|
||||||
|
name: &pile_config::Label,
|
||||||
|
args: Option<&str>,
|
||||||
|
) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
|
self.inner.field(state, name, args).await
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||||
|
let fields = self.inner.fields().await?;
|
||||||
|
Ok(fields)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,56 +0,0 @@
|
|||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
use pile_config::Label;
|
|
||||||
|
|
||||||
use crate::{
|
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
|
||||||
value::{Item, PileValue},
|
|
||||||
};
|
|
||||||
|
|
||||||
pub struct GroupExtractor {
|
|
||||||
item: Item,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl GroupExtractor {
|
|
||||||
pub fn new(item: &Item) -> Self {
|
|
||||||
Self { item: item.clone() }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl ObjectExtractor for GroupExtractor {
|
|
||||||
async fn field(
|
|
||||||
&self,
|
|
||||||
_state: &ExtractState,
|
|
||||||
name: &Label,
|
|
||||||
args: Option<&str>,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
if args.is_some() {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
Ok(self
|
|
||||||
.item
|
|
||||||
.group()
|
|
||||||
.get(name)
|
|
||||||
.map(|item| PileValue::ObjectExtractor(Arc::new(super::ItemExtractor::new(item)))))
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
|
||||||
Ok(self.item.group().keys().cloned().collect())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn to_json(&self, _state: &ExtractState) -> Result<serde_json::Value, std::io::Error> {
|
|
||||||
Ok(serde_json::Value::Object(
|
|
||||||
self.item
|
|
||||||
.group()
|
|
||||||
.iter()
|
|
||||||
.map(|(k, v)| {
|
|
||||||
(
|
|
||||||
k.to_string(),
|
|
||||||
serde_json::Value::String(format!("<GroupItem ({})>", v.key())),
|
|
||||||
)
|
|
||||||
})
|
|
||||||
.collect(),
|
|
||||||
))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,3 +1,4 @@
|
|||||||
|
pub mod blob;
|
||||||
pub mod item;
|
pub mod item;
|
||||||
pub mod misc;
|
pub mod misc;
|
||||||
pub mod regex;
|
pub mod regex;
|
||||||
|
|||||||
@@ -1,27 +1,25 @@
|
|||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
use pile_config::{
|
use pile_config::Label;
|
||||||
Label,
|
use regex::Regex;
|
||||||
pattern::{GroupPattern, GroupSegment},
|
|
||||||
};
|
|
||||||
use smartstring::{LazyCompact, SmartString};
|
use smartstring::{LazyCompact, SmartString};
|
||||||
use std::{
|
use std::{
|
||||||
collections::{BTreeMap, HashMap, HashSet},
|
collections::{BTreeMap, HashMap},
|
||||||
path::PathBuf,
|
path::PathBuf,
|
||||||
sync::{Arc, OnceLock},
|
sync::{Arc, OnceLock},
|
||||||
};
|
};
|
||||||
use walkdir::WalkDir;
|
use walkdir::WalkDir;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::traits::ExtractState,
|
|
||||||
source::{DataSource, misc::path_ts_latest},
|
source::{DataSource, misc::path_ts_latest},
|
||||||
value::{Item, PileValue},
|
value::{BinaryPileValue, Item, PileValue},
|
||||||
};
|
};
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct DirDataSource {
|
pub struct DirDataSource {
|
||||||
pub name: Label,
|
pub name: Label,
|
||||||
pub dir: PathBuf,
|
pub dir: PathBuf,
|
||||||
pub pattern: GroupPattern,
|
pub base_pattern: Regex,
|
||||||
|
pub files: HashMap<Label, String>,
|
||||||
pub index: OnceLock<BTreeMap<SmartString<LazyCompact>, Item>>,
|
pub index: OnceLock<BTreeMap<SmartString<LazyCompact>, Item>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -29,21 +27,18 @@ impl DirDataSource {
|
|||||||
pub async fn new(
|
pub async fn new(
|
||||||
name: &Label,
|
name: &Label,
|
||||||
dir: PathBuf,
|
dir: PathBuf,
|
||||||
pattern: GroupPattern,
|
base_pattern: Regex,
|
||||||
|
files: HashMap<Label, String>,
|
||||||
) -> Result<Arc<Self>, std::io::Error> {
|
) -> Result<Arc<Self>, std::io::Error> {
|
||||||
let source = Arc::new(Self {
|
let source = Arc::new(Self {
|
||||||
name: name.clone(),
|
name: name.clone(),
|
||||||
dir,
|
dir,
|
||||||
pattern,
|
base_pattern,
|
||||||
|
files,
|
||||||
index: OnceLock::new(),
|
index: OnceLock::new(),
|
||||||
});
|
});
|
||||||
|
|
||||||
//
|
let mut index = BTreeMap::new();
|
||||||
// MARK: list paths
|
|
||||||
//
|
|
||||||
|
|
||||||
let mut paths_items = HashSet::new();
|
|
||||||
let mut paths_grouped_items = HashSet::new();
|
|
||||||
'entry: for entry in WalkDir::new(&source.dir) {
|
'entry: for entry in WalkDir::new(&source.dir) {
|
||||||
let entry = match entry {
|
let entry = match entry {
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
@@ -59,51 +54,52 @@ impl DirDataSource {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let path = entry.into_path();
|
let path = entry.into_path();
|
||||||
let path_str = match path.to_str() {
|
let rel_path = match path.strip_prefix(&source.dir) {
|
||||||
|
Ok(p) => p,
|
||||||
|
Err(_) => continue 'entry,
|
||||||
|
};
|
||||||
|
let path_str = match rel_path.to_str() {
|
||||||
Some(x) => x,
|
Some(x) => x,
|
||||||
None => continue 'entry,
|
None => continue 'entry,
|
||||||
};
|
};
|
||||||
|
|
||||||
let groups = resolve_groups(&source.pattern, path_str).await;
|
let captures = match source.base_pattern.captures(path_str) {
|
||||||
paths_grouped_items.extend(groups.into_values());
|
Some(c) => c,
|
||||||
paths_items.insert(path);
|
None => continue 'entry,
|
||||||
}
|
};
|
||||||
|
let base = match captures.get(1) {
|
||||||
//
|
Some(m) => m.as_str(),
|
||||||
// MARK: resolve groups
|
|
||||||
//
|
|
||||||
|
|
||||||
let mut index = BTreeMap::new();
|
|
||||||
'entry: for path in paths_items.difference(&paths_grouped_items) {
|
|
||||||
let path_str = match path.to_str() {
|
|
||||||
Some(x) => x,
|
|
||||||
None => continue 'entry,
|
None => continue 'entry,
|
||||||
};
|
};
|
||||||
|
|
||||||
let group = resolve_groups(&source.pattern, path_str).await;
|
let key: SmartString<LazyCompact> = base.into();
|
||||||
let group = group
|
if index.contains_key(&key) {
|
||||||
.into_iter()
|
continue 'entry;
|
||||||
.map(|(k, group_path)| {
|
}
|
||||||
(
|
|
||||||
k,
|
let mut item_files = HashMap::new();
|
||||||
Box::new(Item::File {
|
for (label, template) in &source.files {
|
||||||
source: Arc::clone(&source),
|
let file_path = source.dir.join(template.replace("{base}", base));
|
||||||
mime: mime_guess::from_path(&group_path).first_or_octet_stream(),
|
if file_path.exists() {
|
||||||
path: group_path.clone(),
|
let mime = mime_guess::from_path(&file_path).first_or_octet_stream();
|
||||||
group: Arc::new(HashMap::new()),
|
item_files.insert(
|
||||||
|
label.clone(),
|
||||||
|
PileValue::Binary(BinaryPileValue::File {
|
||||||
|
mime,
|
||||||
|
path: file_path,
|
||||||
}),
|
}),
|
||||||
)
|
);
|
||||||
})
|
}
|
||||||
.collect::<HashMap<_, _>>();
|
}
|
||||||
|
|
||||||
let item = Item::File {
|
index.insert(
|
||||||
source: Arc::clone(&source),
|
key.clone(),
|
||||||
mime: mime_guess::from_path(path).first_or_octet_stream(),
|
Item::File {
|
||||||
path: path.into(),
|
key,
|
||||||
group: Arc::new(group),
|
source: Arc::clone(&source),
|
||||||
};
|
files: item_files,
|
||||||
|
},
|
||||||
index.insert(item.key(), item);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
source.index.get_or_init(|| index);
|
source.index.get_or_init(|| index);
|
||||||
@@ -139,43 +135,3 @@ impl DataSource for Arc<DirDataSource> {
|
|||||||
path_ts_latest(&self.dir)
|
path_ts_latest(&self.dir)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn resolve_groups(pattern: &GroupPattern, path_str: &str) -> HashMap<Label, PathBuf> {
|
|
||||||
let state = ExtractState { ignore_mime: false };
|
|
||||||
let mut group = HashMap::new();
|
|
||||||
'pattern: for (l, pat) in &pattern.pattern {
|
|
||||||
let item = PileValue::String(Arc::new(path_str.into()));
|
|
||||||
let mut target = String::new();
|
|
||||||
for p in pat {
|
|
||||||
match p {
|
|
||||||
GroupSegment::Literal(x) => target.push_str(x),
|
|
||||||
GroupSegment::Path(op) => {
|
|
||||||
let res = match item.query(&state, op).await {
|
|
||||||
Ok(Some(x)) => x,
|
|
||||||
_ => continue 'pattern,
|
|
||||||
};
|
|
||||||
|
|
||||||
let res = match res.as_str() {
|
|
||||||
Some(x) => x,
|
|
||||||
None => continue 'pattern,
|
|
||||||
};
|
|
||||||
|
|
||||||
target.push_str(res);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let group_path: PathBuf = match target.parse() {
|
|
||||||
Ok(x) => x,
|
|
||||||
Err(_) => continue 'pattern,
|
|
||||||
};
|
|
||||||
|
|
||||||
if !group_path.exists() {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
group.insert(l.clone(), group_path);
|
|
||||||
}
|
|
||||||
|
|
||||||
return group;
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,74 +1,45 @@
|
|||||||
use mime::Mime;
|
|
||||||
use pile_config::Label;
|
use pile_config::Label;
|
||||||
use pile_io::SyncReadBridge;
|
|
||||||
use smartstring::{LazyCompact, SmartString};
|
use smartstring::{LazyCompact, SmartString};
|
||||||
use std::{collections::HashMap, fs::File, path::PathBuf, sync::Arc};
|
use std::{collections::HashMap, sync::Arc};
|
||||||
|
|
||||||
use crate::{source::DirDataSource, value::ItemReader};
|
use crate::{source::DirDataSource, value::PileValue};
|
||||||
|
|
||||||
//
|
//
|
||||||
// MARK: item
|
// MARK: item
|
||||||
//
|
//
|
||||||
|
|
||||||
/// A cheaply-cloneable pointer to an item in a dataset
|
/// A cheaply-cloneable pointer to an item in a dataset
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Clone)]
|
||||||
pub enum Item {
|
pub enum Item {
|
||||||
File {
|
File {
|
||||||
|
key: SmartString<LazyCompact>,
|
||||||
source: Arc<DirDataSource>,
|
source: Arc<DirDataSource>,
|
||||||
mime: Mime,
|
files: HashMap<Label, PileValue>,
|
||||||
|
|
||||||
path: PathBuf,
|
|
||||||
group: Arc<HashMap<Label, Box<Item>>>,
|
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Item {
|
impl std::fmt::Debug for Item {
|
||||||
/// Open the item for reading.
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
pub async fn read(&self) -> Result<ItemReader, std::io::Error> {
|
match self {
|
||||||
Ok(match self {
|
Self::File { key, files, .. } => f
|
||||||
Self::File { path, .. } => ItemReader::File(File::open(path)?),
|
.debug_struct("Item::File")
|
||||||
})
|
.field("key", key)
|
||||||
|
.field("files", &files.keys().collect::<Vec<_>>())
|
||||||
|
.finish(),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Item {
|
||||||
pub fn source_name(&self) -> &pile_config::Label {
|
pub fn source_name(&self) -> &pile_config::Label {
|
||||||
match self {
|
match self {
|
||||||
Self::File { source, .. } => &source.name,
|
Self::File { source, .. } => &source.name,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[expect(clippy::expect_used)]
|
|
||||||
pub fn key(&self) -> SmartString<LazyCompact> {
|
pub fn key(&self) -> SmartString<LazyCompact> {
|
||||||
match self {
|
match self {
|
||||||
Self::File { source, path, .. } => path
|
Self::File { key, .. } => key.clone(),
|
||||||
.strip_prefix(&source.dir)
|
|
||||||
.expect("item must be inside source")
|
|
||||||
.to_str()
|
|
||||||
.expect("path is not utf-8")
|
|
||||||
.into(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn hash(&self) -> Result<blake3::Hash, std::io::Error> {
|
|
||||||
let read = self.read().await?;
|
|
||||||
let mut read = SyncReadBridge::new_current(read);
|
|
||||||
let out = tokio::task::spawn_blocking(move || {
|
|
||||||
let mut hasher = blake3::Hasher::new();
|
|
||||||
std::io::copy(&mut read, &mut hasher)?;
|
|
||||||
return Ok::<_, std::io::Error>(hasher.finalize());
|
|
||||||
})
|
|
||||||
.await??;
|
|
||||||
return Ok(out);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn mime(&self) -> &Mime {
|
|
||||||
match self {
|
|
||||||
Self::File { mime, .. } => mime,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn group(&self) -> &HashMap<Label, Box<Self>> {
|
|
||||||
match self {
|
|
||||||
Self::File { group, .. } => group,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,10 @@
|
|||||||
use pile_io::{AsyncReader, AsyncSeekReader};
|
use pile_io::{AsyncReader, AsyncSeekReader};
|
||||||
use std::{fs::File, io::Seek};
|
use std::{
|
||||||
|
fs::File,
|
||||||
|
io::{Cursor, Seek},
|
||||||
|
};
|
||||||
|
|
||||||
|
use crate::value::ArcBytes;
|
||||||
|
|
||||||
//
|
//
|
||||||
// MARK: itemreader
|
// MARK: itemreader
|
||||||
@@ -7,12 +12,14 @@ use std::{fs::File, io::Seek};
|
|||||||
|
|
||||||
pub enum ItemReader {
|
pub enum ItemReader {
|
||||||
File(File),
|
File(File),
|
||||||
|
Vec(Cursor<ArcBytes>),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AsyncReader for ItemReader {
|
impl AsyncReader for ItemReader {
|
||||||
async fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
|
async fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
|
||||||
match self {
|
match self {
|
||||||
Self::File(x) => std::io::Read::read(x, buf),
|
Self::File(x) => std::io::Read::read(x, buf),
|
||||||
|
Self::Vec(x) => std::io::Read::read(x, buf),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -21,6 +28,7 @@ impl AsyncSeekReader for ItemReader {
|
|||||||
async fn seek(&mut self, pos: std::io::SeekFrom) -> Result<u64, std::io::Error> {
|
async fn seek(&mut self, pos: std::io::SeekFrom) -> Result<u64, std::io::Error> {
|
||||||
match self {
|
match self {
|
||||||
Self::File(x) => x.seek(pos),
|
Self::File(x) => x.seek(pos),
|
||||||
|
Self::Vec(x) => x.seek(pos),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,18 +2,61 @@ use mime::Mime;
|
|||||||
use pile_config::objectpath::{ObjectPath, PathSegment};
|
use pile_config::objectpath::{ObjectPath, PathSegment};
|
||||||
use serde_json::{Map, Value};
|
use serde_json::{Map, Value};
|
||||||
use smartstring::{LazyCompact, SmartString};
|
use smartstring::{LazyCompact, SmartString};
|
||||||
use std::sync::Arc;
|
use std::{fmt::Debug, fs::File, io::Cursor, path::PathBuf, sync::Arc};
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::{
|
extract::{
|
||||||
item::{ImageExtractor, ItemExtractor},
|
blob::BinaryExtractor,
|
||||||
|
item::ItemExtractor,
|
||||||
misc::{ArrayExtractor, MapExtractor, VecExtractor},
|
misc::{ArrayExtractor, MapExtractor, VecExtractor},
|
||||||
string::StringExtractor,
|
string::StringExtractor,
|
||||||
traits::{ExtractState, ListExtractor, ObjectExtractor},
|
traits::{ExtractState, ListExtractor, ObjectExtractor},
|
||||||
},
|
},
|
||||||
value::Item,
|
value::{Item, ItemReader},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct ArcBytes(pub Arc<Vec<u8>>);
|
||||||
|
impl Debug for ArcBytes {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
f.debug_struct("ArcBytes")
|
||||||
|
.field("len()", &self.0.len())
|
||||||
|
.finish()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AsRef<[u8]> for ArcBytes {
|
||||||
|
fn as_ref(&self) -> &[u8] {
|
||||||
|
&self.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub enum BinaryPileValue {
|
||||||
|
/// A binary blob
|
||||||
|
Blob { mime: Mime, bytes: ArcBytes },
|
||||||
|
|
||||||
|
/// An pointer to a file
|
||||||
|
File { mime: Mime, path: PathBuf },
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BinaryPileValue {
|
||||||
|
/// Open the item for reading.
|
||||||
|
pub async fn read(&self) -> Result<ItemReader, std::io::Error> {
|
||||||
|
match self {
|
||||||
|
Self::File { path, .. } => Ok(ItemReader::File(File::open(path)?)),
|
||||||
|
Self::Blob { bytes, .. } => Ok(ItemReader::Vec(Cursor::new(bytes.clone()))),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn mime(&self) -> &Mime {
|
||||||
|
match self {
|
||||||
|
Self::Blob { mime, .. } => mime,
|
||||||
|
Self::File { mime, .. } => mime,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// An immutable, cheaply-cloneable, lazily-computed value.
|
/// An immutable, cheaply-cloneable, lazily-computed value.
|
||||||
/// Very similar to [serde_json::Value].
|
/// Very similar to [serde_json::Value].
|
||||||
pub enum PileValue {
|
pub enum PileValue {
|
||||||
@@ -27,12 +70,6 @@ pub enum PileValue {
|
|||||||
/// An array of values
|
/// An array of values
|
||||||
Array(Arc<Vec<PileValue>>),
|
Array(Arc<Vec<PileValue>>),
|
||||||
|
|
||||||
/// A binary blob
|
|
||||||
Blob {
|
|
||||||
mime: Mime,
|
|
||||||
bytes: Arc<Vec<u8>>,
|
|
||||||
},
|
|
||||||
|
|
||||||
/// A lazily-computed map of {label: value}
|
/// A lazily-computed map of {label: value}
|
||||||
ObjectExtractor(Arc<dyn ObjectExtractor>),
|
ObjectExtractor(Arc<dyn ObjectExtractor>),
|
||||||
|
|
||||||
@@ -41,6 +78,9 @@ pub enum PileValue {
|
|||||||
|
|
||||||
/// An pointer to an item in this dataset
|
/// An pointer to an item in this dataset
|
||||||
Item(Item),
|
Item(Item),
|
||||||
|
|
||||||
|
/// Binary data
|
||||||
|
Binary(BinaryPileValue),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Clone for PileValue {
|
impl Clone for PileValue {
|
||||||
@@ -53,11 +93,8 @@ impl Clone for PileValue {
|
|||||||
Self::Array(x) => Self::Array(x.clone()),
|
Self::Array(x) => Self::Array(x.clone()),
|
||||||
Self::ObjectExtractor(x) => Self::ObjectExtractor(x.clone()),
|
Self::ObjectExtractor(x) => Self::ObjectExtractor(x.clone()),
|
||||||
Self::ListExtractor(x) => Self::ListExtractor(x.clone()),
|
Self::ListExtractor(x) => Self::ListExtractor(x.clone()),
|
||||||
Self::Blob { mime, bytes } => Self::Blob {
|
|
||||||
mime: mime.clone(),
|
|
||||||
bytes: bytes.clone(),
|
|
||||||
},
|
|
||||||
Self::Item(i) => Self::Item(i.clone()),
|
Self::Item(i) => Self::Item(i.clone()),
|
||||||
|
Self::Binary(b) => Self::Binary(b.clone()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -70,13 +107,10 @@ impl PileValue {
|
|||||||
Self::I64(_) => Arc::new(MapExtractor::default()),
|
Self::I64(_) => Arc::new(MapExtractor::default()),
|
||||||
Self::Array(_) => Arc::new(MapExtractor::default()),
|
Self::Array(_) => Arc::new(MapExtractor::default()),
|
||||||
Self::String(s) => Arc::new(StringExtractor::new(s)),
|
Self::String(s) => Arc::new(StringExtractor::new(s)),
|
||||||
Self::Blob { mime, bytes } => {
|
|
||||||
// TODO: make a blobextractor (with pdf, epub, etc; like item)
|
|
||||||
Arc::new(ImageExtractor::from_blob(bytes.clone(), mime.clone()))
|
|
||||||
}
|
|
||||||
Self::ListExtractor(_) => Arc::new(MapExtractor::default()),
|
Self::ListExtractor(_) => Arc::new(MapExtractor::default()),
|
||||||
Self::ObjectExtractor(e) => e.clone(),
|
Self::ObjectExtractor(e) => e.clone(),
|
||||||
Self::Item(i) => Arc::new(ItemExtractor::new(i)),
|
Self::Item(i) => Arc::new(ItemExtractor::new(i)),
|
||||||
|
Self::Binary(b) => Arc::new(BinaryExtractor::new(b)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -87,12 +121,12 @@ impl PileValue {
|
|||||||
Self::I64(_) => Arc::new(VecExtractor::default()),
|
Self::I64(_) => Arc::new(VecExtractor::default()),
|
||||||
Self::Array(a) => Arc::new(ArrayExtractor::new(a.clone())),
|
Self::Array(a) => Arc::new(ArrayExtractor::new(a.clone())),
|
||||||
Self::String(_) => Arc::new(VecExtractor::default()),
|
Self::String(_) => Arc::new(VecExtractor::default()),
|
||||||
Self::Blob { .. } => Arc::new(VecExtractor::default()),
|
|
||||||
Self::ListExtractor(e) => e.clone(),
|
Self::ListExtractor(e) => e.clone(),
|
||||||
Self::ObjectExtractor(e) => e
|
Self::ObjectExtractor(e) => e
|
||||||
.as_list()
|
.as_list()
|
||||||
.unwrap_or_else(|| Arc::new(VecExtractor::default())),
|
.unwrap_or_else(|| Arc::new(VecExtractor::default())),
|
||||||
Self::Item(_) => Arc::new(VecExtractor::default()),
|
Self::Item(_) => Arc::new(VecExtractor::default()),
|
||||||
|
Self::Binary(_) => Arc::new(VecExtractor::default()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -197,14 +231,17 @@ impl PileValue {
|
|||||||
Ok(match self {
|
Ok(match self {
|
||||||
Self::Null => None,
|
Self::Null => None,
|
||||||
|
|
||||||
Self::U64(_) | Self::I64(_) | Self::String(_) | Self::Blob { .. } => {
|
Self::U64(_)
|
||||||
Some(Value::Number(1u64.into()))
|
| Self::I64(_)
|
||||||
}
|
| Self::String(_)
|
||||||
|
| Self::Binary(BinaryPileValue::Blob { .. }) => Some(Value::Number(1u64.into())),
|
||||||
|
|
||||||
Self::Array(x) => (!x.is_empty()).then(|| Value::Number(1u64.into())),
|
Self::Array(x) => (!x.is_empty()).then(|| Value::Number(1u64.into())),
|
||||||
Self::ListExtractor(x) => (x.len(state).await? > 0).then(|| Value::Number(1u64.into())),
|
Self::ListExtractor(x) => (x.len(state).await? > 0).then(|| Value::Number(1u64.into())),
|
||||||
|
|
||||||
Self::ObjectExtractor(_) | Self::Item(_) => {
|
Self::ObjectExtractor(_)
|
||||||
|
| Self::Item(_)
|
||||||
|
| Self::Binary(BinaryPileValue::File { .. }) => {
|
||||||
let e = self.object_extractor();
|
let e = self.object_extractor();
|
||||||
let keys = e.fields().await?;
|
let keys = e.fields().await?;
|
||||||
let mut map = Map::new();
|
let mut map = Map::new();
|
||||||
@@ -241,8 +278,8 @@ impl PileValue {
|
|||||||
Self::String(x) => Value::String(x.to_string()),
|
Self::String(x) => Value::String(x.to_string()),
|
||||||
|
|
||||||
// TODO: replace with something meaningful?
|
// TODO: replace with something meaningful?
|
||||||
Self::Blob { mime, bytes } => {
|
Self::Binary(BinaryPileValue::Blob { mime, bytes }) => {
|
||||||
Value::String(format!("<Blob ({mime}, {} bytes)>", bytes.len()))
|
Value::String(format!("<Blob ({mime}, {} bytes)>", bytes.0.len()))
|
||||||
}
|
}
|
||||||
|
|
||||||
Self::Array(_) | Self::ListExtractor(_) => {
|
Self::Array(_) | Self::ListExtractor(_) => {
|
||||||
@@ -250,7 +287,9 @@ impl PileValue {
|
|||||||
return e.to_json(state).await;
|
return e.to_json(state).await;
|
||||||
}
|
}
|
||||||
|
|
||||||
Self::ObjectExtractor(_) | Self::Item(_) => {
|
Self::ObjectExtractor(_)
|
||||||
|
| Self::Item(_)
|
||||||
|
| Self::Binary(BinaryPileValue::File { .. }) => {
|
||||||
let e = self.object_extractor();
|
let e = self.object_extractor();
|
||||||
return e.to_json(state).await;
|
return e.to_json(state).await;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -9,8 +9,9 @@ workspace = true
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
pile-toolbox = { workspace = true }
|
pile-toolbox = { workspace = true }
|
||||||
pile-dataset = { workspace = true, features = ["axum", "pdfium"] }
|
pile-dataset = { workspace = true }
|
||||||
pile-value = { workspace = true, features = ["pdfium"] }
|
pile-serve = { workspace = true }
|
||||||
|
pile-value = { workspace = true }
|
||||||
pile-config = { workspace = true }
|
pile-config = { workspace = true }
|
||||||
|
|
||||||
tracing = { workspace = true }
|
tracing = { workspace = true }
|
||||||
@@ -34,3 +35,7 @@ base64 = { workspace = true }
|
|||||||
dotenvy = { workspace = true }
|
dotenvy = { workspace = true }
|
||||||
envy = { workspace = true }
|
envy = { workspace = true }
|
||||||
thiserror = { workspace = true }
|
thiserror = { workspace = true }
|
||||||
|
|
||||||
|
[features]
|
||||||
|
default = ["pdfium"]
|
||||||
|
pdfium = ["pile-dataset/pdfium", "pile-serve/pdfium", "pile-value/pdfium"]
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
use clap::Args;
|
use clap::Args;
|
||||||
|
use pile_config::objectpath::ObjectPath;
|
||||||
use pile_dataset::Datasets;
|
use pile_dataset::Datasets;
|
||||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
||||||
use pile_value::{extract::traits::ExtractState, value::PileValue};
|
use pile_value::{extract::traits::ExtractState, value::PileValue};
|
||||||
@@ -40,6 +41,10 @@ pub struct FieldsCommand {
|
|||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
max_percent: Option<f64>,
|
max_percent: Option<f64>,
|
||||||
|
|
||||||
|
/// Print counts of non-null schema fields instead of raw fields
|
||||||
|
#[arg(long)]
|
||||||
|
schema: bool,
|
||||||
|
|
||||||
/// Restrict to these sources (all sources if empty)
|
/// Restrict to these sources (all sources if empty)
|
||||||
#[arg(long, short = 's')]
|
#[arg(long, short = 's')]
|
||||||
source: Vec<String>,
|
source: Vec<String>,
|
||||||
@@ -67,6 +72,17 @@ impl CliCmd for FieldsCommand {
|
|||||||
let jobs = self.jobs.max(1);
|
let jobs = self.jobs.max(1);
|
||||||
let state = ExtractState { ignore_mime: false };
|
let state = ExtractState { ignore_mime: false };
|
||||||
|
|
||||||
|
// Pre-collect schema fields for the --schema mode
|
||||||
|
let schema_fields: Vec<(String, Vec<ObjectPath>)> = if self.schema {
|
||||||
|
ds.config
|
||||||
|
.schema
|
||||||
|
.iter()
|
||||||
|
.map(|(name, spec)| (name.to_string(), spec.path.clone()))
|
||||||
|
.collect()
|
||||||
|
} else {
|
||||||
|
Vec::new()
|
||||||
|
};
|
||||||
|
|
||||||
for (name, dataset) in ds.sources.iter().filter(|(name, _)| {
|
for (name, dataset) in ds.sources.iter().filter(|(name, _)| {
|
||||||
self.source.is_empty() || self.source.iter().any(|s| s == name.as_str())
|
self.source.is_empty() || self.source.iter().any(|s| s == name.as_str())
|
||||||
}) {
|
}) {
|
||||||
@@ -98,19 +114,50 @@ impl CliCmd for FieldsCommand {
|
|||||||
let item = item.clone();
|
let item = item.clone();
|
||||||
let name = name.clone();
|
let name = name.clone();
|
||||||
let state = state.clone();
|
let state = state.clone();
|
||||||
join_set.spawn(async move {
|
if self.schema {
|
||||||
let item = PileValue::Item(item);
|
let schema_fields = schema_fields.clone();
|
||||||
let result = item.count_fields(&state).await.with_context(|| {
|
join_set.spawn(async move {
|
||||||
format!("while counting fields in source {name}")
|
let pv = PileValue::Item(item);
|
||||||
})?;
|
let mut counts = Map::new();
|
||||||
Ok(result.and_then(|v| {
|
for (field_name, paths) in &schema_fields {
|
||||||
if let Value::Object(m) = v {
|
let mut present = false;
|
||||||
Some(m)
|
for path in paths {
|
||||||
} else {
|
let v =
|
||||||
None
|
pv.query(&state, path).await.with_context(|| {
|
||||||
|
format!(
|
||||||
|
"while extracting field {field_name} in source {name}"
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
if let Some(v) = v
|
||||||
|
&& !matches!(v, PileValue::Null)
|
||||||
|
{
|
||||||
|
present = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
counts.insert(
|
||||||
|
field_name.clone(),
|
||||||
|
Value::Number((present as u64).into()),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}))
|
Ok(Some(counts))
|
||||||
});
|
});
|
||||||
|
} else {
|
||||||
|
join_set.spawn(async move {
|
||||||
|
let item = PileValue::Item(item);
|
||||||
|
let result =
|
||||||
|
item.count_fields(&state).await.with_context(|| {
|
||||||
|
format!("while counting fields in source {name}")
|
||||||
|
})?;
|
||||||
|
Ok(result.and_then(|v| {
|
||||||
|
if let Value::Object(m) = v {
|
||||||
|
Some(m)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ pub struct ItemCommand {
|
|||||||
|
|
||||||
/// If present, print the schema fields instead of item data
|
/// If present, print the schema fields instead of item data
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
fields: bool,
|
schema: bool,
|
||||||
|
|
||||||
#[arg(long, short = 'x')]
|
#[arg(long, short = 'x')]
|
||||||
exclude: Vec<String>,
|
exclude: Vec<String>,
|
||||||
@@ -58,7 +58,7 @@ impl CliCmd for ItemCommand {
|
|||||||
})?;
|
})?;
|
||||||
let pv = PileValue::Item(item);
|
let pv = PileValue::Item(item);
|
||||||
|
|
||||||
if self.fields {
|
if self.schema {
|
||||||
let mut map = serde_json::Map::new();
|
let mut map = serde_json::Map::new();
|
||||||
for (name, spec) in &ds.config.schema {
|
for (name, spec) in &ds.config.schema {
|
||||||
if self.exclude.contains(&name.to_string()) {
|
if self.exclude.contains(&name.to_string()) {
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
use clap::Args;
|
use clap::Args;
|
||||||
|
use pile_config::Label;
|
||||||
use pile_config::objectpath::ObjectPath;
|
use pile_config::objectpath::ObjectPath;
|
||||||
use pile_dataset::Datasets;
|
use pile_dataset::Datasets;
|
||||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
||||||
@@ -12,7 +13,7 @@ use crate::{CliCmd, GlobalContext};
|
|||||||
|
|
||||||
#[derive(Debug, Args)]
|
#[derive(Debug, Args)]
|
||||||
pub struct ListCommand {
|
pub struct ListCommand {
|
||||||
/// Path to query, e.g. $.flac.artist
|
/// Path to query, e.g. $.flac.artist (or schema field name when --schema is set)
|
||||||
#[clap(default_value = "$")]
|
#[clap(default_value = "$")]
|
||||||
path: String,
|
path: String,
|
||||||
|
|
||||||
@@ -20,6 +21,10 @@ pub struct ListCommand {
|
|||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
invert: bool,
|
invert: bool,
|
||||||
|
|
||||||
|
/// Treat path as a schema field name and resolve via schema paths
|
||||||
|
#[arg(long)]
|
||||||
|
schema: bool,
|
||||||
|
|
||||||
/// Path to dataset config
|
/// Path to dataset config
|
||||||
#[arg(long, short = 'c', default_value = "./pile.toml")]
|
#[arg(long, short = 'c', default_value = "./pile.toml")]
|
||||||
config: PathBuf,
|
config: PathBuf,
|
||||||
@@ -44,14 +49,24 @@ impl CliCmd for ListCommand {
|
|||||||
_ctx: GlobalContext,
|
_ctx: GlobalContext,
|
||||||
flag: CancelFlag,
|
flag: CancelFlag,
|
||||||
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
||||||
let path = ObjectPath::from_str(&self.path)
|
|
||||||
.with_context(|| format!("invalid path {:?}", self.path))?;
|
|
||||||
let path = Arc::new(path);
|
|
||||||
|
|
||||||
let ds = Datasets::open(&self.config, &self.workdir)
|
let ds = Datasets::open(&self.config, &self.workdir)
|
||||||
.await
|
.await
|
||||||
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
||||||
|
|
||||||
|
// Resolve path arg: either schema field paths or a single ObjectPath
|
||||||
|
let schema_paths: Arc<Vec<ObjectPath>> = if self.schema {
|
||||||
|
let label = Label::new(&self.path)
|
||||||
|
.ok_or_else(|| anyhow::anyhow!("invalid schema field name {:?}", self.path))?;
|
||||||
|
let spec = ds.config.schema.get(&label).ok_or_else(|| {
|
||||||
|
anyhow::anyhow!("schema field {:?} not found in config", self.path)
|
||||||
|
})?;
|
||||||
|
Arc::new(spec.path.clone())
|
||||||
|
} else {
|
||||||
|
let path = ObjectPath::from_str(&self.path)
|
||||||
|
.with_context(|| format!("invalid path {:?}", self.path))?;
|
||||||
|
Arc::new(vec![path])
|
||||||
|
};
|
||||||
|
|
||||||
let jobs = self.jobs.max(1);
|
let jobs = self.jobs.max(1);
|
||||||
let state = ExtractState { ignore_mime: false };
|
let state = ExtractState { ignore_mime: false };
|
||||||
|
|
||||||
@@ -81,16 +96,20 @@ impl CliCmd for ListCommand {
|
|||||||
let item = item.clone();
|
let item = item.clone();
|
||||||
let source_name = name.to_string();
|
let source_name = name.to_string();
|
||||||
let key = item.key().to_string();
|
let key = item.key().to_string();
|
||||||
let path = path.clone();
|
let schema_paths = schema_paths.clone();
|
||||||
let invert = self.invert;
|
let invert = self.invert;
|
||||||
let state = state.clone();
|
let state = state.clone();
|
||||||
|
|
||||||
join_set.spawn(async move {
|
join_set.spawn(async move {
|
||||||
let item = PileValue::Item(item);
|
let pv = PileValue::Item(item);
|
||||||
let value = item.query(&state, &path).await?;
|
let mut is_present = false;
|
||||||
|
for path in schema_paths.as_ref() {
|
||||||
let is_present =
|
let value = pv.query(&state, path).await?;
|
||||||
matches!(value, Some(v) if !matches!(v, PileValue::Null));
|
if matches!(value, Some(v) if !matches!(v, PileValue::Null)) {
|
||||||
|
is_present = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let should_print = if invert { !is_present } else { is_present };
|
let should_print = if invert { !is_present } else { is_present };
|
||||||
|
|
||||||
|
|||||||
@@ -57,8 +57,7 @@ impl CliCmd for ServeCommand {
|
|||||||
})?;
|
})?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let app = Arc::new(ds)
|
let app = pile_serve::router(Arc::new(ds), true)
|
||||||
.router(true)
|
|
||||||
.into_make_service_with_connect_info::<std::net::SocketAddr>();
|
.into_make_service_with_connect_info::<std::net::SocketAddr>();
|
||||||
|
|
||||||
let listener = match tokio::net::TcpListener::bind(self.addr.clone()).await {
|
let listener = match tokio::net::TcpListener::bind(self.addr.clone()).await {
|
||||||
|
|||||||
@@ -8,10 +8,11 @@ use axum::{
|
|||||||
routing::get,
|
routing::get,
|
||||||
};
|
};
|
||||||
use clap::Args;
|
use clap::Args;
|
||||||
use pile_dataset::Datasets;
|
use pile_dataset::{DatasetError, Datasets};
|
||||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
||||||
|
use pile_value::extract::traits::ExtractState;
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use std::{fmt::Debug, path::PathBuf, sync::Arc};
|
use std::{fmt::Debug, path::PathBuf, sync::Arc, time::Duration};
|
||||||
use tracing::{error, info};
|
use tracing::{error, info};
|
||||||
use utoipa::{OpenApi, ToSchema};
|
use utoipa::{OpenApi, ToSchema};
|
||||||
use utoipa_swagger_ui::SwaggerUi;
|
use utoipa_swagger_ui::SwaggerUi;
|
||||||
@@ -27,6 +28,18 @@ pub struct ServerCommand {
|
|||||||
/// If provided, do not serve docs
|
/// If provided, do not serve docs
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
no_docs: bool,
|
no_docs: bool,
|
||||||
|
|
||||||
|
/// If provided, never auto-refresh indices
|
||||||
|
#[arg(long)]
|
||||||
|
no_refresh: bool,
|
||||||
|
|
||||||
|
/// Number of threads to use to refresh indices
|
||||||
|
#[arg(long, default_value = "5")]
|
||||||
|
refresh_jobs: usize,
|
||||||
|
|
||||||
|
/// Refresh indices every `n` seconds
|
||||||
|
#[arg(long, default_value = "300")]
|
||||||
|
refresh_delay: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl CliCmd for ServerCommand {
|
impl CliCmd for ServerCommand {
|
||||||
@@ -47,12 +60,57 @@ impl CliCmd for ServerCommand {
|
|||||||
Arc::new(datasets)
|
Arc::new(datasets)
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Start auto-refresh task
|
||||||
|
if !self.no_refresh {
|
||||||
|
let datasets = datasets.clone();
|
||||||
|
let jobs = self.refresh_jobs.max(1);
|
||||||
|
let delay = self.refresh_delay.max(1);
|
||||||
|
|
||||||
|
async fn refresh_dataset(ds: &Datasets, jobs: usize) -> Result<(), DatasetError> {
|
||||||
|
if ds.needs_fts().await? {
|
||||||
|
let state = ExtractState { ignore_mime: false };
|
||||||
|
match ds.fts_refresh(&state, jobs, None).await {
|
||||||
|
Ok(()) => {}
|
||||||
|
Err(CancelableTaskError::Error(err)) => return Err(err),
|
||||||
|
Err(CancelableTaskError::Cancelled) => unreachable!(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
tokio::task::spawn(async move {
|
||||||
|
loop {
|
||||||
|
for ds in datasets.iter() {
|
||||||
|
match refresh_dataset(ds, jobs).await {
|
||||||
|
Ok(x) => x,
|
||||||
|
Err(error) => {
|
||||||
|
error!(
|
||||||
|
message = "Error while refreshing dataset",
|
||||||
|
dataset = ds.config.dataset.name.as_str(),
|
||||||
|
?error
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tokio::time::sleep(Duration::from_secs(10)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
tokio::time::sleep(Duration::from_secs(delay as u64)).await;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
let bearer = BearerToken(ctx.config.api_token.clone().map(Arc::new));
|
let bearer = BearerToken(ctx.config.api_token.clone().map(Arc::new));
|
||||||
|
|
||||||
let mut router = Router::new();
|
let mut router = Router::new();
|
||||||
for d in datasets.iter() {
|
for d in datasets.iter() {
|
||||||
let prefix = format!("/{}", d.config.dataset.name);
|
let prefix = format!("/{}", d.config.dataset.name);
|
||||||
router = router.merge(d.clone().router_prefix(!self.no_docs, Some(&prefix)))
|
router = router.merge(pile_serve::router_prefix(
|
||||||
|
d.clone(),
|
||||||
|
!self.no_docs,
|
||||||
|
Some(&prefix),
|
||||||
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
router = router.merge(
|
router = router.merge(
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use std::{num::NonZeroUsize, path::PathBuf};
|
use std::{num::NonZeroUsize, path::PathBuf};
|
||||||
use tracing::info;
|
use tracing::debug;
|
||||||
|
|
||||||
use crate::config::{
|
use crate::config::{
|
||||||
env::load_env,
|
env::load_env,
|
||||||
@@ -89,7 +89,7 @@ impl PileServerConfig {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
info!(message = "Config loaded", ?config);
|
debug!(message = "Config loaded", ?config);
|
||||||
|
|
||||||
return config;
|
return config;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user