Compare commits
17 Commits
ec7326a55e
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 450ea7aa86 | |||
| 3bc66ddc48 | |||
| 251d130987 | |||
| 0281a33f86 | |||
| d3ab2684f4 | |||
| 4d4e9c93a2 | |||
| e6e340d082 | |||
| 8b4dfb1a1a | |||
| 60dc755561 | |||
| 5527b61d39 | |||
| 9967e066bb | |||
| 336480469c | |||
| 5807733e62 | |||
| 256af68382 | |||
| fac300431a | |||
| 47a0adbaff | |||
| 80f4ebdbe6 |
1095
Cargo.lock
generated
1095
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
24
Cargo.toml
24
Cargo.toml
@@ -71,8 +71,9 @@ pile-dataset = { path = "crates/pile-dataset" }
|
||||
pile-value = { path = "crates/pile-value" }
|
||||
pile-io = { path = "crates/pile-io" }
|
||||
pile-client = { path = "crates/pile-client" }
|
||||
pile-serve = { path = "crates/pile-serve" }
|
||||
|
||||
# Clients & servers
|
||||
# MARK: Clients & servers
|
||||
tantivy = "0.25.0"
|
||||
servable = { version = "0.0.7", features = ["image"] }
|
||||
axum = { version = "0.8.8", features = ["macros", "multipart"] }
|
||||
@@ -87,15 +88,15 @@ utoipa-swagger-ui = { version = "9.0.2", features = [
|
||||
"debug-embed",
|
||||
"vendored",
|
||||
] }
|
||||
reqwest = { version = "0.12", features = ["blocking", "json", "stream"] }
|
||||
tracing-loki = "0.2.6"
|
||||
|
||||
# Async & Parallelism
|
||||
# MARK: Async & Parallelism
|
||||
tokio = { version = "1.49.0", features = ["full"] }
|
||||
tokio-stream = "0.1"
|
||||
async-trait = "0.1"
|
||||
aws-sdk-s3 = "1"
|
||||
aws-config = "1"
|
||||
|
||||
# CLI & logging
|
||||
# MARK: CLI & logging
|
||||
tracing = "0.1.44"
|
||||
tracing-subscriber = { version = "0.3.22", features = ["env-filter", "json"] }
|
||||
indicatif = { version = "0.18.4", features = ["improved_unicode"] }
|
||||
@@ -103,7 +104,7 @@ tracing-indicatif = "0.3.14"
|
||||
anstyle = "1.0.13"
|
||||
clap = { version = "4.5.60", features = ["derive"] }
|
||||
|
||||
# Serialization & formats
|
||||
# MARK: Serialization & formats
|
||||
serde = { version = "1.0.228", features = ["derive"] }
|
||||
serde_json = "1.0.149"
|
||||
base64 = "0.22.1"
|
||||
@@ -111,11 +112,13 @@ bytes = "1"
|
||||
toml = "1.0.3"
|
||||
toml_edit = "0.25.4"
|
||||
sha2 = "0.11.0-rc.5"
|
||||
sha1 = "0.10"
|
||||
md5 = "0.7"
|
||||
blake3 = "1.8.3"
|
||||
chacha20poly1305 = "0.10.0"
|
||||
binrw = "0.15.1"
|
||||
dotenvy = "0.15.7"
|
||||
envy = "0.4.2"
|
||||
|
||||
# Extractors
|
||||
# MARK: Extractors
|
||||
pdf = "0.10.0"
|
||||
id3 = "1.16.4"
|
||||
epub = "1.2.2"
|
||||
@@ -123,7 +126,7 @@ kamadak-exif = "0.6.1"
|
||||
pdfium-render = "0.8"
|
||||
image = { version = "0.25", default-features = false, features = ["png"] }
|
||||
|
||||
# Misc helpers
|
||||
# MARK: Misc helpers
|
||||
thiserror = "2.0.18"
|
||||
anyhow = "1.0.102"
|
||||
itertools = "0.14.0"
|
||||
@@ -139,3 +142,4 @@ chrono = "0.4.43"
|
||||
parking_lot = "0.12.5"
|
||||
rayon = "1.11.0"
|
||||
percent-encoding = "2"
|
||||
url = { version = "2.5.8", features = ["serde"] }
|
||||
|
||||
@@ -29,6 +29,7 @@ RUN apt-get update && \
|
||||
|
||||
COPY --from=build \
|
||||
/app/rust/target/release/pile \
|
||||
/app/rust/target/release/libpdfium.so \
|
||||
/app/bin/
|
||||
|
||||
ENV PATH="/app/bin:$PATH"
|
||||
|
||||
@@ -8,10 +8,9 @@ edition = { workspace = true }
|
||||
workspace = true
|
||||
|
||||
[dependencies]
|
||||
pile-dataset = { workspace = true, features = ["axum"] }
|
||||
pile-serve = { workspace = true }
|
||||
|
||||
reqwest = { version = "0.12", features = ["json", "stream"] }
|
||||
futures-core = "0.3"
|
||||
reqwest = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
bytes = { workspace = true }
|
||||
|
||||
@@ -3,14 +3,15 @@ use axum::{
|
||||
routing::any,
|
||||
};
|
||||
use bytes::Bytes;
|
||||
use futures_core::Stream;
|
||||
use reqwest::{Client, StatusCode, header};
|
||||
use serde::Deserialize;
|
||||
use std::pin::Pin;
|
||||
use thiserror::Error;
|
||||
use tracing::{trace, warn};
|
||||
|
||||
pub use pile_dataset::serve::{ItemsResponse, LookupRequest, LookupResponse};
|
||||
pub use pile_serve::{
|
||||
ApiValue, FieldSpec, FieldsResponse, ItemsResponse, LookupRequest, LookupResponse,
|
||||
SchemaResponse,
|
||||
};
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum ClientError {
|
||||
@@ -117,26 +118,6 @@ impl DatasetClient {
|
||||
check_status(resp).await?.json().await.map_err(Into::into)
|
||||
}
|
||||
|
||||
/// `GET /item` — stream the raw bytes of an item.
|
||||
///
|
||||
/// The returned stream yields chunks as they arrive from the server.
|
||||
pub async fn get_item(
|
||||
&self,
|
||||
source: &str,
|
||||
key: &str,
|
||||
) -> Result<Pin<Box<dyn Stream<Item = Result<Bytes, reqwest::Error>> + Send>>, ClientError> {
|
||||
let url = format!("{}/item", self.base_url);
|
||||
trace!(url, source, key, "GET /item");
|
||||
let resp = self
|
||||
.client
|
||||
.get(url)
|
||||
.query(&[("source", source), ("key", key)])
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
Ok(Box::pin(check_status(resp).await?.bytes_stream()))
|
||||
}
|
||||
|
||||
/// `GET /extract` — extract a field from an item by object path (e.g. `$.flac.title`).
|
||||
pub async fn get_extract(
|
||||
&self,
|
||||
@@ -167,19 +148,19 @@ impl DatasetClient {
|
||||
Ok(FieldResponse { content_type, data })
|
||||
}
|
||||
|
||||
/// `GET /field` — get a field from an item's schema
|
||||
pub async fn get_field(
|
||||
/// `GET /schema/{field}` — get a single schema field value from an item.
|
||||
pub async fn schema_field(
|
||||
&self,
|
||||
source: &str,
|
||||
key: &str,
|
||||
field: &str,
|
||||
) -> Result<FieldResponse, ClientError> {
|
||||
let url = format!("{}/field", self.base_url);
|
||||
trace!(url, source, key, field, "GET /field");
|
||||
let url = format!("{}/schema/{field}", self.base_url);
|
||||
trace!(url, source, key, field, "GET /schema/{field}");
|
||||
let resp = self
|
||||
.client
|
||||
.get(url)
|
||||
.query(&[("source", source), ("key", key), ("field", field)])
|
||||
.query(&[("source", source), ("key", key)])
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
@@ -197,6 +178,33 @@ impl DatasetClient {
|
||||
Ok(FieldResponse { content_type, data })
|
||||
}
|
||||
|
||||
/// `GET /schema` — get all schema field values for a single item.
|
||||
pub async fn schema(
|
||||
&self,
|
||||
source: &str,
|
||||
key: &str,
|
||||
hidden: bool,
|
||||
) -> Result<SchemaResponse, ClientError> {
|
||||
let url = format!("{}/schema", self.base_url);
|
||||
trace!(url, source, key, hidden, "GET /schema");
|
||||
let resp = self
|
||||
.client
|
||||
.get(url)
|
||||
.query(&[("source", source), ("key", key)])
|
||||
.query(&[("hidden", hidden)])
|
||||
.send()
|
||||
.await?;
|
||||
check_status(resp).await?.json().await.map_err(Into::into)
|
||||
}
|
||||
|
||||
/// `GET /config/schema` — retrieve this dataset's schema spec.
|
||||
pub async fn config_schema(&self) -> Result<FieldsResponse, ClientError> {
|
||||
let url = format!("{}/config/schema", self.base_url);
|
||||
trace!(url, "GET /config/schema");
|
||||
let resp = self.client.get(url).send().await?;
|
||||
check_status(resp).await?.json().await.map_err(Into::into)
|
||||
}
|
||||
|
||||
/// `GET /items` — paginate over all items in this dataset, ordered by (source, key).
|
||||
pub async fn list_items(
|
||||
&self,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use serde::Deserialize;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{collections::HashMap, fmt::Debug, path::PathBuf};
|
||||
|
||||
use crate::{objectpath::ObjectPath, pattern::GroupPattern};
|
||||
use crate::objectpath::ObjectPath;
|
||||
|
||||
mod misc;
|
||||
pub use misc::*;
|
||||
@@ -15,6 +15,15 @@ fn default_true() -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
pub fn default_base() -> String {
|
||||
"(.*)".to_owned()
|
||||
}
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
pub fn default_files() -> HashMap<Label, String> {
|
||||
[(Label::new("item").unwrap(), "{base}".to_owned())].into()
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[expect(clippy::expect_used)]
|
||||
fn init_db_toml_valid() {
|
||||
@@ -33,19 +42,10 @@ pub struct DatasetConfig {
|
||||
/// Must be unique
|
||||
pub name: Label,
|
||||
|
||||
/// Root dir for indices
|
||||
pub working_dir: Option<PathBuf>,
|
||||
|
||||
/// Where to find this field
|
||||
pub source: HashMap<Label, Source>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub struct S3Credentials {
|
||||
pub access_key_id: String,
|
||||
pub secret_access_key: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
#[serde(tag = "type")]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
@@ -60,33 +60,17 @@ pub enum Source {
|
||||
/// Must be relative.
|
||||
path: PathBuf,
|
||||
|
||||
/// How to group files into items in this source
|
||||
#[serde(default)]
|
||||
pattern: GroupPattern,
|
||||
},
|
||||
/// Regex that extracts an item key from a file path.
|
||||
/// - File paths are relative to `path`.
|
||||
/// - The first group in this regex is the file's item key.
|
||||
#[serde(default = "default_base")]
|
||||
base_pattern: String,
|
||||
|
||||
/// An S3-compatible object store bucket
|
||||
S3 {
|
||||
/// If false, ignore this dataset
|
||||
#[serde(default = "default_true")]
|
||||
enabled: bool,
|
||||
|
||||
bucket: String,
|
||||
prefix: Option<String>,
|
||||
|
||||
/// Custom endpoint URL (for MinIO, etc.)
|
||||
endpoint: Option<String>,
|
||||
|
||||
region: String,
|
||||
|
||||
credentials: S3Credentials,
|
||||
|
||||
/// How to group files into items in this source
|
||||
#[serde(default)]
|
||||
pattern: GroupPattern,
|
||||
|
||||
/// If provided, assume objects are encrypted with this secret key.
|
||||
encryption_key: Option<String>,
|
||||
/// Map of files included in each item.'
|
||||
/// `{base}` is replaced with the string extracted by base_pattern.
|
||||
/// Default is `{ item: "{base}" }`
|
||||
#[serde(default = "default_files")]
|
||||
files: HashMap<Label, String>,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -94,8 +78,14 @@ pub enum Source {
|
||||
// MARK: schema
|
||||
//
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct FieldSpec {
|
||||
/// If true, do not display this field.
|
||||
/// This attribute has no effect on pile, it
|
||||
/// is intended for consumers of data.
|
||||
#[serde(default)]
|
||||
pub hidden: bool,
|
||||
|
||||
/// How to find this field in a data entry
|
||||
pub path: Vec<ObjectPath>,
|
||||
}
|
||||
|
||||
@@ -11,8 +11,8 @@ workspace = true
|
||||
pile-config = { workspace = true }
|
||||
pile-toolbox = { workspace = true }
|
||||
pile-value = { workspace = true }
|
||||
pile-io = { workspace = true }
|
||||
|
||||
regex = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
tantivy = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
@@ -20,21 +20,7 @@ chrono = { workspace = true }
|
||||
toml = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
tokio-stream = { workspace = true }
|
||||
|
||||
serde = { workspace = true, optional = true }
|
||||
axum = { workspace = true, optional = true }
|
||||
percent-encoding = { workspace = true, optional = true }
|
||||
utoipa = { workspace = true, optional = true }
|
||||
utoipa-swagger-ui = { workspace = true, optional = true }
|
||||
|
||||
[features]
|
||||
default = []
|
||||
pdfium = ["pile-value/pdfium"]
|
||||
axum = [
|
||||
"dep:axum",
|
||||
"dep:utoipa",
|
||||
"dep:utoipa-swagger-ui",
|
||||
"dep:serde",
|
||||
"dep:percent-encoding",
|
||||
]
|
||||
|
||||
@@ -1,13 +1,14 @@
|
||||
use chrono::{DateTime, Utc};
|
||||
use pile_config::{
|
||||
ConfigToml, DatasetConfig, Label, Source, objectpath::ObjectPath, pattern::GroupPattern,
|
||||
ConfigToml, DatasetConfig, Label, Source, default_base, default_files, objectpath::ObjectPath,
|
||||
};
|
||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
||||
use pile_value::{
|
||||
extract::traits::ExtractState,
|
||||
source::{DataSource, DirDataSource, S3DataSource, misc::path_ts_earliest, string_to_key},
|
||||
source::{DataSource, DirDataSource, misc::path_ts_earliest},
|
||||
value::{Item, PileValue},
|
||||
};
|
||||
use regex::Regex;
|
||||
use serde_json::Value;
|
||||
use std::{collections::HashMap, io::ErrorKind, path::PathBuf, sync::Arc, time::Instant};
|
||||
use tantivy::{Executor, Index, IndexWriter, TantivyError, collector::TopDocs};
|
||||
@@ -33,31 +34,27 @@ pub enum DatasetError {
|
||||
// MARK: Dataset enum
|
||||
//
|
||||
|
||||
/// An opened data source — either a local filesystem directory or an S3 bucket.
|
||||
/// An opened data source
|
||||
pub enum Dataset {
|
||||
Dir(Arc<DirDataSource>),
|
||||
S3(Arc<S3DataSource>),
|
||||
}
|
||||
|
||||
impl Dataset {
|
||||
pub fn len(&self) -> usize {
|
||||
match self {
|
||||
Self::Dir(ds) => ds.len(),
|
||||
Self::S3(ds) => ds.len(),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn get(&self, key: &str) -> Option<Item> {
|
||||
match self {
|
||||
Self::Dir(ds) => ds.get(key).await.ok().flatten(),
|
||||
Self::S3(ds) => ds.get(key).await.ok().flatten(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> Box<dyn Iterator<Item = &Item> + Send + '_> {
|
||||
match self {
|
||||
Self::Dir(ds) => Box::new(ds.iter()),
|
||||
Self::S3(ds) => Box::new(ds.iter()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -68,14 +65,12 @@ impl Dataset {
|
||||
) -> Box<dyn Iterator<Item = &Item> + Send + '_> {
|
||||
match self {
|
||||
Self::Dir(ds) => Box::new(ds.iter_page(offset, limit)),
|
||||
Self::S3(ds) => Box::new(ds.iter_page(offset, limit)),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn latest_change(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
|
||||
match self {
|
||||
Self::Dir(ds) => ds.latest_change().await,
|
||||
Self::S3(ds) => ds.latest_change().await,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -108,14 +103,13 @@ impl Datasets {
|
||||
let config = ConfigToml {
|
||||
dataset: DatasetConfig {
|
||||
name: Label::new("virtual-dataset").unwrap(),
|
||||
working_dir: None,
|
||||
|
||||
source: [(
|
||||
Self::virt_source(),
|
||||
Source::Filesystem {
|
||||
enabled: true,
|
||||
path: path_parent.clone(),
|
||||
pattern: GroupPattern::default(),
|
||||
base_pattern: default_base(),
|
||||
files: default_files(),
|
||||
},
|
||||
)]
|
||||
.into_iter()
|
||||
@@ -133,61 +127,40 @@ impl Datasets {
|
||||
Source::Filesystem {
|
||||
enabled,
|
||||
path,
|
||||
pattern,
|
||||
base_pattern,
|
||||
files,
|
||||
} => {
|
||||
let target = match enabled {
|
||||
true => &mut sources,
|
||||
false => &mut disabled_sources,
|
||||
};
|
||||
|
||||
let base_regex = Regex::new(base_pattern).map_err(|e| {
|
||||
std::io::Error::new(
|
||||
ErrorKind::InvalidInput,
|
||||
format!("invalid base_pattern: {e}"),
|
||||
)
|
||||
})?;
|
||||
if base_regex.captures_len() != 2 {
|
||||
return Err(std::io::Error::new(
|
||||
ErrorKind::InvalidInput,
|
||||
"base_pattern must have exactly one capture group",
|
||||
));
|
||||
}
|
||||
|
||||
target.insert(
|
||||
label.clone(),
|
||||
Dataset::Dir(
|
||||
DirDataSource::new(label, path_parent.join(path), pattern.clone())
|
||||
.await?,
|
||||
DirDataSource::new(
|
||||
label,
|
||||
path_parent.join(path),
|
||||
base_regex,
|
||||
files.clone(),
|
||||
)
|
||||
.await?,
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
Source::S3 {
|
||||
enabled,
|
||||
bucket,
|
||||
prefix,
|
||||
endpoint,
|
||||
region,
|
||||
credentials,
|
||||
pattern,
|
||||
encryption_key,
|
||||
} => {
|
||||
let target = match enabled {
|
||||
true => &mut sources,
|
||||
false => &mut disabled_sources,
|
||||
};
|
||||
|
||||
let encryption_key = encryption_key.as_ref().map(|x| string_to_key(x));
|
||||
|
||||
match S3DataSource::new(
|
||||
label,
|
||||
bucket,
|
||||
prefix.as_ref().map(|x| x.as_str()),
|
||||
endpoint.as_ref().map(|x| x.as_str()),
|
||||
region,
|
||||
&credentials.access_key_id,
|
||||
&credentials.secret_access_key,
|
||||
10_000_000,
|
||||
pattern.clone(),
|
||||
encryption_key,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(ds) => {
|
||||
target.insert(label.clone(), Dataset::S3(ds));
|
||||
}
|
||||
Err(err) => {
|
||||
warn!("Could not open S3 source {label}: {err}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -201,7 +174,10 @@ impl Datasets {
|
||||
});
|
||||
}
|
||||
|
||||
pub async fn open(config: impl Into<PathBuf>) -> Result<Self, std::io::Error> {
|
||||
pub async fn open(
|
||||
config: impl Into<PathBuf>,
|
||||
working_dir_root: impl Into<PathBuf>,
|
||||
) -> Result<Self, std::io::Error> {
|
||||
let path_config = config.into();
|
||||
let path_parent = path_config
|
||||
.parent()
|
||||
@@ -230,12 +206,7 @@ impl Datasets {
|
||||
}
|
||||
};
|
||||
|
||||
let path_workdir = config
|
||||
.dataset
|
||||
.working_dir
|
||||
.clone()
|
||||
.unwrap_or(path_parent.join(".pile"))
|
||||
.join(config.dataset.name.as_str());
|
||||
let path_workdir = working_dir_root.into().join(config.dataset.name.as_str());
|
||||
|
||||
let mut sources = HashMap::new();
|
||||
let mut disabled_sources = HashMap::new();
|
||||
@@ -244,61 +215,40 @@ impl Datasets {
|
||||
Source::Filesystem {
|
||||
enabled,
|
||||
path,
|
||||
pattern,
|
||||
base_pattern,
|
||||
files,
|
||||
} => {
|
||||
let target = match enabled {
|
||||
true => &mut sources,
|
||||
false => &mut disabled_sources,
|
||||
};
|
||||
|
||||
let base_regex = Regex::new(base_pattern).map_err(|e| {
|
||||
std::io::Error::new(
|
||||
ErrorKind::InvalidInput,
|
||||
format!("invalid base_pattern: {e}"),
|
||||
)
|
||||
})?;
|
||||
if base_regex.captures_len() != 2 {
|
||||
return Err(std::io::Error::new(
|
||||
ErrorKind::InvalidInput,
|
||||
"base_pattern must have exactly one capture group",
|
||||
));
|
||||
}
|
||||
|
||||
target.insert(
|
||||
label.clone(),
|
||||
Dataset::Dir(
|
||||
DirDataSource::new(label, path_parent.join(path), pattern.clone())
|
||||
.await?,
|
||||
DirDataSource::new(
|
||||
label,
|
||||
path_parent.join(path),
|
||||
base_regex,
|
||||
files.clone(),
|
||||
)
|
||||
.await?,
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
Source::S3 {
|
||||
enabled,
|
||||
bucket,
|
||||
prefix,
|
||||
endpoint,
|
||||
region,
|
||||
credentials,
|
||||
pattern,
|
||||
encryption_key,
|
||||
} => {
|
||||
let target = match enabled {
|
||||
true => &mut sources,
|
||||
false => &mut disabled_sources,
|
||||
};
|
||||
|
||||
let encryption_key = encryption_key.as_ref().map(|x| string_to_key(x));
|
||||
|
||||
match S3DataSource::new(
|
||||
label,
|
||||
bucket,
|
||||
prefix.as_ref().map(|x| x.as_str()),
|
||||
endpoint.as_ref().map(|x| x.as_str()),
|
||||
region,
|
||||
&credentials.access_key_id,
|
||||
&credentials.secret_access_key,
|
||||
10_000_000,
|
||||
pattern.clone(),
|
||||
encryption_key,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(ds) => {
|
||||
target.insert(label.clone(), Dataset::S3(ds));
|
||||
}
|
||||
Err(err) => {
|
||||
warn!("Could not open S3 source {label}: {err}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -352,6 +302,7 @@ impl Datasets {
|
||||
_threads: usize,
|
||||
flag: Option<CancelFlag>,
|
||||
) -> Result<(), CancelableTaskError<DatasetError>> {
|
||||
let start = Instant::now();
|
||||
let workdir = match self.path_workdir.as_ref() {
|
||||
Some(x) => x,
|
||||
None => {
|
||||
@@ -363,6 +314,14 @@ impl Datasets {
|
||||
let fts_tmp_dir = workdir.join(".tmp-fts");
|
||||
let fts_dir = workdir.join("fts");
|
||||
|
||||
debug!(
|
||||
message = "Rebuilding fts index",
|
||||
dataset = self.config.dataset.name.as_str(),
|
||||
?fts_dir,
|
||||
?fts_tmp_dir,
|
||||
?workdir
|
||||
);
|
||||
|
||||
if fts_tmp_dir.is_dir() {
|
||||
warn!("Removing temporary index in {}", fts_dir.display());
|
||||
std::fs::remove_dir_all(&fts_tmp_dir).map_err(DatasetError::from)?;
|
||||
@@ -442,9 +401,18 @@ impl Datasets {
|
||||
return Err(CancelableTaskError::Cancelled);
|
||||
}
|
||||
|
||||
info!("Committing {total} documents");
|
||||
index_writer.commit().map_err(DatasetError::from)?;
|
||||
|
||||
debug!(
|
||||
message = "Rebuilt fts index",
|
||||
dataset = self.config.dataset.name.as_str(),
|
||||
?fts_dir,
|
||||
?fts_tmp_dir,
|
||||
?workdir,
|
||||
n_docs = total,
|
||||
time_ms = start.elapsed().as_millis()
|
||||
);
|
||||
|
||||
if fts_dir.is_dir() {
|
||||
warn!("Removing existing index in {}", fts_dir.display());
|
||||
std::fs::remove_dir_all(&fts_dir).map_err(DatasetError::from)?;
|
||||
|
||||
@@ -245,7 +245,7 @@ async fn val_to_string(
|
||||
PileValue::Null => {}
|
||||
PileValue::ObjectExtractor(_) => {}
|
||||
PileValue::Item(_) => {}
|
||||
PileValue::Blob { .. } => {}
|
||||
PileValue::Binary(_) => {}
|
||||
}
|
||||
|
||||
return Ok(Vec::new());
|
||||
|
||||
@@ -2,6 +2,3 @@ mod dataset;
|
||||
pub use dataset::{Dataset, DatasetError, Datasets};
|
||||
|
||||
pub mod index;
|
||||
|
||||
#[cfg(feature = "axum")]
|
||||
pub mod serve;
|
||||
|
||||
@@ -1,202 +0,0 @@
|
||||
use axum::{
|
||||
body::Body,
|
||||
extract::{Query, State},
|
||||
http::{HeaderMap, StatusCode, header},
|
||||
response::{IntoResponse, Response},
|
||||
};
|
||||
use pile_config::Label;
|
||||
use pile_io::{AsyncReader, AsyncSeekReader};
|
||||
use serde::Deserialize;
|
||||
use std::{io::SeekFrom, sync::Arc, time::Instant};
|
||||
use tokio::sync::mpsc;
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
use tracing::debug;
|
||||
use utoipa::ToSchema;
|
||||
|
||||
use crate::Datasets;
|
||||
|
||||
#[derive(Deserialize, ToSchema)]
|
||||
pub struct ItemQuery {
|
||||
source: String,
|
||||
key: String,
|
||||
|
||||
#[serde(default)]
|
||||
download: bool,
|
||||
name: Option<String>,
|
||||
}
|
||||
|
||||
/// Parse a `Range: bytes=...` header value.
|
||||
/// Returns `(start, end)` where either may be `None` (suffix form has `None` start).
|
||||
fn parse_byte_range(s: &str) -> Option<(Option<u64>, Option<u64>)> {
|
||||
let spec = s.strip_prefix("bytes=")?;
|
||||
if spec.contains(',') {
|
||||
return None; // multiple ranges not supported
|
||||
}
|
||||
if let Some(suffix) = spec.strip_prefix('-') {
|
||||
return Some((None, Some(suffix.parse().ok()?)));
|
||||
}
|
||||
let mut parts = spec.splitn(2, '-');
|
||||
let start: u64 = parts.next()?.parse().ok()?;
|
||||
let end = parts
|
||||
.next()
|
||||
.and_then(|e| if e.is_empty() { None } else { e.parse().ok() });
|
||||
Some((Some(start), end))
|
||||
}
|
||||
|
||||
/// Fetch the raw bytes of an item by source and key
|
||||
#[utoipa::path(
|
||||
get,
|
||||
path = "/item",
|
||||
params(
|
||||
("source" = String, Query, description = "Source label"),
|
||||
("key" = String, Query, description = "Item key"),
|
||||
("name" = Option<String>, Query, description = "Downloaded filename; defaults to the last segment of the key"),
|
||||
),
|
||||
responses(
|
||||
(status = 200, description = "Raw item bytes"),
|
||||
(status = 206, description = "Partial content"),
|
||||
(status = 400, description = "Invalid source label"),
|
||||
(status = 404, description = "Item not found"),
|
||||
(status = 416, description = "Range not satisfiable"),
|
||||
(status = 500, description = "Internal server error"),
|
||||
)
|
||||
)]
|
||||
pub async fn item_get(
|
||||
State(state): State<Arc<Datasets>>,
|
||||
Query(params): Query<ItemQuery>,
|
||||
headers: HeaderMap,
|
||||
) -> Response {
|
||||
let start = Instant::now();
|
||||
debug!(
|
||||
message = "Serving /item",
|
||||
source = params.source,
|
||||
key = params.key
|
||||
);
|
||||
|
||||
let label = match Label::try_from(params.source.clone()) {
|
||||
Ok(l) => l,
|
||||
Err(e) => return (StatusCode::BAD_REQUEST, format!("{e:?}")).into_response(),
|
||||
};
|
||||
|
||||
let Some(item) = state.get(&label, ¶ms.key).await else {
|
||||
return StatusCode::NOT_FOUND.into_response();
|
||||
};
|
||||
|
||||
let mime = item.mime().to_string();
|
||||
|
||||
let mut reader = match item.read().await {
|
||||
Ok(r) => r,
|
||||
Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
|
||||
};
|
||||
|
||||
let total = match reader.seek(SeekFrom::End(0)).await {
|
||||
Ok(n) => n,
|
||||
Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
|
||||
};
|
||||
|
||||
let range = headers
|
||||
.get(header::RANGE)
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.and_then(parse_byte_range);
|
||||
|
||||
// Resolve (byte_start, byte_end, content_length, is_range)
|
||||
let (byte_start, byte_end, length, is_range) = match range {
|
||||
Some((Some(s), e)) => {
|
||||
let e = e
|
||||
.unwrap_or(total.saturating_sub(1))
|
||||
.min(total.saturating_sub(1));
|
||||
if s >= total || s > e {
|
||||
return (
|
||||
StatusCode::RANGE_NOT_SATISFIABLE,
|
||||
[(header::CONTENT_RANGE, format!("bytes */{total}"))],
|
||||
)
|
||||
.into_response();
|
||||
}
|
||||
(s, e, e - s + 1, true)
|
||||
}
|
||||
Some((None, Some(suffix))) => {
|
||||
let s = total.saturating_sub(suffix);
|
||||
let e = total.saturating_sub(1);
|
||||
(s, e, total.saturating_sub(s), true)
|
||||
}
|
||||
_ => (0, total.saturating_sub(1), total, false),
|
||||
};
|
||||
|
||||
if let Err(e) = reader.seek(SeekFrom::Start(byte_start)).await {
|
||||
return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response();
|
||||
}
|
||||
|
||||
debug!(
|
||||
message = "Served /item",
|
||||
source = params.source,
|
||||
key = params.key,
|
||||
time_ms = start.elapsed().as_millis()
|
||||
);
|
||||
|
||||
let (tx, rx) = mpsc::channel::<Result<Vec<u8>, std::io::Error>>(8);
|
||||
|
||||
tokio::spawn(async move {
|
||||
let mut buf = vec![0u8; 65536];
|
||||
let mut remaining = length;
|
||||
loop {
|
||||
if remaining == 0 {
|
||||
break;
|
||||
}
|
||||
let to_read = (buf.len() as u64).min(remaining) as usize;
|
||||
match reader.read(&mut buf[..to_read]).await {
|
||||
Ok(0) => break,
|
||||
Ok(n) => {
|
||||
remaining -= n as u64;
|
||||
if tx.send(Ok(buf[..n].to_vec())).await.is_err() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
let _ = tx.send(Err(e)).await;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
let body = Body::from_stream(ReceiverStream::new(rx));
|
||||
let status = if is_range {
|
||||
StatusCode::PARTIAL_CONTENT
|
||||
} else {
|
||||
StatusCode::OK
|
||||
};
|
||||
|
||||
let disposition_type = if params.download {
|
||||
"attachment"
|
||||
} else {
|
||||
"inline"
|
||||
};
|
||||
let file_name = params.name.unwrap_or_else(|| {
|
||||
params
|
||||
.key
|
||||
.rsplit('/')
|
||||
.next()
|
||||
.unwrap_or(¶ms.key)
|
||||
.to_owned()
|
||||
});
|
||||
let disposition = format!("{disposition_type}; filename=\"{file_name}\"");
|
||||
|
||||
let mut builder = axum::http::Response::builder()
|
||||
.status(status)
|
||||
.header(header::CONTENT_TYPE, mime)
|
||||
.header(header::ACCEPT_RANGES, "bytes")
|
||||
.header(header::CONTENT_LENGTH, length)
|
||||
.header(header::CONTENT_DISPOSITION, disposition);
|
||||
|
||||
if is_range {
|
||||
builder = builder.header(
|
||||
header::CONTENT_RANGE,
|
||||
format!("bytes {byte_start}-{byte_end}/{total}"),
|
||||
);
|
||||
}
|
||||
|
||||
builder
|
||||
.body(body)
|
||||
.map(IntoResponse::into_response)
|
||||
.unwrap_or_else(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response())
|
||||
}
|
||||
@@ -1,77 +0,0 @@
|
||||
use axum::{
|
||||
Router,
|
||||
routing::{get, post},
|
||||
};
|
||||
use std::sync::Arc;
|
||||
use utoipa::OpenApi;
|
||||
use utoipa_swagger_ui::SwaggerUi;
|
||||
|
||||
use crate::Datasets;
|
||||
|
||||
mod lookup;
|
||||
pub use lookup::*;
|
||||
|
||||
mod item;
|
||||
pub use item::*;
|
||||
|
||||
mod extract;
|
||||
pub use extract::*;
|
||||
|
||||
mod field;
|
||||
pub use field::*;
|
||||
|
||||
mod items;
|
||||
pub use items::*;
|
||||
|
||||
#[derive(OpenApi)]
|
||||
#[openapi(
|
||||
tags(),
|
||||
paths(lookup, item_get, get_extract, items_list, get_field),
|
||||
components(schemas(
|
||||
LookupRequest,
|
||||
LookupResponse,
|
||||
LookupResult,
|
||||
ItemQuery,
|
||||
ExtractQuery,
|
||||
FieldQuery,
|
||||
ItemsQuery,
|
||||
ItemsResponse,
|
||||
ItemRef
|
||||
))
|
||||
)]
|
||||
pub(crate) struct Api;
|
||||
|
||||
impl Datasets {
|
||||
#[inline]
|
||||
pub fn router(self: Arc<Self>, with_docs: bool) -> Router<()> {
|
||||
self.router_prefix(with_docs, None)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn router_prefix(self: Arc<Self>, with_docs: bool, prefix: Option<&str>) -> Router<()> {
|
||||
let mut router = Router::new()
|
||||
.route("/lookup", post(lookup))
|
||||
.route("/item", get(item_get))
|
||||
.route("/extract", get(get_extract))
|
||||
.route("/field", get(get_field))
|
||||
.route("/items", get(items_list))
|
||||
.with_state(self.clone());
|
||||
|
||||
if let Some(prefix) = prefix {
|
||||
router = Router::new().nest(prefix, router);
|
||||
}
|
||||
|
||||
if with_docs {
|
||||
let docs_path = match prefix {
|
||||
None => "/docs".into(),
|
||||
Some(prefix) => format!("{prefix}/docs"),
|
||||
};
|
||||
|
||||
let docs = SwaggerUi::new(docs_path.clone())
|
||||
.url(format!("{}/openapi.json", docs_path), Api::openapi());
|
||||
|
||||
router = router.merge(docs);
|
||||
}
|
||||
router
|
||||
}
|
||||
}
|
||||
@@ -9,7 +9,3 @@ workspace = true
|
||||
|
||||
[dependencies]
|
||||
tokio = { workspace = true }
|
||||
smartstring = { workspace = true }
|
||||
aws-sdk-s3 = { workspace = true }
|
||||
chacha20poly1305 = { workspace = true }
|
||||
binrw = { workspace = true }
|
||||
|
||||
@@ -1,50 +0,0 @@
|
||||
use binrw::{binrw, meta::ReadMagic};
|
||||
|
||||
#[binrw]
|
||||
#[brw(little, magic = b"PileChaChav1")]
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct ChaChaHeaderv1 {
|
||||
pub config: ChaChaConfigv1,
|
||||
pub plaintext_size: u64,
|
||||
}
|
||||
|
||||
impl ChaChaHeaderv1 {
|
||||
pub const SIZE: usize = ChaChaHeaderv1::MAGIC.len() + std::mem::size_of::<ChaChaConfigv1>() + 8;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chachaheader_size() {
|
||||
assert_eq!(
|
||||
ChaChaHeaderv1::SIZE,
|
||||
std::mem::size_of::<ChaChaHeaderv1>() - ChaChaHeaderv1::MAGIC.len()
|
||||
)
|
||||
}
|
||||
|
||||
//
|
||||
// MARK: config
|
||||
//
|
||||
|
||||
#[binrw]
|
||||
#[brw(little)]
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct ChaChaConfigv1 {
|
||||
pub chunk_size: u64,
|
||||
pub nonce_size: u64,
|
||||
pub tag_size: u64,
|
||||
}
|
||||
|
||||
impl Default for ChaChaConfigv1 {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
chunk_size: 64 * 1024,
|
||||
nonce_size: 24,
|
||||
tag_size: 16,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ChaChaConfigv1 {
|
||||
pub(crate) fn enc_chunk_size(&self) -> u64 {
|
||||
self.chunk_size + self.nonce_size + self.tag_size
|
||||
}
|
||||
}
|
||||
@@ -1,9 +0,0 @@
|
||||
mod reader;
|
||||
mod reader_async;
|
||||
mod writer;
|
||||
mod writer_async;
|
||||
|
||||
pub use {reader::*, reader_async::*, writer::*, writer_async::*};
|
||||
|
||||
mod format;
|
||||
pub use format::*;
|
||||
@@ -1,151 +0,0 @@
|
||||
use std::io::{Read, Seek, SeekFrom};
|
||||
|
||||
use crate::{AsyncReader, AsyncSeekReader, chacha::ChaChaHeaderv1};
|
||||
|
||||
//
|
||||
// MARK: reader
|
||||
//
|
||||
|
||||
pub struct ChaChaReaderv1<R: Read + Seek> {
|
||||
inner: R,
|
||||
header: ChaChaHeaderv1,
|
||||
|
||||
data_offset: u64,
|
||||
encryption_key: [u8; 32],
|
||||
cursor: u64,
|
||||
plaintext_size: u64,
|
||||
cached_chunk: Option<(u64, Vec<u8>)>,
|
||||
}
|
||||
|
||||
impl<R: Read + Seek> ChaChaReaderv1<R> {
|
||||
pub fn new(mut inner: R, encryption_key: [u8; 32]) -> Result<Self, std::io::Error> {
|
||||
use binrw::BinReaderExt;
|
||||
|
||||
inner.seek(SeekFrom::Start(0))?;
|
||||
let header: ChaChaHeaderv1 = inner.read_le().map_err(std::io::Error::other)?;
|
||||
let data_offset = inner.stream_position()?;
|
||||
|
||||
Ok(Self {
|
||||
inner,
|
||||
header,
|
||||
data_offset,
|
||||
encryption_key,
|
||||
cursor: 0,
|
||||
plaintext_size: header.plaintext_size,
|
||||
cached_chunk: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn fetch_chunk(&mut self, chunk_index: u64) -> Result<(), std::io::Error> {
|
||||
use chacha20poly1305::{KeyInit, XChaCha20Poly1305, XNonce, aead::Aead};
|
||||
|
||||
let enc_start = self.data_offset + chunk_index * self.header.config.enc_chunk_size();
|
||||
self.inner.seek(SeekFrom::Start(enc_start))?;
|
||||
|
||||
let mut encrypted = vec![0u8; self.header.config.enc_chunk_size() as usize];
|
||||
let n = self.read_exact_or_eof(&mut encrypted)?;
|
||||
encrypted.truncate(n);
|
||||
|
||||
if encrypted.len() < (self.header.config.nonce_size + self.header.config.tag_size) as usize
|
||||
{
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
"encrypted chunk too short",
|
||||
));
|
||||
}
|
||||
|
||||
let (nonce_bytes, ciphertext) = encrypted.split_at(self.header.config.nonce_size as usize);
|
||||
let nonce = XNonce::from_slice(nonce_bytes);
|
||||
let key = chacha20poly1305::Key::from_slice(&self.encryption_key);
|
||||
let cipher = XChaCha20Poly1305::new(key);
|
||||
let plaintext = cipher.decrypt(nonce, ciphertext).map_err(|_| {
|
||||
std::io::Error::new(std::io::ErrorKind::InvalidData, "decryption failed")
|
||||
})?;
|
||||
|
||||
self.cached_chunk = Some((chunk_index, plaintext));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn read_exact_or_eof(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
|
||||
let mut total = 0;
|
||||
while total < buf.len() {
|
||||
match self.inner.read(&mut buf[total..])? {
|
||||
0 => break,
|
||||
n => total += n,
|
||||
}
|
||||
}
|
||||
Ok(total)
|
||||
}
|
||||
}
|
||||
|
||||
impl<R: Read + Seek + Send> AsyncReader for ChaChaReaderv1<R> {
|
||||
async fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
|
||||
let remaining = self.plaintext_size.saturating_sub(self.cursor);
|
||||
if remaining == 0 || buf.is_empty() {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
let chunk_index = self.cursor / self.header.config.chunk_size;
|
||||
|
||||
let need_fetch = match &self.cached_chunk {
|
||||
None => true,
|
||||
Some((idx, _)) => *idx != chunk_index,
|
||||
};
|
||||
|
||||
if need_fetch {
|
||||
self.fetch_chunk(chunk_index)?;
|
||||
}
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
let (_, chunk_data) = self.cached_chunk.as_ref().unwrap();
|
||||
|
||||
let offset_in_chunk = (self.cursor % self.header.config.chunk_size) as usize;
|
||||
let available = chunk_data.len() - offset_in_chunk;
|
||||
let to_copy = available.min(buf.len());
|
||||
|
||||
buf[..to_copy].copy_from_slice(&chunk_data[offset_in_chunk..offset_in_chunk + to_copy]);
|
||||
self.cursor += to_copy as u64;
|
||||
Ok(to_copy)
|
||||
}
|
||||
}
|
||||
|
||||
impl<R: Read + Seek + Send> AsyncSeekReader for ChaChaReaderv1<R> {
|
||||
async fn seek(&mut self, pos: SeekFrom) -> Result<u64, std::io::Error> {
|
||||
match pos {
|
||||
SeekFrom::Start(x) => self.cursor = x.min(self.plaintext_size),
|
||||
|
||||
SeekFrom::Current(x) => {
|
||||
if x < 0 {
|
||||
let abs = x.unsigned_abs();
|
||||
if abs > self.cursor {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidInput,
|
||||
"cannot seek past start",
|
||||
));
|
||||
}
|
||||
self.cursor -= abs;
|
||||
} else {
|
||||
self.cursor += x as u64;
|
||||
}
|
||||
}
|
||||
|
||||
SeekFrom::End(x) => {
|
||||
if x < 0 {
|
||||
let abs = x.unsigned_abs();
|
||||
if abs > self.plaintext_size {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidInput,
|
||||
"cannot seek past start",
|
||||
));
|
||||
}
|
||||
self.cursor = self.plaintext_size - abs;
|
||||
} else {
|
||||
self.cursor = self.plaintext_size + x as u64;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.cursor = self.cursor.min(self.plaintext_size);
|
||||
Ok(self.cursor)
|
||||
}
|
||||
}
|
||||
@@ -1,165 +0,0 @@
|
||||
use std::io::SeekFrom;
|
||||
|
||||
use crate::{AsyncReader, AsyncSeekReader, chacha::ChaChaHeaderv1};
|
||||
|
||||
pub struct ChaChaReaderv1Async<R: AsyncSeekReader> {
|
||||
inner: R,
|
||||
header: ChaChaHeaderv1,
|
||||
|
||||
data_offset: u64,
|
||||
encryption_key: [u8; 32],
|
||||
cursor: u64,
|
||||
plaintext_size: u64,
|
||||
cached_chunk: Option<(u64, Vec<u8>)>,
|
||||
}
|
||||
|
||||
impl<R: AsyncSeekReader> ChaChaReaderv1Async<R> {
|
||||
pub async fn new(mut inner: R, encryption_key: [u8; 32]) -> Result<Self, std::io::Error> {
|
||||
use binrw::BinReaderExt;
|
||||
use std::io::Cursor;
|
||||
|
||||
inner.seek(SeekFrom::Start(0)).await?;
|
||||
let mut buf = [0u8; ChaChaHeaderv1::SIZE];
|
||||
read_exact(&mut inner, &mut buf).await?;
|
||||
let header: ChaChaHeaderv1 = Cursor::new(&buf[..])
|
||||
.read_le()
|
||||
.map_err(std::io::Error::other)?;
|
||||
|
||||
Ok(Self {
|
||||
inner,
|
||||
header,
|
||||
data_offset: buf.len() as u64,
|
||||
encryption_key,
|
||||
cursor: 0,
|
||||
plaintext_size: header.plaintext_size,
|
||||
cached_chunk: None,
|
||||
})
|
||||
}
|
||||
|
||||
async fn fetch_chunk(&mut self, chunk_index: u64) -> Result<(), std::io::Error> {
|
||||
use chacha20poly1305::{KeyInit, XChaCha20Poly1305, XNonce, aead::Aead};
|
||||
|
||||
let enc_start = self.data_offset + chunk_index * self.header.config.enc_chunk_size();
|
||||
self.inner.seek(SeekFrom::Start(enc_start)).await?;
|
||||
|
||||
let mut encrypted = vec![0u8; self.header.config.enc_chunk_size() as usize];
|
||||
let n = read_exact_or_eof(&mut self.inner, &mut encrypted).await?;
|
||||
encrypted.truncate(n);
|
||||
|
||||
if encrypted.len() < (self.header.config.nonce_size + self.header.config.tag_size) as usize
|
||||
{
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
"encrypted chunk too short",
|
||||
));
|
||||
}
|
||||
|
||||
let (nonce_bytes, ciphertext) = encrypted.split_at(self.header.config.nonce_size as usize);
|
||||
let nonce = XNonce::from_slice(nonce_bytes);
|
||||
let key = chacha20poly1305::Key::from_slice(&self.encryption_key);
|
||||
let cipher = XChaCha20Poly1305::new(key);
|
||||
let plaintext = cipher.decrypt(nonce, ciphertext).map_err(|_| {
|
||||
std::io::Error::new(std::io::ErrorKind::InvalidData, "decryption failed")
|
||||
})?;
|
||||
|
||||
self.cached_chunk = Some((chunk_index, plaintext));
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
async fn read_exact<R: AsyncReader>(inner: &mut R, buf: &mut [u8]) -> Result<(), std::io::Error> {
|
||||
let n = read_exact_or_eof(inner, buf).await?;
|
||||
if n < buf.len() {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::UnexpectedEof,
|
||||
"unexpected EOF reading header",
|
||||
));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn read_exact_or_eof<R: AsyncReader>(
|
||||
inner: &mut R,
|
||||
buf: &mut [u8],
|
||||
) -> Result<usize, std::io::Error> {
|
||||
let mut total = 0;
|
||||
while total < buf.len() {
|
||||
match inner.read(&mut buf[total..]).await? {
|
||||
0 => break,
|
||||
n => total += n,
|
||||
}
|
||||
}
|
||||
Ok(total)
|
||||
}
|
||||
|
||||
impl<R: AsyncSeekReader> AsyncReader for ChaChaReaderv1Async<R> {
|
||||
async fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
|
||||
let remaining = self.plaintext_size.saturating_sub(self.cursor);
|
||||
if remaining == 0 || buf.is_empty() {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
let chunk_index = self.cursor / self.header.config.chunk_size;
|
||||
|
||||
let need_fetch = match &self.cached_chunk {
|
||||
None => true,
|
||||
Some((idx, _)) => *idx != chunk_index,
|
||||
};
|
||||
|
||||
if need_fetch {
|
||||
self.fetch_chunk(chunk_index).await?;
|
||||
}
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
let (_, chunk_data) = self.cached_chunk.as_ref().unwrap();
|
||||
|
||||
let offset_in_chunk = (self.cursor % self.header.config.chunk_size) as usize;
|
||||
let available = chunk_data.len() - offset_in_chunk;
|
||||
let to_copy = available.min(buf.len());
|
||||
|
||||
buf[..to_copy].copy_from_slice(&chunk_data[offset_in_chunk..offset_in_chunk + to_copy]);
|
||||
self.cursor += to_copy as u64;
|
||||
Ok(to_copy)
|
||||
}
|
||||
}
|
||||
|
||||
impl<R: AsyncSeekReader> AsyncSeekReader for ChaChaReaderv1Async<R> {
|
||||
async fn seek(&mut self, pos: SeekFrom) -> Result<u64, std::io::Error> {
|
||||
match pos {
|
||||
SeekFrom::Start(x) => self.cursor = x.min(self.plaintext_size),
|
||||
|
||||
SeekFrom::Current(x) => {
|
||||
if x < 0 {
|
||||
let abs = x.unsigned_abs();
|
||||
if abs > self.cursor {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidInput,
|
||||
"cannot seek past start",
|
||||
));
|
||||
}
|
||||
self.cursor -= abs;
|
||||
} else {
|
||||
self.cursor += x as u64;
|
||||
}
|
||||
}
|
||||
|
||||
SeekFrom::End(x) => {
|
||||
if x < 0 {
|
||||
let abs = x.unsigned_abs();
|
||||
if abs > self.plaintext_size {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidInput,
|
||||
"cannot seek past start",
|
||||
));
|
||||
}
|
||||
self.cursor = self.plaintext_size - abs;
|
||||
} else {
|
||||
self.cursor = self.plaintext_size + x as u64;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.cursor = self.cursor.min(self.plaintext_size);
|
||||
Ok(self.cursor)
|
||||
}
|
||||
}
|
||||
@@ -1,91 +0,0 @@
|
||||
use std::io::SeekFrom;
|
||||
|
||||
use tokio::io::{AsyncSeek, AsyncSeekExt, AsyncWrite, AsyncWriteExt};
|
||||
|
||||
use crate::chacha::{ChaChaConfigv1, ChaChaHeaderv1};
|
||||
|
||||
pub struct ChaChaWriterAsync<W: AsyncWrite + AsyncSeek + Unpin + Send> {
|
||||
inner: W,
|
||||
header: ChaChaHeaderv1,
|
||||
|
||||
encryption_key: [u8; 32],
|
||||
buffer: Vec<u8>,
|
||||
plaintext_bytes_written: u64,
|
||||
}
|
||||
|
||||
impl<W: AsyncWrite + AsyncSeek + Unpin + Send> ChaChaWriterAsync<W> {
|
||||
pub async fn new(mut inner: W, encryption_key: [u8; 32]) -> Result<Self, std::io::Error> {
|
||||
let header = ChaChaHeaderv1 {
|
||||
config: ChaChaConfigv1::default(),
|
||||
plaintext_size: 0,
|
||||
};
|
||||
inner.write_all(&serialize_header(header)?).await?;
|
||||
|
||||
Ok(Self {
|
||||
inner,
|
||||
header,
|
||||
encryption_key,
|
||||
buffer: Vec::new(),
|
||||
plaintext_bytes_written: 0,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn write(&mut self, buf: &[u8]) -> Result<(), std::io::Error> {
|
||||
self.buffer.extend_from_slice(buf);
|
||||
self.plaintext_bytes_written += buf.len() as u64;
|
||||
|
||||
let chunk_size = self.header.config.chunk_size as usize;
|
||||
while self.buffer.len() >= chunk_size {
|
||||
let encrypted = encrypt_chunk(&self.encryption_key, &self.buffer[..chunk_size])?;
|
||||
self.inner.write_all(&encrypted).await?;
|
||||
self.buffer.drain(..chunk_size);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Encrypt and write any buffered plaintext, patch the header with the
|
||||
/// final `plaintext_size`, then return the inner writer.
|
||||
pub async fn finish(mut self) -> Result<W, std::io::Error> {
|
||||
if !self.buffer.is_empty() {
|
||||
let encrypted = encrypt_chunk(&self.encryption_key, &self.buffer)?;
|
||||
self.inner.write_all(&encrypted).await?;
|
||||
}
|
||||
|
||||
self.inner.seek(SeekFrom::Start(0)).await?;
|
||||
let header_bytes = serialize_header(ChaChaHeaderv1 {
|
||||
config: self.header.config,
|
||||
plaintext_size: self.plaintext_bytes_written,
|
||||
})?;
|
||||
self.inner.write_all(&header_bytes).await?;
|
||||
|
||||
Ok(self.inner)
|
||||
}
|
||||
}
|
||||
|
||||
fn encrypt_chunk(key: &[u8; 32], plaintext: &[u8]) -> Result<Vec<u8>, std::io::Error> {
|
||||
use chacha20poly1305::{
|
||||
XChaCha20Poly1305,
|
||||
aead::{Aead, AeadCore, KeyInit, OsRng},
|
||||
};
|
||||
|
||||
let nonce = XChaCha20Poly1305::generate_nonce(&mut OsRng);
|
||||
let cipher = XChaCha20Poly1305::new(chacha20poly1305::Key::from_slice(key));
|
||||
let ciphertext = cipher
|
||||
.encrypt(&nonce, plaintext)
|
||||
.map_err(|_| std::io::Error::other("encryption failed"))?;
|
||||
|
||||
let mut output = Vec::with_capacity(nonce.len() + ciphertext.len());
|
||||
output.extend_from_slice(&nonce);
|
||||
output.extend_from_slice(&ciphertext);
|
||||
Ok(output)
|
||||
}
|
||||
|
||||
fn serialize_header(header: ChaChaHeaderv1) -> Result<Vec<u8>, std::io::Error> {
|
||||
use binrw::BinWriterExt;
|
||||
use std::io::Cursor;
|
||||
|
||||
let mut buf = Cursor::new(Vec::new());
|
||||
buf.write_le(&header).map_err(std::io::Error::other)?;
|
||||
Ok(buf.into_inner())
|
||||
}
|
||||
@@ -1,260 +0,0 @@
|
||||
use std::io::{Seek, SeekFrom, Write};
|
||||
|
||||
use crate::chacha::{ChaChaConfigv1, ChaChaHeaderv1};
|
||||
|
||||
/// Generate a random 32-byte encryption key suitable for use with [`ChaChaWriter`].
|
||||
pub fn generate_key() -> [u8; 32] {
|
||||
use chacha20poly1305::aead::OsRng;
|
||||
use chacha20poly1305::{KeyInit, XChaCha20Poly1305};
|
||||
XChaCha20Poly1305::generate_key(&mut OsRng).into()
|
||||
}
|
||||
|
||||
pub struct ChaChaWriterv1<W: Write + Seek> {
|
||||
inner: W,
|
||||
header: ChaChaHeaderv1,
|
||||
|
||||
encryption_key: [u8; 32],
|
||||
buffer: Vec<u8>,
|
||||
plaintext_bytes_written: u64,
|
||||
}
|
||||
|
||||
impl<W: Write + Seek> ChaChaWriterv1<W> {
|
||||
pub fn new(mut inner: W, encryption_key: [u8; 32]) -> Result<Self, std::io::Error> {
|
||||
use binrw::BinWriterExt;
|
||||
|
||||
let header = ChaChaHeaderv1 {
|
||||
config: ChaChaConfigv1::default(),
|
||||
plaintext_size: 0,
|
||||
};
|
||||
inner.write_le(&header).map_err(std::io::Error::other)?;
|
||||
|
||||
Ok(Self {
|
||||
inner,
|
||||
header,
|
||||
encryption_key,
|
||||
buffer: Vec::new(),
|
||||
plaintext_bytes_written: 0,
|
||||
})
|
||||
}
|
||||
|
||||
/// Encrypt and write any buffered plaintext, patch the header with the
|
||||
/// final `plaintext_size`, then return the inner writer.
|
||||
pub fn finish(mut self) -> Result<W, std::io::Error> {
|
||||
use binrw::BinWriterExt;
|
||||
|
||||
self.flush_buffer()?;
|
||||
|
||||
self.inner.seek(SeekFrom::Start(0))?;
|
||||
let header = ChaChaHeaderv1 {
|
||||
config: self.header.config,
|
||||
plaintext_size: self.plaintext_bytes_written,
|
||||
};
|
||||
self.inner
|
||||
.write_le(&header)
|
||||
.map_err(std::io::Error::other)?;
|
||||
|
||||
Ok(self.inner)
|
||||
}
|
||||
|
||||
fn encrypt_chunk(&self, plaintext: &[u8]) -> Result<Vec<u8>, std::io::Error> {
|
||||
use chacha20poly1305::{
|
||||
XChaCha20Poly1305,
|
||||
aead::{Aead, AeadCore, KeyInit, OsRng},
|
||||
};
|
||||
|
||||
let nonce = XChaCha20Poly1305::generate_nonce(&mut OsRng);
|
||||
let key = chacha20poly1305::Key::from_slice(&self.encryption_key);
|
||||
let cipher = XChaCha20Poly1305::new(key);
|
||||
let ciphertext = cipher
|
||||
.encrypt(&nonce, plaintext)
|
||||
.map_err(|_| std::io::Error::other("encryption failed"))?;
|
||||
|
||||
let mut output = Vec::with_capacity(nonce.len() + ciphertext.len());
|
||||
output.extend_from_slice(&nonce);
|
||||
output.extend_from_slice(&ciphertext);
|
||||
Ok(output)
|
||||
}
|
||||
|
||||
fn flush_buffer(&mut self) -> Result<(), std::io::Error> {
|
||||
if !self.buffer.is_empty() {
|
||||
let encrypted = self.encrypt_chunk(&self.buffer)?;
|
||||
self.inner.write_all(&encrypted)?;
|
||||
self.buffer.clear();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: Write + Seek> Write for ChaChaWriterv1<W> {
|
||||
fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
|
||||
self.buffer.extend_from_slice(buf);
|
||||
self.plaintext_bytes_written += buf.len() as u64;
|
||||
|
||||
let chunk_size = self.header.config.chunk_size as usize;
|
||||
while self.buffer.len() >= chunk_size {
|
||||
let encrypted = self.encrypt_chunk(&self.buffer[..chunk_size])?;
|
||||
self.inner.write_all(&encrypted)?;
|
||||
self.buffer.drain(..chunk_size);
|
||||
}
|
||||
|
||||
Ok(buf.len())
|
||||
}
|
||||
|
||||
/// Encrypts and flushes any buffered plaintext as a partial chunk.
|
||||
///
|
||||
/// Prefer [`finish`](Self::finish) to retrieve the inner writer after
|
||||
/// all data has been written. Calling `flush` multiple times will produce
|
||||
/// multiple small encrypted chunks for the same partial data.
|
||||
fn flush(&mut self) -> Result<(), std::io::Error> {
|
||||
self.flush_buffer()?;
|
||||
self.inner.flush()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[expect(clippy::unwrap_used)]
|
||||
mod tests {
|
||||
use std::io::{Cursor, SeekFrom, Write};
|
||||
|
||||
use super::ChaChaWriterv1;
|
||||
use crate::{AsyncReader, AsyncSeekReader, chacha::ChaChaReaderv1};
|
||||
|
||||
const KEY: [u8; 32] = [42u8; 32];
|
||||
|
||||
fn encrypt(data: &[u8]) -> Cursor<Vec<u8>> {
|
||||
let mut writer = ChaChaWriterv1::new(Cursor::new(Vec::new()), KEY).unwrap();
|
||||
writer.write_all(data).unwrap();
|
||||
let mut buf = writer.finish().unwrap();
|
||||
buf.set_position(0);
|
||||
buf
|
||||
}
|
||||
|
||||
async fn decrypt_all(buf: Cursor<Vec<u8>>) -> Vec<u8> {
|
||||
let mut reader = ChaChaReaderv1::new(buf, KEY).unwrap();
|
||||
reader.read_to_end().await.unwrap()
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn roundtrip_empty() {
|
||||
let buf = encrypt(&[]);
|
||||
// Header present but no chunks
|
||||
assert!(!buf.get_ref().is_empty());
|
||||
assert!(decrypt_all(buf).await.is_empty());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn roundtrip_small() {
|
||||
let data = b"hello, world!";
|
||||
assert_eq!(decrypt_all(encrypt(data)).await, data);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn roundtrip_exact_chunk() {
|
||||
let data = vec![0xABu8; 65536];
|
||||
assert_eq!(decrypt_all(encrypt(&data)).await, data);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn roundtrip_multi_chunk() {
|
||||
// 2.5 chunks
|
||||
let data: Vec<u8> = (0u8..=255).cycle().take(65536 * 2 + 1000).collect();
|
||||
assert_eq!(decrypt_all(encrypt(&data)).await, data);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn roundtrip_incremental_writes() {
|
||||
// Write one byte at a time
|
||||
let data: Vec<u8> = (0u8..200).collect();
|
||||
let mut writer = ChaChaWriterv1::new(Cursor::new(Vec::new()), KEY).unwrap();
|
||||
for byte in &data {
|
||||
writer.write_all(&[*byte]).unwrap();
|
||||
}
|
||||
let mut buf = writer.finish().unwrap();
|
||||
buf.set_position(0);
|
||||
assert_eq!(decrypt_all(buf).await, data);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn wrong_key_fails() {
|
||||
let buf = encrypt(b"secret data");
|
||||
let mut reader = ChaChaReaderv1::new(buf, [0u8; 32]).unwrap();
|
||||
assert!(reader.read_to_end().await.is_err());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn header_magic_checked() {
|
||||
// Corrupt the magic bytes — reader should fail
|
||||
let mut buf = encrypt(b"data");
|
||||
buf.get_mut()[0] = 0xFF;
|
||||
buf.set_position(0);
|
||||
assert!(ChaChaReaderv1::new(buf, KEY).is_err());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn seek_from_start() {
|
||||
let data: Vec<u8> = (0u8..100).collect();
|
||||
let mut reader = ChaChaReaderv1::new(encrypt(&data), KEY).unwrap();
|
||||
|
||||
reader.seek(SeekFrom::Start(50)).await.unwrap();
|
||||
let mut buf = [0u8; 10];
|
||||
let mut read = 0;
|
||||
while read < buf.len() {
|
||||
read += reader.read(&mut buf[read..]).await.unwrap();
|
||||
}
|
||||
assert_eq!(buf, data[50..60]);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn seek_from_end() {
|
||||
let data: Vec<u8> = (0u8..100).collect();
|
||||
let mut reader = ChaChaReaderv1::new(encrypt(&data), KEY).unwrap();
|
||||
|
||||
reader.seek(SeekFrom::End(-10)).await.unwrap();
|
||||
assert_eq!(reader.read_to_end().await.unwrap(), &data[90..]);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn seek_across_chunk_boundary() {
|
||||
// Seek to 6 bytes before the end of chunk 0, read 12 bytes spanning into chunk 1
|
||||
let data: Vec<u8> = (0u8..=255).cycle().take(65536 + 500).collect();
|
||||
let mut reader = ChaChaReaderv1::new(encrypt(&data), KEY).unwrap();
|
||||
|
||||
reader.seek(SeekFrom::Start(65530)).await.unwrap();
|
||||
let mut buf = vec![0u8; 12];
|
||||
let mut read = 0;
|
||||
while read < buf.len() {
|
||||
read += reader.read(&mut buf[read..]).await.unwrap();
|
||||
}
|
||||
assert_eq!(buf, data[65530..65542]);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn seek_current() {
|
||||
let data: Vec<u8> = (0u8..=255).cycle().take(200).collect();
|
||||
let mut reader = ChaChaReaderv1::new(encrypt(&data), KEY).unwrap();
|
||||
|
||||
// Read 10, seek back 5, read 5 — should get bytes 5..10
|
||||
let mut first = [0u8; 10];
|
||||
let mut n = 0;
|
||||
while n < first.len() {
|
||||
n += reader.read(&mut first[n..]).await.unwrap();
|
||||
}
|
||||
reader.seek(SeekFrom::Current(-5)).await.unwrap();
|
||||
let mut second = [0u8; 5];
|
||||
n = 0;
|
||||
while n < second.len() {
|
||||
n += reader.read(&mut second[n..]).await.unwrap();
|
||||
}
|
||||
assert_eq!(second, data[5..10]);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn seek_past_end_clamps() {
|
||||
let data = b"hello";
|
||||
let mut reader = ChaChaReaderv1::new(encrypt(data), KEY).unwrap();
|
||||
|
||||
let pos = reader.seek(SeekFrom::Start(9999)).await.unwrap();
|
||||
assert_eq!(pos, data.len() as u64);
|
||||
assert_eq!(reader.read_to_end().await.unwrap(), b"");
|
||||
}
|
||||
}
|
||||
@@ -1,7 +1,2 @@
|
||||
mod asyncreader;
|
||||
pub use asyncreader::*;
|
||||
|
||||
mod s3reader;
|
||||
pub use s3reader::*;
|
||||
|
||||
pub mod chacha;
|
||||
|
||||
@@ -1,181 +0,0 @@
|
||||
use aws_sdk_s3::config::{BehaviorVersion, Credentials, Region};
|
||||
use smartstring::{LazyCompact, SmartString};
|
||||
use std::{fmt::Debug, io::SeekFrom, sync::Arc};
|
||||
|
||||
use crate::{AsyncReader, AsyncSeekReader};
|
||||
|
||||
//
|
||||
// MARK: client
|
||||
//
|
||||
|
||||
/// An interface to an S3 bucket.
|
||||
///
|
||||
/// TODO: S3 is slow and expensive. Ideally, we'll have this struct cache data
|
||||
/// so we don't have to download anything twice. This is, however, complicated,
|
||||
/// and doesn't fully solve the "expensive" problem.
|
||||
pub struct S3Client {
|
||||
pub client: aws_sdk_s3::Client,
|
||||
bucket: SmartString<LazyCompact>,
|
||||
|
||||
/// maximum number of bytes to use for cached data
|
||||
cache_limit_bytes: usize,
|
||||
}
|
||||
|
||||
impl Debug for S3Client {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("S3Client")
|
||||
.field("bucket", &self.bucket)
|
||||
.field("cache_limit_bytes", &self.cache_limit_bytes)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl S3Client {
|
||||
pub async fn new(
|
||||
bucket: &str,
|
||||
endpoint: Option<&str>,
|
||||
region: &str,
|
||||
access_key_id: &str,
|
||||
secret_access_key: &str,
|
||||
cache_limit_bytes: usize,
|
||||
) -> Arc<Self> {
|
||||
let client = {
|
||||
let mut s3_config = aws_sdk_s3::config::Builder::new()
|
||||
.behavior_version(BehaviorVersion::latest())
|
||||
.region(Region::new(region.to_owned()))
|
||||
.credentials_provider(Credentials::new(
|
||||
access_key_id,
|
||||
secret_access_key,
|
||||
None,
|
||||
None,
|
||||
"pile",
|
||||
));
|
||||
|
||||
if let Some(ep) = endpoint {
|
||||
s3_config = s3_config.endpoint_url(ep).force_path_style(true);
|
||||
}
|
||||
|
||||
aws_sdk_s3::Client::from_conf(s3_config.build())
|
||||
};
|
||||
|
||||
return Arc::new(Self {
|
||||
bucket: bucket.into(),
|
||||
client,
|
||||
cache_limit_bytes,
|
||||
});
|
||||
}
|
||||
|
||||
pub fn bucket(&self) -> &str {
|
||||
&self.bucket
|
||||
}
|
||||
|
||||
pub async fn get(self: &Arc<Self>, key: &str) -> Result<S3Reader, std::io::Error> {
|
||||
let head = self
|
||||
.client
|
||||
.head_object()
|
||||
.bucket(self.bucket.as_str())
|
||||
.key(key)
|
||||
.send()
|
||||
.await
|
||||
.map_err(std::io::Error::other)?;
|
||||
|
||||
let size = head.content_length().unwrap_or(0) as u64;
|
||||
|
||||
Ok(S3Reader {
|
||||
client: self.clone(),
|
||||
bucket: self.bucket.clone(),
|
||||
key: key.into(),
|
||||
cursor: 0,
|
||||
size,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// MARK: reader
|
||||
//
|
||||
|
||||
pub struct S3Reader {
|
||||
pub client: Arc<S3Client>,
|
||||
pub bucket: SmartString<LazyCompact>,
|
||||
pub key: SmartString<LazyCompact>,
|
||||
pub cursor: u64,
|
||||
pub size: u64,
|
||||
}
|
||||
|
||||
impl AsyncReader for S3Reader {
|
||||
async fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
|
||||
let len_left = self.size.saturating_sub(self.cursor);
|
||||
if len_left == 0 || buf.is_empty() {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
let start_byte = self.cursor;
|
||||
let len_to_read = (buf.len() as u64).min(len_left);
|
||||
let end_byte = start_byte + len_to_read - 1;
|
||||
|
||||
let resp = self
|
||||
.client
|
||||
.client
|
||||
.get_object()
|
||||
.bucket(self.bucket.as_str())
|
||||
.key(self.key.as_str())
|
||||
.range(format!("bytes={start_byte}-{end_byte}"))
|
||||
.send()
|
||||
.await
|
||||
.map_err(std::io::Error::other)?;
|
||||
|
||||
let bytes = resp
|
||||
.body
|
||||
.collect()
|
||||
.await
|
||||
.map(|x| x.into_bytes())
|
||||
.map_err(std::io::Error::other)?;
|
||||
|
||||
let n = bytes.len().min(buf.len());
|
||||
buf[..n].copy_from_slice(&bytes[..n]);
|
||||
self.cursor += n as u64;
|
||||
Ok(n)
|
||||
}
|
||||
}
|
||||
|
||||
impl AsyncSeekReader for S3Reader {
|
||||
async fn seek(&mut self, pos: SeekFrom) -> Result<u64, std::io::Error> {
|
||||
match pos {
|
||||
SeekFrom::Start(x) => self.cursor = x.min(self.size),
|
||||
|
||||
SeekFrom::Current(x) => {
|
||||
if x < 0 {
|
||||
let abs = x.unsigned_abs();
|
||||
if abs > self.cursor {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidInput,
|
||||
"cannot seek past start",
|
||||
));
|
||||
}
|
||||
self.cursor -= abs;
|
||||
} else {
|
||||
self.cursor += x as u64;
|
||||
}
|
||||
}
|
||||
|
||||
std::io::SeekFrom::End(x) => {
|
||||
if x < 0 {
|
||||
let abs = x.unsigned_abs();
|
||||
if abs > self.size {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidInput,
|
||||
"cannot seek past start",
|
||||
));
|
||||
}
|
||||
self.cursor = self.size - abs;
|
||||
} else {
|
||||
self.cursor = self.size + x as u64;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.cursor = self.cursor.min(self.size);
|
||||
Ok(self.cursor)
|
||||
}
|
||||
}
|
||||
28
crates/pile-serve/Cargo.toml
Normal file
28
crates/pile-serve/Cargo.toml
Normal file
@@ -0,0 +1,28 @@
|
||||
[package]
|
||||
name = "pile-serve"
|
||||
version = { workspace = true }
|
||||
rust-version = { workspace = true }
|
||||
edition = { workspace = true }
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
[dependencies]
|
||||
pile-config = { workspace = true }
|
||||
pile-value = { workspace = true }
|
||||
pile-dataset = { workspace = true }
|
||||
|
||||
serde_json = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
tokio-util = { version = "0.7", features = ["io"] }
|
||||
|
||||
serde = { workspace = true }
|
||||
axum = { workspace = true }
|
||||
percent-encoding = { workspace = true }
|
||||
utoipa = { workspace = true }
|
||||
utoipa-swagger-ui = { workspace = true }
|
||||
|
||||
[features]
|
||||
default = []
|
||||
pdfium = ["pile-value/pdfium"]
|
||||
30
crates/pile-serve/src/config_schema.rs
Normal file
30
crates/pile-serve/src/config_schema.rs
Normal file
@@ -0,0 +1,30 @@
|
||||
use axum::{
|
||||
Json,
|
||||
extract::State,
|
||||
http::StatusCode,
|
||||
response::{IntoResponse, Response},
|
||||
};
|
||||
use pile_dataset::Datasets;
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
pub use pile_config::FieldSpec;
|
||||
|
||||
pub type FieldsResponse = HashMap<String, FieldSpec>;
|
||||
|
||||
/// Retrieve this dataset's schema.
|
||||
#[utoipa::path(
|
||||
get,
|
||||
path = "/config/schema",
|
||||
responses(
|
||||
(status = 200, description = "This dataset's schema"),
|
||||
)
|
||||
)]
|
||||
pub async fn config_schema(State(state): State<Arc<Datasets>>) -> Response {
|
||||
let fields: FieldsResponse = state
|
||||
.config
|
||||
.schema
|
||||
.iter()
|
||||
.map(|(k, v)| (k.as_str().to_owned(), v.clone()))
|
||||
.collect();
|
||||
(StatusCode::OK, Json(fields)).into_response()
|
||||
}
|
||||
@@ -1,19 +1,23 @@
|
||||
use axum::{
|
||||
Json,
|
||||
body::Body,
|
||||
extract::{Query, RawQuery, State},
|
||||
http::{StatusCode, header},
|
||||
response::{IntoResponse, Response},
|
||||
};
|
||||
use percent_encoding::percent_decode_str;
|
||||
use pile_config::{Label, objectpath::ObjectPath};
|
||||
use pile_value::{extract::traits::ExtractState, value::PileValue};
|
||||
use pile_dataset::Datasets;
|
||||
use pile_value::{
|
||||
extract::traits::ExtractState,
|
||||
value::{BinaryPileValue, PileValue},
|
||||
};
|
||||
use serde::Deserialize;
|
||||
use std::{sync::Arc, time::Instant};
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tracing::debug;
|
||||
use utoipa::ToSchema;
|
||||
|
||||
use crate::Datasets;
|
||||
|
||||
#[derive(Deserialize, ToSchema)]
|
||||
pub struct ExtractQuery {
|
||||
source: String,
|
||||
@@ -96,17 +100,24 @@ pub async fn get_extract(
|
||||
let mut value = None;
|
||||
for path in &paths {
|
||||
match item.query(&extract_state, path).await {
|
||||
Ok(Some(PileValue::Null)) | Ok(None) => continue,
|
||||
Ok(None) => continue,
|
||||
|
||||
Ok(Some(PileValue::Null)) => {
|
||||
value = Some(PileValue::Null);
|
||||
continue;
|
||||
}
|
||||
|
||||
Ok(Some(v)) => {
|
||||
value = Some(v);
|
||||
break;
|
||||
}
|
||||
|
||||
Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
|
||||
}
|
||||
}
|
||||
|
||||
let Some(value) = value else {
|
||||
return StatusCode::NOT_FOUND.into_response();
|
||||
return (StatusCode::BAD_REQUEST, "no value").into_response();
|
||||
};
|
||||
|
||||
debug!(
|
||||
@@ -141,15 +152,30 @@ pub async fn get_extract(
|
||||
s.to_string(),
|
||||
)
|
||||
.into_response(),
|
||||
PileValue::Blob { mime, bytes } => (
|
||||
StatusCode::OK,
|
||||
[
|
||||
(header::CONTENT_TYPE, mime.to_string()),
|
||||
(header::CONTENT_DISPOSITION, disposition),
|
||||
],
|
||||
bytes.as_ref().clone(),
|
||||
)
|
||||
.into_response(),
|
||||
|
||||
PileValue::Binary(binary) => {
|
||||
let mime = binary.mime().to_string();
|
||||
let body = match binary {
|
||||
BinaryPileValue::Blob { bytes, .. } => Body::from(bytes.0.to_vec()),
|
||||
BinaryPileValue::File { path, .. } => match tokio::fs::File::open(&path).await {
|
||||
Ok(file) => Body::from_stream(ReaderStream::new(file)),
|
||||
Err(e) => {
|
||||
return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}"))
|
||||
.into_response();
|
||||
}
|
||||
},
|
||||
};
|
||||
(
|
||||
StatusCode::OK,
|
||||
[
|
||||
(header::CONTENT_TYPE, mime),
|
||||
(header::CONTENT_DISPOSITION, disposition),
|
||||
],
|
||||
body,
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
_ => match value.to_json(&extract_state).await {
|
||||
Ok(json) => (
|
||||
StatusCode::OK,
|
||||
@@ -157,6 +183,7 @@ pub async fn get_extract(
|
||||
Json(json),
|
||||
)
|
||||
.into_response(),
|
||||
|
||||
Err(e) => (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
|
||||
},
|
||||
}
|
||||
@@ -4,13 +4,12 @@ use axum::{
|
||||
http::StatusCode,
|
||||
response::{IntoResponse, Response},
|
||||
};
|
||||
use pile_dataset::Datasets;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::sync::Arc;
|
||||
use tracing::debug;
|
||||
use utoipa::ToSchema;
|
||||
|
||||
use crate::Datasets;
|
||||
|
||||
#[derive(Deserialize, ToSchema)]
|
||||
pub struct ItemsQuery {
|
||||
#[serde(default)]
|
||||
89
crates/pile-serve/src/lib.rs
Normal file
89
crates/pile-serve/src/lib.rs
Normal file
@@ -0,0 +1,89 @@
|
||||
use axum::{
|
||||
Router,
|
||||
routing::{get, post},
|
||||
};
|
||||
use pile_dataset::Datasets;
|
||||
use std::sync::Arc;
|
||||
use utoipa::OpenApi;
|
||||
use utoipa_swagger_ui::SwaggerUi;
|
||||
|
||||
mod lookup;
|
||||
pub use lookup::*;
|
||||
|
||||
mod extract;
|
||||
pub use extract::*;
|
||||
|
||||
mod items;
|
||||
pub use items::*;
|
||||
|
||||
mod config_schema;
|
||||
pub use config_schema::*;
|
||||
|
||||
mod schema_field;
|
||||
pub use schema_field::*;
|
||||
|
||||
mod schema;
|
||||
pub use schema::*;
|
||||
|
||||
#[derive(OpenApi)]
|
||||
#[openapi(
|
||||
tags(),
|
||||
paths(
|
||||
lookup,
|
||||
get_extract,
|
||||
items_list,
|
||||
config_schema,
|
||||
schema_field,
|
||||
schema_all
|
||||
),
|
||||
components(schemas(
|
||||
LookupRequest,
|
||||
LookupResponse,
|
||||
LookupResult,
|
||||
ExtractQuery,
|
||||
ItemsQuery,
|
||||
ItemsResponse,
|
||||
ItemRef
|
||||
))
|
||||
)]
|
||||
pub(crate) struct Api;
|
||||
|
||||
#[inline]
|
||||
pub fn router(ds: Arc<Datasets>, with_docs: bool) -> Router<()> {
|
||||
router_prefix(ds, with_docs, None)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn router_prefix(ds: Arc<Datasets>, with_docs: bool, prefix: Option<&str>) -> Router<()> {
|
||||
let mut router = Router::new()
|
||||
.route("/lookup", post(lookup))
|
||||
.route("/extract", get(get_extract))
|
||||
.route("/items", get(items_list))
|
||||
.route("/config/schema", get(config_schema))
|
||||
.route("/schema", get(schema_all))
|
||||
.route("/schema/{field}", get(schema_field))
|
||||
.with_state(ds.clone());
|
||||
|
||||
if let Some(prefix) = prefix {
|
||||
router = Router::new().nest(prefix, router);
|
||||
}
|
||||
|
||||
if with_docs {
|
||||
let docs_path = match prefix {
|
||||
None => "/docs".into(),
|
||||
Some(prefix) => format!("{prefix}/docs"),
|
||||
};
|
||||
|
||||
let api = Api::openapi();
|
||||
let api = match prefix {
|
||||
None => api,
|
||||
Some(prefix) => utoipa::openapi::OpenApi::default().nest(prefix, api),
|
||||
};
|
||||
|
||||
let docs =
|
||||
SwaggerUi::new(docs_path.clone()).url(format!("{}/openapi.json", docs_path), api);
|
||||
|
||||
router = router.merge(docs);
|
||||
}
|
||||
router
|
||||
}
|
||||
@@ -4,13 +4,12 @@ use axum::{
|
||||
http::StatusCode,
|
||||
response::{IntoResponse, Response},
|
||||
};
|
||||
use pile_dataset::Datasets;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{sync::Arc, time::Instant};
|
||||
use tracing::debug;
|
||||
use utoipa::ToSchema;
|
||||
|
||||
use crate::Datasets;
|
||||
|
||||
#[derive(Serialize, Deserialize, ToSchema, Debug)]
|
||||
pub struct LookupRequest {
|
||||
pub query: String,
|
||||
129
crates/pile-serve/src/schema.rs
Normal file
129
crates/pile-serve/src/schema.rs
Normal file
@@ -0,0 +1,129 @@
|
||||
use axum::{
|
||||
Json,
|
||||
extract::{Query, State},
|
||||
http::StatusCode,
|
||||
response::{IntoResponse, Response},
|
||||
};
|
||||
use pile_config::Label;
|
||||
use pile_dataset::Datasets;
|
||||
use pile_value::{extract::traits::ExtractState, value::PileValue};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
use utoipa::IntoParams;
|
||||
|
||||
#[derive(Deserialize, IntoParams)]
|
||||
pub struct SchemaQuery {
|
||||
source: String,
|
||||
key: String,
|
||||
|
||||
#[serde(default)]
|
||||
hidden: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
pub enum ApiValue {
|
||||
Binary { binary: bool, mime: String },
|
||||
Object { object: bool },
|
||||
Array(Vec<ApiValue>),
|
||||
String(String),
|
||||
Number(serde_json::Number),
|
||||
Null,
|
||||
}
|
||||
|
||||
pub type SchemaResponse = HashMap<String, ApiValue>;
|
||||
|
||||
async fn pile_value_to_api(
|
||||
state: &ExtractState,
|
||||
value: PileValue,
|
||||
) -> Result<ApiValue, std::io::Error> {
|
||||
match value {
|
||||
PileValue::String(s) => Ok(ApiValue::String(s.to_string())),
|
||||
PileValue::U64(n) => Ok(ApiValue::Number(n.into())),
|
||||
PileValue::I64(n) => Ok(ApiValue::Number(n.into())),
|
||||
PileValue::Null => Ok(ApiValue::Null),
|
||||
|
||||
PileValue::Binary(x) => Ok(ApiValue::Binary {
|
||||
binary: true,
|
||||
mime: x.mime().to_string(),
|
||||
}),
|
||||
|
||||
PileValue::Array(arr) => {
|
||||
let mut out = Vec::with_capacity(arr.len());
|
||||
for item in arr.iter() {
|
||||
out.push(Box::pin(pile_value_to_api(state, item.clone())).await?);
|
||||
}
|
||||
Ok(ApiValue::Array(out))
|
||||
}
|
||||
|
||||
PileValue::ObjectExtractor(_) | PileValue::ListExtractor(_) | PileValue::Item(_) => {
|
||||
Ok(ApiValue::Object { object: true })
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get all schema field values for a single item.
|
||||
#[utoipa::path(
|
||||
get,
|
||||
path = "/schema",
|
||||
params(
|
||||
("source" = String, Query, description = "Source label"),
|
||||
("key" = String, Query, description = "Item key"),
|
||||
("hidden" = bool, Query, description = "Include hidden fields (default: false)"),
|
||||
),
|
||||
responses(
|
||||
(status = 200, description = "Schema field values as a map of label to value"),
|
||||
(status = 400, description = "Invalid source label"),
|
||||
(status = 404, description = "Item not found"),
|
||||
(status = 500, description = "Internal server error"),
|
||||
)
|
||||
)]
|
||||
pub async fn schema_all(
|
||||
State(state): State<Arc<Datasets>>,
|
||||
Query(params): Query<SchemaQuery>,
|
||||
) -> Response {
|
||||
let label = match Label::try_from(params.source.clone()) {
|
||||
Ok(l) => l,
|
||||
Err(e) => return (StatusCode::BAD_REQUEST, format!("{e:?}")).into_response(),
|
||||
};
|
||||
|
||||
let Some(item) = state.get(&label, ¶ms.key).await else {
|
||||
return StatusCode::NOT_FOUND.into_response();
|
||||
};
|
||||
|
||||
let extract_state = ExtractState { ignore_mime: false };
|
||||
let item = PileValue::Item(item);
|
||||
|
||||
let mut result: SchemaResponse = HashMap::new();
|
||||
|
||||
for (field_label, field_spec) in &state.config.schema {
|
||||
if field_spec.hidden && !params.hidden {
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut value = None;
|
||||
for path in &field_spec.path {
|
||||
match item.query(&extract_state, path).await {
|
||||
Ok(Some(PileValue::Null)) | Ok(None) => continue,
|
||||
Ok(Some(v)) => {
|
||||
value = Some(v);
|
||||
break;
|
||||
}
|
||||
Err(e) => {
|
||||
return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let Some(v) = value else { continue };
|
||||
|
||||
let api_value = match pile_value_to_api(&extract_state, v).await {
|
||||
Ok(v) => v,
|
||||
Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
|
||||
};
|
||||
|
||||
result.insert(field_label.as_str().to_owned(), api_value);
|
||||
}
|
||||
|
||||
(StatusCode::OK, Json(result)).into_response()
|
||||
}
|
||||
@@ -1,37 +1,40 @@
|
||||
use axum::{
|
||||
Json,
|
||||
extract::{Query, State},
|
||||
body::Body,
|
||||
extract::{Path, Query, State},
|
||||
http::{StatusCode, header},
|
||||
response::{IntoResponse, Response},
|
||||
};
|
||||
use pile_config::Label;
|
||||
use pile_value::{extract::traits::ExtractState, value::PileValue};
|
||||
use pile_dataset::Datasets;
|
||||
use pile_value::{
|
||||
extract::traits::ExtractState,
|
||||
value::{BinaryPileValue, PileValue},
|
||||
};
|
||||
use serde::Deserialize;
|
||||
use std::{sync::Arc, time::Instant};
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tracing::debug;
|
||||
use utoipa::{IntoParams, ToSchema};
|
||||
use utoipa::IntoParams;
|
||||
|
||||
use crate::Datasets;
|
||||
|
||||
#[derive(Deserialize, ToSchema, IntoParams)]
|
||||
pub struct FieldQuery {
|
||||
#[derive(Deserialize, IntoParams)]
|
||||
pub struct SchemaFieldQuery {
|
||||
source: String,
|
||||
key: String,
|
||||
field: String,
|
||||
|
||||
#[serde(default)]
|
||||
download: bool,
|
||||
name: Option<String>,
|
||||
}
|
||||
|
||||
/// Extract a specific field from an item's metadata.
|
||||
/// Extract a specific schema field from an item's metadata.
|
||||
#[utoipa::path(
|
||||
get,
|
||||
path = "/field",
|
||||
path = "/schema/{field}",
|
||||
params(
|
||||
("field" = String, Path, description = "Schema field"),
|
||||
("source" = String, Query, description = "Source label"),
|
||||
("key" = String, Query, description = "Item key"),
|
||||
("field" = String, Query, description = "Schema field"),
|
||||
("name" = Option<String>, Query, description = "Downloaded filename; defaults to the last segment of the key"),
|
||||
),
|
||||
responses(
|
||||
@@ -41,9 +44,10 @@ pub struct FieldQuery {
|
||||
(status = 500, description = "Internal server error"),
|
||||
)
|
||||
)]
|
||||
pub async fn get_field(
|
||||
pub async fn schema_field(
|
||||
State(state): State<Arc<Datasets>>,
|
||||
Query(params): Query<FieldQuery>,
|
||||
Path(field): Path<String>,
|
||||
Query(params): Query<SchemaFieldQuery>,
|
||||
) -> Response {
|
||||
let start = Instant::now();
|
||||
|
||||
@@ -53,22 +57,22 @@ pub async fn get_field(
|
||||
};
|
||||
|
||||
debug!(
|
||||
message = "Serving /field",
|
||||
message = "Serving /schema/{field}",
|
||||
source = params.source,
|
||||
key = params.key,
|
||||
field = params.field,
|
||||
field = field,
|
||||
);
|
||||
|
||||
let Some(item) = state.get(&label, ¶ms.key).await else {
|
||||
return StatusCode::NOT_FOUND.into_response();
|
||||
};
|
||||
|
||||
let field = match Label::new(¶ms.field) {
|
||||
let field_label = match Label::new(&field) {
|
||||
Some(x) => x,
|
||||
None => return StatusCode::NOT_FOUND.into_response(),
|
||||
};
|
||||
|
||||
let paths = match state.config.schema.get(&field) {
|
||||
let paths = match state.config.schema.get(&field_label) {
|
||||
Some(x) => &x.path,
|
||||
None => return StatusCode::NOT_FOUND.into_response(),
|
||||
};
|
||||
@@ -79,24 +83,31 @@ pub async fn get_field(
|
||||
let mut value = None;
|
||||
for path in paths {
|
||||
match item.query(&extract_state, path).await {
|
||||
Ok(Some(PileValue::Null)) | Ok(None) => continue,
|
||||
Ok(None) => continue,
|
||||
|
||||
Ok(Some(PileValue::Null)) => {
|
||||
value = Some(PileValue::Null);
|
||||
continue;
|
||||
}
|
||||
|
||||
Ok(Some(v)) => {
|
||||
value = Some(v);
|
||||
break;
|
||||
}
|
||||
|
||||
Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
|
||||
}
|
||||
}
|
||||
|
||||
let Some(value) = value else {
|
||||
return StatusCode::NOT_FOUND.into_response();
|
||||
return (StatusCode::BAD_REQUEST, "no value").into_response();
|
||||
};
|
||||
|
||||
debug!(
|
||||
message = "Served /field",
|
||||
message = "Served /schema/{field}",
|
||||
source = params.source,
|
||||
key = params.key,
|
||||
field = params.field,
|
||||
field = field,
|
||||
time_ms = start.elapsed().as_millis()
|
||||
);
|
||||
|
||||
@@ -125,15 +136,30 @@ pub async fn get_field(
|
||||
s.to_string(),
|
||||
)
|
||||
.into_response(),
|
||||
PileValue::Blob { mime, bytes } => (
|
||||
StatusCode::OK,
|
||||
[
|
||||
(header::CONTENT_TYPE, mime.to_string()),
|
||||
(header::CONTENT_DISPOSITION, disposition),
|
||||
],
|
||||
bytes.as_ref().clone(),
|
||||
)
|
||||
.into_response(),
|
||||
|
||||
PileValue::Binary(binary) => {
|
||||
let mime = binary.mime().to_string();
|
||||
let body = match binary {
|
||||
BinaryPileValue::Blob { bytes, .. } => Body::from(bytes.0.to_vec()),
|
||||
BinaryPileValue::File { path, .. } => match tokio::fs::File::open(&path).await {
|
||||
Ok(file) => Body::from_stream(ReaderStream::new(file)),
|
||||
Err(e) => {
|
||||
return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}"))
|
||||
.into_response();
|
||||
}
|
||||
},
|
||||
};
|
||||
(
|
||||
StatusCode::OK,
|
||||
[
|
||||
(header::CONTENT_TYPE, mime),
|
||||
(header::CONTENT_DISPOSITION, disposition),
|
||||
],
|
||||
body,
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
_ => match value.to_json(&extract_state).await {
|
||||
Ok(json) => (
|
||||
StatusCode::OK,
|
||||
@@ -21,8 +21,9 @@ toml = { workspace = true }
|
||||
smartstring = { workspace = true }
|
||||
regex = { workspace = true }
|
||||
blake3 = { workspace = true }
|
||||
chacha20poly1305 = { workspace = true }
|
||||
base64 = { workspace = true }
|
||||
sha2 = { workspace = true }
|
||||
sha1 = { workspace = true }
|
||||
md5 = { workspace = true }
|
||||
epub = { workspace = true }
|
||||
kamadak-exif = { workspace = true }
|
||||
pdf = { workspace = true }
|
||||
@@ -36,6 +37,9 @@ mime_guess = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
strum = { workspace = true }
|
||||
|
||||
[build-dependencies]
|
||||
reqwest = { workspace = true }
|
||||
|
||||
[features]
|
||||
default = []
|
||||
pdfium = ["dep:pdfium-render"]
|
||||
|
||||
@@ -69,11 +69,14 @@ fn main() {
|
||||
|
||||
eprintln!("cargo:warning=Downloading PDFium from {url}");
|
||||
|
||||
let status = std::process::Command::new("curl")
|
||||
.args(["-L", "--fail", "-o", tgz_path.to_str().unwrap(), &url])
|
||||
.status()
|
||||
.expect("failed to run curl");
|
||||
assert!(status.success(), "curl failed to download PDFium");
|
||||
let response = reqwest::blocking::get(&url).expect("failed to download PDFium");
|
||||
assert!(
|
||||
response.status().is_success(),
|
||||
"failed to download PDFium: {}",
|
||||
response.status()
|
||||
);
|
||||
let bytes = response.bytes().expect("failed to read PDFium response");
|
||||
std::fs::write(&tgz_path, &bytes).expect("failed to write pdfium.tgz");
|
||||
|
||||
let status = std::process::Command::new("tar")
|
||||
.args([
|
||||
|
||||
@@ -6,16 +6,16 @@ use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
extract::traits::ExtractState,
|
||||
value::{Item, PileValue},
|
||||
value::{ArcBytes, BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
pub struct EpubCoverExtractor {
|
||||
item: Item,
|
||||
item: BinaryPileValue,
|
||||
output: OnceLock<Option<(Mime, Vec<u8>)>>,
|
||||
}
|
||||
|
||||
impl EpubCoverExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
@@ -51,7 +51,7 @@ impl EpubCoverExtractor {
|
||||
Err(error) => match error.downcast::<std::io::Error>() {
|
||||
Ok(x) => return Err(x),
|
||||
Err(error) => {
|
||||
trace!(message = "Could not extract epub cover", ?error, key = ?self.item.key());
|
||||
trace!(message = "Could not extract epub cover", ?error, item = ?self.item);
|
||||
None
|
||||
}
|
||||
},
|
||||
@@ -65,12 +65,11 @@ impl EpubCoverExtractor {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Ok(self
|
||||
.get_inner()
|
||||
.await?
|
||||
.map(|(mime, bytes)| PileValue::Blob {
|
||||
Ok(self.get_inner().await?.map(|(mime, bytes)| {
|
||||
PileValue::Binary(BinaryPileValue::Blob {
|
||||
mime: mime.clone(),
|
||||
bytes: Arc::new(bytes.clone()),
|
||||
}))
|
||||
bytes: ArcBytes(Arc::new(bytes.clone())),
|
||||
})
|
||||
}))
|
||||
}
|
||||
}
|
||||
@@ -9,16 +9,16 @@ use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::{BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
pub struct EpubMetaExtractor {
|
||||
item: Item,
|
||||
item: BinaryPileValue,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl EpubMetaExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
@@ -57,7 +57,7 @@ impl EpubMetaExtractor {
|
||||
Err(error) => match error.downcast::<std::io::Error>() {
|
||||
Ok(x) => return Err(x),
|
||||
Err(error) => {
|
||||
trace!(message = "Could not process epub", ?error, key = ?self.item.key());
|
||||
trace!(message = "Could not process epub", ?error, item = ?self.item);
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
},
|
||||
@@ -9,16 +9,16 @@ use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::{BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
pub struct EpubTextExtractor {
|
||||
item: Item,
|
||||
item: BinaryPileValue,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl EpubTextExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
@@ -54,7 +54,7 @@ impl EpubTextExtractor {
|
||||
Err(error) => match error.downcast::<std::io::Error>() {
|
||||
Ok(x) => return Err(x),
|
||||
Err(error) => {
|
||||
trace!(message = "Could not process epub", ?error, key = ?self.item.key());
|
||||
trace!(message = "Could not process epub", ?error, item = ?self.item);
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
},
|
||||
@@ -12,7 +12,7 @@ pub use epub_text::*;
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::{BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
pub struct EpubExtractor {
|
||||
@@ -22,7 +22,7 @@ pub struct EpubExtractor {
|
||||
}
|
||||
|
||||
impl EpubExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
text: Arc::new(EpubTextExtractor::new(item)),
|
||||
meta: Arc::new(EpubMetaExtractor::new(item)),
|
||||
@@ -87,9 +87,13 @@ impl ObjectExtractor for EpubExtractor {
|
||||
|
||||
if k.as_str() == "cover" {
|
||||
let summary = match &v {
|
||||
PileValue::Blob { mime, bytes } => {
|
||||
format!("<Blob ({}, {} bytes)>", mime, bytes.len())
|
||||
PileValue::Binary(BinaryPileValue::Blob { mime, bytes }) => {
|
||||
format!("<Blob ({mime}, {} bytes)>", bytes.0.len())
|
||||
}
|
||||
PileValue::Binary(BinaryPileValue::File { mime, .. }) => {
|
||||
format!("<File ({mime})>")
|
||||
}
|
||||
|
||||
PileValue::Null => "<null>".to_owned(),
|
||||
_ => "<cover>".to_owned(),
|
||||
};
|
||||
@@ -9,16 +9,16 @@ use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::{BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
pub struct ExifExtractor {
|
||||
item: Item,
|
||||
item: BinaryPileValue,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl ExifExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
@@ -53,7 +53,7 @@ impl ExifExtractor {
|
||||
Ok(x) => x,
|
||||
Err(exif::Error::Io(x)) => return Err(x),
|
||||
Err(error) => {
|
||||
trace!(message = "Could not process exif", ?error, key = ?self.item.key());
|
||||
trace!(message = "Could not process exif", ?error, item = ?self.item);
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
};
|
||||
@@ -94,7 +94,7 @@ impl ObjectExtractor for ExifExtractor {
|
||||
) -> Result<Option<PileValue>, std::io::Error> {
|
||||
trace!(
|
||||
?args,
|
||||
key = self.item.key().as_str(),
|
||||
item = ?self.item,
|
||||
"Getting field {name:?} from ExifExtractor",
|
||||
);
|
||||
|
||||
@@ -11,16 +11,16 @@ use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ListExtractor, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::{ArcBytes, BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
pub struct FlacImagesExtractor {
|
||||
item: Item,
|
||||
item: BinaryPileValue,
|
||||
cached_count: OnceLock<usize>,
|
||||
}
|
||||
|
||||
impl FlacImagesExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
cached_count: OnceLock::new(),
|
||||
@@ -65,7 +65,7 @@ impl ListExtractor for FlacImagesExtractor {
|
||||
mut idx: usize,
|
||||
) -> Result<Option<PileValue>, std::io::Error> {
|
||||
trace!(
|
||||
key = self.item.key().as_str(),
|
||||
item = ?self.item,
|
||||
"Getting index {idx} from FlacImagesExtractor",
|
||||
);
|
||||
|
||||
@@ -73,7 +73,7 @@ impl ListExtractor for FlacImagesExtractor {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let key = self.item.key();
|
||||
let item = self.item.clone();
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
let image = tokio::task::spawn_blocking(move || {
|
||||
let reader = FlacReader::new(BufReader::new(reader));
|
||||
@@ -93,11 +93,7 @@ impl ListExtractor for FlacImagesExtractor {
|
||||
|
||||
Err(FlacDecodeError::IoError(err)) => return Err(err),
|
||||
Err(error) => {
|
||||
trace!(
|
||||
message = "Could not parse FLAC images",
|
||||
key = key.as_str(),
|
||||
?error
|
||||
);
|
||||
trace!(message = "Could not parse FLAC images", ?item, ?error);
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
@@ -109,9 +105,11 @@ impl ListExtractor for FlacImagesExtractor {
|
||||
.await
|
||||
.map_err(std::io::Error::other)??;
|
||||
|
||||
Ok(image.map(|(mime, data)| PileValue::Blob {
|
||||
mime,
|
||||
bytes: Arc::new(data),
|
||||
Ok(image.map(|(mime, data)| {
|
||||
PileValue::Binary(BinaryPileValue::Blob {
|
||||
mime,
|
||||
bytes: ArcBytes(Arc::new(data)),
|
||||
})
|
||||
}))
|
||||
}
|
||||
|
||||
@@ -130,13 +128,13 @@ impl ListExtractor for FlacImagesExtractor {
|
||||
}
|
||||
|
||||
pub struct FlacExtractor {
|
||||
item: Item,
|
||||
item: BinaryPileValue,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
images: PileValue,
|
||||
}
|
||||
|
||||
impl FlacExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
@@ -149,12 +147,9 @@ impl FlacExtractor {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
trace!(
|
||||
message = "Reading FLAC tags",
|
||||
key = self.item.key().as_str()
|
||||
);
|
||||
trace!(message = "Reading FLAC tags", item = ?self.item);
|
||||
|
||||
let key = self.item.key();
|
||||
let item = self.item.clone();
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
let output = tokio::task::spawn_blocking(move || {
|
||||
let reader = FlacReader::new(BufReader::new(reader));
|
||||
@@ -176,11 +171,7 @@ impl FlacExtractor {
|
||||
|
||||
Err(FlacDecodeError::IoError(err)) => return Err(err),
|
||||
Err(error) => {
|
||||
trace!(
|
||||
message = "Could not parse FLAC metadata",
|
||||
key = key.as_str(),
|
||||
?error
|
||||
);
|
||||
trace!(message = "Could not parse FLAC metadata", ?item, ?error);
|
||||
return Ok(HashMap::new());
|
||||
}
|
||||
|
||||
@@ -1,21 +1,21 @@
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::{BinaryPileValue, PileValue},
|
||||
};
|
||||
use pile_config::Label;
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
path::{Component, PathBuf},
|
||||
path::Component,
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
|
||||
pub struct FsExtractor {
|
||||
item: Item,
|
||||
item: BinaryPileValue,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl FsExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
@@ -27,7 +27,10 @@ impl FsExtractor {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
let path = PathBuf::from(self.item.key().as_str());
|
||||
let path = match &self.item {
|
||||
BinaryPileValue::File { path, .. } => path,
|
||||
_ => return Ok(self.output.get_or_init(HashMap::new)),
|
||||
};
|
||||
|
||||
let mut root = false;
|
||||
let components = path
|
||||
111
crates/pile-value/src/extract/blob/hash.rs
Normal file
111
crates/pile-value/src/extract/blob/hash.rs
Normal file
@@ -0,0 +1,111 @@
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{BinaryPileValue, PileValue},
|
||||
};
|
||||
use pile_config::Label;
|
||||
use pile_io::SyncReadBridge;
|
||||
use std::{io::Read, sync::Arc};
|
||||
use tokio::sync::OnceCell;
|
||||
|
||||
fn to_hex(bytes: &[u8]) -> String {
|
||||
bytes.iter().map(|b| format!("{b:02x}")).collect()
|
||||
}
|
||||
|
||||
macro_rules! hash_algos {
|
||||
($($name:ident),* $(,)?) => {
|
||||
pub struct HashExtractor {
|
||||
item: BinaryPileValue,
|
||||
$($name: OnceCell<String>,)*
|
||||
}
|
||||
|
||||
impl HashExtractor {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
$($name: OnceCell::new(),)*
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static LABELS: std::sync::LazyLock<Vec<Label>> = std::sync::LazyLock::new(|| {
|
||||
vec![$(Label::new(stringify!($name)).unwrap()),*]
|
||||
});
|
||||
};
|
||||
}
|
||||
|
||||
hash_algos!(blake3, md5, sha1, sha224, sha256, sha384, sha512);
|
||||
|
||||
impl HashExtractor {
|
||||
async fn compute(&self, name: &Label) -> Result<Option<String>, std::io::Error> {
|
||||
let name_str = name.as_ref();
|
||||
|
||||
macro_rules! algo {
|
||||
($cell:ident, $compute:expr) => {
|
||||
if name_str == stringify!($cell) {
|
||||
return Ok(Some(
|
||||
self.$cell
|
||||
.get_or_try_init(|| async {
|
||||
let read = self.item.read().await?;
|
||||
let mut read = SyncReadBridge::new_current(read);
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let mut bytes = Vec::new();
|
||||
read.read_to_end(&mut bytes)?;
|
||||
Ok::<String, std::io::Error>($compute(&bytes))
|
||||
})
|
||||
.await?
|
||||
})
|
||||
.await?
|
||||
.clone(),
|
||||
));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
algo!(blake3, |b: &Vec<u8>| blake3::hash(b).to_hex().to_string());
|
||||
algo!(md5, |b: &Vec<u8>| format!("{:x}", md5::compute(b)));
|
||||
algo!(sha1, |b: &Vec<u8>| {
|
||||
use sha1::Digest;
|
||||
to_hex(sha1::Sha1::digest(b).as_ref())
|
||||
});
|
||||
algo!(sha224, |b: &Vec<u8>| {
|
||||
use sha2::Digest;
|
||||
to_hex(sha2::Sha224::digest(b).as_ref())
|
||||
});
|
||||
algo!(sha256, |b: &Vec<u8>| {
|
||||
use sha2::Digest;
|
||||
to_hex(sha2::Sha256::digest(b).as_ref())
|
||||
});
|
||||
algo!(sha384, |b: &Vec<u8>| {
|
||||
use sha2::Digest;
|
||||
to_hex(sha2::Sha384::digest(b).as_ref())
|
||||
});
|
||||
algo!(sha512, |b: &Vec<u8>| {
|
||||
use sha2::Digest;
|
||||
to_hex(sha2::Sha512::digest(b).as_ref())
|
||||
});
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for HashExtractor {
|
||||
async fn field(
|
||||
&self,
|
||||
_state: &ExtractState,
|
||||
name: &Label,
|
||||
args: Option<&str>,
|
||||
) -> Result<Option<PileValue>, std::io::Error> {
|
||||
if args.is_some() {
|
||||
return Ok(None);
|
||||
}
|
||||
Ok(self
|
||||
.compute(name)
|
||||
.await?
|
||||
.map(|s| PileValue::String(Arc::new(s.into()))))
|
||||
}
|
||||
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(LABELS.clone())
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,5 @@
|
||||
use id3::Tag;
|
||||
use mime::Mime;
|
||||
use pile_config::Label;
|
||||
use pile_io::SyncReadBridge;
|
||||
use std::{
|
||||
@@ -10,20 +11,106 @@ use std::{
|
||||
use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
extract::traits::{ExtractState, ListExtractor, ObjectExtractor},
|
||||
value::{ArcBytes, BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
pub struct Id3ImagesExtractor {
|
||||
item: BinaryPileValue,
|
||||
cached_count: OnceLock<usize>,
|
||||
}
|
||||
|
||||
impl Id3ImagesExtractor {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
cached_count: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn read_tag(&self) -> Result<Option<Tag>, std::io::Error> {
|
||||
let item = self.item.clone();
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
tokio::task::spawn_blocking(move || match Tag::read_from2(BufReader::new(reader)) {
|
||||
Ok(tag) => Ok(Some(tag)),
|
||||
Err(id3::Error {
|
||||
kind: id3::ErrorKind::Io(e),
|
||||
..
|
||||
}) => Err(e),
|
||||
Err(error) => {
|
||||
trace!(message = "Could not parse id3 tags", ?item, ?error);
|
||||
Ok(None)
|
||||
}
|
||||
})
|
||||
.await
|
||||
.map_err(std::io::Error::other)?
|
||||
}
|
||||
|
||||
fn mime_ok(&self, state: &ExtractState) -> bool {
|
||||
state.ignore_mime || self.item.mime().essence_str() == "audio/mpeg"
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ListExtractor for Id3ImagesExtractor {
|
||||
async fn get(
|
||||
&self,
|
||||
state: &ExtractState,
|
||||
idx: usize,
|
||||
) -> Result<Option<PileValue>, std::io::Error> {
|
||||
if !self.mime_ok(state) {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let Some(tag) = self.read_tag().await? else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
let Some(picture) = tag.pictures().nth(idx) else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
let mime: Mime = picture
|
||||
.mime_type
|
||||
.parse()
|
||||
.unwrap_or(mime::APPLICATION_OCTET_STREAM);
|
||||
let data = picture.data.clone();
|
||||
|
||||
Ok(Some(PileValue::Binary(BinaryPileValue::Blob {
|
||||
mime,
|
||||
bytes: ArcBytes(Arc::new(data)),
|
||||
})))
|
||||
}
|
||||
|
||||
async fn len(&self, state: &ExtractState) -> Result<usize, std::io::Error> {
|
||||
if !self.mime_ok(state) {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
if let Some(x) = self.cached_count.get() {
|
||||
return Ok(*x);
|
||||
}
|
||||
|
||||
let count = match self.read_tag().await? {
|
||||
Some(tag) => tag.pictures().count(),
|
||||
None => 0,
|
||||
};
|
||||
Ok(*self.cached_count.get_or_init(|| count))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Id3Extractor {
|
||||
item: Item,
|
||||
item: BinaryPileValue,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
images: PileValue,
|
||||
}
|
||||
|
||||
impl Id3Extractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
images: PileValue::ListExtractor(Arc::new(Id3ImagesExtractor::new(item))),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,9 +119,9 @@ impl Id3Extractor {
|
||||
return Ok(x);
|
||||
}
|
||||
|
||||
trace!(message = "Reading id3 tags", key = self.item.key().as_str());
|
||||
trace!(message = "Reading id3 tags", key = ?self.item);
|
||||
|
||||
let key = self.item.key();
|
||||
let item = self.item.clone();
|
||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||
let tag = match tokio::task::spawn_blocking(move || Tag::read_from2(BufReader::new(reader)))
|
||||
.await
|
||||
@@ -48,11 +135,7 @@ impl Id3Extractor {
|
||||
})) => return Err(e),
|
||||
|
||||
Ok(Err(error)) => {
|
||||
trace!(
|
||||
message = "Could not parse id3 tags",
|
||||
key = key.as_str(),
|
||||
?error
|
||||
);
|
||||
trace!(message = "Could not parse id3 tags", ?item, ?error);
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
};
|
||||
@@ -138,10 +221,21 @@ impl ObjectExtractor for Id3Extractor {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
if name.as_str() == "images" {
|
||||
return Ok(Some(self.images.clone()));
|
||||
}
|
||||
|
||||
Ok(self.get_inner().await?.get(name).cloned())
|
||||
}
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.get_inner().await?.keys().cloned().collect())
|
||||
Ok(self
|
||||
.get_inner()
|
||||
.await?
|
||||
.keys()
|
||||
.cloned()
|
||||
.chain([Label::new("images").unwrap()])
|
||||
.collect::<Vec<_>>())
|
||||
}
|
||||
}
|
||||
@@ -1,63 +1,25 @@
|
||||
mod transform;
|
||||
use transform::{CropTransformer, ImageTransformer, MaxDimTransformer};
|
||||
|
||||
use image::ImageFormat;
|
||||
use mime::Mime;
|
||||
use pile_config::Label;
|
||||
use pile_io::AsyncReader;
|
||||
use std::{
|
||||
io::Cursor,
|
||||
str::FromStr,
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
use std::{io::Cursor, str::FromStr, sync::Arc};
|
||||
use tracing::trace;
|
||||
use transform::{CropTransformer, ImageTransformer, MaxDimTransformer};
|
||||
|
||||
mod transform;
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::{ArcBytes, BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
enum ImageSource {
|
||||
Item(Item, OnceLock<Arc<Vec<u8>>>),
|
||||
Blob(Arc<Vec<u8>>, Mime),
|
||||
}
|
||||
|
||||
pub struct ImageExtractor {
|
||||
source: ImageSource,
|
||||
item: BinaryPileValue,
|
||||
}
|
||||
|
||||
impl ImageExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
Self {
|
||||
source: ImageSource::Item(item.clone(), OnceLock::new()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_blob(bytes: Arc<Vec<u8>>, mime: Mime) -> Self {
|
||||
Self {
|
||||
source: ImageSource::Blob(bytes, mime),
|
||||
}
|
||||
}
|
||||
|
||||
fn mime(&self) -> &Mime {
|
||||
match &self.source {
|
||||
ImageSource::Item(item, _) => item.mime(),
|
||||
ImageSource::Blob(_, mime) => mime,
|
||||
}
|
||||
}
|
||||
|
||||
async fn read_bytes(&self) -> Result<Arc<Vec<u8>>, std::io::Error> {
|
||||
match &self.source {
|
||||
ImageSource::Blob(bytes, _) => Ok(bytes.clone()),
|
||||
ImageSource::Item(item, cache) => {
|
||||
if let Some(x) = cache.get() {
|
||||
return Ok(x.clone());
|
||||
}
|
||||
let mut reader = item.read().await?;
|
||||
let bytes = reader.read_to_end().await?;
|
||||
Ok(cache.get_or_init(|| Arc::new(bytes)).clone())
|
||||
}
|
||||
}
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self { item: item.clone() }
|
||||
}
|
||||
|
||||
async fn apply<T: ImageTransformer + Send + 'static>(
|
||||
@@ -69,11 +31,14 @@ impl ImageExtractor {
|
||||
Err(_) => return Ok(None),
|
||||
};
|
||||
|
||||
let mime = self.mime().clone();
|
||||
let bytes = self.read_bytes().await?;
|
||||
let mime = self.item.mime().clone();
|
||||
let bytes = self.item.read().await?.read_to_end().await?;
|
||||
|
||||
let Some(format) = ImageFormat::from_mime_type(&mime) else {
|
||||
return Ok(Some(PileValue::Blob { mime, bytes }));
|
||||
return Ok(Some(PileValue::Binary(BinaryPileValue::Blob {
|
||||
mime,
|
||||
bytes: ArcBytes(Arc::new(bytes)),
|
||||
})));
|
||||
};
|
||||
|
||||
let bytes_for_closure = bytes.clone();
|
||||
@@ -91,11 +56,15 @@ impl ImageExtractor {
|
||||
.await?;
|
||||
|
||||
match result {
|
||||
Ok((out_mime, out_bytes)) => Ok(Some(PileValue::Blob {
|
||||
Ok((out_mime, out_bytes)) => Ok(Some(PileValue::Binary(BinaryPileValue::Blob {
|
||||
mime: out_mime,
|
||||
bytes: Arc::new(out_bytes),
|
||||
})),
|
||||
Err(_) => Ok(Some(PileValue::Blob { mime, bytes })),
|
||||
bytes: ArcBytes(Arc::new(out_bytes)),
|
||||
}))),
|
||||
|
||||
Err(_) => Ok(Some(PileValue::Binary(BinaryPileValue::Blob {
|
||||
mime,
|
||||
bytes: ArcBytes(Arc::new(bytes)),
|
||||
}))),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -7,7 +7,7 @@ use std::{
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::{BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
fn json_to_pile(value: serde_json::Value) -> PileValue {
|
||||
@@ -24,12 +24,12 @@ fn json_to_pile(value: serde_json::Value) -> PileValue {
|
||||
}
|
||||
|
||||
pub struct JsonExtractor {
|
||||
item: Item,
|
||||
item: BinaryPileValue,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl JsonExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
@@ -25,31 +25,31 @@ mod toml;
|
||||
use pile_config::Label;
|
||||
pub use toml::*;
|
||||
|
||||
mod group;
|
||||
pub use group::*;
|
||||
|
||||
mod text;
|
||||
pub use text::*;
|
||||
|
||||
mod image;
|
||||
pub use image::*;
|
||||
|
||||
mod hash;
|
||||
pub use hash::*;
|
||||
|
||||
use crate::{
|
||||
extract::{
|
||||
misc::MapExtractor,
|
||||
traits::{ExtractState, ObjectExtractor},
|
||||
},
|
||||
value::{Item, PileValue},
|
||||
value::{BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
pub struct ItemExtractor {
|
||||
pub struct BinaryExtractor {
|
||||
inner: MapExtractor,
|
||||
image: Arc<ImageExtractor>,
|
||||
}
|
||||
|
||||
impl ItemExtractor {
|
||||
impl BinaryExtractor {
|
||||
#[expect(clippy::unwrap_used)]
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
let inner = MapExtractor {
|
||||
inner: HashMap::from([
|
||||
(
|
||||
@@ -89,8 +89,8 @@ impl ItemExtractor {
|
||||
PileValue::ObjectExtractor(Arc::new(TextExtractor::new(item))),
|
||||
),
|
||||
(
|
||||
Label::new("groups").unwrap(),
|
||||
PileValue::ObjectExtractor(Arc::new(GroupExtractor::new(item))),
|
||||
Label::new("hash").unwrap(),
|
||||
PileValue::ObjectExtractor(Arc::new(HashExtractor::new(item))),
|
||||
),
|
||||
]),
|
||||
};
|
||||
@@ -103,7 +103,7 @@ impl ItemExtractor {
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for ItemExtractor {
|
||||
impl ObjectExtractor for BinaryExtractor {
|
||||
async fn field(
|
||||
&self,
|
||||
state: &ExtractState,
|
||||
@@ -15,7 +15,7 @@ pub use pdf_text::*;
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::{BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
pub struct PdfExtractor {
|
||||
@@ -26,7 +26,7 @@ pub struct PdfExtractor {
|
||||
}
|
||||
|
||||
impl PdfExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
text: Arc::new(PdfTextExtractor::new(item)),
|
||||
meta: Arc::new(PdfMetaExtractor::new(item)),
|
||||
@@ -46,7 +46,7 @@ impl ObjectExtractor for PdfExtractor {
|
||||
) -> Result<Option<PileValue>, std::io::Error> {
|
||||
trace!(
|
||||
?args,
|
||||
key = self.text.item.key().as_str(),
|
||||
item = ?self.text.item,
|
||||
"Getting field {name:?} from PdfExtractor",
|
||||
);
|
||||
|
||||
@@ -9,18 +9,19 @@ use std::{
|
||||
};
|
||||
use tracing::trace;
|
||||
|
||||
use crate::value::BinaryPileValue;
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::PileValue,
|
||||
};
|
||||
|
||||
pub struct PdfMetaExtractor {
|
||||
item: Item,
|
||||
item: BinaryPileValue,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl PdfMetaExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
@@ -82,7 +83,7 @@ impl PdfMetaExtractor {
|
||||
let (page_count, raw_meta) = match raw_meta {
|
||||
Ok(x) => x,
|
||||
Err(error) => {
|
||||
trace!(message = "Could not process pdf", ?error, key = ?self.item.key());
|
||||
trace!(message = "Could not process pdf", ?error, item = ?self.item);
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
};
|
||||
@@ -9,15 +9,15 @@ use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ListExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::{ArcBytes, BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
pub struct PdfPagesExtractor {
|
||||
item: Item,
|
||||
item: BinaryPileValue,
|
||||
}
|
||||
|
||||
impl PdfPagesExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self { item: item.clone() }
|
||||
}
|
||||
|
||||
@@ -41,7 +41,7 @@ impl ListExtractor for PdfPagesExtractor {
|
||||
idx: usize,
|
||||
) -> Result<Option<PileValue>, std::io::Error> {
|
||||
trace!(
|
||||
key = self.item.key().as_str(),
|
||||
item = ?self.item,
|
||||
"Getting index {idx} from PdfPagesExtractor",
|
||||
);
|
||||
|
||||
@@ -78,12 +78,12 @@ impl ListExtractor for PdfPagesExtractor {
|
||||
|
||||
let value = match png {
|
||||
Ok(None) => return Ok(None),
|
||||
Ok(Some(bytes)) => PileValue::Blob {
|
||||
Ok(Some(bytes)) => PileValue::Binary(BinaryPileValue::Blob {
|
||||
mime: mime::IMAGE_PNG,
|
||||
bytes: Arc::new(bytes),
|
||||
},
|
||||
bytes: ArcBytes(Arc::new(bytes)),
|
||||
}),
|
||||
Err(error) => {
|
||||
trace!(message = "Could not render pdf page", ?error, idx, key = ?self.item.key());
|
||||
trace!(message = "Could not render pdf page", ?error, idx, item = ?self.item);
|
||||
PileValue::Null
|
||||
}
|
||||
};
|
||||
@@ -108,7 +108,7 @@ impl ListExtractor for PdfPagesExtractor {
|
||||
match count {
|
||||
Ok(n) => Ok(n),
|
||||
Err(error) => {
|
||||
trace!(message = "Could not read pdf page count", ?error, key = ?self.item.key());
|
||||
trace!(message = "Could not read pdf page count", ?error, item = ?self.item);
|
||||
Ok(0)
|
||||
}
|
||||
}
|
||||
@@ -9,18 +9,19 @@ use std::{
|
||||
};
|
||||
use tracing::trace;
|
||||
|
||||
use crate::value::BinaryPileValue;
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::PileValue,
|
||||
};
|
||||
|
||||
pub struct PdfTextExtractor {
|
||||
pub(super) item: Item,
|
||||
pub(super) item: BinaryPileValue,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl PdfTextExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
@@ -86,7 +87,7 @@ impl PdfTextExtractor {
|
||||
let raw_text = match raw_text {
|
||||
Ok(x) => x,
|
||||
Err(error) => {
|
||||
trace!(message = "Could not process pdf", ?error, key = ?self.item.key());
|
||||
trace!(message = "Could not process pdf", ?error, item = ?self.item);
|
||||
return Ok(self.output.get_or_init(HashMap::new));
|
||||
}
|
||||
};
|
||||
@@ -4,16 +4,16 @@ use std::sync::{Arc, OnceLock};
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::{BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
pub struct TextExtractor {
|
||||
item: Item,
|
||||
item: BinaryPileValue,
|
||||
output: OnceLock<PileValue>,
|
||||
}
|
||||
|
||||
impl TextExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
@@ -7,7 +7,7 @@ use std::{
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
value::{BinaryPileValue, PileValue},
|
||||
};
|
||||
|
||||
fn toml_to_pile(value: toml::Value) -> PileValue {
|
||||
@@ -25,12 +25,12 @@ fn toml_to_pile(value: toml::Value) -> PileValue {
|
||||
}
|
||||
|
||||
pub struct TomlExtractor {
|
||||
item: Item,
|
||||
item: BinaryPileValue,
|
||||
output: OnceLock<HashMap<Label, PileValue>>,
|
||||
}
|
||||
|
||||
impl TomlExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
pub fn new(item: &BinaryPileValue) -> Self {
|
||||
Self {
|
||||
item: item.clone(),
|
||||
output: OnceLock::new(),
|
||||
58
crates/pile-value/src/extract/item.rs
Normal file
58
crates/pile-value/src/extract/item.rs
Normal file
@@ -0,0 +1,58 @@
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
use pile_config::Label;
|
||||
|
||||
use crate::{
|
||||
extract::{
|
||||
misc::MapExtractor,
|
||||
traits::{ExtractState, ObjectExtractor},
|
||||
},
|
||||
value::{Item, PileValue},
|
||||
};
|
||||
|
||||
pub struct ItemExtractor {
|
||||
inner: MapExtractor,
|
||||
}
|
||||
|
||||
impl ItemExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
let files = {
|
||||
let Item::File { files, .. } = &item;
|
||||
let mut inner = HashMap::new();
|
||||
for f in files {
|
||||
inner.insert(f.0.clone(), f.1.clone());
|
||||
}
|
||||
PileValue::ObjectExtractor(Arc::new(MapExtractor { inner }))
|
||||
};
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
let inner = MapExtractor {
|
||||
inner: HashMap::from([
|
||||
(Label::new("files").unwrap(), files),
|
||||
(
|
||||
Label::new("key").unwrap(),
|
||||
PileValue::String(Arc::new(item.key())),
|
||||
),
|
||||
]),
|
||||
};
|
||||
|
||||
Self { inner }
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for ItemExtractor {
|
||||
async fn field(
|
||||
&self,
|
||||
state: &ExtractState,
|
||||
name: &pile_config::Label,
|
||||
args: Option<&str>,
|
||||
) -> Result<Option<PileValue>, std::io::Error> {
|
||||
self.inner.field(state, name, args).await
|
||||
}
|
||||
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
let fields = self.inner.fields().await?;
|
||||
Ok(fields)
|
||||
}
|
||||
}
|
||||
@@ -1,56 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use pile_config::Label;
|
||||
|
||||
use crate::{
|
||||
extract::traits::{ExtractState, ObjectExtractor},
|
||||
value::{Item, PileValue},
|
||||
};
|
||||
|
||||
pub struct GroupExtractor {
|
||||
item: Item,
|
||||
}
|
||||
|
||||
impl GroupExtractor {
|
||||
pub fn new(item: &Item) -> Self {
|
||||
Self { item: item.clone() }
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ObjectExtractor for GroupExtractor {
|
||||
async fn field(
|
||||
&self,
|
||||
_state: &ExtractState,
|
||||
name: &Label,
|
||||
args: Option<&str>,
|
||||
) -> Result<Option<PileValue>, std::io::Error> {
|
||||
if args.is_some() {
|
||||
return Ok(None);
|
||||
}
|
||||
Ok(self
|
||||
.item
|
||||
.group()
|
||||
.get(name)
|
||||
.map(|item| PileValue::ObjectExtractor(Arc::new(super::ItemExtractor::new(item)))))
|
||||
}
|
||||
|
||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||
Ok(self.item.group().keys().cloned().collect())
|
||||
}
|
||||
|
||||
async fn to_json(&self, _state: &ExtractState) -> Result<serde_json::Value, std::io::Error> {
|
||||
Ok(serde_json::Value::Object(
|
||||
self.item
|
||||
.group()
|
||||
.iter()
|
||||
.map(|(k, v)| {
|
||||
(
|
||||
k.to_string(),
|
||||
serde_json::Value::String(format!("<GroupItem ({})>", v.key())),
|
||||
)
|
||||
})
|
||||
.collect(),
|
||||
))
|
||||
}
|
||||
}
|
||||
@@ -1,3 +1,4 @@
|
||||
pub mod blob;
|
||||
pub mod item;
|
||||
pub mod misc;
|
||||
pub mod regex;
|
||||
|
||||
@@ -1,27 +1,25 @@
|
||||
use chrono::{DateTime, Utc};
|
||||
use pile_config::{
|
||||
Label,
|
||||
pattern::{GroupPattern, GroupSegment},
|
||||
};
|
||||
use pile_config::Label;
|
||||
use regex::Regex;
|
||||
use smartstring::{LazyCompact, SmartString};
|
||||
use std::{
|
||||
collections::{BTreeMap, HashMap, HashSet},
|
||||
collections::{BTreeMap, HashMap},
|
||||
path::PathBuf,
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::{
|
||||
extract::traits::ExtractState,
|
||||
source::{DataSource, misc::path_ts_latest},
|
||||
value::{Item, PileValue},
|
||||
value::{BinaryPileValue, Item, PileValue},
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DirDataSource {
|
||||
pub name: Label,
|
||||
pub dir: PathBuf,
|
||||
pub pattern: GroupPattern,
|
||||
pub base_pattern: Regex,
|
||||
pub files: HashMap<Label, String>,
|
||||
pub index: OnceLock<BTreeMap<SmartString<LazyCompact>, Item>>,
|
||||
}
|
||||
|
||||
@@ -29,21 +27,18 @@ impl DirDataSource {
|
||||
pub async fn new(
|
||||
name: &Label,
|
||||
dir: PathBuf,
|
||||
pattern: GroupPattern,
|
||||
base_pattern: Regex,
|
||||
files: HashMap<Label, String>,
|
||||
) -> Result<Arc<Self>, std::io::Error> {
|
||||
let source = Arc::new(Self {
|
||||
name: name.clone(),
|
||||
dir,
|
||||
pattern,
|
||||
base_pattern,
|
||||
files,
|
||||
index: OnceLock::new(),
|
||||
});
|
||||
|
||||
//
|
||||
// MARK: list paths
|
||||
//
|
||||
|
||||
let mut paths_items = HashSet::new();
|
||||
let mut paths_grouped_items = HashSet::new();
|
||||
let mut index = BTreeMap::new();
|
||||
'entry: for entry in WalkDir::new(&source.dir) {
|
||||
let entry = match entry {
|
||||
Err(e) => {
|
||||
@@ -59,51 +54,52 @@ impl DirDataSource {
|
||||
}
|
||||
|
||||
let path = entry.into_path();
|
||||
let path_str = match path.to_str() {
|
||||
let rel_path = match path.strip_prefix(&source.dir) {
|
||||
Ok(p) => p,
|
||||
Err(_) => continue 'entry,
|
||||
};
|
||||
let path_str = match rel_path.to_str() {
|
||||
Some(x) => x,
|
||||
None => continue 'entry,
|
||||
};
|
||||
|
||||
let groups = resolve_groups(&source.pattern, path_str).await;
|
||||
paths_grouped_items.extend(groups.into_values());
|
||||
paths_items.insert(path);
|
||||
}
|
||||
|
||||
//
|
||||
// MARK: resolve groups
|
||||
//
|
||||
|
||||
let mut index = BTreeMap::new();
|
||||
'entry: for path in paths_items.difference(&paths_grouped_items) {
|
||||
let path_str = match path.to_str() {
|
||||
Some(x) => x,
|
||||
let captures = match source.base_pattern.captures(path_str) {
|
||||
Some(c) => c,
|
||||
None => continue 'entry,
|
||||
};
|
||||
let base = match captures.get(1) {
|
||||
Some(m) => m.as_str(),
|
||||
None => continue 'entry,
|
||||
};
|
||||
|
||||
let group = resolve_groups(&source.pattern, path_str).await;
|
||||
let group = group
|
||||
.into_iter()
|
||||
.map(|(k, group_path)| {
|
||||
(
|
||||
k,
|
||||
Box::new(Item::File {
|
||||
source: Arc::clone(&source),
|
||||
mime: mime_guess::from_path(&group_path).first_or_octet_stream(),
|
||||
path: group_path.clone(),
|
||||
group: Arc::new(HashMap::new()),
|
||||
let key: SmartString<LazyCompact> = base.into();
|
||||
if index.contains_key(&key) {
|
||||
continue 'entry;
|
||||
}
|
||||
|
||||
let mut item_files = HashMap::new();
|
||||
for (label, template) in &source.files {
|
||||
let file_path = source.dir.join(template.replace("{base}", base));
|
||||
if file_path.exists() {
|
||||
let mime = mime_guess::from_path(&file_path).first_or_octet_stream();
|
||||
item_files.insert(
|
||||
label.clone(),
|
||||
PileValue::Binary(BinaryPileValue::File {
|
||||
mime,
|
||||
path: file_path,
|
||||
}),
|
||||
)
|
||||
})
|
||||
.collect::<HashMap<_, _>>();
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let item = Item::File {
|
||||
source: Arc::clone(&source),
|
||||
mime: mime_guess::from_path(path).first_or_octet_stream(),
|
||||
path: path.into(),
|
||||
group: Arc::new(group),
|
||||
};
|
||||
|
||||
index.insert(item.key(), item);
|
||||
index.insert(
|
||||
key.clone(),
|
||||
Item::File {
|
||||
key,
|
||||
source: Arc::clone(&source),
|
||||
files: item_files,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
source.index.get_or_init(|| index);
|
||||
@@ -139,43 +135,3 @@ impl DataSource for Arc<DirDataSource> {
|
||||
path_ts_latest(&self.dir)
|
||||
}
|
||||
}
|
||||
|
||||
async fn resolve_groups(pattern: &GroupPattern, path_str: &str) -> HashMap<Label, PathBuf> {
|
||||
let state = ExtractState { ignore_mime: false };
|
||||
let mut group = HashMap::new();
|
||||
'pattern: for (l, pat) in &pattern.pattern {
|
||||
let item = PileValue::String(Arc::new(path_str.into()));
|
||||
let mut target = String::new();
|
||||
for p in pat {
|
||||
match p {
|
||||
GroupSegment::Literal(x) => target.push_str(x),
|
||||
GroupSegment::Path(op) => {
|
||||
let res = match item.query(&state, op).await {
|
||||
Ok(Some(x)) => x,
|
||||
_ => continue 'pattern,
|
||||
};
|
||||
|
||||
let res = match res.as_str() {
|
||||
Some(x) => x,
|
||||
None => continue 'pattern,
|
||||
};
|
||||
|
||||
target.push_str(res);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let group_path: PathBuf = match target.parse() {
|
||||
Ok(x) => x,
|
||||
Err(_) => continue 'pattern,
|
||||
};
|
||||
|
||||
if !group_path.exists() {
|
||||
continue;
|
||||
}
|
||||
|
||||
group.insert(l.clone(), group_path);
|
||||
}
|
||||
|
||||
return group;
|
||||
}
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
mod dir;
|
||||
pub use dir::*;
|
||||
|
||||
mod s3;
|
||||
pub use s3::*;
|
||||
|
||||
pub mod misc;
|
||||
|
||||
/// A read-only set of [Item]s.
|
||||
|
||||
@@ -1,322 +0,0 @@
|
||||
use chrono::{DateTime, Utc};
|
||||
use pile_config::{
|
||||
Label,
|
||||
pattern::{GroupPattern, GroupSegment},
|
||||
};
|
||||
use pile_io::S3Client;
|
||||
use smartstring::{LazyCompact, SmartString};
|
||||
use std::{
|
||||
collections::{BTreeMap, HashMap, HashSet},
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
extract::traits::ExtractState,
|
||||
source::DataSource,
|
||||
value::{Item, PileValue},
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct S3DataSource {
|
||||
pub name: Label,
|
||||
pub client: Arc<S3Client>,
|
||||
|
||||
pub prefix: Option<SmartString<LazyCompact>>,
|
||||
pub pattern: GroupPattern,
|
||||
pub encryption_key: Option<[u8; 32]>,
|
||||
pub index: OnceLock<BTreeMap<SmartString<LazyCompact>, Item>>,
|
||||
}
|
||||
|
||||
impl S3DataSource {
|
||||
pub async fn new(
|
||||
name: &Label,
|
||||
bucket: &str,
|
||||
prefix: Option<&str>,
|
||||
endpoint: Option<&str>,
|
||||
region: &str,
|
||||
access_key_id: &str,
|
||||
secret_access_key: &str,
|
||||
cache_limit_bytes: usize,
|
||||
pattern: GroupPattern,
|
||||
encryption_key: Option<[u8; 32]>,
|
||||
) -> Result<Arc<Self>, std::io::Error> {
|
||||
let client = S3Client::new(
|
||||
bucket,
|
||||
endpoint,
|
||||
region,
|
||||
access_key_id,
|
||||
secret_access_key,
|
||||
cache_limit_bytes,
|
||||
)
|
||||
.await;
|
||||
|
||||
let source = Arc::new(Self {
|
||||
name: name.clone(),
|
||||
client,
|
||||
prefix: prefix.map(|x| x.into()),
|
||||
pattern,
|
||||
encryption_key,
|
||||
index: OnceLock::new(),
|
||||
});
|
||||
|
||||
//
|
||||
// MARK: list keys
|
||||
//
|
||||
|
||||
let mut all_keys: HashSet<SmartString<LazyCompact>> = HashSet::new();
|
||||
let mut continuation_token: Option<String> = None;
|
||||
|
||||
loop {
|
||||
let mut req = source
|
||||
.client
|
||||
.client
|
||||
.list_objects_v2()
|
||||
.bucket(source.client.bucket());
|
||||
|
||||
if let Some(prefix) = &source.prefix {
|
||||
req = req.prefix(prefix.as_str());
|
||||
}
|
||||
|
||||
if let Some(token) = continuation_token {
|
||||
req = req.continuation_token(token);
|
||||
}
|
||||
|
||||
let resp = req.send().await.map_err(std::io::Error::other)?;
|
||||
|
||||
let next_token = resp.next_continuation_token().map(ToOwned::to_owned);
|
||||
let is_truncated = resp.is_truncated().unwrap_or(false);
|
||||
|
||||
for obj in resp.contents() {
|
||||
let Some(full_key) = obj.key() else { continue };
|
||||
let raw_key = strip_prefix(full_key, source.prefix.as_deref());
|
||||
let key = match &source.encryption_key {
|
||||
None => raw_key.into(),
|
||||
Some(enc_key) => match decrypt_path(enc_key, raw_key) {
|
||||
Some(decrypted) => decrypted.into(),
|
||||
None => continue,
|
||||
},
|
||||
};
|
||||
all_keys.insert(key);
|
||||
}
|
||||
|
||||
if !is_truncated {
|
||||
break;
|
||||
}
|
||||
continuation_token = next_token;
|
||||
}
|
||||
|
||||
//
|
||||
// MARK: resolve groups
|
||||
//
|
||||
|
||||
let mut keys_grouped: HashSet<SmartString<LazyCompact>> = HashSet::new();
|
||||
for key in &all_keys {
|
||||
let groups = resolve_groups(&source.pattern, key).await;
|
||||
for group_key in groups.into_values() {
|
||||
if all_keys.contains(&group_key) {
|
||||
keys_grouped.insert(group_key);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut index = BTreeMap::new();
|
||||
for key in all_keys.difference(&keys_grouped) {
|
||||
let groups = resolve_groups(&source.pattern, key).await;
|
||||
let group = groups
|
||||
.into_iter()
|
||||
.filter(|(_, gk)| all_keys.contains(gk))
|
||||
.map(|(label, gk)| {
|
||||
(
|
||||
label,
|
||||
Box::new(Item::S3 {
|
||||
source: Arc::clone(&source),
|
||||
mime: mime_guess::from_path(gk.as_str()).first_or_octet_stream(),
|
||||
key: gk,
|
||||
group: Arc::new(HashMap::new()),
|
||||
}),
|
||||
)
|
||||
})
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
let item = Item::S3 {
|
||||
source: Arc::clone(&source),
|
||||
mime: mime_guess::from_path(key.as_str()).first_or_octet_stream(),
|
||||
key: key.clone(),
|
||||
group: Arc::new(group),
|
||||
};
|
||||
|
||||
index.insert(item.key(), item);
|
||||
}
|
||||
|
||||
source.index.get_or_init(|| index);
|
||||
Ok(source)
|
||||
}
|
||||
}
|
||||
|
||||
impl DataSource for Arc<S3DataSource> {
|
||||
#[expect(clippy::expect_used)]
|
||||
fn len(&self) -> usize {
|
||||
self.index.get().expect("index should be initialized").len()
|
||||
}
|
||||
|
||||
#[expect(clippy::expect_used)]
|
||||
async fn get(&self, key: &str) -> Result<Option<Item>, std::io::Error> {
|
||||
return Ok(self
|
||||
.index
|
||||
.get()
|
||||
.expect("index should be initialized")
|
||||
.get(key)
|
||||
.cloned());
|
||||
}
|
||||
|
||||
#[expect(clippy::expect_used)]
|
||||
fn iter(&self) -> impl Iterator<Item = &Item> {
|
||||
self.index
|
||||
.get()
|
||||
.expect("index should be initialized")
|
||||
.values()
|
||||
}
|
||||
|
||||
async fn latest_change(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
|
||||
let mut ts: Option<DateTime<Utc>> = None;
|
||||
let mut continuation_token: Option<String> = None;
|
||||
|
||||
loop {
|
||||
let mut req = self
|
||||
.client
|
||||
.client
|
||||
.list_objects_v2()
|
||||
.bucket(self.client.bucket());
|
||||
|
||||
if let Some(prefix) = &self.prefix {
|
||||
req = req.prefix(prefix.as_str());
|
||||
}
|
||||
|
||||
if let Some(token) = continuation_token {
|
||||
req = req.continuation_token(token);
|
||||
}
|
||||
|
||||
let resp = match req.send().await {
|
||||
Err(_) => return Ok(None),
|
||||
Ok(resp) => resp,
|
||||
};
|
||||
|
||||
let next_token = resp.next_continuation_token().map(ToOwned::to_owned);
|
||||
let is_truncated = resp.is_truncated().unwrap_or(false);
|
||||
|
||||
for obj in resp.contents() {
|
||||
if let Some(last_modified) = obj.last_modified() {
|
||||
let dt = DateTime::from_timestamp(
|
||||
last_modified.secs(),
|
||||
last_modified.subsec_nanos(),
|
||||
);
|
||||
if let Some(dt) = dt {
|
||||
ts = Some(match ts {
|
||||
None => dt,
|
||||
Some(prev) => prev.max(dt),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !is_truncated {
|
||||
break;
|
||||
}
|
||||
continuation_token = next_token;
|
||||
}
|
||||
|
||||
Ok(ts)
|
||||
}
|
||||
}
|
||||
|
||||
/// Derive an encryption key from a password
|
||||
pub fn string_to_key(password: &str) -> [u8; 32] {
|
||||
blake3::derive_key("pile s3 encryption", password.as_bytes())
|
||||
}
|
||||
|
||||
/// Encrypt a logical path to a base64 S3 key using a deterministic nonce.
|
||||
pub fn encrypt_path(enc_key: &[u8; 32], path: &str) -> String {
|
||||
use base64::Engine;
|
||||
use chacha20poly1305::{KeyInit, XChaCha20Poly1305, XNonce, aead::Aead};
|
||||
|
||||
let hash = blake3::keyed_hash(enc_key, path.as_bytes());
|
||||
let nonce_bytes = &hash.as_bytes()[..24];
|
||||
let nonce = XNonce::from_slice(nonce_bytes);
|
||||
let key = chacha20poly1305::Key::from_slice(enc_key);
|
||||
let cipher = XChaCha20Poly1305::new(key);
|
||||
#[expect(clippy::expect_used)]
|
||||
let ciphertext = cipher
|
||||
.encrypt(nonce, path.as_bytes())
|
||||
.expect("path encryption should not fail");
|
||||
|
||||
let mut result = nonce_bytes.to_vec();
|
||||
result.extend_from_slice(&ciphertext);
|
||||
base64::engine::general_purpose::URL_SAFE_NO_PAD.encode(result)
|
||||
}
|
||||
|
||||
/// Decrypt a base64 S3 key back to its logical path.
|
||||
fn decrypt_path(enc_key: &[u8; 32], encrypted: &str) -> Option<String> {
|
||||
use base64::Engine;
|
||||
use chacha20poly1305::{KeyInit, XChaCha20Poly1305, XNonce, aead::Aead};
|
||||
|
||||
let bytes = base64::engine::general_purpose::URL_SAFE_NO_PAD
|
||||
.decode(encrypted)
|
||||
.ok()?;
|
||||
if bytes.len() < 24 + 16 {
|
||||
return None;
|
||||
}
|
||||
let (nonce_bytes, ciphertext) = bytes.split_at(24);
|
||||
let nonce = XNonce::from_slice(nonce_bytes);
|
||||
let key = chacha20poly1305::Key::from_slice(enc_key);
|
||||
let cipher = XChaCha20Poly1305::new(key);
|
||||
let plaintext = cipher.decrypt(nonce, ciphertext).ok()?;
|
||||
String::from_utf8(plaintext).ok()
|
||||
}
|
||||
|
||||
fn strip_prefix<'a>(key: &'a str, prefix: Option<&str>) -> &'a str {
|
||||
match prefix {
|
||||
None => key,
|
||||
Some(p) => {
|
||||
let with_slash = if p.ends_with('/') {
|
||||
key.strip_prefix(p)
|
||||
} else {
|
||||
key.strip_prefix(&format!("{p}/"))
|
||||
};
|
||||
with_slash.unwrap_or(key)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn resolve_groups(
|
||||
pattern: &GroupPattern,
|
||||
key: &str,
|
||||
) -> HashMap<Label, SmartString<LazyCompact>> {
|
||||
let state = ExtractState { ignore_mime: false };
|
||||
let mut group = HashMap::new();
|
||||
'pattern: for (l, pat) in &pattern.pattern {
|
||||
let item = PileValue::String(Arc::new(key.into()));
|
||||
let mut target = String::new();
|
||||
for p in pat {
|
||||
match p {
|
||||
GroupSegment::Literal(x) => target.push_str(x),
|
||||
GroupSegment::Path(op) => {
|
||||
let res = match item.query(&state, op).await {
|
||||
Ok(Some(x)) => x,
|
||||
_ => continue 'pattern,
|
||||
};
|
||||
|
||||
let res = match res.as_str() {
|
||||
Some(x) => x,
|
||||
None => continue 'pattern,
|
||||
};
|
||||
|
||||
target.push_str(res);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
group.insert(l.clone(), target.into());
|
||||
}
|
||||
|
||||
return group;
|
||||
}
|
||||
@@ -1,158 +0,0 @@
|
||||
use aws_sdk_s3::{error::SdkError, operation::get_object::GetObjectError};
|
||||
use mime::Mime;
|
||||
use std::io::{Error as IoError, Seek, SeekFrom, Write};
|
||||
use thiserror::Error;
|
||||
|
||||
use super::S3Client;
|
||||
use crate::retry;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
#[expect(clippy::large_enum_variant)]
|
||||
pub enum S3ReaderError {
|
||||
#[error("sdk error")]
|
||||
SdkError(#[from] SdkError<GetObjectError>),
|
||||
|
||||
#[error("byte stream error")]
|
||||
ByteStreamError(#[from] aws_sdk_s3::primitives::ByteStreamError),
|
||||
|
||||
#[error("i/o error")]
|
||||
IoError(#[from] IoError),
|
||||
}
|
||||
|
||||
/// Provides a [`std::io::Read`]-like interface to an S3 object. \
|
||||
/// This doesn't actually implement [`std::io::Read`] because Read isn't async.
|
||||
///
|
||||
/// Also implements [`std::io::Seek`]
|
||||
pub struct S3Reader {
|
||||
pub(super) client: S3Client,
|
||||
pub(super) bucket: String,
|
||||
pub(super) key: String,
|
||||
|
||||
pub(super) cursor: u64,
|
||||
pub(super) size: u64,
|
||||
pub(super) mime: Mime,
|
||||
}
|
||||
|
||||
impl S3Reader {
|
||||
pub async fn read(&mut self, mut buf: &mut [u8]) -> Result<usize, S3ReaderError> {
|
||||
let len_left = self.size - self.cursor;
|
||||
if len_left == 0 || buf.is_empty() {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
#[expect(clippy::unwrap_used)] // TODO: probably fits?
|
||||
let start_byte = usize::try_from(self.cursor).unwrap();
|
||||
|
||||
#[expect(clippy::unwrap_used)] // usize fits in u64
|
||||
let len_to_read = u64::try_from(buf.len()).unwrap().min(len_left);
|
||||
|
||||
#[expect(clippy::unwrap_used)] // must fit, we called min()
|
||||
let len_to_read = usize::try_from(len_to_read).unwrap();
|
||||
|
||||
let end_byte = start_byte + len_to_read - 1;
|
||||
|
||||
let b = retry!(
|
||||
self.client.retries,
|
||||
self.client
|
||||
.client
|
||||
.get_object()
|
||||
.bucket(self.bucket.as_str())
|
||||
.key(self.key.as_str())
|
||||
.range(format!("bytes={start_byte}-{end_byte}"))
|
||||
.send()
|
||||
.await
|
||||
)?;
|
||||
|
||||
// Looks like `bytes 31000000-31999999/33921176``
|
||||
// println!("{:?}", b.content_range);
|
||||
|
||||
let mut bytes = b.body.collect().await?.into_bytes();
|
||||
bytes.truncate(len_to_read);
|
||||
let l = bytes.len();
|
||||
|
||||
// Memory to memory writes are infallible
|
||||
#[expect(clippy::unwrap_used)]
|
||||
buf.write_all(&bytes).unwrap();
|
||||
|
||||
// Cannot fail, usize should always fit into u64
|
||||
#[expect(clippy::unwrap_used)]
|
||||
{
|
||||
self.cursor += u64::try_from(l).unwrap();
|
||||
}
|
||||
|
||||
return Ok(len_to_read);
|
||||
}
|
||||
|
||||
pub fn is_done(&self) -> bool {
|
||||
return self.cursor == self.size;
|
||||
}
|
||||
|
||||
pub fn mime(&self) -> &Mime {
|
||||
&self.mime
|
||||
}
|
||||
|
||||
/// Write the entire contents of this reader to `r`.
|
||||
///
|
||||
/// This method always downloads the whole object,
|
||||
/// and always preserves `self.cursor`.
|
||||
pub async fn download<W: Write>(&mut self, r: &mut W) -> Result<(), S3ReaderError> {
|
||||
let pos = self.stream_position()?;
|
||||
|
||||
const BUF_LEN: usize = 10_000_000;
|
||||
#[expect(clippy::unwrap_used)] // Cannot fail
|
||||
let mut buf: Box<[u8; BUF_LEN]> = vec![0u8; BUF_LEN].try_into().unwrap();
|
||||
|
||||
while !self.is_done() {
|
||||
let b = self.read(&mut buf[..]).await?;
|
||||
r.write_all(&buf[0..b])?;
|
||||
}
|
||||
|
||||
self.seek(SeekFrom::Start(pos))?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Seek for S3Reader {
|
||||
fn seek(&mut self, pos: SeekFrom) -> std::io::Result<u64> {
|
||||
match pos {
|
||||
SeekFrom::Start(x) => self.cursor = x.min(self.size - 1),
|
||||
|
||||
// Cannot panic, we handle all cases
|
||||
#[expect(clippy::unwrap_used)]
|
||||
SeekFrom::Current(x) => {
|
||||
if x < 0 {
|
||||
if u64::try_from(x.abs()).unwrap() > self.cursor {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidInput,
|
||||
"cannot seek past start",
|
||||
));
|
||||
}
|
||||
self.cursor -= u64::try_from(x.abs()).unwrap();
|
||||
} else {
|
||||
self.cursor += u64::try_from(x).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
// Cannot panic, we handle all cases
|
||||
#[expect(clippy::unwrap_used)]
|
||||
SeekFrom::End(x) => {
|
||||
if x < 0 {
|
||||
if u64::try_from(x.abs()).unwrap() > self.size {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidInput,
|
||||
"cannot seek past start",
|
||||
));
|
||||
}
|
||||
// Cannot fail, is abs
|
||||
self.cursor = self.size - u64::try_from(x.abs()).unwrap();
|
||||
} else {
|
||||
// Cannot fail, is positive
|
||||
self.cursor = self.size + u64::try_from(x).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.cursor = self.cursor.min(self.size - 1);
|
||||
return Ok(self.cursor);
|
||||
}
|
||||
}
|
||||
@@ -1,119 +1,45 @@
|
||||
use mime::Mime;
|
||||
use pile_config::Label;
|
||||
use pile_io::{SyncReadBridge, chacha::ChaChaReaderv1Async};
|
||||
use smartstring::{LazyCompact, SmartString};
|
||||
use std::{collections::HashMap, fs::File, path::PathBuf, sync::Arc};
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
use crate::{
|
||||
source::{DirDataSource, S3DataSource, encrypt_path},
|
||||
value::ItemReader,
|
||||
};
|
||||
use crate::{source::DirDataSource, value::PileValue};
|
||||
|
||||
//
|
||||
// MARK: item
|
||||
//
|
||||
|
||||
/// A cheaply-cloneable pointer to an item in a dataset
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Clone)]
|
||||
pub enum Item {
|
||||
File {
|
||||
source: Arc<DirDataSource>,
|
||||
mime: Mime,
|
||||
|
||||
path: PathBuf,
|
||||
group: Arc<HashMap<Label, Box<Item>>>,
|
||||
},
|
||||
|
||||
S3 {
|
||||
source: Arc<S3DataSource>,
|
||||
mime: Mime,
|
||||
|
||||
key: SmartString<LazyCompact>,
|
||||
group: Arc<HashMap<Label, Box<Item>>>,
|
||||
source: Arc<DirDataSource>,
|
||||
files: HashMap<Label, PileValue>,
|
||||
},
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Item {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::File { key, files, .. } => f
|
||||
.debug_struct("Item::File")
|
||||
.field("key", key)
|
||||
.field("files", &files.keys().collect::<Vec<_>>())
|
||||
.finish(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Item {
|
||||
/// Open the item for reading. For S3, performs a HEAD request to determine
|
||||
/// the object size.
|
||||
pub async fn read(&self) -> Result<ItemReader, std::io::Error> {
|
||||
Ok(match self {
|
||||
Self::File { path, .. } => ItemReader::File(File::open(path)?),
|
||||
|
||||
Self::S3 { source, key, .. } => {
|
||||
let logical_key = key.as_str();
|
||||
|
||||
let s3_key_part: SmartString<LazyCompact> = match &source.encryption_key {
|
||||
None => logical_key.into(),
|
||||
Some(enc_key) => encrypt_path(enc_key, logical_key).into(),
|
||||
};
|
||||
|
||||
let full_key: SmartString<LazyCompact> = match &source.prefix {
|
||||
None => s3_key_part,
|
||||
Some(p) => {
|
||||
if p.ends_with('/') {
|
||||
format!("{p}{s3_key_part}").into()
|
||||
} else {
|
||||
format!("{p}/{s3_key_part}").into()
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let reader = source.client.get(&full_key).await?;
|
||||
|
||||
match source.encryption_key {
|
||||
None => ItemReader::S3(reader),
|
||||
Some(enc_key) => {
|
||||
ItemReader::EncryptedS3(ChaChaReaderv1Async::new(reader, enc_key).await?)
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn source_name(&self) -> &pile_config::Label {
|
||||
match self {
|
||||
Self::File { source, .. } => &source.name,
|
||||
Self::S3 { source, .. } => &source.name,
|
||||
}
|
||||
}
|
||||
|
||||
#[expect(clippy::expect_used)]
|
||||
pub fn key(&self) -> SmartString<LazyCompact> {
|
||||
match self {
|
||||
Self::File { source, path, .. } => path
|
||||
.strip_prefix(&source.dir)
|
||||
.expect("item must be inside source")
|
||||
.to_str()
|
||||
.expect("path is not utf-8")
|
||||
.into(),
|
||||
Self::S3 { key, .. } => key.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn hash(&self) -> Result<blake3::Hash, std::io::Error> {
|
||||
let read = self.read().await?;
|
||||
let mut read = SyncReadBridge::new_current(read);
|
||||
let out = tokio::task::spawn_blocking(move || {
|
||||
let mut hasher = blake3::Hasher::new();
|
||||
std::io::copy(&mut read, &mut hasher)?;
|
||||
return Ok::<_, std::io::Error>(hasher.finalize());
|
||||
})
|
||||
.await??;
|
||||
return Ok(out);
|
||||
}
|
||||
|
||||
pub fn mime(&self) -> &Mime {
|
||||
match self {
|
||||
Self::File { mime, .. } => mime,
|
||||
Self::S3 { mime, .. } => mime,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn group(&self) -> &HashMap<Label, Box<Self>> {
|
||||
match self {
|
||||
Self::File { group, .. } => group,
|
||||
Self::S3 { group, .. } => group,
|
||||
Self::File { key, .. } => key.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,10 @@
|
||||
use pile_io::{AsyncReader, AsyncSeekReader, S3Reader, chacha::ChaChaReaderv1Async};
|
||||
use std::{fs::File, io::Seek};
|
||||
use pile_io::{AsyncReader, AsyncSeekReader};
|
||||
use std::{
|
||||
fs::File,
|
||||
io::{Cursor, Seek},
|
||||
};
|
||||
|
||||
use crate::value::ArcBytes;
|
||||
|
||||
//
|
||||
// MARK: itemreader
|
||||
@@ -7,16 +12,14 @@ use std::{fs::File, io::Seek};
|
||||
|
||||
pub enum ItemReader {
|
||||
File(File),
|
||||
S3(S3Reader),
|
||||
EncryptedS3(ChaChaReaderv1Async<S3Reader>),
|
||||
Vec(Cursor<ArcBytes>),
|
||||
}
|
||||
|
||||
impl AsyncReader for ItemReader {
|
||||
async fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
|
||||
match self {
|
||||
Self::File(x) => std::io::Read::read(x, buf),
|
||||
Self::S3(x) => x.read(buf).await,
|
||||
Self::EncryptedS3(x) => x.read(buf).await,
|
||||
Self::Vec(x) => std::io::Read::read(x, buf),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -25,8 +28,7 @@ impl AsyncSeekReader for ItemReader {
|
||||
async fn seek(&mut self, pos: std::io::SeekFrom) -> Result<u64, std::io::Error> {
|
||||
match self {
|
||||
Self::File(x) => x.seek(pos),
|
||||
Self::S3(x) => x.seek(pos).await,
|
||||
Self::EncryptedS3(x) => x.seek(pos).await,
|
||||
Self::Vec(x) => x.seek(pos),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,18 +2,61 @@ use mime::Mime;
|
||||
use pile_config::objectpath::{ObjectPath, PathSegment};
|
||||
use serde_json::{Map, Value};
|
||||
use smartstring::{LazyCompact, SmartString};
|
||||
use std::sync::Arc;
|
||||
use std::{fmt::Debug, fs::File, io::Cursor, path::PathBuf, sync::Arc};
|
||||
|
||||
use crate::{
|
||||
extract::{
|
||||
item::{ImageExtractor, ItemExtractor},
|
||||
blob::BinaryExtractor,
|
||||
item::ItemExtractor,
|
||||
misc::{ArrayExtractor, MapExtractor, VecExtractor},
|
||||
string::StringExtractor,
|
||||
traits::{ExtractState, ListExtractor, ObjectExtractor},
|
||||
},
|
||||
value::Item,
|
||||
value::{Item, ItemReader},
|
||||
};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct ArcBytes(pub Arc<Vec<u8>>);
|
||||
impl Debug for ArcBytes {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("ArcBytes")
|
||||
.field("len()", &self.0.len())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for ArcBytes {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum BinaryPileValue {
|
||||
/// A binary blob
|
||||
Blob { mime: Mime, bytes: ArcBytes },
|
||||
|
||||
/// An pointer to a file
|
||||
File { mime: Mime, path: PathBuf },
|
||||
}
|
||||
|
||||
impl BinaryPileValue {
|
||||
/// Open the item for reading.
|
||||
pub async fn read(&self) -> Result<ItemReader, std::io::Error> {
|
||||
match self {
|
||||
Self::File { path, .. } => Ok(ItemReader::File(File::open(path)?)),
|
||||
Self::Blob { bytes, .. } => Ok(ItemReader::Vec(Cursor::new(bytes.clone()))),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn mime(&self) -> &Mime {
|
||||
match self {
|
||||
Self::Blob { mime, .. } => mime,
|
||||
Self::File { mime, .. } => mime,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An immutable, cheaply-cloneable, lazily-computed value.
|
||||
/// Very similar to [serde_json::Value].
|
||||
pub enum PileValue {
|
||||
@@ -27,12 +70,6 @@ pub enum PileValue {
|
||||
/// An array of values
|
||||
Array(Arc<Vec<PileValue>>),
|
||||
|
||||
/// A binary blob
|
||||
Blob {
|
||||
mime: Mime,
|
||||
bytes: Arc<Vec<u8>>,
|
||||
},
|
||||
|
||||
/// A lazily-computed map of {label: value}
|
||||
ObjectExtractor(Arc<dyn ObjectExtractor>),
|
||||
|
||||
@@ -41,6 +78,9 @@ pub enum PileValue {
|
||||
|
||||
/// An pointer to an item in this dataset
|
||||
Item(Item),
|
||||
|
||||
/// Binary data
|
||||
Binary(BinaryPileValue),
|
||||
}
|
||||
|
||||
impl Clone for PileValue {
|
||||
@@ -53,11 +93,8 @@ impl Clone for PileValue {
|
||||
Self::Array(x) => Self::Array(x.clone()),
|
||||
Self::ObjectExtractor(x) => Self::ObjectExtractor(x.clone()),
|
||||
Self::ListExtractor(x) => Self::ListExtractor(x.clone()),
|
||||
Self::Blob { mime, bytes } => Self::Blob {
|
||||
mime: mime.clone(),
|
||||
bytes: bytes.clone(),
|
||||
},
|
||||
Self::Item(i) => Self::Item(i.clone()),
|
||||
Self::Binary(b) => Self::Binary(b.clone()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -70,13 +107,10 @@ impl PileValue {
|
||||
Self::I64(_) => Arc::new(MapExtractor::default()),
|
||||
Self::Array(_) => Arc::new(MapExtractor::default()),
|
||||
Self::String(s) => Arc::new(StringExtractor::new(s)),
|
||||
Self::Blob { mime, bytes } => {
|
||||
// TODO: make a blobextractor (with pdf, epub, etc; like item)
|
||||
Arc::new(ImageExtractor::from_blob(bytes.clone(), mime.clone()))
|
||||
}
|
||||
Self::ListExtractor(_) => Arc::new(MapExtractor::default()),
|
||||
Self::ObjectExtractor(e) => e.clone(),
|
||||
Self::Item(i) => Arc::new(ItemExtractor::new(i)),
|
||||
Self::Binary(b) => Arc::new(BinaryExtractor::new(b)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -87,12 +121,12 @@ impl PileValue {
|
||||
Self::I64(_) => Arc::new(VecExtractor::default()),
|
||||
Self::Array(a) => Arc::new(ArrayExtractor::new(a.clone())),
|
||||
Self::String(_) => Arc::new(VecExtractor::default()),
|
||||
Self::Blob { .. } => Arc::new(VecExtractor::default()),
|
||||
Self::ListExtractor(e) => e.clone(),
|
||||
Self::ObjectExtractor(e) => e
|
||||
.as_list()
|
||||
.unwrap_or_else(|| Arc::new(VecExtractor::default())),
|
||||
Self::Item(_) => Arc::new(VecExtractor::default()),
|
||||
Self::Binary(_) => Arc::new(VecExtractor::default()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -197,14 +231,17 @@ impl PileValue {
|
||||
Ok(match self {
|
||||
Self::Null => None,
|
||||
|
||||
Self::U64(_) | Self::I64(_) | Self::String(_) | Self::Blob { .. } => {
|
||||
Some(Value::Number(1u64.into()))
|
||||
}
|
||||
Self::U64(_)
|
||||
| Self::I64(_)
|
||||
| Self::String(_)
|
||||
| Self::Binary(BinaryPileValue::Blob { .. }) => Some(Value::Number(1u64.into())),
|
||||
|
||||
Self::Array(x) => (!x.is_empty()).then(|| Value::Number(1u64.into())),
|
||||
Self::ListExtractor(x) => (x.len(state).await? > 0).then(|| Value::Number(1u64.into())),
|
||||
|
||||
Self::ObjectExtractor(_) | Self::Item(_) => {
|
||||
Self::ObjectExtractor(_)
|
||||
| Self::Item(_)
|
||||
| Self::Binary(BinaryPileValue::File { .. }) => {
|
||||
let e = self.object_extractor();
|
||||
let keys = e.fields().await?;
|
||||
let mut map = Map::new();
|
||||
@@ -241,8 +278,8 @@ impl PileValue {
|
||||
Self::String(x) => Value::String(x.to_string()),
|
||||
|
||||
// TODO: replace with something meaningful?
|
||||
Self::Blob { mime, bytes } => {
|
||||
Value::String(format!("<Blob ({mime}, {} bytes)>", bytes.len()))
|
||||
Self::Binary(BinaryPileValue::Blob { mime, bytes }) => {
|
||||
Value::String(format!("<Blob ({mime}, {} bytes)>", bytes.0.len()))
|
||||
}
|
||||
|
||||
Self::Array(_) | Self::ListExtractor(_) => {
|
||||
@@ -250,7 +287,9 @@ impl PileValue {
|
||||
return e.to_json(state).await;
|
||||
}
|
||||
|
||||
Self::ObjectExtractor(_) | Self::Item(_) => {
|
||||
Self::ObjectExtractor(_)
|
||||
| Self::Item(_)
|
||||
| Self::Binary(BinaryPileValue::File { .. }) => {
|
||||
let e = self.object_extractor();
|
||||
return e.to_json(state).await;
|
||||
}
|
||||
|
||||
@@ -9,13 +9,11 @@ workspace = true
|
||||
|
||||
[dependencies]
|
||||
pile-toolbox = { workspace = true }
|
||||
pile-dataset = { workspace = true, features = ["axum", "pdfium"] }
|
||||
pile-value = { workspace = true, features = ["pdfium"] }
|
||||
pile-dataset = { workspace = true }
|
||||
pile-serve = { workspace = true }
|
||||
pile-value = { workspace = true }
|
||||
pile-config = { workspace = true }
|
||||
pile-io = { workspace = true }
|
||||
|
||||
aws-sdk-s3 = { workspace = true }
|
||||
bytes = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
tracing-subscriber = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
@@ -31,3 +29,13 @@ serde_json = { workspace = true }
|
||||
axum = { workspace = true }
|
||||
utoipa = { workspace = true }
|
||||
utoipa-swagger-ui = { workspace = true }
|
||||
url = { workspace = true }
|
||||
tracing-loki = { workspace = true }
|
||||
base64 = { workspace = true }
|
||||
dotenvy = { workspace = true }
|
||||
envy = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
|
||||
[features]
|
||||
default = ["pdfium"]
|
||||
pdfium = ["pile-dataset/pdfium", "pile-serve/pdfium", "pile-value/pdfium"]
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use anstyle::{AnsiColor, Color, Style};
|
||||
use indicatif::ProgressStyle;
|
||||
|
||||
pub fn clap_styles() -> clap::builder::Styles {
|
||||
clap::builder::Styles::styled()
|
||||
@@ -37,6 +36,7 @@ pub fn clap_styles() -> clap::builder::Styles {
|
||||
.placeholder(Style::new().fg_color(Some(Color::Ansi(AnsiColor::White))))
|
||||
}
|
||||
|
||||
/*
|
||||
#[expect(clippy::unwrap_used)]
|
||||
pub fn progress_big() -> ProgressStyle {
|
||||
return ProgressStyle::default_bar()
|
||||
@@ -50,7 +50,6 @@ pub fn progress_big() -> ProgressStyle {
|
||||
]);
|
||||
}
|
||||
|
||||
/*
|
||||
#[expect(clippy::unwrap_used)]
|
||||
pub fn spinner_small() -> ProgressStyle {
|
||||
return ProgressStyle::default_bar()
|
||||
|
||||
@@ -13,6 +13,10 @@ pub struct CheckCommand {
|
||||
/// Path to dataset config
|
||||
#[arg(long, short = 'c', default_value = "./pile.toml")]
|
||||
config: PathBuf,
|
||||
|
||||
/// Working directory root
|
||||
#[arg(long, default_value = "./.pile")]
|
||||
workdir: PathBuf,
|
||||
}
|
||||
|
||||
impl CliCmd for CheckCommand {
|
||||
@@ -43,7 +47,7 @@ impl CliCmd for CheckCommand {
|
||||
}
|
||||
}
|
||||
|
||||
let ds = Datasets::open(&self.config)
|
||||
let ds = Datasets::open(&self.config, self.workdir)
|
||||
.await
|
||||
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
||||
|
||||
|
||||
@@ -1,75 +0,0 @@
|
||||
use anyhow::{Context, Result};
|
||||
use clap::Args;
|
||||
use pile_io::AsyncReader;
|
||||
use pile_io::chacha::{ChaChaReaderv1, ChaChaWriterv1};
|
||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
||||
use pile_value::source::string_to_key;
|
||||
use std::io::{Cursor, Write};
|
||||
use std::path::PathBuf;
|
||||
|
||||
use crate::{CliCmd, GlobalContext};
|
||||
|
||||
#[derive(Debug, Args)]
|
||||
pub struct EncryptCommand {
|
||||
/// File to encrypt
|
||||
path: PathBuf,
|
||||
|
||||
/// Encryption password
|
||||
password: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Args)]
|
||||
pub struct DecryptCommand {
|
||||
/// File to decrypt
|
||||
path: PathBuf,
|
||||
|
||||
/// Encryption password
|
||||
password: String,
|
||||
}
|
||||
|
||||
impl CliCmd for EncryptCommand {
|
||||
async fn run(
|
||||
self,
|
||||
_ctx: GlobalContext,
|
||||
_flag: CancelFlag,
|
||||
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
||||
let key = string_to_key(&self.password);
|
||||
let plaintext = tokio::fs::read(&self.path)
|
||||
.await
|
||||
.with_context(|| format!("while reading '{}'", self.path.display()))?;
|
||||
|
||||
let mut writer = ChaChaWriterv1::new(Cursor::new(Vec::new()), key)
|
||||
.context("while initializing encryptor")?;
|
||||
writer.write_all(&plaintext).context("while encrypting")?;
|
||||
let buf = writer.finish().context("while finalizing encryptor")?;
|
||||
|
||||
std::io::stdout()
|
||||
.write_all(buf.get_ref())
|
||||
.context("while writing to stdout")?;
|
||||
|
||||
Ok(0)
|
||||
}
|
||||
}
|
||||
|
||||
impl CliCmd for DecryptCommand {
|
||||
async fn run(
|
||||
self,
|
||||
_ctx: GlobalContext,
|
||||
_flag: CancelFlag,
|
||||
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
||||
let key = string_to_key(&self.password);
|
||||
let ciphertext = tokio::fs::read(&self.path)
|
||||
.await
|
||||
.with_context(|| format!("while reading '{}'", self.path.display()))?;
|
||||
|
||||
let mut reader = ChaChaReaderv1::new(Cursor::new(ciphertext), key)
|
||||
.context("while initializing decryptor")?;
|
||||
let plaintext = reader.read_to_end().await.context("while decrypting")?;
|
||||
|
||||
std::io::stdout()
|
||||
.write_all(&plaintext)
|
||||
.context("while writing to stdout")?;
|
||||
|
||||
Ok(0)
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,6 @@
|
||||
use anyhow::{Context, Result};
|
||||
use clap::Args;
|
||||
use pile_config::objectpath::ObjectPath;
|
||||
use pile_dataset::Datasets;
|
||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
||||
use pile_value::{extract::traits::ExtractState, value::PileValue};
|
||||
@@ -40,9 +41,17 @@ pub struct FieldsCommand {
|
||||
#[arg(long)]
|
||||
max_percent: Option<f64>,
|
||||
|
||||
/// Print counts of non-null schema fields instead of raw fields
|
||||
#[arg(long)]
|
||||
schema: bool,
|
||||
|
||||
/// Restrict to these sources (all sources if empty)
|
||||
#[arg(long, short = 's')]
|
||||
source: Vec<String>,
|
||||
|
||||
/// Working directory root
|
||||
#[arg(long, default_value = "./.pile")]
|
||||
workdir: PathBuf,
|
||||
}
|
||||
|
||||
impl CliCmd for FieldsCommand {
|
||||
@@ -53,7 +62,7 @@ impl CliCmd for FieldsCommand {
|
||||
_ctx: GlobalContext,
|
||||
flag: CancelFlag,
|
||||
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
||||
let ds = Datasets::open(&self.config)
|
||||
let ds = Datasets::open(&self.config, &self.workdir)
|
||||
.await
|
||||
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
||||
|
||||
@@ -63,6 +72,17 @@ impl CliCmd for FieldsCommand {
|
||||
let jobs = self.jobs.max(1);
|
||||
let state = ExtractState { ignore_mime: false };
|
||||
|
||||
// Pre-collect schema fields for the --schema mode
|
||||
let schema_fields: Vec<(String, Vec<ObjectPath>)> = if self.schema {
|
||||
ds.config
|
||||
.schema
|
||||
.iter()
|
||||
.map(|(name, spec)| (name.to_string(), spec.path.clone()))
|
||||
.collect()
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
for (name, dataset) in ds.sources.iter().filter(|(name, _)| {
|
||||
self.source.is_empty() || self.source.iter().any(|s| s == name.as_str())
|
||||
}) {
|
||||
@@ -94,19 +114,50 @@ impl CliCmd for FieldsCommand {
|
||||
let item = item.clone();
|
||||
let name = name.clone();
|
||||
let state = state.clone();
|
||||
join_set.spawn(async move {
|
||||
let item = PileValue::Item(item);
|
||||
let result = item.count_fields(&state).await.with_context(|| {
|
||||
format!("while counting fields in source {name}")
|
||||
})?;
|
||||
Ok(result.and_then(|v| {
|
||||
if let Value::Object(m) = v {
|
||||
Some(m)
|
||||
} else {
|
||||
None
|
||||
if self.schema {
|
||||
let schema_fields = schema_fields.clone();
|
||||
join_set.spawn(async move {
|
||||
let pv = PileValue::Item(item);
|
||||
let mut counts = Map::new();
|
||||
for (field_name, paths) in &schema_fields {
|
||||
let mut present = false;
|
||||
for path in paths {
|
||||
let v =
|
||||
pv.query(&state, path).await.with_context(|| {
|
||||
format!(
|
||||
"while extracting field {field_name} in source {name}"
|
||||
)
|
||||
})?;
|
||||
if let Some(v) = v
|
||||
&& !matches!(v, PileValue::Null)
|
||||
{
|
||||
present = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
counts.insert(
|
||||
field_name.clone(),
|
||||
Value::Number((present as u64).into()),
|
||||
);
|
||||
}
|
||||
}))
|
||||
});
|
||||
Ok(Some(counts))
|
||||
});
|
||||
} else {
|
||||
join_set.spawn(async move {
|
||||
let item = PileValue::Item(item);
|
||||
let result =
|
||||
item.count_fields(&state).await.with_context(|| {
|
||||
format!("while counting fields in source {name}")
|
||||
})?;
|
||||
Ok(result.and_then(|v| {
|
||||
if let Value::Object(m) = v {
|
||||
Some(m)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}))
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,6 +16,10 @@ pub struct IndexCommand {
|
||||
/// Number of threads to use for indexing
|
||||
#[arg(long, short = 'j', default_value = "3")]
|
||||
jobs: usize,
|
||||
|
||||
/// Working directory root
|
||||
#[arg(long, default_value = "./.pile")]
|
||||
workdir: PathBuf,
|
||||
}
|
||||
|
||||
impl CliCmd for IndexCommand {
|
||||
@@ -24,7 +28,7 @@ impl CliCmd for IndexCommand {
|
||||
_ctx: GlobalContext,
|
||||
flag: CancelFlag,
|
||||
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
||||
let ds = Datasets::open(&self.config)
|
||||
let ds = Datasets::open(&self.config, &self.workdir)
|
||||
.await
|
||||
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ pub struct ItemCommand {
|
||||
|
||||
/// If present, print the schema fields instead of item data
|
||||
#[arg(long)]
|
||||
fields: bool,
|
||||
schema: bool,
|
||||
|
||||
#[arg(long, short = 'x')]
|
||||
exclude: Vec<String>,
|
||||
@@ -30,6 +30,10 @@ pub struct ItemCommand {
|
||||
/// Path to dataset config
|
||||
#[arg(long, short = 'c', default_value = "./pile.toml")]
|
||||
config: PathBuf,
|
||||
|
||||
/// Working directory root
|
||||
#[arg(long, default_value = "./.pile")]
|
||||
workdir: PathBuf,
|
||||
}
|
||||
|
||||
impl CliCmd for ItemCommand {
|
||||
@@ -43,7 +47,7 @@ impl CliCmd for ItemCommand {
|
||||
let source = Label::new(&self.source)
|
||||
.ok_or_else(|| anyhow::anyhow!("invalid source name {:?}", self.source))?;
|
||||
|
||||
let ds = Datasets::open(&self.config)
|
||||
let ds = Datasets::open(&self.config, &self.workdir)
|
||||
.await
|
||||
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
||||
|
||||
@@ -54,7 +58,7 @@ impl CliCmd for ItemCommand {
|
||||
})?;
|
||||
let pv = PileValue::Item(item);
|
||||
|
||||
if self.fields {
|
||||
if self.schema {
|
||||
let mut map = serde_json::Map::new();
|
||||
for (name, spec) in &ds.config.schema {
|
||||
if self.exclude.contains(&name.to_string()) {
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use anyhow::{Context, Result};
|
||||
use clap::Args;
|
||||
use pile_config::Label;
|
||||
use pile_config::objectpath::ObjectPath;
|
||||
use pile_dataset::Datasets;
|
||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
||||
@@ -12,7 +13,7 @@ use crate::{CliCmd, GlobalContext};
|
||||
|
||||
#[derive(Debug, Args)]
|
||||
pub struct ListCommand {
|
||||
/// Path to query, e.g. $.flac.artist
|
||||
/// Path to query, e.g. $.flac.artist (or schema field name when --schema is set)
|
||||
#[clap(default_value = "$")]
|
||||
path: String,
|
||||
|
||||
@@ -20,6 +21,10 @@ pub struct ListCommand {
|
||||
#[arg(long)]
|
||||
invert: bool,
|
||||
|
||||
/// Treat path as a schema field name and resolve via schema paths
|
||||
#[arg(long)]
|
||||
schema: bool,
|
||||
|
||||
/// Path to dataset config
|
||||
#[arg(long, short = 'c', default_value = "./pile.toml")]
|
||||
config: PathBuf,
|
||||
@@ -31,6 +36,10 @@ pub struct ListCommand {
|
||||
/// Restrict to these sources (all sources if empty)
|
||||
#[arg(long, short = 's')]
|
||||
source: Vec<String>,
|
||||
|
||||
/// Working directory root
|
||||
#[arg(long, default_value = "./.pile")]
|
||||
workdir: PathBuf,
|
||||
}
|
||||
|
||||
impl CliCmd for ListCommand {
|
||||
@@ -40,14 +49,24 @@ impl CliCmd for ListCommand {
|
||||
_ctx: GlobalContext,
|
||||
flag: CancelFlag,
|
||||
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
||||
let path = ObjectPath::from_str(&self.path)
|
||||
.with_context(|| format!("invalid path {:?}", self.path))?;
|
||||
let path = Arc::new(path);
|
||||
|
||||
let ds = Datasets::open(&self.config)
|
||||
let ds = Datasets::open(&self.config, &self.workdir)
|
||||
.await
|
||||
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
||||
|
||||
// Resolve path arg: either schema field paths or a single ObjectPath
|
||||
let schema_paths: Arc<Vec<ObjectPath>> = if self.schema {
|
||||
let label = Label::new(&self.path)
|
||||
.ok_or_else(|| anyhow::anyhow!("invalid schema field name {:?}", self.path))?;
|
||||
let spec = ds.config.schema.get(&label).ok_or_else(|| {
|
||||
anyhow::anyhow!("schema field {:?} not found in config", self.path)
|
||||
})?;
|
||||
Arc::new(spec.path.clone())
|
||||
} else {
|
||||
let path = ObjectPath::from_str(&self.path)
|
||||
.with_context(|| format!("invalid path {:?}", self.path))?;
|
||||
Arc::new(vec![path])
|
||||
};
|
||||
|
||||
let jobs = self.jobs.max(1);
|
||||
let state = ExtractState { ignore_mime: false };
|
||||
|
||||
@@ -77,16 +96,20 @@ impl CliCmd for ListCommand {
|
||||
let item = item.clone();
|
||||
let source_name = name.to_string();
|
||||
let key = item.key().to_string();
|
||||
let path = path.clone();
|
||||
let schema_paths = schema_paths.clone();
|
||||
let invert = self.invert;
|
||||
let state = state.clone();
|
||||
|
||||
join_set.spawn(async move {
|
||||
let item = PileValue::Item(item);
|
||||
let value = item.query(&state, &path).await?;
|
||||
|
||||
let is_present =
|
||||
matches!(value, Some(v) if !matches!(v, PileValue::Null));
|
||||
let pv = PileValue::Item(item);
|
||||
let mut is_present = false;
|
||||
for path in schema_paths.as_ref() {
|
||||
let value = pv.query(&state, path).await?;
|
||||
if matches!(value, Some(v) if !matches!(v, PileValue::Null)) {
|
||||
is_present = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let should_print = if invert { !is_present } else { is_present };
|
||||
|
||||
|
||||
@@ -31,6 +31,10 @@ pub struct LookupCommand {
|
||||
/// Number of threads to use for indexing
|
||||
#[arg(long, short = 'j', default_value = "3")]
|
||||
jobs: usize,
|
||||
|
||||
/// Working directory root
|
||||
#[arg(long, default_value = "./.pile")]
|
||||
workdir: PathBuf,
|
||||
}
|
||||
|
||||
impl CliCmd for LookupCommand {
|
||||
@@ -40,7 +44,7 @@ impl CliCmd for LookupCommand {
|
||||
_ctx: GlobalContext,
|
||||
flag: CancelFlag,
|
||||
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
||||
let ds = Datasets::open(&self.config)
|
||||
let ds = Datasets::open(&self.config, &self.workdir)
|
||||
.await
|
||||
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
||||
|
||||
|
||||
@@ -5,7 +5,6 @@ use pile_toolbox::cancelabletask::{
|
||||
};
|
||||
|
||||
mod check;
|
||||
mod encrypt;
|
||||
mod fields;
|
||||
mod index;
|
||||
mod init;
|
||||
@@ -15,7 +14,6 @@ mod lookup;
|
||||
mod probe;
|
||||
mod serve;
|
||||
mod server;
|
||||
mod upload;
|
||||
|
||||
use crate::{Cli, GlobalContext};
|
||||
|
||||
@@ -85,24 +83,6 @@ pub enum SubCommand {
|
||||
#[command(flatten)]
|
||||
cmd: server::ServerCommand,
|
||||
},
|
||||
|
||||
/// Upload a filesystem source to an S3 source
|
||||
Upload {
|
||||
#[command(flatten)]
|
||||
cmd: upload::UploadCommand,
|
||||
},
|
||||
|
||||
/// Encrypt a file to stdout
|
||||
Encrypt {
|
||||
#[command(flatten)]
|
||||
cmd: encrypt::EncryptCommand,
|
||||
},
|
||||
|
||||
/// Decrypt a file to stdout
|
||||
Decrypt {
|
||||
#[command(flatten)]
|
||||
cmd: encrypt::DecryptCommand,
|
||||
},
|
||||
}
|
||||
|
||||
impl CliCmdDispatch for SubCommand {
|
||||
@@ -118,9 +98,6 @@ impl CliCmdDispatch for SubCommand {
|
||||
Self::Item { cmd } => cmd.start(ctx),
|
||||
Self::Serve { cmd } => cmd.start(ctx),
|
||||
Self::Server { cmd } => cmd.start(ctx),
|
||||
Self::Upload { cmd } => cmd.start(ctx),
|
||||
Self::Encrypt { cmd } => cmd.start(ctx),
|
||||
Self::Decrypt { cmd } => cmd.start(ctx),
|
||||
|
||||
Self::Docs {} => {
|
||||
print_help_recursively(&mut Cli::command(), None);
|
||||
|
||||
@@ -25,6 +25,10 @@ pub struct ServeCommand {
|
||||
/// Number of threads to use for indexing
|
||||
#[arg(long, short = 'j', default_value = "3")]
|
||||
jobs: usize,
|
||||
|
||||
/// Working directory root
|
||||
#[arg(long, default_value = "./.pile")]
|
||||
workdir: PathBuf,
|
||||
}
|
||||
|
||||
impl CliCmd for ServeCommand {
|
||||
@@ -33,7 +37,7 @@ impl CliCmd for ServeCommand {
|
||||
_ctx: GlobalContext,
|
||||
flag: CancelFlag,
|
||||
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
||||
let ds = Datasets::open(&self.config)
|
||||
let ds = Datasets::open(&self.config, &self.workdir)
|
||||
.await
|
||||
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
||||
|
||||
@@ -53,8 +57,7 @@ impl CliCmd for ServeCommand {
|
||||
})?;
|
||||
}
|
||||
|
||||
let app = Arc::new(ds)
|
||||
.router(true)
|
||||
let app = pile_serve::router(Arc::new(ds), true)
|
||||
.into_make_service_with_connect_info::<std::net::SocketAddr>();
|
||||
|
||||
let listener = match tokio::net::TcpListener::bind(self.addr.clone()).await {
|
||||
|
||||
@@ -8,10 +8,11 @@ use axum::{
|
||||
routing::get,
|
||||
};
|
||||
use clap::Args;
|
||||
use pile_dataset::Datasets;
|
||||
use pile_dataset::{DatasetError, Datasets};
|
||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
||||
use pile_value::extract::traits::ExtractState;
|
||||
use serde::Serialize;
|
||||
use std::{fmt::Debug, path::PathBuf, sync::Arc};
|
||||
use std::{fmt::Debug, path::PathBuf, sync::Arc, time::Duration};
|
||||
use tracing::{error, info};
|
||||
use utoipa::{OpenApi, ToSchema};
|
||||
use utoipa_swagger_ui::SwaggerUi;
|
||||
@@ -20,10 +21,6 @@ use crate::{CliCmd, GlobalContext};
|
||||
|
||||
#[derive(Debug, Args)]
|
||||
pub struct ServerCommand {
|
||||
/// Address to bind to
|
||||
#[arg(default_value = "0.0.0.0:9000")]
|
||||
addr: String,
|
||||
|
||||
/// The datasets we should serve. Can be repeated.
|
||||
#[arg(long, short = 'c')]
|
||||
config: Vec<PathBuf>,
|
||||
@@ -32,21 +29,29 @@ pub struct ServerCommand {
|
||||
#[arg(long)]
|
||||
no_docs: bool,
|
||||
|
||||
/// If provided, require this bearer token for all requests
|
||||
/// If provided, never auto-refresh indices
|
||||
#[arg(long)]
|
||||
token: Option<String>,
|
||||
no_refresh: bool,
|
||||
|
||||
/// Number of threads to use to refresh indices
|
||||
#[arg(long, default_value = "5")]
|
||||
refresh_jobs: usize,
|
||||
|
||||
/// Refresh indices every `n` seconds
|
||||
#[arg(long, default_value = "300")]
|
||||
refresh_delay: usize,
|
||||
}
|
||||
|
||||
impl CliCmd for ServerCommand {
|
||||
async fn run(
|
||||
self,
|
||||
_ctx: GlobalContext,
|
||||
ctx: GlobalContext,
|
||||
flag: CancelFlag,
|
||||
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
||||
let datasets = {
|
||||
let mut datasets = Vec::new();
|
||||
for c in &self.config {
|
||||
let ds = Datasets::open(&c)
|
||||
let ds = Datasets::open(&c, &ctx.config.workdir_root)
|
||||
.await
|
||||
.with_context(|| format!("while opening dataset for {}", c.display()))?;
|
||||
datasets.push(Arc::new(ds));
|
||||
@@ -55,12 +60,57 @@ impl CliCmd for ServerCommand {
|
||||
Arc::new(datasets)
|
||||
};
|
||||
|
||||
let bearer = BearerToken(self.token.map(Arc::new));
|
||||
// Start auto-refresh task
|
||||
if !self.no_refresh {
|
||||
let datasets = datasets.clone();
|
||||
let jobs = self.refresh_jobs.max(1);
|
||||
let delay = self.refresh_delay.max(1);
|
||||
|
||||
async fn refresh_dataset(ds: &Datasets, jobs: usize) -> Result<(), DatasetError> {
|
||||
if ds.needs_fts().await? {
|
||||
let state = ExtractState { ignore_mime: false };
|
||||
match ds.fts_refresh(&state, jobs, None).await {
|
||||
Ok(()) => {}
|
||||
Err(CancelableTaskError::Error(err)) => return Err(err),
|
||||
Err(CancelableTaskError::Cancelled) => unreachable!(),
|
||||
};
|
||||
}
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
tokio::task::spawn(async move {
|
||||
loop {
|
||||
for ds in datasets.iter() {
|
||||
match refresh_dataset(ds, jobs).await {
|
||||
Ok(x) => x,
|
||||
Err(error) => {
|
||||
error!(
|
||||
message = "Error while refreshing dataset",
|
||||
dataset = ds.config.dataset.name.as_str(),
|
||||
?error
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
tokio::time::sleep(Duration::from_secs(10)).await;
|
||||
}
|
||||
|
||||
tokio::time::sleep(Duration::from_secs(delay as u64)).await;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
let bearer = BearerToken(ctx.config.api_token.clone().map(Arc::new));
|
||||
|
||||
let mut router = Router::new();
|
||||
for d in datasets.iter() {
|
||||
let prefix = format!("/{}", d.config.dataset.name);
|
||||
router = router.merge(d.clone().router_prefix(!self.no_docs, Some(&prefix)))
|
||||
router = router.merge(pile_serve::router_prefix(
|
||||
d.clone(),
|
||||
!self.no_docs,
|
||||
Some(&prefix),
|
||||
))
|
||||
}
|
||||
|
||||
router = router.merge(
|
||||
@@ -81,14 +131,14 @@ impl CliCmd for ServerCommand {
|
||||
|
||||
let app = router.into_make_service_with_connect_info::<std::net::SocketAddr>();
|
||||
|
||||
let listener = match tokio::net::TcpListener::bind(self.addr.clone()).await {
|
||||
let listener = match tokio::net::TcpListener::bind(ctx.config.server_addr.clone()).await {
|
||||
Ok(x) => x,
|
||||
Err(error) => {
|
||||
match error.kind() {
|
||||
std::io::ErrorKind::AddrInUse => {
|
||||
error!(
|
||||
message = "Cannot bind to address, already in use",
|
||||
addr = self.addr
|
||||
addr = ctx.config.server_addr
|
||||
);
|
||||
}
|
||||
_ => {
|
||||
|
||||
@@ -1,284 +0,0 @@
|
||||
use anyhow::{Context, Result};
|
||||
use aws_sdk_s3::primitives::ByteStream;
|
||||
use clap::Args;
|
||||
use indicatif::ProgressBar;
|
||||
use pile_config::Label;
|
||||
use pile_dataset::{Dataset, Datasets};
|
||||
use pile_io::chacha::ChaChaWriterv1;
|
||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
||||
use pile_value::source::{DataSource, DirDataSource, S3DataSource, encrypt_path};
|
||||
use std::{
|
||||
io::{Cursor, Write},
|
||||
path::PathBuf,
|
||||
sync::Arc,
|
||||
time::Duration,
|
||||
};
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::info;
|
||||
|
||||
use crate::{CliCmd, GlobalContext, cli::progress_big};
|
||||
|
||||
#[derive(Debug, Args)]
|
||||
pub struct UploadCommand {
|
||||
/// Name of the filesystem source to upload from
|
||||
dir_source: String,
|
||||
|
||||
/// Name of the S3 source to upload to
|
||||
s3_source: String,
|
||||
|
||||
/// Prefix path under the S3 source to upload files to
|
||||
prefix: String,
|
||||
|
||||
/// Path to dataset config
|
||||
#[arg(long, short = 'c', default_value = "./pile.toml")]
|
||||
config: PathBuf,
|
||||
|
||||
/// Override the S3 bucket from pile.toml
|
||||
#[arg(long)]
|
||||
bucket: Option<String>,
|
||||
|
||||
/// Allow overwriting files that already exist at the target prefix
|
||||
#[arg(long)]
|
||||
overwrite: bool,
|
||||
|
||||
/// Delete all files at the target prefix before uploading
|
||||
#[arg(long)]
|
||||
delete_existing_forever: bool,
|
||||
|
||||
/// Number of parallel upload jobs
|
||||
#[arg(long, short = 'j', default_value = "5")]
|
||||
jobs: usize,
|
||||
}
|
||||
|
||||
impl CliCmd for UploadCommand {
|
||||
async fn run(
|
||||
self,
|
||||
ctx: GlobalContext,
|
||||
flag: CancelFlag,
|
||||
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
||||
let ds = Datasets::open(&self.config)
|
||||
.await
|
||||
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
||||
|
||||
let dir_label = Label::new(&self.dir_source)
|
||||
.ok_or_else(|| anyhow::anyhow!("invalid source name: {}", self.dir_source))?;
|
||||
let s3_label = Label::new(&self.s3_source)
|
||||
.ok_or_else(|| anyhow::anyhow!("invalid source name: {}", self.s3_source))?;
|
||||
|
||||
let dir_ds: Arc<DirDataSource> = get_dir_source(&ds, &dir_label, &self.dir_source)?;
|
||||
let s3_ds: Arc<S3DataSource> = get_s3_source(&ds, &s3_label, &self.s3_source)?;
|
||||
|
||||
let bucket = self
|
||||
.bucket
|
||||
.as_deref()
|
||||
.unwrap_or(s3_ds.client.bucket())
|
||||
.to_owned();
|
||||
let full_prefix = self.prefix.trim_matches('/').to_owned();
|
||||
|
||||
// Check for existing objects at the target prefix
|
||||
let existing_keys = list_prefix(&s3_ds.client.client, &bucket, &full_prefix)
|
||||
.await
|
||||
.context("while checking for existing objects at target prefix")?;
|
||||
|
||||
if !existing_keys.is_empty() {
|
||||
if self.delete_existing_forever {
|
||||
info!(
|
||||
"Deleting {} existing object(s) at '{}'",
|
||||
existing_keys.len(),
|
||||
full_prefix
|
||||
);
|
||||
for key in &existing_keys {
|
||||
s3_ds
|
||||
.client
|
||||
.client
|
||||
.delete_object()
|
||||
.bucket(&bucket)
|
||||
.key(key)
|
||||
.send()
|
||||
.await
|
||||
.with_context(|| format!("while deleting existing object '{key}'"))?;
|
||||
}
|
||||
} else if !self.overwrite {
|
||||
return Err(anyhow::anyhow!(
|
||||
"{} file(s) already exist at '{}'. \
|
||||
Pass --overwrite to allow overwriting, \
|
||||
or --delete-existing-forever to delete them first.",
|
||||
existing_keys.len(),
|
||||
full_prefix
|
||||
)
|
||||
.into());
|
||||
}
|
||||
}
|
||||
|
||||
// Count total files before uploading so we can show accurate progress
|
||||
let total = dir_ds.iter().count() as u64;
|
||||
|
||||
// Walk filesystem source and upload files in parallel
|
||||
let jobs = self.jobs.max(1);
|
||||
let mut uploaded: u64 = 0;
|
||||
let mut stream = dir_ds.iter();
|
||||
let mut join_set: JoinSet<Result<String, anyhow::Error>> = JoinSet::new();
|
||||
|
||||
let pb = ctx.mp.add(ProgressBar::new(total));
|
||||
pb.set_style(progress_big());
|
||||
pb.enable_steady_tick(Duration::from_millis(100));
|
||||
pb.set_message(full_prefix.clone());
|
||||
|
||||
loop {
|
||||
// Drain completed tasks before checking for cancellation or new work
|
||||
while join_set.len() >= jobs {
|
||||
match join_set.join_next().await {
|
||||
Some(Ok(Ok(key))) => {
|
||||
info!("Uploaded {key}");
|
||||
pb.set_message(key);
|
||||
pb.inc(1);
|
||||
uploaded += 1;
|
||||
}
|
||||
Some(Ok(Err(e))) => return Err(e.into()),
|
||||
Some(Err(e)) => return Err(anyhow::anyhow!("upload task panicked: {e}").into()),
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
|
||||
if flag.is_cancelled() {
|
||||
join_set.abort_all();
|
||||
return Err(CancelableTaskError::Cancelled);
|
||||
}
|
||||
|
||||
let item = match stream.next() {
|
||||
None => break,
|
||||
Some(item) => item.clone(),
|
||||
};
|
||||
|
||||
let relative_str = item.key().to_string();
|
||||
let item_path = dir_ds.dir.join(&relative_str);
|
||||
|
||||
let enc_key_part = match s3_ds.encryption_key {
|
||||
None => relative_str.clone(),
|
||||
Some(ref enc_key) => encrypt_path(enc_key, &relative_str),
|
||||
};
|
||||
let key = format!("{full_prefix}/{enc_key_part}");
|
||||
let mime = item.mime().to_string();
|
||||
let client = Arc::clone(&s3_ds.client);
|
||||
let bucket = bucket.clone();
|
||||
let encryption_key = s3_ds.encryption_key;
|
||||
|
||||
join_set.spawn(async move {
|
||||
let body = if let Some(enc_key) = encryption_key {
|
||||
let path = item_path.clone();
|
||||
let encrypted =
|
||||
tokio::task::spawn_blocking(move || -> anyhow::Result<Vec<u8>> {
|
||||
let plaintext = std::fs::read(&path)
|
||||
.with_context(|| format!("while opening '{}'", path.display()))?;
|
||||
let mut writer = ChaChaWriterv1::new(Cursor::new(Vec::new()), enc_key)
|
||||
.context("while initializing encryptor")?;
|
||||
writer.write_all(&plaintext).context("while encrypting")?;
|
||||
Ok(writer.finish().context("while finalizing")?.into_inner())
|
||||
})
|
||||
.await
|
||||
.context("encryptor task panicked")??;
|
||||
ByteStream::from(bytes::Bytes::from(encrypted))
|
||||
} else {
|
||||
ByteStream::from_path(&item_path)
|
||||
.await
|
||||
.with_context(|| format!("while opening '{}'", item_path.display()))?
|
||||
};
|
||||
|
||||
client
|
||||
.client
|
||||
.put_object()
|
||||
.bucket(&bucket)
|
||||
.key(&key)
|
||||
.content_type(&mime)
|
||||
.body(body)
|
||||
.send()
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("while uploading '{}' to '{key}'", item_path.display())
|
||||
})?;
|
||||
|
||||
Ok(key)
|
||||
});
|
||||
}
|
||||
|
||||
// Drain remaining tasks
|
||||
while let Some(result) = join_set.join_next().await {
|
||||
match result {
|
||||
Ok(Ok(key)) => {
|
||||
info!("Uploaded {key}");
|
||||
pb.set_message(key);
|
||||
pb.inc(1);
|
||||
uploaded += 1;
|
||||
}
|
||||
Ok(Err(e)) => return Err(e.into()),
|
||||
Err(e) => return Err(anyhow::anyhow!("upload task panicked: {e}").into()),
|
||||
}
|
||||
}
|
||||
|
||||
pb.finish_and_clear();
|
||||
info!("Done: uploaded {uploaded} file(s) to '{full_prefix}'");
|
||||
Ok(0)
|
||||
}
|
||||
}
|
||||
|
||||
fn get_dir_source(
|
||||
ds: &Datasets,
|
||||
label: &Label,
|
||||
name: &str,
|
||||
) -> Result<Arc<DirDataSource>, anyhow::Error> {
|
||||
match ds.sources.get(label).or(ds.disabled_sources.get(label)) {
|
||||
Some(Dataset::Dir(d)) => Ok(Arc::clone(d)),
|
||||
Some(_) => Err(anyhow::anyhow!(
|
||||
"source '{name}' is not a filesystem source"
|
||||
)),
|
||||
None => Err(anyhow::anyhow!(
|
||||
"filesystem source '{name}' not found in config"
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_s3_source(
|
||||
ds: &Datasets,
|
||||
label: &Label,
|
||||
name: &str,
|
||||
) -> Result<Arc<S3DataSource>, anyhow::Error> {
|
||||
match ds.sources.get(label).or(ds.disabled_sources.get(label)) {
|
||||
Some(Dataset::S3(s)) => Ok(Arc::clone(s)),
|
||||
Some(_) => Err(anyhow::anyhow!("source '{name}' is not an S3 source")),
|
||||
None => Err(anyhow::anyhow!("s3 source '{name}' not found in config")),
|
||||
}
|
||||
}
|
||||
|
||||
/// List all S3 object keys under the given prefix.
|
||||
async fn list_prefix(
|
||||
client: &aws_sdk_s3::Client,
|
||||
bucket: &str,
|
||||
prefix: &str,
|
||||
) -> Result<Vec<String>> {
|
||||
let mut keys = Vec::new();
|
||||
let mut continuation_token: Option<String> = None;
|
||||
|
||||
loop {
|
||||
let mut req = client.list_objects_v2().bucket(bucket).prefix(prefix);
|
||||
|
||||
if let Some(token) = continuation_token {
|
||||
req = req.continuation_token(token);
|
||||
}
|
||||
|
||||
let resp = req.send().await.context("list_objects_v2 failed")?;
|
||||
|
||||
for obj in resp.contents() {
|
||||
if let Some(k) = obj.key() {
|
||||
keys.push(k.to_owned());
|
||||
}
|
||||
}
|
||||
|
||||
if !resp.is_truncated().unwrap_or(false) {
|
||||
break;
|
||||
}
|
||||
|
||||
continuation_token = resp.next_continuation_token().map(ToOwned::to_owned);
|
||||
}
|
||||
|
||||
Ok(keys)
|
||||
}
|
||||
109
crates/pile/src/config/config.rs
Normal file
109
crates/pile/src/config/config.rs
Normal file
@@ -0,0 +1,109 @@
|
||||
use serde::Deserialize;
|
||||
use std::{num::NonZeroUsize, path::PathBuf};
|
||||
use tracing::debug;
|
||||
|
||||
use crate::config::{
|
||||
env::load_env,
|
||||
logging::{LoggingFormat, LoggingInitializer, LoggingPreset, LoggingTarget, LokiConfig},
|
||||
};
|
||||
|
||||
/// Note that the field of this struct are not capitalized.
|
||||
/// Envy is case-insensitive, and expects Rust fields to be snake_case.
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
pub struct PileServerConfig {
|
||||
#[serde(flatten)]
|
||||
pub loki: Option<LokiConfig>,
|
||||
|
||||
/// The logging level to run with
|
||||
#[serde(default)]
|
||||
pub loglevel: LoggingPreset,
|
||||
|
||||
#[serde(default)]
|
||||
pub logformat: LoggingFormat,
|
||||
|
||||
/// How many worker threads to use
|
||||
pub threads: Option<NonZeroUsize>,
|
||||
|
||||
/// IP and port to bind to
|
||||
/// Should look like `127.0.0.1:3030`
|
||||
pub server_addr: String,
|
||||
|
||||
pub api_token: Option<String>,
|
||||
pub workdir_root: PathBuf,
|
||||
}
|
||||
|
||||
impl Default for PileServerConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
loki: None,
|
||||
loglevel: LoggingPreset::Debug,
|
||||
logformat: LoggingFormat::Ansi,
|
||||
threads: None,
|
||||
server_addr: "0.0.0.0:3000".into(),
|
||||
api_token: None,
|
||||
workdir_root: "./.pile".into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PileServerConfig {
|
||||
pub fn load(with_env: bool, cli_log_level: LoggingPreset) -> Self {
|
||||
let config = match with_env {
|
||||
false => Self::default(),
|
||||
true => {
|
||||
let env = match load_env::<Self>() {
|
||||
Ok(x) => x,
|
||||
|
||||
#[expect(clippy::print_stdout)]
|
||||
Err(err) => {
|
||||
println!("Error while loading .env: {err}");
|
||||
std::process::exit(1);
|
||||
}
|
||||
};
|
||||
env.get_config().clone()
|
||||
}
|
||||
};
|
||||
|
||||
{
|
||||
let res = LoggingInitializer {
|
||||
app_name: "pile-server",
|
||||
loki: config.loki.clone(),
|
||||
preset: if with_env {
|
||||
config.loglevel
|
||||
} else {
|
||||
cli_log_level
|
||||
},
|
||||
target: LoggingTarget::Stderr {
|
||||
format: config.logformat,
|
||||
},
|
||||
}
|
||||
.initialize();
|
||||
|
||||
if let Err(e) = res {
|
||||
#[expect(clippy::print_stderr)]
|
||||
for e in e.chain() {
|
||||
eprintln!("{e}");
|
||||
}
|
||||
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
debug!(message = "Config loaded", ?config);
|
||||
|
||||
return config;
|
||||
}
|
||||
|
||||
pub fn make_runtime(&self) -> tokio::runtime::Runtime {
|
||||
let mut rt = tokio::runtime::Builder::new_multi_thread();
|
||||
rt.enable_all();
|
||||
if let Some(threads) = self.threads {
|
||||
rt.worker_threads(threads.into());
|
||||
}
|
||||
|
||||
#[expect(clippy::unwrap_used)]
|
||||
let rt = rt.build().unwrap();
|
||||
|
||||
return rt;
|
||||
}
|
||||
}
|
||||
108
crates/pile/src/config/env.rs
Normal file
108
crates/pile/src/config/env.rs
Normal file
@@ -0,0 +1,108 @@
|
||||
#![expect(dead_code)]
|
||||
|
||||
use serde::de::DeserializeOwned;
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
env::VarError,
|
||||
io::ErrorKind,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
use thiserror::Error;
|
||||
|
||||
/// An error we might encounter when loading an env
|
||||
#[derive(Debug, Error)]
|
||||
pub enum EnvLoadError {
|
||||
#[error("i/o error")]
|
||||
IOError(#[from] std::io::Error),
|
||||
|
||||
#[error("varerror")]
|
||||
VarError(#[from] VarError),
|
||||
|
||||
#[error("line parse error: `{on_line}` at char {at_char}")]
|
||||
LineParse { on_line: String, at_char: usize },
|
||||
|
||||
#[error("other dotenvy error")]
|
||||
Other(#[from] dotenvy::Error),
|
||||
|
||||
#[error("missing value {0}")]
|
||||
MissingValue(String),
|
||||
|
||||
#[error("parse error: {0}")]
|
||||
OtherParseError(String),
|
||||
}
|
||||
|
||||
pub enum LoadedEnv<T> {
|
||||
/// We loaded config from `.env` and env vars
|
||||
FoundFile { config: T, path: PathBuf },
|
||||
|
||||
/// We could not find `.env` and only loaded env vars
|
||||
OnlyVars(T),
|
||||
}
|
||||
|
||||
impl<T> LoadedEnv<T> {
|
||||
pub fn get_config(&self) -> &T {
|
||||
match self {
|
||||
Self::FoundFile { config, .. } => config,
|
||||
Self::OnlyVars(config) => config,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Load the configuration type `T` from the current environment,
|
||||
/// including the `.env` if it exists.
|
||||
#[expect(clippy::wildcard_enum_match_arm)]
|
||||
pub fn load_env<T: DeserializeOwned>() -> Result<LoadedEnv<T>, EnvLoadError> {
|
||||
let env_path = match dotenvy::dotenv() {
|
||||
Ok(path) => Some(path),
|
||||
|
||||
Err(dotenvy::Error::Io(err)) => match err.kind() {
|
||||
ErrorKind::NotFound => None,
|
||||
_ => return Err(EnvLoadError::IOError(err)),
|
||||
},
|
||||
|
||||
Err(dotenvy::Error::EnvVar(err)) => {
|
||||
return Err(EnvLoadError::VarError(err));
|
||||
}
|
||||
|
||||
Err(dotenvy::Error::LineParse(on_line, at_char)) => {
|
||||
return Err(EnvLoadError::LineParse { on_line, at_char });
|
||||
}
|
||||
|
||||
Err(err) => {
|
||||
return Err(EnvLoadError::Other(err));
|
||||
}
|
||||
};
|
||||
|
||||
match envy::from_env::<T>() {
|
||||
Ok(config) => {
|
||||
if let Some(path) = env_path {
|
||||
return Ok(LoadedEnv::FoundFile { path, config });
|
||||
} else {
|
||||
return Ok(LoadedEnv::OnlyVars(config));
|
||||
}
|
||||
}
|
||||
|
||||
Err(envy::Error::MissingValue(value)) => {
|
||||
return Err(EnvLoadError::MissingValue(value.into()));
|
||||
}
|
||||
|
||||
Err(envy::Error::Custom(message)) => {
|
||||
return Err(EnvLoadError::OtherParseError(message));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/// Load an .env file to a hashmap.
|
||||
///
|
||||
/// This function does not read the current env,
|
||||
/// only parsing vars explicitly declared in the given file.
|
||||
pub fn load_env_dict(p: impl AsRef<Path>) -> Result<HashMap<String, String>, EnvLoadError> {
|
||||
let mut out = HashMap::new();
|
||||
|
||||
for item in dotenvy::from_filename_iter(p)? {
|
||||
let (key, val) = item?;
|
||||
out.insert(key, val);
|
||||
}
|
||||
|
||||
return Ok(out);
|
||||
}
|
||||
@@ -1,7 +1,13 @@
|
||||
use anyhow::Result;
|
||||
use clap::ValueEnum;
|
||||
use indicatif::MultiProgress;
|
||||
use serde::Deserialize;
|
||||
use std::{fmt::Display, str::FromStr};
|
||||
use tracing_subscriber::EnvFilter;
|
||||
use tracing_indicatif::IndicatifWriter;
|
||||
use tracing_subscriber::{
|
||||
EnvFilter, Layer, fmt::MakeWriter, layer::SubscriberExt, util::SubscriberInitExt,
|
||||
};
|
||||
use url::Url;
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub enum LogLevel {
|
||||
@@ -32,6 +38,7 @@ pub enum LoggingPreset {
|
||||
Info,
|
||||
Debug,
|
||||
Trace,
|
||||
Loki,
|
||||
}
|
||||
|
||||
pub struct LoggingConfig {
|
||||
@@ -138,6 +145,203 @@ impl LoggingPreset {
|
||||
pile_dataset: LogLevel::Trace,
|
||||
pile_toolbox: LogLevel::Trace,
|
||||
},
|
||||
|
||||
Self::Loki => LoggingConfig {
|
||||
other: LogLevel::Warn,
|
||||
extractor: LogLevel::Error,
|
||||
|
||||
pile: LogLevel::Trace,
|
||||
pile_flac: LogLevel::Trace,
|
||||
pile_config: LogLevel::Trace,
|
||||
pile_dataset: LogLevel::Trace,
|
||||
pile_toolbox: LogLevel::Trace,
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// MARK: initializer
|
||||
//
|
||||
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
pub struct LokiConfig {
|
||||
pub loki_host: Url,
|
||||
pub loki_user: String,
|
||||
pub loki_pass: String,
|
||||
pub loki_node_name: String,
|
||||
}
|
||||
|
||||
/// Where to print logs
|
||||
#[expect(dead_code)]
|
||||
pub enum LoggingTarget {
|
||||
/// Send logs to stdout
|
||||
Stdout { format: LoggingFormat },
|
||||
|
||||
/// Send logs to stderr
|
||||
Stderr { format: LoggingFormat },
|
||||
|
||||
/// Send logs to an IndicatifWriter.
|
||||
///
|
||||
/// This is the same as Stderr { format: Ansi {color:true} },
|
||||
/// but uses an indicatifwriter with the given multiprogress.
|
||||
Indicatif(MultiProgress),
|
||||
}
|
||||
|
||||
/// How to print logs
|
||||
#[derive(Debug, Clone, Copy, Deserialize, Default)]
|
||||
pub enum LoggingFormat {
|
||||
#[default]
|
||||
Ansi,
|
||||
AnsiNoColor,
|
||||
Json,
|
||||
}
|
||||
|
||||
pub struct LoggingInitializer {
|
||||
pub app_name: &'static str,
|
||||
|
||||
/// If `Some`, send logs to the given loki server
|
||||
pub loki: Option<LokiConfig>,
|
||||
|
||||
/// Log filter for printed logs
|
||||
pub preset: LoggingPreset,
|
||||
|
||||
/// Where to print logs
|
||||
pub target: LoggingTarget,
|
||||
}
|
||||
|
||||
impl LoggingInitializer {
|
||||
pub fn initialize(self) -> Result<()> {
|
||||
let mut stderr_ansi_layer = None;
|
||||
let mut stderr_json_layer = None;
|
||||
let mut stdout_ansi_layer = None;
|
||||
let mut stdout_json_layer = None;
|
||||
let mut indicatif_layer = None;
|
||||
match self.target {
|
||||
LoggingTarget::Stderr {
|
||||
format: LoggingFormat::Ansi,
|
||||
} => {
|
||||
stderr_ansi_layer = Some(
|
||||
tracing_subscriber::fmt::Layer::default()
|
||||
.without_time()
|
||||
.with_ansi(true)
|
||||
.with_writer(std::io::stderr)
|
||||
.with_filter::<EnvFilter>(self.preset.get_config().into()),
|
||||
)
|
||||
}
|
||||
|
||||
LoggingTarget::Stderr {
|
||||
format: LoggingFormat::AnsiNoColor,
|
||||
} => {
|
||||
stderr_ansi_layer = Some(
|
||||
tracing_subscriber::fmt::Layer::default()
|
||||
.without_time()
|
||||
.with_ansi(false)
|
||||
.with_writer(std::io::stderr)
|
||||
.with_filter::<EnvFilter>(self.preset.get_config().into()),
|
||||
)
|
||||
}
|
||||
|
||||
LoggingTarget::Stderr {
|
||||
format: LoggingFormat::Json,
|
||||
} => {
|
||||
stderr_json_layer = Some(
|
||||
tracing_subscriber::fmt::Layer::default()
|
||||
.without_time()
|
||||
.with_ansi(false)
|
||||
.json()
|
||||
.flatten_event(true)
|
||||
.with_writer(std::io::stderr)
|
||||
.with_filter::<EnvFilter>(self.preset.get_config().into()),
|
||||
)
|
||||
}
|
||||
|
||||
LoggingTarget::Stdout {
|
||||
format: LoggingFormat::Ansi,
|
||||
} => {
|
||||
stdout_ansi_layer = Some(
|
||||
tracing_subscriber::fmt::Layer::default()
|
||||
.without_time()
|
||||
.with_ansi(true)
|
||||
.with_writer(std::io::stdout)
|
||||
.with_filter::<EnvFilter>(self.preset.get_config().into()),
|
||||
)
|
||||
}
|
||||
|
||||
LoggingTarget::Stdout {
|
||||
format: LoggingFormat::AnsiNoColor,
|
||||
} => {
|
||||
stdout_ansi_layer = Some(
|
||||
tracing_subscriber::fmt::Layer::default()
|
||||
.without_time()
|
||||
.with_ansi(false)
|
||||
.with_writer(std::io::stdout)
|
||||
.with_filter::<EnvFilter>(self.preset.get_config().into()),
|
||||
)
|
||||
}
|
||||
|
||||
LoggingTarget::Stdout {
|
||||
format: LoggingFormat::Json,
|
||||
} => {
|
||||
stdout_json_layer = Some(
|
||||
tracing_subscriber::fmt::Layer::default()
|
||||
.without_time()
|
||||
.with_ansi(false)
|
||||
.json()
|
||||
.flatten_event(true)
|
||||
.with_writer(std::io::stdout)
|
||||
.with_filter::<EnvFilter>(self.preset.get_config().into()),
|
||||
)
|
||||
}
|
||||
|
||||
LoggingTarget::Indicatif(mp) => {
|
||||
let writer: IndicatifWriter<tracing_indicatif::writer::Stderr> =
|
||||
IndicatifWriter::new(mp);
|
||||
|
||||
indicatif_layer = Some(
|
||||
tracing_subscriber::fmt::Layer::default()
|
||||
.without_time()
|
||||
.with_ansi(true)
|
||||
.with_writer(writer.make_writer())
|
||||
.with_filter::<EnvFilter>(self.preset.get_config().into()),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
let loki_layer = {
|
||||
if let Some(cfg) = self.loki {
|
||||
use anyhow::Context;
|
||||
use base64::{Engine, prelude::BASE64_STANDARD};
|
||||
|
||||
let basic_auth = format!("{}:{}", cfg.loki_user, cfg.loki_pass);
|
||||
let encoded_basic_auth = BASE64_STANDARD.encode(basic_auth.as_bytes());
|
||||
|
||||
let (layer, task) = tracing_loki::builder()
|
||||
.label("node_name", cfg.loki_node_name)
|
||||
.context("while building loki node_name label")?
|
||||
.label("app", self.app_name)
|
||||
.context("while building loki app label")?
|
||||
.http_header("Authorization", format!("Basic {encoded_basic_auth}"))
|
||||
.context("while building loki header")?
|
||||
.build_url(cfg.loki_host)
|
||||
.context("while building loki layer")?;
|
||||
|
||||
tokio::spawn(task);
|
||||
Some(layer.with_filter::<EnvFilter>(LoggingPreset::Loki.get_config().into()))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
tracing_subscriber::registry()
|
||||
.with(loki_layer)
|
||||
.with(stdout_ansi_layer)
|
||||
.with(stdout_json_layer)
|
||||
.with(stderr_ansi_layer)
|
||||
.with(stderr_json_layer)
|
||||
.with(indicatif_layer)
|
||||
.init();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,2 +1,6 @@
|
||||
mod logging;
|
||||
pub use logging::*;
|
||||
pub mod env;
|
||||
pub mod logging;
|
||||
|
||||
#[expect(clippy::module_inception)]
|
||||
mod config;
|
||||
pub use config::*;
|
||||
|
||||
@@ -1,15 +1,13 @@
|
||||
use anyhow::{Context, Result};
|
||||
use clap::Parser;
|
||||
use config::LoggingPreset;
|
||||
use indicatif::MultiProgress;
|
||||
use pile_toolbox::cancelabletask::CancelableTaskResult;
|
||||
use std::process::ExitCode;
|
||||
use tracing::{error, warn};
|
||||
use tracing_indicatif::{IndicatifWriter, writer::Stderr};
|
||||
use tracing_subscriber::fmt::MakeWriter;
|
||||
|
||||
use crate::{
|
||||
command::{CliCmd, CliCmdDispatch, SubCommand},
|
||||
config::{PileServerConfig, logging::LoggingPreset},
|
||||
signal::start_signal_task,
|
||||
};
|
||||
|
||||
@@ -36,17 +34,11 @@ struct Cli {
|
||||
#[derive(Clone)]
|
||||
pub struct GlobalContext {
|
||||
pub mp: MultiProgress,
|
||||
pub config: PileServerConfig,
|
||||
}
|
||||
|
||||
fn main() -> ExitCode {
|
||||
#[expect(clippy::unwrap_used)]
|
||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||
.enable_all()
|
||||
.worker_threads(10)
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
match rt.block_on(main_inner()) {
|
||||
match main_inner() {
|
||||
Ok(code) => {
|
||||
std::process::exit(code);
|
||||
}
|
||||
@@ -59,7 +51,7 @@ fn main() -> ExitCode {
|
||||
}
|
||||
}
|
||||
|
||||
async fn main_inner() -> Result<i32> {
|
||||
fn main_inner() -> Result<i32> {
|
||||
let cli = Cli::parse();
|
||||
|
||||
let level_i: i16 = cli.v as i16 - cli.q as i16;
|
||||
@@ -80,36 +72,32 @@ async fn main_inner() -> Result<i32> {
|
||||
}
|
||||
|
||||
let mp = MultiProgress::new();
|
||||
let writer: IndicatifWriter<Stderr> = IndicatifWriter::new(mp.clone());
|
||||
let config = PileServerConfig::load(matches!(cli.cmd, SubCommand::Server { .. }), level);
|
||||
let rt = config.make_runtime();
|
||||
|
||||
tracing_subscriber::fmt()
|
||||
.with_env_filter(level.get_config())
|
||||
.without_time()
|
||||
.with_ansi(true)
|
||||
.with_writer(writer.make_writer())
|
||||
.init();
|
||||
let ctx = GlobalContext { mp, config };
|
||||
|
||||
let ctx = GlobalContext { mp };
|
||||
rt.block_on(async {
|
||||
let task = cli.cmd.start(ctx).context("while starting task")?;
|
||||
let signal_task = start_signal_task(task.flag().clone());
|
||||
|
||||
let task = cli.cmd.start(ctx).context("while starting task")?;
|
||||
let signal_task = start_signal_task(task.flag().clone());
|
||||
match task.join().await {
|
||||
Ok(CancelableTaskResult::Finished(Ok(code))) => Ok(code),
|
||||
Ok(CancelableTaskResult::Cancelled) => {
|
||||
signal_task.abort();
|
||||
warn!("Task cancelled successfully");
|
||||
Ok(1)
|
||||
}
|
||||
|
||||
match task.join().await {
|
||||
Ok(CancelableTaskResult::Finished(Ok(code))) => Ok(code),
|
||||
Ok(CancelableTaskResult::Cancelled) => {
|
||||
signal_task.abort();
|
||||
warn!("Task cancelled successfully");
|
||||
Ok(1)
|
||||
Err(err) => {
|
||||
signal_task.abort();
|
||||
Err(err).context("while joining task")
|
||||
}
|
||||
|
||||
Ok(CancelableTaskResult::Finished(Err(err))) => {
|
||||
signal_task.abort();
|
||||
Err(err).context("while running task")
|
||||
}
|
||||
}
|
||||
|
||||
Err(err) => {
|
||||
signal_task.abort();
|
||||
Err(err).context("while joining task")
|
||||
}
|
||||
|
||||
Ok(CancelableTaskResult::Finished(Err(err))) => {
|
||||
signal_task.abort();
|
||||
Err(err).context("while running task")
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
24
docker-compose.yml
Normal file
24
docker-compose.yml
Normal file
@@ -0,0 +1,24 @@
|
||||
services:
|
||||
pile:
|
||||
#image: git.betalupi.com/mark/pile:latest
|
||||
image: pile
|
||||
container_name: pile
|
||||
restart: unless-stopped
|
||||
|
||||
ports:
|
||||
- 7100:7100
|
||||
volumes:
|
||||
- "./x.ignore/books:/data/books:ro"
|
||||
- "./pile:/workdir"
|
||||
|
||||
environment:
|
||||
SERVER_ADDR: "0.0.0.0:7100"
|
||||
WORKDIR_ROOT: "/workdir"
|
||||
API_TOKEN: "pile_token"
|
||||
THREADS: 8
|
||||
#LOKI_HOST: "http://loki:3100"
|
||||
#LOKI_USER: "user"
|
||||
#LOKI_PASS: "pass"
|
||||
#LOKI_NODE_NAME: "pile"
|
||||
|
||||
command: "pile server -c /data/books/pile.toml"
|
||||
Reference in New Issue
Block a user