Compare commits
6 Commits
main
...
82dcdbaa27
| Author | SHA1 | Date | |
|---|---|---|---|
| 82dcdbaa27 | |||
| 42f186d77f | |||
| b36b62150c | |||
| c03fac0e37 | |||
| c9d99e8719 | |||
| 4546a85bd3 |
@@ -1,30 +0,0 @@
|
|||||||
name: Docker
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches: [main]
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
build-and-push:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- name: Log in to Gitea container registry
|
|
||||||
uses: docker/login-action@v3
|
|
||||||
with:
|
|
||||||
registry: git.betalupi.com
|
|
||||||
username: ${{ gitea.actor }}
|
|
||||||
password: ${{ secrets.DEPLOY_TOKEN }}
|
|
||||||
|
|
||||||
- name: Set up Docker Buildx
|
|
||||||
uses: docker/setup-buildx-action@v3
|
|
||||||
|
|
||||||
- name: Build and push
|
|
||||||
uses: docker/build-push-action@v6
|
|
||||||
with:
|
|
||||||
context: .
|
|
||||||
push: true
|
|
||||||
tags: git.betalupi.com/mark/pile:latest
|
|
||||||
cache-from: type=registry,ref=git.betalupi.com/mark/pile:cache
|
|
||||||
cache-to: type=registry,ref=git.betalupi.com/mark/pile:cache,mode=max
|
|
||||||
1212
Cargo.lock
generated
1212
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
29
Cargo.toml
29
Cargo.toml
@@ -57,7 +57,6 @@ unimplemented = "deny"
|
|||||||
unwrap_used = "warn"
|
unwrap_used = "warn"
|
||||||
expect_used = "warn"
|
expect_used = "warn"
|
||||||
type_complexity = "allow"
|
type_complexity = "allow"
|
||||||
len_without_is_empty = "allow"
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# MARK: dependencies
|
# MARK: dependencies
|
||||||
@@ -69,13 +68,9 @@ pile-config = { path = "crates/pile-config" }
|
|||||||
pile-flac = { path = "crates/pile-flac" }
|
pile-flac = { path = "crates/pile-flac" }
|
||||||
pile-dataset = { path = "crates/pile-dataset" }
|
pile-dataset = { path = "crates/pile-dataset" }
|
||||||
pile-value = { path = "crates/pile-value" }
|
pile-value = { path = "crates/pile-value" }
|
||||||
pile-io = { path = "crates/pile-io" }
|
|
||||||
pile-client = { path = "crates/pile-client" }
|
|
||||||
pile-serve = { path = "crates/pile-serve" }
|
|
||||||
|
|
||||||
# MARK: Clients & servers
|
# Clients & servers
|
||||||
tantivy = "0.25.0"
|
tantivy = "0.25.0"
|
||||||
servable = { version = "0.0.7", features = ["image"] }
|
|
||||||
axum = { version = "0.8.8", features = ["macros", "multipart"] }
|
axum = { version = "0.8.8", features = ["macros", "multipart"] }
|
||||||
utoipa = { version = "5.4.0", features = [
|
utoipa = { version = "5.4.0", features = [
|
||||||
"axum_extras",
|
"axum_extras",
|
||||||
@@ -88,15 +83,15 @@ utoipa-swagger-ui = { version = "9.0.2", features = [
|
|||||||
"debug-embed",
|
"debug-embed",
|
||||||
"vendored",
|
"vendored",
|
||||||
] }
|
] }
|
||||||
reqwest = { version = "0.12", features = ["blocking", "json", "stream"] }
|
|
||||||
tracing-loki = "0.2.6"
|
|
||||||
|
|
||||||
# MARK: Async & Parallelism
|
# Async & Parallelism
|
||||||
tokio = { version = "1.49.0", features = ["full"] }
|
tokio = { version = "1.49.0", features = ["full"] }
|
||||||
tokio-stream = "0.1"
|
tokio-stream = "0.1"
|
||||||
async-trait = "0.1"
|
async-trait = "0.1"
|
||||||
|
aws-sdk-s3 = "1"
|
||||||
|
aws-config = "1"
|
||||||
|
|
||||||
# MARK: CLI & logging
|
# CLI & logging
|
||||||
tracing = "0.1.44"
|
tracing = "0.1.44"
|
||||||
tracing-subscriber = { version = "0.3.22", features = ["env-filter", "json"] }
|
tracing-subscriber = { version = "0.3.22", features = ["env-filter", "json"] }
|
||||||
indicatif = { version = "0.18.4", features = ["improved_unicode"] }
|
indicatif = { version = "0.18.4", features = ["improved_unicode"] }
|
||||||
@@ -104,21 +99,16 @@ tracing-indicatif = "0.3.14"
|
|||||||
anstyle = "1.0.13"
|
anstyle = "1.0.13"
|
||||||
clap = { version = "4.5.60", features = ["derive"] }
|
clap = { version = "4.5.60", features = ["derive"] }
|
||||||
|
|
||||||
# MARK: Serialization & formats
|
# Serialization & formats
|
||||||
serde = { version = "1.0.228", features = ["derive"] }
|
serde = { version = "1.0.228", features = ["derive"] }
|
||||||
serde_json = "1.0.149"
|
serde_json = "1.0.149"
|
||||||
base64 = "0.22.1"
|
base64 = "0.22.1"
|
||||||
bytes = "1"
|
|
||||||
toml = "1.0.3"
|
toml = "1.0.3"
|
||||||
toml_edit = "0.25.4"
|
toml_edit = "0.25.4"
|
||||||
sha2 = "0.11.0-rc.5"
|
sha2 = "0.11.0-rc.5"
|
||||||
sha1 = "0.10"
|
|
||||||
md5 = "0.7"
|
|
||||||
blake3 = "1.8.3"
|
blake3 = "1.8.3"
|
||||||
dotenvy = "0.15.7"
|
|
||||||
envy = "0.4.2"
|
|
||||||
|
|
||||||
# MARK: Extractors
|
# Extractors
|
||||||
pdf = "0.10.0"
|
pdf = "0.10.0"
|
||||||
id3 = "1.16.4"
|
id3 = "1.16.4"
|
||||||
epub = "1.2.2"
|
epub = "1.2.2"
|
||||||
@@ -126,7 +116,7 @@ kamadak-exif = "0.6.1"
|
|||||||
pdfium-render = "0.8"
|
pdfium-render = "0.8"
|
||||||
image = { version = "0.25", default-features = false, features = ["png"] }
|
image = { version = "0.25", default-features = false, features = ["png"] }
|
||||||
|
|
||||||
# MARK: Misc helpers
|
# Misc helpers
|
||||||
thiserror = "2.0.18"
|
thiserror = "2.0.18"
|
||||||
anyhow = "1.0.102"
|
anyhow = "1.0.102"
|
||||||
itertools = "0.14.0"
|
itertools = "0.14.0"
|
||||||
@@ -137,9 +127,6 @@ mime = "0.3.17"
|
|||||||
mime_guess = "2.0.5"
|
mime_guess = "2.0.5"
|
||||||
paste = "1.0.15"
|
paste = "1.0.15"
|
||||||
smartstring = "1.0.1"
|
smartstring = "1.0.1"
|
||||||
regex = "1"
|
|
||||||
chrono = "0.4.43"
|
chrono = "0.4.43"
|
||||||
parking_lot = "0.12.5"
|
parking_lot = "0.12.5"
|
||||||
rayon = "1.11.0"
|
rayon = "1.11.0"
|
||||||
percent-encoding = "2"
|
|
||||||
url = { version = "2.5.8", features = ["serde"] }
|
|
||||||
|
|||||||
38
Dockerfile
38
Dockerfile
@@ -1,38 +0,0 @@
|
|||||||
FROM rust:1.94-bookworm AS base
|
|
||||||
|
|
||||||
#
|
|
||||||
# MARK: Build
|
|
||||||
#
|
|
||||||
FROM base AS build
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y --no-install-recommends \
|
|
||||||
ca-certificates wget unzip \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
WORKDIR /app/rust
|
|
||||||
COPY . .
|
|
||||||
RUN cargo build --release --workspace
|
|
||||||
RUN cargo test --release --workspace
|
|
||||||
|
|
||||||
#
|
|
||||||
# MARK: Release
|
|
||||||
#
|
|
||||||
FROM debian:bookworm AS deploy
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y --no-install-recommends \
|
|
||||||
ca-certificates \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
COPY --from=build \
|
|
||||||
/app/rust/target/release/pile \
|
|
||||||
/app/rust/target/release/libpdfium.so \
|
|
||||||
/app/bin/
|
|
||||||
|
|
||||||
ENV PATH="/app/bin:$PATH"
|
|
||||||
ENV RUST_BACKTRACE=full
|
|
||||||
|
|
||||||
ENTRYPOINT [""]
|
|
||||||
@@ -1,18 +0,0 @@
|
|||||||
[package]
|
|
||||||
name = "pile-client"
|
|
||||||
version = { workspace = true }
|
|
||||||
rust-version = { workspace = true }
|
|
||||||
edition = { workspace = true }
|
|
||||||
|
|
||||||
[lints]
|
|
||||||
workspace = true
|
|
||||||
|
|
||||||
[dependencies]
|
|
||||||
pile-serve = { workspace = true }
|
|
||||||
|
|
||||||
reqwest = { workspace = true }
|
|
||||||
serde = { workspace = true }
|
|
||||||
thiserror = { workspace = true }
|
|
||||||
bytes = { workspace = true }
|
|
||||||
axum = { workspace = true }
|
|
||||||
tracing = { workspace = true }
|
|
||||||
@@ -1,339 +0,0 @@
|
|||||||
use axum::{
|
|
||||||
Router, body::Body as AxumBody, extract::State, response::Response as AxumResponse,
|
|
||||||
routing::any,
|
|
||||||
};
|
|
||||||
use bytes::Bytes;
|
|
||||||
use reqwest::{Client, StatusCode, header};
|
|
||||||
use serde::Deserialize;
|
|
||||||
use thiserror::Error;
|
|
||||||
use tracing::{trace, warn};
|
|
||||||
|
|
||||||
pub use pile_serve::{
|
|
||||||
ApiValue, FieldSpec, FieldsResponse, ItemsResponse, LookupRequest, LookupResponse,
|
|
||||||
SchemaResponse,
|
|
||||||
};
|
|
||||||
|
|
||||||
#[derive(Debug, Error)]
|
|
||||||
pub enum ClientError {
|
|
||||||
#[error("invalid bearer token")]
|
|
||||||
InvalidToken,
|
|
||||||
|
|
||||||
#[error("HTTP {status}: {body}")]
|
|
||||||
Http { status: StatusCode, body: String },
|
|
||||||
|
|
||||||
#[error(transparent)]
|
|
||||||
Reqwest(#[from] reqwest::Error),
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
|
||||||
pub struct DatasetInfo {
|
|
||||||
pub name: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Raw field response: the content-type and body bytes as returned by the server.
|
|
||||||
pub struct FieldResponse {
|
|
||||||
pub content_type: String,
|
|
||||||
pub data: Bytes,
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
// MARK: PileClient
|
|
||||||
//
|
|
||||||
|
|
||||||
/// A client for a pile server. Use [`PileClient::dataset`] to get a dataset-scoped client.
|
|
||||||
pub struct PileClient {
|
|
||||||
base_url: String,
|
|
||||||
client: Client,
|
|
||||||
token: Option<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PileClient {
|
|
||||||
pub fn new(base_url: impl Into<String>, token: Option<&str>) -> Result<Self, ClientError> {
|
|
||||||
let mut headers = header::HeaderMap::new();
|
|
||||||
|
|
||||||
if let Some(token) = token {
|
|
||||||
let value = header::HeaderValue::from_str(&format!("Bearer {token}"))
|
|
||||||
.map_err(|_| ClientError::InvalidToken)?;
|
|
||||||
headers.insert(header::AUTHORIZATION, value);
|
|
||||||
}
|
|
||||||
|
|
||||||
let client = Client::builder()
|
|
||||||
.default_headers(headers)
|
|
||||||
.build()
|
|
||||||
.map_err(ClientError::Reqwest)?;
|
|
||||||
|
|
||||||
Ok(Self {
|
|
||||||
base_url: base_url.into(),
|
|
||||||
client,
|
|
||||||
token: token.map(str::to_owned),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns a client scoped to a specific dataset (i.e. `/{name}/...`).
|
|
||||||
pub fn dataset(&self, name: &str) -> DatasetClient {
|
|
||||||
DatasetClient {
|
|
||||||
base_url: format!("{}/{name}", self.base_url),
|
|
||||||
client: self.client.clone(),
|
|
||||||
token: self.token.clone(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// `GET /datasets` — list all datasets served by this server.
|
|
||||||
pub async fn list_datasets(&self) -> Result<Vec<DatasetInfo>, ClientError> {
|
|
||||||
let url = format!("{}/datasets", self.base_url);
|
|
||||||
trace!(url, "GET /datasets");
|
|
||||||
let resp = self.client.get(url).send().await?;
|
|
||||||
|
|
||||||
check_status(resp).await?.json().await.map_err(Into::into)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
// MARK: DatasetClient
|
|
||||||
//
|
|
||||||
|
|
||||||
/// A client scoped to a single dataset on the server.
|
|
||||||
pub struct DatasetClient {
|
|
||||||
base_url: String,
|
|
||||||
client: Client,
|
|
||||||
token: Option<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl DatasetClient {
|
|
||||||
/// `POST /lookup` — full-text search within this dataset.
|
|
||||||
pub async fn lookup(
|
|
||||||
&self,
|
|
||||||
query: impl Into<String>,
|
|
||||||
limit: Option<usize>,
|
|
||||||
) -> Result<LookupResponse, ClientError> {
|
|
||||||
let body = LookupRequest {
|
|
||||||
query: query.into(),
|
|
||||||
limit,
|
|
||||||
};
|
|
||||||
|
|
||||||
let url = format!("{}/lookup", self.base_url);
|
|
||||||
trace!(url, "POST /lookup");
|
|
||||||
let resp = self.client.post(url).json(&body).send().await?;
|
|
||||||
|
|
||||||
check_status(resp).await?.json().await.map_err(Into::into)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// `GET /extract` — extract a field from an item by object path (e.g. `$.flac.title`).
|
|
||||||
pub async fn get_extract(
|
|
||||||
&self,
|
|
||||||
source: &str,
|
|
||||||
key: &str,
|
|
||||||
path: &str,
|
|
||||||
) -> Result<FieldResponse, ClientError> {
|
|
||||||
let url = format!("{}/extract", self.base_url);
|
|
||||||
trace!(url, source, key, path, "GET /extract");
|
|
||||||
let resp = self
|
|
||||||
.client
|
|
||||||
.get(url)
|
|
||||||
.query(&[("source", source), ("key", key), ("path", path)])
|
|
||||||
.send()
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
let resp = check_status(resp).await?;
|
|
||||||
|
|
||||||
let content_type = resp
|
|
||||||
.headers()
|
|
||||||
.get(header::CONTENT_TYPE)
|
|
||||||
.and_then(|v| v.to_str().ok())
|
|
||||||
.unwrap_or("application/octet-stream")
|
|
||||||
.to_owned();
|
|
||||||
|
|
||||||
let data = resp.bytes().await?;
|
|
||||||
|
|
||||||
Ok(FieldResponse { content_type, data })
|
|
||||||
}
|
|
||||||
|
|
||||||
/// `GET /schema/{field}` — get a single schema field value from an item.
|
|
||||||
pub async fn schema_field(
|
|
||||||
&self,
|
|
||||||
source: &str,
|
|
||||||
key: &str,
|
|
||||||
field: &str,
|
|
||||||
) -> Result<FieldResponse, ClientError> {
|
|
||||||
let url = format!("{}/schema/{field}", self.base_url);
|
|
||||||
trace!(url, source, key, field, "GET /schema/{field}");
|
|
||||||
let resp = self
|
|
||||||
.client
|
|
||||||
.get(url)
|
|
||||||
.query(&[("source", source), ("key", key)])
|
|
||||||
.send()
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
let resp = check_status(resp).await?;
|
|
||||||
|
|
||||||
let content_type = resp
|
|
||||||
.headers()
|
|
||||||
.get(header::CONTENT_TYPE)
|
|
||||||
.and_then(|v| v.to_str().ok())
|
|
||||||
.unwrap_or("application/octet-stream")
|
|
||||||
.to_owned();
|
|
||||||
|
|
||||||
let data = resp.bytes().await?;
|
|
||||||
|
|
||||||
Ok(FieldResponse { content_type, data })
|
|
||||||
}
|
|
||||||
|
|
||||||
/// `GET /schema` — get all schema field values for a single item.
|
|
||||||
pub async fn schema(
|
|
||||||
&self,
|
|
||||||
source: &str,
|
|
||||||
key: &str,
|
|
||||||
hidden: bool,
|
|
||||||
) -> Result<SchemaResponse, ClientError> {
|
|
||||||
let url = format!("{}/schema", self.base_url);
|
|
||||||
trace!(url, source, key, hidden, "GET /schema");
|
|
||||||
let resp = self
|
|
||||||
.client
|
|
||||||
.get(url)
|
|
||||||
.query(&[("source", source), ("key", key)])
|
|
||||||
.query(&[("hidden", hidden)])
|
|
||||||
.send()
|
|
||||||
.await?;
|
|
||||||
check_status(resp).await?.json().await.map_err(Into::into)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// `GET /config/schema` — retrieve this dataset's schema spec.
|
|
||||||
pub async fn config_schema(&self) -> Result<FieldsResponse, ClientError> {
|
|
||||||
let url = format!("{}/config/schema", self.base_url);
|
|
||||||
trace!(url, "GET /config/schema");
|
|
||||||
let resp = self.client.get(url).send().await?;
|
|
||||||
check_status(resp).await?.json().await.map_err(Into::into)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// `GET /items` — paginate over all items in this dataset, ordered by (source, key).
|
|
||||||
pub async fn list_items(
|
|
||||||
&self,
|
|
||||||
offset: usize,
|
|
||||||
limit: usize,
|
|
||||||
) -> Result<ItemsResponse, ClientError> {
|
|
||||||
let url = format!("{}/items", self.base_url);
|
|
||||||
trace!(url, offset, limit, "GET /items");
|
|
||||||
let resp = self
|
|
||||||
.client
|
|
||||||
.get(url)
|
|
||||||
.query(&[("offset", offset), ("limit", limit)])
|
|
||||||
.send()
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
check_status(resp).await?.json().await.map_err(Into::into)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns an axum [`Router`] that proxies all requests to this dataset's
|
|
||||||
/// endpoints on the remote pile server, streaming responses without buffering.
|
|
||||||
/// All headers are forwarded; hop-by-hop headers are stripped.
|
|
||||||
pub fn proxy_router(&self) -> Router {
|
|
||||||
let state = ProxyState {
|
|
||||||
base_url: self.base_url.clone(),
|
|
||||||
client: self.client.clone(),
|
|
||||||
token: self.token.clone(),
|
|
||||||
};
|
|
||||||
Router::new()
|
|
||||||
.route("/", any(proxy_handler))
|
|
||||||
.route("/{*path}", any(proxy_handler))
|
|
||||||
.with_state(state)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
// MARK: Proxy
|
|
||||||
//
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
struct ProxyState {
|
|
||||||
base_url: String,
|
|
||||||
client: Client,
|
|
||||||
token: Option<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn proxy_handler(
|
|
||||||
State(state): State<ProxyState>,
|
|
||||||
req: axum::extract::Request,
|
|
||||||
) -> AxumResponse {
|
|
||||||
let path = req.uri().path().to_owned();
|
|
||||||
let query_str = req
|
|
||||||
.uri()
|
|
||||||
.query()
|
|
||||||
.map(|q| format!("?{q}"))
|
|
||||||
.unwrap_or_default();
|
|
||||||
let method = req.method().clone();
|
|
||||||
|
|
||||||
let url = format!("{}{}{}", state.base_url, path, query_str);
|
|
||||||
trace!(method = %method, url, "proxying request");
|
|
||||||
let mut req_builder = state.client.request(method, &url);
|
|
||||||
|
|
||||||
// Forward all request headers except hop-by-hop and Host.
|
|
||||||
// Authorization is skipped so the client's default bearer token is used.
|
|
||||||
for (name, value) in req.headers() {
|
|
||||||
if !is_hop_by_hop(name) && name != header::HOST && name != header::AUTHORIZATION {
|
|
||||||
req_builder = req_builder.header(name, value);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Attach bearer token if present (overrides client default for clarity).
|
|
||||||
if let Some(ref token) = state.token
|
|
||||||
&& let Ok(value) = header::HeaderValue::from_str(&format!("Bearer {token}"))
|
|
||||||
{
|
|
||||||
req_builder = req_builder.header(header::AUTHORIZATION, value);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Stream the request body upstream.
|
|
||||||
let body_stream = req.into_body().into_data_stream();
|
|
||||||
req_builder = req_builder.body(reqwest::Body::wrap_stream(body_stream));
|
|
||||||
|
|
||||||
let upstream = match req_builder.send().await {
|
|
||||||
Ok(r) => r,
|
|
||||||
Err(e) => {
|
|
||||||
warn!(error = %e, "upstream request failed");
|
|
||||||
return AxumResponse::builder()
|
|
||||||
.status(StatusCode::BAD_GATEWAY.as_u16())
|
|
||||||
.body(AxumBody::from(e.to_string()))
|
|
||||||
.unwrap_or_else(|_| AxumResponse::new(AxumBody::empty()));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let status = upstream.status().as_u16();
|
|
||||||
trace!(status, "upstream response");
|
|
||||||
let resp_headers = upstream.headers().clone();
|
|
||||||
|
|
||||||
let mut builder = AxumResponse::builder().status(status);
|
|
||||||
for (name, value) in &resp_headers {
|
|
||||||
if !is_hop_by_hop(name) {
|
|
||||||
builder = builder.header(name, value);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Stream the response body without buffering.
|
|
||||||
builder
|
|
||||||
.body(AxumBody::from_stream(upstream.bytes_stream()))
|
|
||||||
.unwrap_or_else(|_| AxumResponse::new(AxumBody::empty()))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn is_hop_by_hop(name: &header::HeaderName) -> bool {
|
|
||||||
name == header::CONNECTION
|
|
||||||
|| name == header::TRANSFER_ENCODING
|
|
||||||
|| name == header::TE
|
|
||||||
|| name == header::UPGRADE
|
|
||||||
|| name == header::PROXY_AUTHORIZATION
|
|
||||||
|| name == header::PROXY_AUTHENTICATE
|
|
||||||
|| name.as_str() == "keep-alive"
|
|
||||||
|| name.as_str() == "trailers"
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
// MARK: helpers
|
|
||||||
//
|
|
||||||
|
|
||||||
async fn check_status(resp: reqwest::Response) -> Result<reqwest::Response, ClientError> {
|
|
||||||
let status = resp.status();
|
|
||||||
if status.is_success() {
|
|
||||||
return Ok(resp);
|
|
||||||
}
|
|
||||||
|
|
||||||
let body = resp.text().await.unwrap_or_default();
|
|
||||||
Err(ClientError::Http { status, body })
|
|
||||||
}
|
|
||||||
@@ -9,7 +9,7 @@ name = "dataset"
|
|||||||
# working_dir = ".pile"
|
# working_dir = ".pile"
|
||||||
|
|
||||||
# Data sources available in this dataset
|
# Data sources available in this dataset
|
||||||
source."music" = { type = "filesystem", path = "library" }
|
source."music" = { type = "filesystem", path = "music" }
|
||||||
|
|
||||||
# This dataset's schema.
|
# This dataset's schema.
|
||||||
# Defines normalized fields that are extracted from source entries on-demand.
|
# Defines normalized fields that are extracted from source entries on-demand.
|
||||||
@@ -21,18 +21,18 @@ source."music" = { type = "filesystem", path = "library" }
|
|||||||
# # only text is supported in this version.
|
# # only text is supported in this version.
|
||||||
# type = "text",
|
# type = "text",
|
||||||
#
|
#
|
||||||
# # How to extract this field from each source entry.
|
# # An array of jsonpaths (rfc9535) used to extract this field from each source entry.
|
||||||
# # These are evaluated in order, the first non-null value is used.
|
# # These are evaluated in order, the first non-null value is used.
|
||||||
# path = [ "$.json.path" ]
|
# # A single string is equivalent to an array with one element.
|
||||||
|
# path = "$.json.path"
|
||||||
# }
|
# }
|
||||||
[schema]
|
[schema]
|
||||||
album = { type = "text", path = ["$.flac.album"] }
|
album = { type = "text", path = "$.Album" }
|
||||||
isrc = { type = "text", path = ["$.flac.isrc"] }
|
isrc = { type = "text", path = "$.Isrc" }
|
||||||
artist = { type = "text", path = ["$.flac.artist", "$.flac.trackartist"] }
|
artist = { type = "text", path = ["$.Artist", "$.TrackArtist"] }
|
||||||
lyrics = { type = "text", path = ["$.flac.lyrics"] }
|
lyrics = { type = "text", path = "$.Lyrics" }
|
||||||
genre = { type = "text", path = ["$.flac.genre"] }
|
genre = { type = "text", path = "$.Genre" }
|
||||||
title = { type = "text", path = ["$.flac.tracktitle", "$.flac.title"] }
|
title = { type = "text", path = ["$.Title", "$.TrackTitle"] }
|
||||||
|
|
||||||
|
|
||||||
# Fts configuration.
|
# Fts configuration.
|
||||||
# Determines which fields (defined in `schema`) are included in the fts index.
|
# Determines which fields (defined in `schema`) are included in the fts index.
|
||||||
|
|||||||
@@ -1,13 +1,12 @@
|
|||||||
use serde::{Deserialize, Serialize};
|
use serde::Deserialize;
|
||||||
use std::{collections::HashMap, fmt::Debug, path::PathBuf};
|
use std::{collections::HashMap, fmt::Debug, path::PathBuf};
|
||||||
|
|
||||||
use crate::objectpath::ObjectPath;
|
|
||||||
|
|
||||||
mod misc;
|
mod misc;
|
||||||
pub use misc::*;
|
pub use misc::*;
|
||||||
|
|
||||||
|
use crate::objectpath::ObjectPath;
|
||||||
|
|
||||||
pub mod objectpath;
|
pub mod objectpath;
|
||||||
pub mod pattern;
|
|
||||||
|
|
||||||
pub static INIT_DB_TOML: &str = include_str!("./config.toml");
|
pub static INIT_DB_TOML: &str = include_str!("./config.toml");
|
||||||
|
|
||||||
@@ -15,15 +14,6 @@ fn default_true() -> bool {
|
|||||||
true
|
true
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn default_base() -> String {
|
|
||||||
"(.*)".to_owned()
|
|
||||||
}
|
|
||||||
|
|
||||||
#[expect(clippy::unwrap_used)]
|
|
||||||
pub fn default_files() -> HashMap<Label, String> {
|
|
||||||
[(Label::new("item").unwrap(), "{base}".to_owned())].into()
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
#[expect(clippy::expect_used)]
|
#[expect(clippy::expect_used)]
|
||||||
fn init_db_toml_valid() {
|
fn init_db_toml_valid() {
|
||||||
@@ -42,35 +32,53 @@ pub struct DatasetConfig {
|
|||||||
/// Must be unique
|
/// Must be unique
|
||||||
pub name: Label,
|
pub name: Label,
|
||||||
|
|
||||||
|
/// Root dir for indices
|
||||||
|
pub working_dir: Option<PathBuf>,
|
||||||
|
|
||||||
/// Where to find this field
|
/// Where to find this field
|
||||||
pub source: HashMap<Label, Source>,
|
pub source: HashMap<Label, Source>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Deserialize)]
|
||||||
|
pub struct S3Credentials {
|
||||||
|
pub access_key_id: String,
|
||||||
|
pub secret_access_key: String,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Deserialize)]
|
#[derive(Debug, Clone, Deserialize)]
|
||||||
#[serde(tag = "type")]
|
#[serde(tag = "type")]
|
||||||
#[serde(rename_all = "lowercase")]
|
#[serde(rename_all = "lowercase")]
|
||||||
pub enum Source {
|
pub enum Source {
|
||||||
/// A directory of files
|
/// A directory of files
|
||||||
Filesystem {
|
Filesystem {
|
||||||
/// If false, ignore this dataset
|
|
||||||
#[serde(default = "default_true")]
|
|
||||||
enabled: bool,
|
|
||||||
|
|
||||||
/// The directories to scan.
|
/// The directories to scan.
|
||||||
/// Must be relative.
|
/// Must be relative.
|
||||||
path: PathBuf,
|
path: PathBuf,
|
||||||
|
|
||||||
/// Regex that extracts an item key from a file path.
|
/// If true, all toml files are ignored.
|
||||||
/// - File paths are relative to `path`.
|
/// Metadata can be added to any file using a {filename}.toml.
|
||||||
/// - The first group in this regex is the file's item key.
|
///
|
||||||
#[serde(default = "default_base")]
|
/// If false, toml files are treated as regular files
|
||||||
base_pattern: String,
|
/// and sidecar metadata is disabled.
|
||||||
|
#[serde(default = "default_true")]
|
||||||
|
sidecars: bool,
|
||||||
|
},
|
||||||
|
|
||||||
/// Map of files included in each item.'
|
/// An S3-compatible object store bucket
|
||||||
/// `{base}` is replaced with the string extracted by base_pattern.
|
S3 {
|
||||||
/// Default is `{ item: "{base}" }`
|
bucket: String,
|
||||||
#[serde(default = "default_files")]
|
prefix: Option<String>,
|
||||||
files: HashMap<Label, String>,
|
|
||||||
|
/// Custom endpoint URL (for MinIO, etc.)
|
||||||
|
endpoint: Option<String>,
|
||||||
|
|
||||||
|
region: String,
|
||||||
|
|
||||||
|
credentials: S3Credentials,
|
||||||
|
|
||||||
|
/// If true, all .toml objects are treated as sidecar metadata files.
|
||||||
|
#[serde(default = "default_true")]
|
||||||
|
sidecars: bool,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -78,18 +86,21 @@ pub enum Source {
|
|||||||
// MARK: schema
|
// MARK: schema
|
||||||
//
|
//
|
||||||
|
|
||||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
#[derive(Debug, Clone, Deserialize)]
|
||||||
pub struct FieldSpec {
|
pub struct FieldSpec {
|
||||||
/// If true, do not display this field.
|
/// The type of this field
|
||||||
/// This attribute has no effect on pile, it
|
pub r#type: FieldType,
|
||||||
/// is intended for consumers of data.
|
|
||||||
#[serde(default)]
|
|
||||||
pub hidden: bool,
|
|
||||||
|
|
||||||
/// How to find this field in a data entry
|
/// How to find this field in a data entry
|
||||||
pub path: Vec<ObjectPath>,
|
pub path: Vec<ObjectPath>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, Deserialize, PartialEq, Eq)]
|
||||||
|
#[serde(rename_all = "lowercase")]
|
||||||
|
pub enum FieldType {
|
||||||
|
Text,
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// MARK: fts
|
// MARK: fts
|
||||||
//
|
//
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
use std::{fmt, str::FromStr};
|
use std::{fmt, str::FromStr};
|
||||||
|
|
||||||
use serde::{
|
use serde::{
|
||||||
Deserialize, Deserializer, Serialize, Serializer,
|
Deserialize, Deserializer,
|
||||||
de::{self, Visitor},
|
de::{self, Visitor},
|
||||||
};
|
};
|
||||||
use smartstring::{LazyCompact, SmartString};
|
use smartstring::{LazyCompact, SmartString};
|
||||||
@@ -41,21 +41,11 @@ pub enum PathSegment {
|
|||||||
/// Go to root node (`$` identifier)
|
/// Go to root node (`$` identifier)
|
||||||
Root,
|
Root,
|
||||||
|
|
||||||
/// Go to a child of the current object.
|
/// Go to a child of the current object
|
||||||
Field {
|
Field(Label),
|
||||||
name: Label,
|
|
||||||
args: Option<SmartString<LazyCompact>>,
|
|
||||||
},
|
|
||||||
|
|
||||||
/// Go to an element of the current list
|
/// Go to an element of the current list
|
||||||
Index(i64),
|
Index(i64),
|
||||||
|
|
||||||
/// Go to a slice of the current list
|
|
||||||
Range {
|
|
||||||
start: i64,
|
|
||||||
end: i64,
|
|
||||||
inclusive: bool,
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A path to aPathSegment::Field inside a nested object,
|
/// A path to aPathSegment::Field inside a nested object,
|
||||||
@@ -65,44 +55,11 @@ pub enum PathSegment {
|
|||||||
/// - `$` refers to the root object
|
/// - `$` refers to the root object
|
||||||
/// - `.<name>` selects aPathSegment::Field of an object
|
/// - `.<name>` selects aPathSegment::Field of an object
|
||||||
/// - `[n]` selects an item of an array
|
/// - `[n]` selects an item of an array
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct ObjectPath {
|
pub struct ObjectPath {
|
||||||
pub segments: Vec<PathSegment>,
|
pub segments: Vec<PathSegment>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl fmt::Display for ObjectPath {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
||||||
for seg in &self.segments {
|
|
||||||
match seg {
|
|
||||||
PathSegment::Root => write!(f, "$")?,
|
|
||||||
PathSegment::Field { name, args: None } => write!(f, ".{name}")?,
|
|
||||||
PathSegment::Field {
|
|
||||||
name,
|
|
||||||
args: Some(a),
|
|
||||||
} => write!(f, ".{name}({a})")?,
|
|
||||||
PathSegment::Index(i) => write!(f, "[{i}]")?,
|
|
||||||
PathSegment::Range {
|
|
||||||
start,
|
|
||||||
end,
|
|
||||||
inclusive: false,
|
|
||||||
} => write!(f, "[{start}..{end}]")?,
|
|
||||||
PathSegment::Range {
|
|
||||||
start,
|
|
||||||
end,
|
|
||||||
inclusive: true,
|
|
||||||
} => write!(f, "[{start}..={end}]")?,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Serialize for ObjectPath {
|
|
||||||
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
|
|
||||||
serializer.serialize_str(&self.to_string())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'de> Deserialize<'de> for ObjectPath {
|
impl<'de> Deserialize<'de> for ObjectPath {
|
||||||
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
|
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
|
||||||
struct PathVisitor;
|
struct PathVisitor;
|
||||||
|
|||||||
@@ -1,80 +1,10 @@
|
|||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
|
|
||||||
use smartstring::{LazyCompact, SmartString};
|
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
Label,
|
Label,
|
||||||
objectpath::{PathParseError, PathSegment, tokenizer::Token},
|
objectpath::{PathParseError, PathSegment, tokenizer::Token},
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Parse an ident token into a `PathSegment::Field`, handling optional args of
|
|
||||||
/// the form `name(args)`. Parens inside args may be nested; `\(` and `\)` are
|
|
||||||
/// escaped and do not affect depth counting.
|
|
||||||
fn parse_field(ident: &str, position: usize) -> Result<PathSegment, PathParseError> {
|
|
||||||
let bytes = ident.as_bytes();
|
|
||||||
let mut i = 0;
|
|
||||||
|
|
||||||
// Find the first unescaped '(' — everything before it is the name.
|
|
||||||
let open_paren: Option<usize> = loop {
|
|
||||||
if i >= bytes.len() {
|
|
||||||
break None;
|
|
||||||
}
|
|
||||||
match bytes[i] {
|
|
||||||
b'\\' => i += 2, // skip escaped character
|
|
||||||
b'(' => break Some(i),
|
|
||||||
_ => i += 1,
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let name_str = &ident[..open_paren.unwrap_or(bytes.len())];
|
|
||||||
let name = Label::new(name_str).ok_or_else(|| PathParseError::InvalidField {
|
|
||||||
position,
|
|
||||||
str: name_str.into(),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let Some(open_pos) = open_paren else {
|
|
||||||
return Ok(PathSegment::Field { name, args: None });
|
|
||||||
};
|
|
||||||
|
|
||||||
// Scan args, tracking paren depth.
|
|
||||||
let args_start = open_pos + 1;
|
|
||||||
let mut depth: usize = 1;
|
|
||||||
let mut j = args_start;
|
|
||||||
|
|
||||||
while j < bytes.len() {
|
|
||||||
match bytes[j] {
|
|
||||||
b'\\' => j += 2, // skip escaped character
|
|
||||||
b'(' => {
|
|
||||||
depth += 1;
|
|
||||||
j += 1;
|
|
||||||
}
|
|
||||||
b')' => {
|
|
||||||
depth -= 1;
|
|
||||||
if depth == 0 {
|
|
||||||
// Closing paren must be the last character.
|
|
||||||
if j + 1 != bytes.len() {
|
|
||||||
return Err(PathParseError::Syntax {
|
|
||||||
position: position + j + 1,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
let args: SmartString<LazyCompact> = ident[args_start..j].into();
|
|
||||||
return Ok(PathSegment::Field {
|
|
||||||
name,
|
|
||||||
args: Some(args),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
j += 1;
|
|
||||||
}
|
|
||||||
_ => j += 1,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Reached end of ident without finding the matching ')'.
|
|
||||||
Err(PathParseError::Syntax {
|
|
||||||
position: position + ident.len(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
enum State {
|
enum State {
|
||||||
Start,
|
Start,
|
||||||
|
|
||||||
@@ -87,15 +17,6 @@ enum State {
|
|||||||
/// We are indexing an array, waiting for a number
|
/// We are indexing an array, waiting for a number
|
||||||
Index,
|
Index,
|
||||||
|
|
||||||
/// We parsed the start index, waiting for `]` or the first `.` of `..`
|
|
||||||
IndexAfterStart(i64),
|
|
||||||
|
|
||||||
/// We saw one `.` after the start index, waiting for the second `.`
|
|
||||||
IndexRangeDot1(i64),
|
|
||||||
|
|
||||||
/// We saw `..`, waiting for the end index (optionally prefixed with `=`)
|
|
||||||
IndexRangeDot2(i64),
|
|
||||||
|
|
||||||
/// We are indexing an array, waiting for a close-bracket
|
/// We are indexing an array, waiting for a close-bracket
|
||||||
IndexClose,
|
IndexClose,
|
||||||
}
|
}
|
||||||
@@ -151,7 +72,14 @@ impl Parser {
|
|||||||
// MARK: dot
|
// MARK: dot
|
||||||
//
|
//
|
||||||
(State::Dot, (p, Token::Ident(ident))) => {
|
(State::Dot, (p, Token::Ident(ident))) => {
|
||||||
self.segments.push(parse_field(ident, *p)?);
|
self.segments
|
||||||
|
.push(PathSegment::Field(Label::new(*ident).ok_or_else(|| {
|
||||||
|
PathParseError::InvalidField {
|
||||||
|
position: *p,
|
||||||
|
str: (*ident).into(),
|
||||||
|
}
|
||||||
|
})?));
|
||||||
|
|
||||||
self.state = State::Selected;
|
self.state = State::Selected;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -173,7 +101,8 @@ impl Parser {
|
|||||||
}
|
}
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
self.state = State::IndexAfterStart(idx);
|
self.segments.push(PathSegment::Index(idx));
|
||||||
|
self.state = State::IndexClose;
|
||||||
}
|
}
|
||||||
|
|
||||||
(State::Index, (p, Token::Root))
|
(State::Index, (p, Token::Root))
|
||||||
@@ -183,49 +112,6 @@ impl Parser {
|
|||||||
return Err(PathParseError::Syntax { position: *p });
|
return Err(PathParseError::Syntax { position: *p });
|
||||||
}
|
}
|
||||||
|
|
||||||
(State::IndexAfterStart(idx), (_, Token::SqbClose)) => {
|
|
||||||
self.segments.push(PathSegment::Index(idx));
|
|
||||||
self.state = State::Selected;
|
|
||||||
}
|
|
||||||
(State::IndexAfterStart(idx), (_, Token::Dot)) => {
|
|
||||||
self.state = State::IndexRangeDot1(idx);
|
|
||||||
}
|
|
||||||
(State::IndexAfterStart(_), (p, _)) => {
|
|
||||||
return Err(PathParseError::Syntax { position: *p });
|
|
||||||
}
|
|
||||||
|
|
||||||
(State::IndexRangeDot1(idx), (_, Token::Dot)) => {
|
|
||||||
self.state = State::IndexRangeDot2(idx);
|
|
||||||
}
|
|
||||||
(State::IndexRangeDot1(_), (p, _)) => {
|
|
||||||
return Err(PathParseError::Syntax { position: *p });
|
|
||||||
}
|
|
||||||
|
|
||||||
(State::IndexRangeDot2(start), (p, Token::Ident(ident))) => {
|
|
||||||
let (end_str, inclusive) = if let Some(stripped) = ident.strip_prefix('=') {
|
|
||||||
(stripped, true)
|
|
||||||
} else {
|
|
||||||
(*ident, false)
|
|
||||||
};
|
|
||||||
|
|
||||||
let end: i64 = i64::from_str(end_str).map_err(|_err| {
|
|
||||||
PathParseError::InvalidIndexString {
|
|
||||||
position: *p,
|
|
||||||
str: (*ident).into(),
|
|
||||||
}
|
|
||||||
})?;
|
|
||||||
|
|
||||||
self.segments.push(PathSegment::Range {
|
|
||||||
start,
|
|
||||||
end,
|
|
||||||
inclusive,
|
|
||||||
});
|
|
||||||
self.state = State::IndexClose;
|
|
||||||
}
|
|
||||||
(State::IndexRangeDot2(_), (p, _)) => {
|
|
||||||
return Err(PathParseError::Syntax { position: *p });
|
|
||||||
}
|
|
||||||
|
|
||||||
(State::IndexClose, (_, Token::SqbClose)) => self.state = State::Selected,
|
(State::IndexClose, (_, Token::SqbClose)) => self.state = State::Selected,
|
||||||
(State::IndexClose, (p, _)) => {
|
(State::IndexClose, (p, _)) => {
|
||||||
return Err(PathParseError::Syntax { position: *p });
|
return Err(PathParseError::Syntax { position: *p });
|
||||||
@@ -238,9 +124,6 @@ impl Parser {
|
|||||||
State::Start => Err(PathParseError::Syntax { position: 0 }),
|
State::Start => Err(PathParseError::Syntax { position: 0 }),
|
||||||
State::Dot => Err(PathParseError::Syntax { position }),
|
State::Dot => Err(PathParseError::Syntax { position }),
|
||||||
State::Index => Err(PathParseError::Syntax { position }),
|
State::Index => Err(PathParseError::Syntax { position }),
|
||||||
State::IndexAfterStart(_) => Err(PathParseError::Syntax { position }),
|
|
||||||
State::IndexRangeDot1(_) => Err(PathParseError::Syntax { position }),
|
|
||||||
State::IndexRangeDot2(_) => Err(PathParseError::Syntax { position }),
|
|
||||||
State::IndexClose => Err(PathParseError::Syntax { position }),
|
State::IndexClose => Err(PathParseError::Syntax { position }),
|
||||||
State::Selected => Ok(()),
|
State::Selected => Ok(()),
|
||||||
}?;
|
}?;
|
||||||
@@ -278,30 +161,27 @@ mod tests {
|
|||||||
parse_test("$", Ok(&[PathSegment::Root]));
|
parse_test("$", Ok(&[PathSegment::Root]));
|
||||||
}
|
}
|
||||||
|
|
||||||
fn field(name: &str) -> PathSegment {
|
|
||||||
PathSegment::Field {
|
|
||||||
name: Label::new(name).unwrap(),
|
|
||||||
args: None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn field_args(name: &str, args: &str) -> PathSegment {
|
|
||||||
PathSegment::Field {
|
|
||||||
name: Label::new(name).unwrap(),
|
|
||||||
args: Some(args.into()),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn single_field() {
|
fn single_field() {
|
||||||
parse_test("$.foo", Ok(&[PathSegment::Root, field("foo")]));
|
parse_test(
|
||||||
|
"$.foo",
|
||||||
|
Ok(&[
|
||||||
|
PathSegment::Root,
|
||||||
|
PathSegment::Field(Label::new("foo").unwrap()),
|
||||||
|
]),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn nested_fields() {
|
fn nested_fields() {
|
||||||
parse_test(
|
parse_test(
|
||||||
"$.foo.bar.baz",
|
"$.foo.bar.baz",
|
||||||
Ok(&[PathSegment::Root, field("foo"), field("bar"), field("baz")]),
|
Ok(&[
|
||||||
|
PathSegment::Root,
|
||||||
|
PathSegment::Field(Label::new("foo").unwrap()),
|
||||||
|
PathSegment::Field(Label::new("bar").unwrap()),
|
||||||
|
PathSegment::Field(Label::new("baz").unwrap()),
|
||||||
|
]),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -309,7 +189,11 @@ mod tests {
|
|||||||
fn array_index() {
|
fn array_index() {
|
||||||
parse_test(
|
parse_test(
|
||||||
"$.items[0]",
|
"$.items[0]",
|
||||||
Ok(&[PathSegment::Root, field("items"), PathSegment::Index(0)]),
|
Ok(&[
|
||||||
|
PathSegment::Root,
|
||||||
|
PathSegment::Field(Label::new("items").unwrap()),
|
||||||
|
PathSegment::Index(0),
|
||||||
|
]),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -319,7 +203,7 @@ mod tests {
|
|||||||
"$.a[1][2]",
|
"$.a[1][2]",
|
||||||
Ok(&[
|
Ok(&[
|
||||||
PathSegment::Root,
|
PathSegment::Root,
|
||||||
field("a"),
|
PathSegment::Field(Label::new("a").unwrap()),
|
||||||
PathSegment::Index(1),
|
PathSegment::Index(1),
|
||||||
PathSegment::Index(2),
|
PathSegment::Index(2),
|
||||||
]),
|
]),
|
||||||
@@ -332,9 +216,9 @@ mod tests {
|
|||||||
"$.a[0].b",
|
"$.a[0].b",
|
||||||
Ok(&[
|
Ok(&[
|
||||||
PathSegment::Root,
|
PathSegment::Root,
|
||||||
field("a"),
|
PathSegment::Field(Label::new("a").unwrap()),
|
||||||
PathSegment::Index(0),
|
PathSegment::Index(0),
|
||||||
field("b"),
|
PathSegment::Field(Label::new("b").unwrap()),
|
||||||
]),
|
]),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -343,94 +227,14 @@ mod tests {
|
|||||||
fn negative_index() {
|
fn negative_index() {
|
||||||
parse_test(
|
parse_test(
|
||||||
"$.a[-1]",
|
"$.a[-1]",
|
||||||
Ok(&[PathSegment::Root, field("a"), PathSegment::Index(-1)]),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// MARK: args
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn field_with_simple_args() {
|
|
||||||
parse_test(
|
|
||||||
"$.foo(bar)",
|
|
||||||
Ok(&[PathSegment::Root, field_args("foo", "bar")]),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn field_with_empty_args() {
|
|
||||||
parse_test("$.foo()", Ok(&[PathSegment::Root, field_args("foo", "")]));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn field_with_nested_parens_in_args() {
|
|
||||||
parse_test(
|
|
||||||
"$.foo(a(b)c)",
|
|
||||||
Ok(&[PathSegment::Root, field_args("foo", "a(b)c")]),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn field_with_deeply_nested_parens_in_args() {
|
|
||||||
parse_test(
|
|
||||||
"$.foo(a(b(c))d)",
|
|
||||||
Ok(&[PathSegment::Root, field_args("foo", "a(b(c))d")]),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn field_with_escaped_open_paren_in_args() {
|
|
||||||
// "$.foo(a\(b)" — '\(' is escaped, so depth never rises above 1; ')' closes it
|
|
||||||
parse_test(
|
|
||||||
r"$.foo(a\(b)",
|
|
||||||
Ok(&[PathSegment::Root, field_args("foo", r"a\(b")]),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn field_with_escaped_close_paren_in_args() {
|
|
||||||
// "$.foo(a\)b)" — '\)' is escaped, the second ')' closes at depth 0
|
|
||||||
parse_test(
|
|
||||||
r"$.foo(a\)b)",
|
|
||||||
Ok(&[PathSegment::Root, field_args("foo", r"a\)b")]),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn field_with_both_escaped_parens_in_args() {
|
|
||||||
parse_test(
|
|
||||||
r"$.foo(a\(b\)c)",
|
|
||||||
Ok(&[PathSegment::Root, field_args("foo", r"a\(b\)c")]),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn field_args_with_multiple_segments() {
|
|
||||||
parse_test(
|
|
||||||
"$.foo(x).bar(y)",
|
|
||||||
Ok(&[
|
Ok(&[
|
||||||
PathSegment::Root,
|
PathSegment::Root,
|
||||||
field_args("foo", "x"),
|
PathSegment::Field(Label::new("a").unwrap()),
|
||||||
field_args("bar", "y"),
|
PathSegment::Index(-1),
|
||||||
]),
|
]),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn field_args_unclosed_paren_error() {
|
|
||||||
// Missing closing ')' → Syntax error at end of source
|
|
||||||
parse_test("$.foo(bar", Err(PathParseError::Syntax { position: 9 }));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn field_args_trailing_chars_after_close_error() {
|
|
||||||
// Closing ')' is not the last char → Syntax error at the trailing char
|
|
||||||
parse_test(
|
|
||||||
"$.foo(bar)baz",
|
|
||||||
Err(PathParseError::Syntax { position: 10 }),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn non_ascii_error() {
|
fn non_ascii_error() {
|
||||||
parse_test(
|
parse_test(
|
||||||
@@ -441,46 +245,4 @@ mod tests {
|
|||||||
}),
|
}),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// MARK: range
|
|
||||||
|
|
||||||
fn range(start: i64, end: i64, inclusive: bool) -> PathSegment {
|
|
||||||
PathSegment::Range {
|
|
||||||
start,
|
|
||||||
end,
|
|
||||||
inclusive,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn exclusive_range() {
|
|
||||||
parse_test(
|
|
||||||
"$.a[0..5]",
|
|
||||||
Ok(&[PathSegment::Root, field("a"), range(0, 5, false)]),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn inclusive_range() {
|
|
||||||
parse_test(
|
|
||||||
"$.a[1..=2]",
|
|
||||||
Ok(&[PathSegment::Root, field("a"), range(1, 2, true)]),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn range_with_negative_end() {
|
|
||||||
parse_test(
|
|
||||||
"$.a[0..-1]",
|
|
||||||
Ok(&[PathSegment::Root, field("a"), range(0, -1, false)]),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn range_with_negative_start() {
|
|
||||||
parse_test(
|
|
||||||
"$.a[-3..-1]",
|
|
||||||
Ok(&[PathSegment::Root, field("a"), range(-3, -1, false)]),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -21,52 +21,7 @@ impl Tokenizer {
|
|||||||
let mut tokens = Vec::new();
|
let mut tokens = Vec::new();
|
||||||
|
|
||||||
let mut window_start = None;
|
let mut window_start = None;
|
||||||
// Paren depth: while > 0, `.` / `[` / `]` / `$` are part of the ident.
|
|
||||||
let mut paren_depth: usize = 0;
|
|
||||||
// When true, the current char is escaped by a preceding `\` and is
|
|
||||||
// treated as a plain ident character with no special meaning.
|
|
||||||
let mut skip_next = false;
|
|
||||||
|
|
||||||
for (i, c) in source.char_indices() {
|
for (i, c) in source.char_indices() {
|
||||||
if skip_next {
|
|
||||||
skip_next = false;
|
|
||||||
// Escaped char: just extend the ident window (already opened by `\`).
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if c == '\\' {
|
|
||||||
if window_start.is_none() {
|
|
||||||
window_start = Some(i);
|
|
||||||
}
|
|
||||||
skip_next = true;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if paren_depth > 0 {
|
|
||||||
// Inside parens: only track depth changes, everything else is ident.
|
|
||||||
match c {
|
|
||||||
'(' => {
|
|
||||||
if window_start.is_none() {
|
|
||||||
window_start = Some(i);
|
|
||||||
}
|
|
||||||
paren_depth += 1;
|
|
||||||
}
|
|
||||||
')' => {
|
|
||||||
if window_start.is_none() {
|
|
||||||
window_start = Some(i);
|
|
||||||
}
|
|
||||||
paren_depth -= 1;
|
|
||||||
}
|
|
||||||
x if x.is_ascii() => {
|
|
||||||
if window_start.is_none() {
|
|
||||||
window_start = Some(i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
char => return Err(PathParseError::NonAsciiChar { position: i, char }),
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
match c {
|
match c {
|
||||||
'$' => {
|
'$' => {
|
||||||
if let Some(s) = window_start.take() {
|
if let Some(s) = window_start.take() {
|
||||||
@@ -96,26 +51,10 @@ impl Tokenizer {
|
|||||||
tokens.push((i, Token::SqbClose));
|
tokens.push((i, Token::SqbClose));
|
||||||
}
|
}
|
||||||
|
|
||||||
'(' => {
|
x if x.is_ascii() => match window_start {
|
||||||
if window_start.is_none() {
|
None => window_start = Some(i),
|
||||||
window_start = Some(i);
|
Some(_) => continue,
|
||||||
}
|
},
|
||||||
paren_depth += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
')' => {
|
|
||||||
if window_start.is_none() {
|
|
||||||
window_start = Some(i);
|
|
||||||
}
|
|
||||||
// paren_depth is 0 here — stray `)` is an ident char and
|
|
||||||
// parse_field will surface the error later.
|
|
||||||
}
|
|
||||||
|
|
||||||
x if x.is_ascii() => {
|
|
||||||
if window_start.is_none() {
|
|
||||||
window_start = Some(i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
char => return Err(PathParseError::NonAsciiChar { position: i, char }),
|
char => return Err(PathParseError::NonAsciiChar { position: i, char }),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,49 +0,0 @@
|
|||||||
use std::collections::HashMap;
|
|
||||||
|
|
||||||
use serde::{Deserialize, Deserializer, de};
|
|
||||||
use smartstring::{LazyCompact, SmartString};
|
|
||||||
use thiserror::Error;
|
|
||||||
|
|
||||||
use crate::{Label, objectpath::PathParseError as ObjectPathError};
|
|
||||||
|
|
||||||
mod parser;
|
|
||||||
pub use parser::GroupSegment;
|
|
||||||
|
|
||||||
#[derive(Debug, Error, PartialEq)]
|
|
||||||
pub enum GroupPatternParseError {
|
|
||||||
/// A `{` or `}` appeared in an invalid position, or a `{` was never closed.
|
|
||||||
#[error("syntax error at index {position}")]
|
|
||||||
Syntax { position: usize },
|
|
||||||
|
|
||||||
/// The contents of a `{...}` block could not be parsed as an object path.
|
|
||||||
#[error("invalid object path {path:?}: {source}")]
|
|
||||||
InvalidObjectPath {
|
|
||||||
start: usize,
|
|
||||||
end: usize,
|
|
||||||
path: SmartString<LazyCompact>,
|
|
||||||
source: ObjectPathError,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Default)]
|
|
||||||
pub struct GroupPattern {
|
|
||||||
pub pattern: HashMap<Label, Vec<GroupSegment>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'de> Deserialize<'de> for GroupPattern {
|
|
||||||
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
|
|
||||||
let raw = HashMap::<String, String>::deserialize(deserializer)?;
|
|
||||||
let mut parts = HashMap::with_capacity(raw.len());
|
|
||||||
for (key, value) in raw {
|
|
||||||
let label = Label::try_from(key.as_str()).map_err(de::Error::custom)?;
|
|
||||||
let segments = parser::Parser::new()
|
|
||||||
.parse(&value)
|
|
||||||
.map_err(de::Error::custom)?
|
|
||||||
.into_iter()
|
|
||||||
.map(|(_, seg)| seg)
|
|
||||||
.collect();
|
|
||||||
parts.insert(label, segments);
|
|
||||||
}
|
|
||||||
Ok(GroupPattern { pattern: parts })
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,203 +0,0 @@
|
|||||||
use smartstring::{LazyCompact, SmartString};
|
|
||||||
|
|
||||||
use crate::{objectpath::ObjectPath, pattern::GroupPatternParseError};
|
|
||||||
|
|
||||||
#[cfg_attr(test, derive(PartialEq))]
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub enum GroupSegment {
|
|
||||||
Path(ObjectPath),
|
|
||||||
Literal(SmartString<LazyCompact>),
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct Parser {}
|
|
||||||
|
|
||||||
impl Parser {
|
|
||||||
pub fn new() -> Self {
|
|
||||||
Self {}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Parse a pattern string of the form `{path}.literal{path}...`.
|
|
||||||
///
|
|
||||||
/// - `{...}` delimiters are parsed as [`ObjectPath`] expressions.
|
|
||||||
/// Nested `{}` inside a path are allowed; depth is tracked to find the
|
|
||||||
/// matching closing brace.
|
|
||||||
/// - Everything outside `{...}` is a `Literal` segment.
|
|
||||||
/// - A bare `}` in literal position (depth == 0) is a syntax error.
|
|
||||||
/// - An unclosed `{` is a syntax error.
|
|
||||||
pub fn parse(self, source: &str) -> Result<Vec<(usize, GroupSegment)>, GroupPatternParseError> {
|
|
||||||
let mut tokens = Vec::new();
|
|
||||||
|
|
||||||
// `depth` > 0 means we are currently inside a `{...}` path expression.
|
|
||||||
let mut depth: usize = 0;
|
|
||||||
// Start of the current segment (literal text or path content).
|
|
||||||
let mut window_start: usize = 0;
|
|
||||||
// Source position of the opening `{` for the current path (used for error reporting).
|
|
||||||
let mut open_brace: usize = 0;
|
|
||||||
|
|
||||||
for (i, c) in source.char_indices() {
|
|
||||||
match c {
|
|
||||||
'{' => {
|
|
||||||
if depth == 0 {
|
|
||||||
// Emit any accumulated literal.
|
|
||||||
if i > window_start {
|
|
||||||
tokens.push((
|
|
||||||
window_start,
|
|
||||||
GroupSegment::Literal(source[window_start..i].into()),
|
|
||||||
));
|
|
||||||
}
|
|
||||||
open_brace = i;
|
|
||||||
// Path content starts after the opening brace.
|
|
||||||
window_start = i + 1;
|
|
||||||
depth = 1;
|
|
||||||
} else {
|
|
||||||
// Nested brace inside a path — keep counting.
|
|
||||||
depth += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
'}' => {
|
|
||||||
if depth == 0 {
|
|
||||||
// Unmatched `}` outside any path.
|
|
||||||
return Err(GroupPatternParseError::Syntax { position: i });
|
|
||||||
}
|
|
||||||
depth -= 1;
|
|
||||||
if depth == 0 {
|
|
||||||
// Closing brace of the outermost path expression — parse as ObjectPath.
|
|
||||||
let path_str = &source[window_start..i];
|
|
||||||
let path = path_str.parse::<ObjectPath>().map_err(|e| {
|
|
||||||
GroupPatternParseError::InvalidObjectPath {
|
|
||||||
start: open_brace,
|
|
||||||
end: i + 1,
|
|
||||||
path: path_str.into(),
|
|
||||||
source: e,
|
|
||||||
}
|
|
||||||
})?;
|
|
||||||
tokens.push((open_brace, GroupSegment::Path(path)));
|
|
||||||
// Literal content (if any) starts after this `}`.
|
|
||||||
window_start = i + 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Unclosed `{`.
|
|
||||||
if depth > 0 {
|
|
||||||
return Err(GroupPatternParseError::Syntax {
|
|
||||||
position: open_brace,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Emit any trailing literal.
|
|
||||||
if window_start < source.len() {
|
|
||||||
tokens.push((
|
|
||||||
window_start,
|
|
||||||
GroupSegment::Literal(source[window_start..].into()),
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(tokens)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
// MARK: tests
|
|
||||||
//
|
|
||||||
|
|
||||||
#[expect(clippy::unwrap_used)]
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
fn parse(source: &str) -> Result<Vec<(usize, GroupSegment)>, GroupPatternParseError> {
|
|
||||||
Parser::new().parse(source)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn path(s: &str) -> GroupSegment {
|
|
||||||
GroupSegment::Path(s.parse().unwrap())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn lit(s: &str) -> GroupSegment {
|
|
||||||
GroupSegment::Literal(s.into())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn regex() {
|
|
||||||
assert_eq!(
|
|
||||||
parse("{$.split(/)[-1].regex((.*).pub \\((.*)\\).pdf)[0]}").unwrap(),
|
|
||||||
vec![(0, path("$.split(/)[-1].regex((.*).pub \\((.*)\\).pdf)[0]"))]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn single_path() {
|
|
||||||
assert_eq!(parse("{$.foo}").unwrap(), vec![(0, path("$.foo"))]);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn single_literal() {
|
|
||||||
assert_eq!(parse("hello").unwrap(), vec![(0, lit("hello"))]);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn path_then_literal() {
|
|
||||||
assert_eq!(
|
|
||||||
parse("{$.foo}.txt").unwrap(),
|
|
||||||
vec![(0, path("$.foo")), (7, lit(".txt"))]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn literal_then_path() {
|
|
||||||
assert_eq!(
|
|
||||||
parse("prefix/{$.foo}").unwrap(),
|
|
||||||
vec![(0, lit("prefix/")), (7, path("$.foo"))]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn interleaved() {
|
|
||||||
assert_eq!(
|
|
||||||
parse("{$.a}.sep.{$.b}").unwrap(),
|
|
||||||
vec![(0, path("$.a")), (5, lit(".sep.")), (10, path("$.b")),]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn unmatched_open_brace_error() {
|
|
||||||
assert_eq!(
|
|
||||||
parse("{$.foo"),
|
|
||||||
Err(GroupPatternParseError::Syntax { position: 0 })
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn unmatched_close_brace_in_literal_error() {
|
|
||||||
assert_eq!(
|
|
||||||
parse("foo}bar"),
|
|
||||||
Err(GroupPatternParseError::Syntax { position: 3 })
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn invalid_path_error() {
|
|
||||||
assert_eq!(
|
|
||||||
parse("{not-a-path}"),
|
|
||||||
Err(GroupPatternParseError::InvalidObjectPath {
|
|
||||||
start: 0,
|
|
||||||
end: 12,
|
|
||||||
path: "not-a-path".into(),
|
|
||||||
source: crate::objectpath::PathParseError::MustStartWithRoot { position: 0 },
|
|
||||||
})
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn literal_between_paths() {
|
|
||||||
assert_eq!(
|
|
||||||
parse("foo{$.x}bar").unwrap(),
|
|
||||||
vec![(0, lit("foo")), (3, path("$.x")), (8, lit("bar")),]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -12,7 +12,6 @@ pile-config = { workspace = true }
|
|||||||
pile-toolbox = { workspace = true }
|
pile-toolbox = { workspace = true }
|
||||||
pile-value = { workspace = true }
|
pile-value = { workspace = true }
|
||||||
|
|
||||||
regex = { workspace = true }
|
|
||||||
serde_json = { workspace = true }
|
serde_json = { workspace = true }
|
||||||
tantivy = { workspace = true }
|
tantivy = { workspace = true }
|
||||||
tracing = { workspace = true }
|
tracing = { workspace = true }
|
||||||
@@ -20,7 +19,14 @@ chrono = { workspace = true }
|
|||||||
toml = { workspace = true }
|
toml = { workspace = true }
|
||||||
thiserror = { workspace = true }
|
thiserror = { workspace = true }
|
||||||
tokio = { workspace = true }
|
tokio = { workspace = true }
|
||||||
|
tokio-stream = { workspace = true }
|
||||||
|
|
||||||
|
serde = { workspace = true, optional = true }
|
||||||
|
axum = { workspace = true, optional = true }
|
||||||
|
utoipa = { workspace = true, optional = true }
|
||||||
|
utoipa-swagger-ui = { workspace = true, optional = true }
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = []
|
default = []
|
||||||
pdfium = ["pile-value/pdfium"]
|
pdfium = ["pile-value/pdfium"]
|
||||||
|
axum = ["dep:axum", "dep:utoipa", "dep:utoipa-swagger-ui", "dep:serde"]
|
||||||
|
|||||||
@@ -1,19 +1,16 @@
|
|||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
use pile_config::{
|
use pile_config::{ConfigToml, Label, Source, objectpath::ObjectPath};
|
||||||
ConfigToml, DatasetConfig, Label, Source, default_base, default_files, objectpath::ObjectPath,
|
|
||||||
};
|
|
||||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
||||||
use pile_value::{
|
use pile_value::{
|
||||||
extract::traits::ExtractState,
|
source::{DataSource, DirDataSource, S3DataSource, misc::path_ts_earliest},
|
||||||
source::{DataSource, DirDataSource, misc::path_ts_earliest},
|
|
||||||
value::{Item, PileValue},
|
value::{Item, PileValue},
|
||||||
};
|
};
|
||||||
use regex::Regex;
|
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
use std::{collections::HashMap, io::ErrorKind, path::PathBuf, sync::Arc, time::Instant};
|
use std::{collections::HashMap, io::ErrorKind, path::PathBuf, sync::Arc, time::Instant};
|
||||||
use tantivy::{Executor, Index, IndexWriter, TantivyError, collector::TopDocs};
|
use tantivy::{Executor, Index, IndexWriter, TantivyError, collector::TopDocs};
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
use tokio::task::JoinSet;
|
use tokio::task::JoinSet;
|
||||||
|
use tokio_stream::{StreamExt, wrappers::ReceiverStream};
|
||||||
use tracing::{debug, info, trace, warn};
|
use tracing::{debug, info, trace, warn};
|
||||||
|
|
||||||
use crate::index::{DbFtsIndex, FtsLookupResult};
|
use crate::index::{DbFtsIndex, FtsLookupResult};
|
||||||
@@ -34,43 +31,31 @@ pub enum DatasetError {
|
|||||||
// MARK: Dataset enum
|
// MARK: Dataset enum
|
||||||
//
|
//
|
||||||
|
|
||||||
/// An opened data source
|
/// An opened data source — either a local filesystem directory or an S3 bucket.
|
||||||
pub enum Dataset {
|
pub enum Dataset {
|
||||||
Dir(Arc<DirDataSource>),
|
Dir(Arc<DirDataSource>),
|
||||||
|
S3(Arc<S3DataSource>),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Dataset {
|
impl Dataset {
|
||||||
pub fn len(&self) -> usize {
|
|
||||||
match self {
|
|
||||||
Self::Dir(ds) => ds.len(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn get(&self, key: &str) -> Option<Item> {
|
pub async fn get(&self, key: &str) -> Option<Item> {
|
||||||
match self {
|
match self {
|
||||||
Self::Dir(ds) => ds.get(key).await.ok().flatten(),
|
Self::Dir(ds) => ds.get(key).await.ok().flatten(),
|
||||||
|
Self::S3(ds) => ds.get(key).await.ok().flatten(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn iter(&self) -> Box<dyn Iterator<Item = &Item> + Send + '_> {
|
pub fn iter(&self) -> ReceiverStream<Result<Item, std::io::Error>> {
|
||||||
match self {
|
match self {
|
||||||
Self::Dir(ds) => Box::new(ds.iter()),
|
Self::Dir(ds) => ds.iter(),
|
||||||
}
|
Self::S3(ds) => ds.iter(),
|
||||||
}
|
|
||||||
|
|
||||||
pub fn iter_page(
|
|
||||||
&self,
|
|
||||||
offset: usize,
|
|
||||||
limit: usize,
|
|
||||||
) -> Box<dyn Iterator<Item = &Item> + Send + '_> {
|
|
||||||
match self {
|
|
||||||
Self::Dir(ds) => Box::new(ds.iter_page(offset, limit)),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn latest_change(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
|
pub async fn latest_change(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
|
||||||
match self {
|
match self {
|
||||||
Self::Dir(ds) => ds.latest_change().await,
|
Self::Dir(ds) => ds.latest_change().await,
|
||||||
|
Self::S3(ds) => ds.latest_change().await,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -81,103 +66,16 @@ impl Dataset {
|
|||||||
|
|
||||||
/// An opened dataset: config, working directory, and all opened sources.
|
/// An opened dataset: config, working directory, and all opened sources.
|
||||||
pub struct Datasets {
|
pub struct Datasets {
|
||||||
pub path_config: Option<PathBuf>,
|
pub path_config: PathBuf,
|
||||||
pub path_parent: PathBuf,
|
pub path_parent: PathBuf,
|
||||||
pub path_workdir: Option<PathBuf>,
|
pub path_workdir: PathBuf,
|
||||||
|
|
||||||
pub config: ConfigToml,
|
pub config: ConfigToml,
|
||||||
pub sources: HashMap<Label, Dataset>,
|
pub sources: HashMap<Label, Dataset>,
|
||||||
pub disabled_sources: HashMap<Label, Dataset>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Datasets {
|
impl Datasets {
|
||||||
#[expect(clippy::unwrap_used)]
|
pub fn open(config: impl Into<PathBuf>) -> Result<Self, std::io::Error> {
|
||||||
pub fn virt_source() -> Label {
|
|
||||||
Label::new("virtual-source").unwrap()
|
|
||||||
}
|
|
||||||
|
|
||||||
#[expect(clippy::unwrap_used)]
|
|
||||||
pub async fn virt(parent: impl Into<PathBuf>) -> Result<Self, std::io::Error> {
|
|
||||||
let path_parent = parent.into();
|
|
||||||
|
|
||||||
let config = ConfigToml {
|
|
||||||
dataset: DatasetConfig {
|
|
||||||
name: Label::new("virtual-dataset").unwrap(),
|
|
||||||
source: [(
|
|
||||||
Self::virt_source(),
|
|
||||||
Source::Filesystem {
|
|
||||||
enabled: true,
|
|
||||||
path: path_parent.clone(),
|
|
||||||
base_pattern: default_base(),
|
|
||||||
files: default_files(),
|
|
||||||
},
|
|
||||||
)]
|
|
||||||
.into_iter()
|
|
||||||
.collect(),
|
|
||||||
},
|
|
||||||
schema: HashMap::new(),
|
|
||||||
fts: None,
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut sources = HashMap::new();
|
|
||||||
let mut disabled_sources = HashMap::new();
|
|
||||||
|
|
||||||
for (label, source) in &config.dataset.source {
|
|
||||||
match source {
|
|
||||||
Source::Filesystem {
|
|
||||||
enabled,
|
|
||||||
path,
|
|
||||||
base_pattern,
|
|
||||||
files,
|
|
||||||
} => {
|
|
||||||
let target = match enabled {
|
|
||||||
true => &mut sources,
|
|
||||||
false => &mut disabled_sources,
|
|
||||||
};
|
|
||||||
|
|
||||||
let base_regex = Regex::new(base_pattern).map_err(|e| {
|
|
||||||
std::io::Error::new(
|
|
||||||
ErrorKind::InvalidInput,
|
|
||||||
format!("invalid base_pattern: {e}"),
|
|
||||||
)
|
|
||||||
})?;
|
|
||||||
if base_regex.captures_len() != 2 {
|
|
||||||
return Err(std::io::Error::new(
|
|
||||||
ErrorKind::InvalidInput,
|
|
||||||
"base_pattern must have exactly one capture group",
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
target.insert(
|
|
||||||
label.clone(),
|
|
||||||
Dataset::Dir(
|
|
||||||
DirDataSource::new(
|
|
||||||
label,
|
|
||||||
path_parent.join(path),
|
|
||||||
base_regex,
|
|
||||||
files.clone(),
|
|
||||||
)
|
|
||||||
.await?,
|
|
||||||
),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return Ok(Self {
|
|
||||||
path_config: None,
|
|
||||||
path_workdir: None,
|
|
||||||
path_parent,
|
|
||||||
config,
|
|
||||||
sources,
|
|
||||||
disabled_sources,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn open(
|
|
||||||
config: impl Into<PathBuf>,
|
|
||||||
working_dir_root: impl Into<PathBuf>,
|
|
||||||
) -> Result<Self, std::io::Error> {
|
|
||||||
let path_config = config.into();
|
let path_config = config.into();
|
||||||
let path_parent = path_config
|
let path_parent = path_config
|
||||||
.parent()
|
.parent()
|
||||||
@@ -206,59 +104,61 @@ impl Datasets {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let path_workdir = working_dir_root.into().join(config.dataset.name.as_str());
|
let path_workdir = config
|
||||||
|
.dataset
|
||||||
|
.working_dir
|
||||||
|
.clone()
|
||||||
|
.unwrap_or(path_parent.join(".pile"))
|
||||||
|
.join(config.dataset.name.as_str());
|
||||||
|
|
||||||
let mut sources = HashMap::new();
|
let mut sources = HashMap::new();
|
||||||
let mut disabled_sources = HashMap::new();
|
|
||||||
for (label, source) in &config.dataset.source {
|
for (label, source) in &config.dataset.source {
|
||||||
match source {
|
match source {
|
||||||
Source::Filesystem {
|
Source::Filesystem { path, sidecars } => {
|
||||||
enabled,
|
sources.insert(
|
||||||
path,
|
|
||||||
base_pattern,
|
|
||||||
files,
|
|
||||||
} => {
|
|
||||||
let target = match enabled {
|
|
||||||
true => &mut sources,
|
|
||||||
false => &mut disabled_sources,
|
|
||||||
};
|
|
||||||
|
|
||||||
let base_regex = Regex::new(base_pattern).map_err(|e| {
|
|
||||||
std::io::Error::new(
|
|
||||||
ErrorKind::InvalidInput,
|
|
||||||
format!("invalid base_pattern: {e}"),
|
|
||||||
)
|
|
||||||
})?;
|
|
||||||
if base_regex.captures_len() != 2 {
|
|
||||||
return Err(std::io::Error::new(
|
|
||||||
ErrorKind::InvalidInput,
|
|
||||||
"base_pattern must have exactly one capture group",
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
target.insert(
|
|
||||||
label.clone(),
|
label.clone(),
|
||||||
Dataset::Dir(
|
Dataset::Dir(Arc::new(DirDataSource::new(
|
||||||
DirDataSource::new(
|
label,
|
||||||
label,
|
path_parent.join(path),
|
||||||
path_parent.join(path),
|
*sidecars,
|
||||||
base_regex,
|
))),
|
||||||
files.clone(),
|
|
||||||
)
|
|
||||||
.await?,
|
|
||||||
),
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Source::S3 {
|
||||||
|
bucket,
|
||||||
|
prefix,
|
||||||
|
endpoint,
|
||||||
|
region,
|
||||||
|
credentials,
|
||||||
|
sidecars,
|
||||||
|
} => {
|
||||||
|
match S3DataSource::new(
|
||||||
|
label,
|
||||||
|
bucket.clone(),
|
||||||
|
prefix.clone(),
|
||||||
|
endpoint.clone(),
|
||||||
|
region.clone(),
|
||||||
|
credentials,
|
||||||
|
*sidecars,
|
||||||
|
) {
|
||||||
|
Ok(ds) => {
|
||||||
|
sources.insert(label.clone(), Dataset::S3(Arc::new(ds)));
|
||||||
|
}
|
||||||
|
Err(err) => {
|
||||||
|
warn!("Could not open S3 source {label}: {err}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return Ok(Self {
|
return Ok(Self {
|
||||||
path_config: Some(path_config),
|
path_config,
|
||||||
path_workdir: Some(path_workdir),
|
|
||||||
path_parent,
|
path_parent,
|
||||||
|
path_workdir,
|
||||||
config,
|
config,
|
||||||
sources,
|
sources,
|
||||||
disabled_sources,
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -274,7 +174,6 @@ impl Datasets {
|
|||||||
/// Returns `None` if the item or field is not found.
|
/// Returns `None` if the item or field is not found.
|
||||||
pub async fn get_field(
|
pub async fn get_field(
|
||||||
&self,
|
&self,
|
||||||
state: &ExtractState,
|
|
||||||
source: &Label,
|
source: &Label,
|
||||||
key: &str,
|
key: &str,
|
||||||
path: &ObjectPath,
|
path: &ObjectPath,
|
||||||
@@ -284,11 +183,11 @@ impl Datasets {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let item = PileValue::Item(item);
|
let item = PileValue::Item(item);
|
||||||
let Some(value) = item.query(state, path).await? else {
|
let Some(value) = item.query(path).await? else {
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(Some(value.to_json(state).await?))
|
Ok(Some(value.to_json().await?))
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
@@ -298,29 +197,11 @@ impl Datasets {
|
|||||||
/// Refresh this dataset's fts index.
|
/// Refresh this dataset's fts index.
|
||||||
pub async fn fts_refresh(
|
pub async fn fts_refresh(
|
||||||
&self,
|
&self,
|
||||||
state: &ExtractState,
|
|
||||||
_threads: usize,
|
_threads: usize,
|
||||||
flag: Option<CancelFlag>,
|
flag: Option<CancelFlag>,
|
||||||
) -> Result<(), CancelableTaskError<DatasetError>> {
|
) -> Result<(), CancelableTaskError<DatasetError>> {
|
||||||
let start = Instant::now();
|
let fts_tmp_dir = self.path_workdir.join(".tmp-fts");
|
||||||
let workdir = match self.path_workdir.as_ref() {
|
let fts_dir = self.path_workdir.join("fts");
|
||||||
Some(x) => x,
|
|
||||||
None => {
|
|
||||||
warn!("Skipping fts_refresh, no workdir");
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let fts_tmp_dir = workdir.join(".tmp-fts");
|
|
||||||
let fts_dir = workdir.join("fts");
|
|
||||||
|
|
||||||
debug!(
|
|
||||||
message = "Rebuilding fts index",
|
|
||||||
dataset = self.config.dataset.name.as_str(),
|
|
||||||
?fts_dir,
|
|
||||||
?fts_tmp_dir,
|
|
||||||
?workdir
|
|
||||||
);
|
|
||||||
|
|
||||||
if fts_tmp_dir.is_dir() {
|
if fts_tmp_dir.is_dir() {
|
||||||
warn!("Removing temporary index in {}", fts_dir.display());
|
warn!("Removing temporary index in {}", fts_dir.display());
|
||||||
@@ -348,7 +229,7 @@ impl Datasets {
|
|||||||
index_writer.add_document(doc).map_err(DatasetError::from)?;
|
index_writer.add_document(doc).map_err(DatasetError::from)?;
|
||||||
total += 1;
|
total += 1;
|
||||||
if logged_at.elapsed().as_secs() >= 5 {
|
if logged_at.elapsed().as_secs() >= 5 {
|
||||||
debug!("Indexed {total} documents");
|
debug!("Indexed {total} documents so far");
|
||||||
logged_at = Instant::now();
|
logged_at = Instant::now();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -361,20 +242,19 @@ impl Datasets {
|
|||||||
for (name, dataset) in &self.sources {
|
for (name, dataset) in &self.sources {
|
||||||
info!("Loading source {name}");
|
info!("Loading source {name}");
|
||||||
|
|
||||||
let stream = dataset.iter();
|
let mut stream = dataset.iter();
|
||||||
for item in stream {
|
while let Some(item_result) = stream.next().await {
|
||||||
if let Some(flag) = &flag
|
if let Some(flag) = &flag
|
||||||
&& flag.is_cancelled()
|
&& flag.is_cancelled()
|
||||||
{
|
{
|
||||||
return Err(CancelableTaskError::Cancelled);
|
return Err(CancelableTaskError::Cancelled);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let item = item_result.map_err(DatasetError::from)?;
|
||||||
let db = Arc::clone(&db_index);
|
let db = Arc::clone(&db_index);
|
||||||
let state = state.clone();
|
|
||||||
let item = item.clone();
|
|
||||||
join_set.spawn(async move {
|
join_set.spawn(async move {
|
||||||
let key = item.key();
|
let key = item.key();
|
||||||
let result = db.entry_to_document(&state, &item).await;
|
let result = db.entry_to_document(&item).await;
|
||||||
(key, result)
|
(key, result)
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -401,18 +281,9 @@ impl Datasets {
|
|||||||
return Err(CancelableTaskError::Cancelled);
|
return Err(CancelableTaskError::Cancelled);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
info!("Committing {total} documents");
|
||||||
index_writer.commit().map_err(DatasetError::from)?;
|
index_writer.commit().map_err(DatasetError::from)?;
|
||||||
|
|
||||||
debug!(
|
|
||||||
message = "Rebuilt fts index",
|
|
||||||
dataset = self.config.dataset.name.as_str(),
|
|
||||||
?fts_dir,
|
|
||||||
?fts_tmp_dir,
|
|
||||||
?workdir,
|
|
||||||
n_docs = total,
|
|
||||||
time_ms = start.elapsed().as_millis()
|
|
||||||
);
|
|
||||||
|
|
||||||
if fts_dir.is_dir() {
|
if fts_dir.is_dir() {
|
||||||
warn!("Removing existing index in {}", fts_dir.display());
|
warn!("Removing existing index in {}", fts_dir.display());
|
||||||
std::fs::remove_dir_all(&fts_dir).map_err(DatasetError::from)?;
|
std::fs::remove_dir_all(&fts_dir).map_err(DatasetError::from)?;
|
||||||
@@ -427,15 +298,7 @@ impl Datasets {
|
|||||||
query: &str,
|
query: &str,
|
||||||
top_n: usize,
|
top_n: usize,
|
||||||
) -> Result<Vec<FtsLookupResult>, DatasetError> {
|
) -> Result<Vec<FtsLookupResult>, DatasetError> {
|
||||||
let workdir = match self.path_workdir.as_ref() {
|
let fts_dir = self.path_workdir.join("fts");
|
||||||
Some(x) => x,
|
|
||||||
None => {
|
|
||||||
warn!("Skipping fts_lookup, no workdir");
|
|
||||||
return Ok(Vec::new());
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let fts_dir = workdir.join("fts");
|
|
||||||
|
|
||||||
if !fts_dir.exists() {
|
if !fts_dir.exists() {
|
||||||
return Err(DatasetError::NoFtsIndex);
|
return Err(DatasetError::NoFtsIndex);
|
||||||
@@ -455,12 +318,7 @@ impl Datasets {
|
|||||||
|
|
||||||
/// Time at which fts was created
|
/// Time at which fts was created
|
||||||
pub fn ts_fts(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
|
pub fn ts_fts(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
|
||||||
let workdir = match self.path_workdir.as_ref() {
|
let fts_dir = self.path_workdir.join("fts");
|
||||||
Some(x) => x,
|
|
||||||
None => return Ok(None),
|
|
||||||
};
|
|
||||||
|
|
||||||
let fts_dir = workdir.join("fts");
|
|
||||||
|
|
||||||
if !fts_dir.exists() {
|
if !fts_dir.exists() {
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
|
|||||||
@@ -1,8 +1,5 @@
|
|||||||
use pile_config::{ConfigToml, DatasetFts, Label, objectpath::ObjectPath};
|
use pile_config::{ConfigToml, DatasetFts, Label};
|
||||||
use pile_value::{
|
use pile_value::value::{Item, PileValue};
|
||||||
extract::traits::ExtractState,
|
|
||||||
value::{Item, PileValue},
|
|
||||||
};
|
|
||||||
use std::{path::PathBuf, sync::LazyLock};
|
use std::{path::PathBuf, sync::LazyLock};
|
||||||
use tantivy::{
|
use tantivy::{
|
||||||
DocAddress, Index, ReloadPolicy, TantivyDocument, TantivyError,
|
DocAddress, Index, ReloadPolicy, TantivyDocument, TantivyError,
|
||||||
@@ -10,7 +7,7 @@ use tantivy::{
|
|||||||
query::QueryParser,
|
query::QueryParser,
|
||||||
schema::{self, Schema, Value as TantivyValue},
|
schema::{self, Schema, Value as TantivyValue},
|
||||||
};
|
};
|
||||||
use tracing::warn;
|
use tracing::{debug, trace, warn};
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct FtsLookupResult {
|
pub struct FtsLookupResult {
|
||||||
@@ -66,7 +63,6 @@ impl DbFtsIndex {
|
|||||||
/// Turn an entry into a tantivy document
|
/// Turn an entry into a tantivy document
|
||||||
pub async fn entry_to_document(
|
pub async fn entry_to_document(
|
||||||
&self,
|
&self,
|
||||||
state: &ExtractState,
|
|
||||||
item: &Item,
|
item: &Item,
|
||||||
) -> Result<Option<TantivyDocument>, TantivyError> {
|
) -> Result<Option<TantivyDocument>, TantivyError> {
|
||||||
let mut doc = TantivyDocument::default();
|
let mut doc = TantivyDocument::default();
|
||||||
@@ -79,12 +75,18 @@ impl DbFtsIndex {
|
|||||||
|
|
||||||
let mut empty = true;
|
let mut empty = true;
|
||||||
for name in self.fts_cfg().fields.keys() {
|
for name in self.fts_cfg().fields.keys() {
|
||||||
let vals = self.get_field(state, &item, name).await?;
|
let x = self.get_field(&item, name).await?;
|
||||||
let field = self.schema.get_field(name)?;
|
|
||||||
|
|
||||||
for v in vals {
|
let val = match x {
|
||||||
empty = false;
|
Some(x) => x,
|
||||||
doc.add_text(field, v);
|
None => continue,
|
||||||
|
};
|
||||||
|
|
||||||
|
empty = false;
|
||||||
|
|
||||||
|
let field = self.schema.get_field(name);
|
||||||
|
if let Ok(field) = field {
|
||||||
|
doc.add_text(field, val);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -97,32 +99,112 @@ impl DbFtsIndex {
|
|||||||
|
|
||||||
pub async fn get_field(
|
pub async fn get_field(
|
||||||
&self,
|
&self,
|
||||||
state: &ExtractState,
|
|
||||||
extractor: &PileValue,
|
extractor: &PileValue,
|
||||||
field_name: &Label,
|
field_name: &Label,
|
||||||
) -> Result<Vec<String>, std::io::Error> {
|
) -> Result<Option<String>, std::io::Error> {
|
||||||
let field = match self.cfg.schema.get(field_name) {
|
let field = match self.cfg.schema.get(field_name) {
|
||||||
Some(x) => x,
|
Some(x) => x,
|
||||||
None => {
|
None => {
|
||||||
warn!("Unknown field {field_name:?}");
|
warn!("Unknown field {field_name:?}");
|
||||||
return Ok(Vec::new());
|
return Ok(None);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Try paths in order, using the first value we find
|
// Try paths in order, using the first value we find
|
||||||
for path in field.path.as_slice() {
|
'outer: for path in field.path.as_slice() {
|
||||||
let val = match extractor.query(state, path).await? {
|
let val = match extractor.query(path).await? {
|
||||||
Some(PileValue::Null) | None => continue,
|
|
||||||
Some(x) => x,
|
Some(x) => x,
|
||||||
|
None => return Ok(None),
|
||||||
};
|
};
|
||||||
|
|
||||||
let val = val_to_string(state, &val, path, field_name).await?;
|
let mut val = match val {
|
||||||
if !val.is_empty() {
|
PileValue::Null => {
|
||||||
return Ok(val);
|
trace!(
|
||||||
|
message = "Skipping field, is null",
|
||||||
|
field = field_name.to_string(),
|
||||||
|
?path,
|
||||||
|
// value = ?val
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
x => x.clone(),
|
||||||
|
};
|
||||||
|
|
||||||
|
loop {
|
||||||
|
val = match val {
|
||||||
|
PileValue::String(x) => return Ok(Some(x.to_string())),
|
||||||
|
PileValue::U64(x) => return Ok(Some(x.to_string())),
|
||||||
|
PileValue::I64(x) => return Ok(Some(x.to_string())),
|
||||||
|
|
||||||
|
PileValue::Array(x) => {
|
||||||
|
if x.len() == 1 {
|
||||||
|
x[0].clone()
|
||||||
|
} else if x.len() > 1 {
|
||||||
|
debug!(
|
||||||
|
message = "Skipping field, is array with more than one element",
|
||||||
|
field = field_name.to_string(),
|
||||||
|
?path,
|
||||||
|
);
|
||||||
|
continue 'outer;
|
||||||
|
} else {
|
||||||
|
debug!(
|
||||||
|
message = "Skipping field, is empty array",
|
||||||
|
field = field_name.to_string(),
|
||||||
|
?path,
|
||||||
|
);
|
||||||
|
continue 'outer;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
PileValue::Null => {
|
||||||
|
trace!(
|
||||||
|
message = "Skipping field, is null",
|
||||||
|
field = field_name.to_string(),
|
||||||
|
?path,
|
||||||
|
);
|
||||||
|
continue 'outer;
|
||||||
|
}
|
||||||
|
|
||||||
|
PileValue::ObjectExtractor(_) => {
|
||||||
|
trace!(
|
||||||
|
message = "Skipping field, is object",
|
||||||
|
field = field_name.to_string(),
|
||||||
|
?path,
|
||||||
|
);
|
||||||
|
continue 'outer;
|
||||||
|
}
|
||||||
|
|
||||||
|
PileValue::Item(_) => {
|
||||||
|
trace!(
|
||||||
|
message = "Skipping field, is item",
|
||||||
|
field = field_name.to_string(),
|
||||||
|
?path,
|
||||||
|
);
|
||||||
|
continue 'outer;
|
||||||
|
}
|
||||||
|
|
||||||
|
PileValue::ListExtractor(_) => {
|
||||||
|
trace!(
|
||||||
|
message = "Skipping field, is ListExtractor",
|
||||||
|
field = field_name.to_string(),
|
||||||
|
?path,
|
||||||
|
);
|
||||||
|
continue 'outer;
|
||||||
|
}
|
||||||
|
|
||||||
|
PileValue::Blob { .. } => {
|
||||||
|
trace!(
|
||||||
|
message = "Skipping field, is blob",
|
||||||
|
field = field_name.to_string(),
|
||||||
|
?path,
|
||||||
|
);
|
||||||
|
continue 'outer;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return Ok(Vec::new());
|
return Ok(None);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Run the given query on this table's ftx index.
|
/// Run the given query on this table's ftx index.
|
||||||
@@ -211,42 +293,3 @@ impl DbFtsIndex {
|
|||||||
return Ok(out);
|
return Ok(out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn val_to_string(
|
|
||||||
state: &ExtractState,
|
|
||||||
val: &PileValue,
|
|
||||||
path: &ObjectPath,
|
|
||||||
field_name: &str,
|
|
||||||
) -> Result<Vec<String>, std::io::Error> {
|
|
||||||
match val {
|
|
||||||
PileValue::String(x) => return Ok(vec![x.to_string()]),
|
|
||||||
PileValue::U64(x) => return Ok(vec![x.to_string()]),
|
|
||||||
PileValue::I64(x) => return Ok(vec![x.to_string()]),
|
|
||||||
|
|
||||||
PileValue::Array(x) => {
|
|
||||||
let mut out = Vec::new();
|
|
||||||
for x in x.iter() {
|
|
||||||
out.extend(Box::pin(val_to_string(state, x, path, field_name)).await?);
|
|
||||||
}
|
|
||||||
return Ok(out);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[expect(clippy::unwrap_used)]
|
|
||||||
PileValue::ListExtractor(x) => {
|
|
||||||
let mut out = Vec::new();
|
|
||||||
let len = x.len(state).await?;
|
|
||||||
for i in 0..len {
|
|
||||||
let v = x.get(state, i).await?;
|
|
||||||
out.extend(Box::pin(val_to_string(state, &v.unwrap(), path, field_name)).await?);
|
|
||||||
}
|
|
||||||
return Ok(out);
|
|
||||||
}
|
|
||||||
|
|
||||||
PileValue::Null => {}
|
|
||||||
PileValue::ObjectExtractor(_) => {}
|
|
||||||
PileValue::Item(_) => {}
|
|
||||||
PileValue::Binary(_) => {}
|
|
||||||
}
|
|
||||||
|
|
||||||
return Ok(Vec::new());
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -2,3 +2,6 @@ mod dataset;
|
|||||||
pub use dataset::{Dataset, DatasetError, Datasets};
|
pub use dataset::{Dataset, DatasetError, Datasets};
|
||||||
|
|
||||||
pub mod index;
|
pub mod index;
|
||||||
|
|
||||||
|
#[cfg(feature = "axum")]
|
||||||
|
pub mod serve;
|
||||||
|
|||||||
98
crates/pile-dataset/src/serve/field.rs
Normal file
98
crates/pile-dataset/src/serve/field.rs
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
use axum::{
|
||||||
|
Json,
|
||||||
|
extract::{Query, State},
|
||||||
|
http::{StatusCode, header},
|
||||||
|
response::{IntoResponse, Response},
|
||||||
|
};
|
||||||
|
use pile_config::{Label, objectpath::ObjectPath};
|
||||||
|
use pile_value::value::PileValue;
|
||||||
|
use serde::Deserialize;
|
||||||
|
use std::{sync::Arc, time::Instant};
|
||||||
|
use tracing::debug;
|
||||||
|
use utoipa::ToSchema;
|
||||||
|
|
||||||
|
use crate::Datasets;
|
||||||
|
|
||||||
|
#[derive(Deserialize, ToSchema)]
|
||||||
|
pub struct FieldQuery {
|
||||||
|
source: String,
|
||||||
|
key: String,
|
||||||
|
path: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract a specific field from an item's metadata
|
||||||
|
#[utoipa::path(
|
||||||
|
get,
|
||||||
|
path = "/field",
|
||||||
|
params(
|
||||||
|
("source" = String, Query, description = "Source label"),
|
||||||
|
("key" = String, Query, description = "Item key"),
|
||||||
|
("path" = String, Query, description = "Object path (e.g. $.flac.title)"),
|
||||||
|
),
|
||||||
|
responses(
|
||||||
|
(status = 200, description = "Field value as JSON"),
|
||||||
|
(status = 400, description = "Invalid source label or path"),
|
||||||
|
(status = 404, description = "Item or field not found"),
|
||||||
|
(status = 500, description = "Internal server error"),
|
||||||
|
)
|
||||||
|
)]
|
||||||
|
pub async fn get_field(
|
||||||
|
State(state): State<Arc<Datasets>>,
|
||||||
|
Query(params): Query<FieldQuery>,
|
||||||
|
) -> Response {
|
||||||
|
let start = Instant::now();
|
||||||
|
debug!(
|
||||||
|
message = "Serving /field",
|
||||||
|
source = params.source,
|
||||||
|
key = params.key,
|
||||||
|
path = params.path,
|
||||||
|
);
|
||||||
|
|
||||||
|
let label = match Label::try_from(params.source.clone()) {
|
||||||
|
Ok(l) => l,
|
||||||
|
Err(e) => return (StatusCode::BAD_REQUEST, format!("{e:?}")).into_response(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let path: ObjectPath = match params.path.parse() {
|
||||||
|
Ok(p) => p,
|
||||||
|
Err(e) => return (StatusCode::BAD_REQUEST, format!("{e:?}")).into_response(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let Some(item) = state.get(&label, ¶ms.key).await else {
|
||||||
|
return StatusCode::NOT_FOUND.into_response();
|
||||||
|
};
|
||||||
|
|
||||||
|
let item = PileValue::Item(item);
|
||||||
|
let value = match item.query(&path).await {
|
||||||
|
Ok(Some(v)) => v,
|
||||||
|
Ok(None) => return StatusCode::NOT_FOUND.into_response(),
|
||||||
|
Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
|
||||||
|
};
|
||||||
|
|
||||||
|
debug!(
|
||||||
|
message = "Served /field",
|
||||||
|
source = params.source,
|
||||||
|
key = params.key,
|
||||||
|
path = params.path,
|
||||||
|
time_ms = start.elapsed().as_millis()
|
||||||
|
);
|
||||||
|
|
||||||
|
match value {
|
||||||
|
PileValue::String(s) => (
|
||||||
|
StatusCode::OK,
|
||||||
|
[(header::CONTENT_TYPE, "text/plain")],
|
||||||
|
s.to_string(),
|
||||||
|
)
|
||||||
|
.into_response(),
|
||||||
|
PileValue::Blob { mime, bytes } => (
|
||||||
|
StatusCode::OK,
|
||||||
|
[(header::CONTENT_TYPE, mime.to_string())],
|
||||||
|
bytes.as_ref().clone(),
|
||||||
|
)
|
||||||
|
.into_response(),
|
||||||
|
_ => match value.to_json().await {
|
||||||
|
Ok(json) => (StatusCode::OK, Json(json)).into_response(),
|
||||||
|
Err(e) => (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
181
crates/pile-dataset/src/serve/item.rs
Normal file
181
crates/pile-dataset/src/serve/item.rs
Normal file
@@ -0,0 +1,181 @@
|
|||||||
|
use axum::{
|
||||||
|
body::Body,
|
||||||
|
extract::{Query, State},
|
||||||
|
http::{HeaderMap, StatusCode, header},
|
||||||
|
response::{IntoResponse, Response},
|
||||||
|
};
|
||||||
|
use pile_config::Label;
|
||||||
|
use pile_value::value::{AsyncReader, AsyncSeekReader};
|
||||||
|
use serde::Deserialize;
|
||||||
|
use std::{io::SeekFrom, sync::Arc, time::Instant};
|
||||||
|
use tokio::sync::mpsc;
|
||||||
|
use tokio_stream::wrappers::ReceiverStream;
|
||||||
|
use tracing::debug;
|
||||||
|
use utoipa::ToSchema;
|
||||||
|
|
||||||
|
use crate::Datasets;
|
||||||
|
|
||||||
|
#[derive(Deserialize, ToSchema)]
|
||||||
|
pub struct ItemQuery {
|
||||||
|
source: String,
|
||||||
|
key: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse a `Range: bytes=...` header value.
|
||||||
|
/// Returns `(start, end)` where either may be `None` (suffix form has `None` start).
|
||||||
|
fn parse_byte_range(s: &str) -> Option<(Option<u64>, Option<u64>)> {
|
||||||
|
let spec = s.strip_prefix("bytes=")?;
|
||||||
|
if spec.contains(',') {
|
||||||
|
return None; // multiple ranges not supported
|
||||||
|
}
|
||||||
|
if let Some(suffix) = spec.strip_prefix('-') {
|
||||||
|
return Some((None, Some(suffix.parse().ok()?)));
|
||||||
|
}
|
||||||
|
let mut parts = spec.splitn(2, '-');
|
||||||
|
let start: u64 = parts.next()?.parse().ok()?;
|
||||||
|
let end = parts
|
||||||
|
.next()
|
||||||
|
.and_then(|e| if e.is_empty() { None } else { e.parse().ok() });
|
||||||
|
Some((Some(start), end))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fetch the raw bytes of an item by source and key
|
||||||
|
#[utoipa::path(
|
||||||
|
get,
|
||||||
|
path = "/item",
|
||||||
|
params(
|
||||||
|
("source" = String, Query, description = "Source label"),
|
||||||
|
("key" = String, Query, description = "Item key"),
|
||||||
|
),
|
||||||
|
responses(
|
||||||
|
(status = 200, description = "Raw item bytes"),
|
||||||
|
(status = 206, description = "Partial content"),
|
||||||
|
(status = 400, description = "Invalid source label"),
|
||||||
|
(status = 404, description = "Item not found"),
|
||||||
|
(status = 416, description = "Range not satisfiable"),
|
||||||
|
(status = 500, description = "Internal server error"),
|
||||||
|
)
|
||||||
|
)]
|
||||||
|
pub async fn item_get(
|
||||||
|
State(state): State<Arc<Datasets>>,
|
||||||
|
Query(params): Query<ItemQuery>,
|
||||||
|
headers: HeaderMap,
|
||||||
|
) -> Response {
|
||||||
|
let start = Instant::now();
|
||||||
|
debug!(
|
||||||
|
message = "Serving /item",
|
||||||
|
source = params.source,
|
||||||
|
key = params.key
|
||||||
|
);
|
||||||
|
|
||||||
|
let label = match Label::try_from(params.source.clone()) {
|
||||||
|
Ok(l) => l,
|
||||||
|
Err(e) => return (StatusCode::BAD_REQUEST, format!("{e:?}")).into_response(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let Some(item) = state.get(&label, ¶ms.key).await else {
|
||||||
|
return StatusCode::NOT_FOUND.into_response();
|
||||||
|
};
|
||||||
|
|
||||||
|
let mime = item.mime().to_string();
|
||||||
|
|
||||||
|
let mut reader = match item.read().await {
|
||||||
|
Ok(r) => r,
|
||||||
|
Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let total = match reader.seek(SeekFrom::End(0)).await {
|
||||||
|
Ok(n) => n,
|
||||||
|
Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let range = headers
|
||||||
|
.get(header::RANGE)
|
||||||
|
.and_then(|v| v.to_str().ok())
|
||||||
|
.and_then(parse_byte_range);
|
||||||
|
|
||||||
|
// Resolve (byte_start, byte_end, content_length, is_range)
|
||||||
|
let (byte_start, byte_end, length, is_range) = match range {
|
||||||
|
Some((Some(s), e)) => {
|
||||||
|
let e = e
|
||||||
|
.unwrap_or(total.saturating_sub(1))
|
||||||
|
.min(total.saturating_sub(1));
|
||||||
|
if s >= total || s > e {
|
||||||
|
return (
|
||||||
|
StatusCode::RANGE_NOT_SATISFIABLE,
|
||||||
|
[(header::CONTENT_RANGE, format!("bytes */{total}"))],
|
||||||
|
)
|
||||||
|
.into_response();
|
||||||
|
}
|
||||||
|
(s, e, e - s + 1, true)
|
||||||
|
}
|
||||||
|
Some((None, Some(suffix))) => {
|
||||||
|
let s = total.saturating_sub(suffix);
|
||||||
|
let e = total.saturating_sub(1);
|
||||||
|
(s, e, total.saturating_sub(s), true)
|
||||||
|
}
|
||||||
|
_ => (0, total.saturating_sub(1), total, false),
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Err(e) = reader.seek(SeekFrom::Start(byte_start)).await {
|
||||||
|
return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response();
|
||||||
|
}
|
||||||
|
|
||||||
|
debug!(
|
||||||
|
message = "Served /item",
|
||||||
|
source = params.source,
|
||||||
|
key = params.key,
|
||||||
|
time_ms = start.elapsed().as_millis()
|
||||||
|
);
|
||||||
|
|
||||||
|
let (tx, rx) = mpsc::channel::<Result<Vec<u8>, std::io::Error>>(8);
|
||||||
|
|
||||||
|
tokio::spawn(async move {
|
||||||
|
let mut buf = vec![0u8; 65536];
|
||||||
|
let mut remaining = length;
|
||||||
|
loop {
|
||||||
|
if remaining == 0 {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let to_read = (buf.len() as u64).min(remaining) as usize;
|
||||||
|
match reader.read(&mut buf[..to_read]).await {
|
||||||
|
Ok(0) => break,
|
||||||
|
Ok(n) => {
|
||||||
|
remaining -= n as u64;
|
||||||
|
if tx.send(Ok(buf[..n].to_vec())).await.is_err() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
let _ = tx.send(Err(e)).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
let body = Body::from_stream(ReceiverStream::new(rx));
|
||||||
|
let status = if is_range {
|
||||||
|
StatusCode::PARTIAL_CONTENT
|
||||||
|
} else {
|
||||||
|
StatusCode::OK
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut builder = axum::http::Response::builder()
|
||||||
|
.status(status)
|
||||||
|
.header(header::CONTENT_TYPE, mime)
|
||||||
|
.header(header::ACCEPT_RANGES, "bytes")
|
||||||
|
.header(header::CONTENT_LENGTH, length);
|
||||||
|
|
||||||
|
if is_range {
|
||||||
|
builder = builder.header(
|
||||||
|
header::CONTENT_RANGE,
|
||||||
|
format!("bytes {byte_start}-{byte_end}/{total}"),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
builder
|
||||||
|
.body(body)
|
||||||
|
.map(IntoResponse::into_response)
|
||||||
|
.unwrap_or_else(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response())
|
||||||
|
}
|
||||||
@@ -4,12 +4,13 @@ use axum::{
|
|||||||
http::StatusCode,
|
http::StatusCode,
|
||||||
response::{IntoResponse, Response},
|
response::{IntoResponse, Response},
|
||||||
};
|
};
|
||||||
use pile_dataset::Datasets;
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::{sync::Arc, time::Instant};
|
use std::{sync::Arc, time::Instant};
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
use utoipa::ToSchema;
|
use utoipa::ToSchema;
|
||||||
|
|
||||||
|
use crate::Datasets;
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, ToSchema, Debug)]
|
#[derive(Serialize, Deserialize, ToSchema, Debug)]
|
||||||
pub struct LookupRequest {
|
pub struct LookupRequest {
|
||||||
pub query: String,
|
pub query: String,
|
||||||
@@ -21,7 +22,6 @@ pub struct LookupRequest {
|
|||||||
#[derive(Debug, Serialize, Deserialize, ToSchema)]
|
#[derive(Debug, Serialize, Deserialize, ToSchema)]
|
||||||
pub struct LookupResponse {
|
pub struct LookupResponse {
|
||||||
pub results: Vec<LookupResult>,
|
pub results: Vec<LookupResult>,
|
||||||
pub total: u64,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize, ToSchema)]
|
#[derive(Debug, Serialize, Deserialize, ToSchema)]
|
||||||
@@ -48,10 +48,13 @@ pub async fn lookup(
|
|||||||
Json(body): Json<LookupRequest>,
|
Json(body): Json<LookupRequest>,
|
||||||
) -> Response {
|
) -> Response {
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
let limit = body.limit.unwrap_or(128).min(1024);
|
debug!(
|
||||||
debug!(message = "Serving /lookup", query = body.query, limit);
|
message = "Serving /lookup",
|
||||||
|
query = body.query,
|
||||||
|
limit = body.limit.unwrap_or(10)
|
||||||
|
);
|
||||||
|
|
||||||
let results: Vec<LookupResult> = match state.fts_lookup(&body.query, limit) {
|
let results: Vec<LookupResult> = match state.fts_lookup(&body.query, body.limit.unwrap_or(10)) {
|
||||||
Ok(x) => x
|
Ok(x) => x
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|x| LookupResult {
|
.map(|x| LookupResult {
|
||||||
@@ -66,8 +69,6 @@ pub async fn lookup(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let total: u64 = state.sources.iter().map(|x| x.1.len() as u64).sum();
|
|
||||||
|
|
||||||
debug!(
|
debug!(
|
||||||
message = "Served /lookup",
|
message = "Served /lookup",
|
||||||
query = body.query,
|
query = body.query,
|
||||||
@@ -75,5 +76,5 @@ pub async fn lookup(
|
|||||||
time_ms = start.elapsed().as_millis()
|
time_ms = start.elapsed().as_millis()
|
||||||
);
|
);
|
||||||
|
|
||||||
return (StatusCode::OK, Json(LookupResponse { results, total })).into_response();
|
return (StatusCode::OK, Json(LookupResponse { results })).into_response();
|
||||||
}
|
}
|
||||||
47
crates/pile-dataset/src/serve/mod.rs
Normal file
47
crates/pile-dataset/src/serve/mod.rs
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
use axum::{
|
||||||
|
Router,
|
||||||
|
routing::{get, post},
|
||||||
|
};
|
||||||
|
use std::sync::Arc;
|
||||||
|
use utoipa::OpenApi;
|
||||||
|
use utoipa_swagger_ui::SwaggerUi;
|
||||||
|
|
||||||
|
use crate::Datasets;
|
||||||
|
|
||||||
|
mod lookup;
|
||||||
|
pub use lookup::*;
|
||||||
|
|
||||||
|
mod item;
|
||||||
|
pub use item::*;
|
||||||
|
|
||||||
|
mod field;
|
||||||
|
pub use field::*;
|
||||||
|
|
||||||
|
#[derive(OpenApi)]
|
||||||
|
#[openapi(
|
||||||
|
tags(),
|
||||||
|
paths(lookup, item_get, get_field),
|
||||||
|
components(schemas(LookupRequest, LookupResponse, LookupResult, ItemQuery, FieldQuery))
|
||||||
|
)]
|
||||||
|
pub(crate) struct Api;
|
||||||
|
|
||||||
|
impl Datasets {
|
||||||
|
#[inline]
|
||||||
|
pub fn router(self: Arc<Self>, with_docs: bool) -> Router<()> {
|
||||||
|
let mut router = Router::new()
|
||||||
|
.route("/lookup", post(lookup))
|
||||||
|
.route("/item", get(item_get))
|
||||||
|
.route("/field", get(get_field))
|
||||||
|
.with_state(self.clone());
|
||||||
|
|
||||||
|
if with_docs {
|
||||||
|
let docs_path = "/docs";
|
||||||
|
let docs = SwaggerUi::new(docs_path)
|
||||||
|
.url(format!("{}/openapi.json", docs_path), Api::openapi());
|
||||||
|
|
||||||
|
router = router.merge(docs);
|
||||||
|
}
|
||||||
|
|
||||||
|
router
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,20 +1,10 @@
|
|||||||
use std::io::{ErrorKind, Read, Seek, SeekFrom};
|
use std::io::{Read, Seek, SeekFrom};
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
FlacBlock, FlacDecodeError,
|
FlacBlock, FlacDecodeError,
|
||||||
blocks::{FlacAudioFrame, FlacMetablockHeader, FlacMetablockType},
|
blocks::{FlacAudioFrame, FlacMetablockHeader, FlacMetablockType},
|
||||||
};
|
};
|
||||||
|
|
||||||
fn read_exact_flac<R: Read>(reader: &mut R, buf: &mut [u8]) -> Result<(), FlacDecodeError> {
|
|
||||||
reader.read_exact(buf).map_err(|e| {
|
|
||||||
if e.kind() == ErrorKind::UnexpectedEof {
|
|
||||||
FlacDecodeError::MalformedBlock
|
|
||||||
} else {
|
|
||||||
e.into()
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: quickly skip blocks we do not need
|
// TODO: quickly skip blocks we do not need
|
||||||
|
|
||||||
/// The next block we expect to read
|
/// The next block we expect to read
|
||||||
@@ -52,9 +42,9 @@ impl<R: Read + Seek> Iterator for FlacReader<R> {
|
|||||||
|
|
||||||
ReaderState::MagicBits => {
|
ReaderState::MagicBits => {
|
||||||
let mut data = [0u8; 4];
|
let mut data = [0u8; 4];
|
||||||
if let Err(e) = read_exact_flac(&mut self.inner, &mut data[..4]) {
|
if let Err(e) = self.inner.read_exact(&mut data[..4]) {
|
||||||
self.state = ReaderState::Done;
|
self.state = ReaderState::Done;
|
||||||
return Some(Err(e));
|
return Some(Err(e.into()));
|
||||||
}
|
}
|
||||||
|
|
||||||
if data != [0x66, 0x4C, 0x61, 0x43] {
|
if data != [0x66, 0x4C, 0x61, 0x43] {
|
||||||
@@ -67,9 +57,9 @@ impl<R: Read + Seek> Iterator for FlacReader<R> {
|
|||||||
|
|
||||||
ReaderState::MetablockHeader { is_first } => {
|
ReaderState::MetablockHeader { is_first } => {
|
||||||
let mut data = [0u8; 4];
|
let mut data = [0u8; 4];
|
||||||
if let Err(e) = read_exact_flac(&mut self.inner, &mut data[..]) {
|
if let Err(e) = self.inner.read_exact(&mut data[..]) {
|
||||||
self.state = ReaderState::Done;
|
self.state = ReaderState::Done;
|
||||||
return Some(Err(e));
|
return Some(Err(e.into()));
|
||||||
}
|
}
|
||||||
|
|
||||||
let header = match FlacMetablockHeader::decode(&data) {
|
let header = match FlacMetablockHeader::decode(&data) {
|
||||||
@@ -90,9 +80,9 @@ impl<R: Read + Seek> Iterator for FlacReader<R> {
|
|||||||
|
|
||||||
ReaderState::MetaBlock { header } => {
|
ReaderState::MetaBlock { header } => {
|
||||||
let mut data = vec![0u8; header.length as usize];
|
let mut data = vec![0u8; header.length as usize];
|
||||||
if let Err(e) = read_exact_flac(&mut self.inner, &mut data) {
|
if let Err(e) = self.inner.read_exact(&mut data) {
|
||||||
self.state = ReaderState::Done;
|
self.state = ReaderState::Done;
|
||||||
return Some(Err(e));
|
return Some(Err(e.into()));
|
||||||
}
|
}
|
||||||
|
|
||||||
let block = match FlacBlock::decode(header.block_type, &data) {
|
let block = match FlacBlock::decode(header.block_type, &data) {
|
||||||
|
|||||||
@@ -1,11 +0,0 @@
|
|||||||
[package]
|
|
||||||
name = "pile-io"
|
|
||||||
version = { workspace = true }
|
|
||||||
rust-version = { workspace = true }
|
|
||||||
edition = { workspace = true }
|
|
||||||
|
|
||||||
[lints]
|
|
||||||
workspace = true
|
|
||||||
|
|
||||||
[dependencies]
|
|
||||||
tokio = { workspace = true }
|
|
||||||
@@ -1,75 +0,0 @@
|
|||||||
use std::io::{Read, Seek, SeekFrom};
|
|
||||||
use tokio::runtime::Handle;
|
|
||||||
|
|
||||||
//
|
|
||||||
// MARK: asyncreader
|
|
||||||
//
|
|
||||||
|
|
||||||
/// An `async` equivalent of [std::io::Read].
|
|
||||||
pub trait AsyncReader: Send {
|
|
||||||
/// Read a chunk of bytes.
|
|
||||||
fn read(
|
|
||||||
&mut self,
|
|
||||||
buf: &mut [u8],
|
|
||||||
) -> impl Future<Output = Result<usize, std::io::Error>> + Send;
|
|
||||||
|
|
||||||
/// Read all remaining bytes into a `Vec`.
|
|
||||||
fn read_to_end(&mut self) -> impl Future<Output = Result<Vec<u8>, std::io::Error>> + Send {
|
|
||||||
async {
|
|
||||||
let mut buf = Vec::new();
|
|
||||||
let mut chunk = vec![0u8; 65536];
|
|
||||||
loop {
|
|
||||||
let n = self.read(&mut chunk).await?;
|
|
||||||
if n == 0 {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
buf.extend_from_slice(&chunk[..n]);
|
|
||||||
}
|
|
||||||
Ok(buf)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// An `async` equivalent of [std::io::Read] + [std::io::Seek].
|
|
||||||
pub trait AsyncSeekReader: AsyncReader {
|
|
||||||
fn seek(&mut self, pos: SeekFrom) -> impl Future<Output = Result<u64, std::io::Error>> + Send;
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
// MARK: sync bridge
|
|
||||||
//
|
|
||||||
|
|
||||||
/// Turn an async [Reader] into a sync [Read] + [Seek].
|
|
||||||
///
|
|
||||||
/// Never use this outside of [tokio::task::spawn_blocking],
|
|
||||||
/// the async runtime will deadlock if this struct blocks
|
|
||||||
/// the runtime.
|
|
||||||
pub struct SyncReadBridge<R: AsyncReader> {
|
|
||||||
inner: R,
|
|
||||||
handle: Handle,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<R: AsyncReader> SyncReadBridge<R> {
|
|
||||||
/// Creates a new adapter using a handle to the current runtime.
|
|
||||||
/// Panics if called outside of a tokio context.
|
|
||||||
pub fn new_current(inner: R) -> Self {
|
|
||||||
Self::new(inner, Handle::current())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Creates a new adapter using a handle to an existing runtime.
|
|
||||||
pub fn new(inner: R, handle: Handle) -> Self {
|
|
||||||
Self { inner, handle }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<R: AsyncReader> Read for SyncReadBridge<R> {
|
|
||||||
fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
|
|
||||||
self.handle.block_on(self.inner.read(buf))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<R: AsyncReader + AsyncSeekReader> Seek for SyncReadBridge<R> {
|
|
||||||
fn seek(&mut self, pos: SeekFrom) -> Result<u64, std::io::Error> {
|
|
||||||
self.handle.block_on(self.inner.seek(pos))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,2 +0,0 @@
|
|||||||
mod asyncreader;
|
|
||||||
pub use asyncreader::*;
|
|
||||||
@@ -1,28 +0,0 @@
|
|||||||
[package]
|
|
||||||
name = "pile-serve"
|
|
||||||
version = { workspace = true }
|
|
||||||
rust-version = { workspace = true }
|
|
||||||
edition = { workspace = true }
|
|
||||||
|
|
||||||
[lints]
|
|
||||||
workspace = true
|
|
||||||
|
|
||||||
[dependencies]
|
|
||||||
pile-config = { workspace = true }
|
|
||||||
pile-value = { workspace = true }
|
|
||||||
pile-dataset = { workspace = true }
|
|
||||||
|
|
||||||
serde_json = { workspace = true }
|
|
||||||
tracing = { workspace = true }
|
|
||||||
tokio = { workspace = true }
|
|
||||||
tokio-util = { version = "0.7", features = ["io"] }
|
|
||||||
|
|
||||||
serde = { workspace = true }
|
|
||||||
axum = { workspace = true }
|
|
||||||
percent-encoding = { workspace = true }
|
|
||||||
utoipa = { workspace = true }
|
|
||||||
utoipa-swagger-ui = { workspace = true }
|
|
||||||
|
|
||||||
[features]
|
|
||||||
default = []
|
|
||||||
pdfium = ["pile-value/pdfium"]
|
|
||||||
@@ -1,30 +0,0 @@
|
|||||||
use axum::{
|
|
||||||
Json,
|
|
||||||
extract::State,
|
|
||||||
http::StatusCode,
|
|
||||||
response::{IntoResponse, Response},
|
|
||||||
};
|
|
||||||
use pile_dataset::Datasets;
|
|
||||||
use std::{collections::HashMap, sync::Arc};
|
|
||||||
|
|
||||||
pub use pile_config::FieldSpec;
|
|
||||||
|
|
||||||
pub type FieldsResponse = HashMap<String, FieldSpec>;
|
|
||||||
|
|
||||||
/// Retrieve this dataset's schema.
|
|
||||||
#[utoipa::path(
|
|
||||||
get,
|
|
||||||
path = "/config/schema",
|
|
||||||
responses(
|
|
||||||
(status = 200, description = "This dataset's schema"),
|
|
||||||
)
|
|
||||||
)]
|
|
||||||
pub async fn config_schema(State(state): State<Arc<Datasets>>) -> Response {
|
|
||||||
let fields: FieldsResponse = state
|
|
||||||
.config
|
|
||||||
.schema
|
|
||||||
.iter()
|
|
||||||
.map(|(k, v)| (k.as_str().to_owned(), v.clone()))
|
|
||||||
.collect();
|
|
||||||
(StatusCode::OK, Json(fields)).into_response()
|
|
||||||
}
|
|
||||||
@@ -1,190 +0,0 @@
|
|||||||
use axum::{
|
|
||||||
Json,
|
|
||||||
body::Body,
|
|
||||||
extract::{Query, RawQuery, State},
|
|
||||||
http::{StatusCode, header},
|
|
||||||
response::{IntoResponse, Response},
|
|
||||||
};
|
|
||||||
use percent_encoding::percent_decode_str;
|
|
||||||
use pile_config::{Label, objectpath::ObjectPath};
|
|
||||||
use pile_dataset::Datasets;
|
|
||||||
use pile_value::{
|
|
||||||
extract::traits::ExtractState,
|
|
||||||
value::{BinaryPileValue, PileValue},
|
|
||||||
};
|
|
||||||
use serde::Deserialize;
|
|
||||||
use std::{sync::Arc, time::Instant};
|
|
||||||
use tokio_util::io::ReaderStream;
|
|
||||||
use tracing::debug;
|
|
||||||
use utoipa::ToSchema;
|
|
||||||
|
|
||||||
#[derive(Deserialize, ToSchema)]
|
|
||||||
pub struct ExtractQuery {
|
|
||||||
source: String,
|
|
||||||
key: String,
|
|
||||||
|
|
||||||
#[serde(default)]
|
|
||||||
download: bool,
|
|
||||||
name: Option<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Extract a specific field from an item's metadata.
|
|
||||||
/// Multiple `path` parameters may be provided; the first non-null result is returned.
|
|
||||||
#[utoipa::path(
|
|
||||||
get,
|
|
||||||
path = "/extract",
|
|
||||||
params(
|
|
||||||
("source" = String, Query, description = "Source label"),
|
|
||||||
("key" = String, Query, description = "Item key"),
|
|
||||||
("path" = String, Query, description = "Object path (e.g. $.flac.title); repeat for fallbacks"),
|
|
||||||
("name" = Option<String>, Query, description = "Downloaded filename; defaults to the last segment of the key"),
|
|
||||||
),
|
|
||||||
responses(
|
|
||||||
(status = 200, description = "Field value as JSON"),
|
|
||||||
(status = 400, description = "Invalid source label or path"),
|
|
||||||
(status = 404, description = "Item or field not found"),
|
|
||||||
(status = 500, description = "Internal server error"),
|
|
||||||
)
|
|
||||||
)]
|
|
||||||
pub async fn get_extract(
|
|
||||||
State(state): State<Arc<Datasets>>,
|
|
||||||
Query(params): Query<ExtractQuery>,
|
|
||||||
RawQuery(raw_query): RawQuery,
|
|
||||||
) -> Response {
|
|
||||||
let start = Instant::now();
|
|
||||||
|
|
||||||
let label = match Label::try_from(params.source.clone()) {
|
|
||||||
Ok(l) => l,
|
|
||||||
Err(e) => return (StatusCode::BAD_REQUEST, format!("{e:?}")).into_response(),
|
|
||||||
};
|
|
||||||
|
|
||||||
// Collect all `path` query params in order (supports repeated ?path=...&path=...)
|
|
||||||
let raw = raw_query.as_deref().unwrap_or("");
|
|
||||||
let paths: Vec<ObjectPath> = {
|
|
||||||
let mut result = Vec::new();
|
|
||||||
for part in raw.split('&') {
|
|
||||||
if let Some((k, v)) = part.split_once('=')
|
|
||||||
&& k == "path"
|
|
||||||
{
|
|
||||||
let v = percent_decode_str(v).decode_utf8_lossy();
|
|
||||||
match v.parse::<ObjectPath>() {
|
|
||||||
Ok(p) => result.push(p),
|
|
||||||
Err(e) => {
|
|
||||||
return (StatusCode::BAD_REQUEST, format!("{e:?}")).into_response();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
result
|
|
||||||
};
|
|
||||||
|
|
||||||
if paths.is_empty() {
|
|
||||||
return (StatusCode::BAD_REQUEST, "Missing `path` query parameter").into_response();
|
|
||||||
}
|
|
||||||
|
|
||||||
debug!(
|
|
||||||
message = "Serving /extract",
|
|
||||||
source = params.source,
|
|
||||||
key = params.key,
|
|
||||||
paths = paths.len(),
|
|
||||||
);
|
|
||||||
|
|
||||||
let Some(item) = state.get(&label, ¶ms.key).await else {
|
|
||||||
return StatusCode::NOT_FOUND.into_response();
|
|
||||||
};
|
|
||||||
|
|
||||||
let extract_state = ExtractState { ignore_mime: false };
|
|
||||||
let item = PileValue::Item(item);
|
|
||||||
|
|
||||||
// Try each path in order, returning the first non-null result
|
|
||||||
let mut value = None;
|
|
||||||
for path in &paths {
|
|
||||||
match item.query(&extract_state, path).await {
|
|
||||||
Ok(None) => continue,
|
|
||||||
|
|
||||||
Ok(Some(PileValue::Null)) => {
|
|
||||||
value = Some(PileValue::Null);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(Some(v)) => {
|
|
||||||
value = Some(v);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let Some(value) = value else {
|
|
||||||
return (StatusCode::BAD_REQUEST, "no value").into_response();
|
|
||||||
};
|
|
||||||
|
|
||||||
debug!(
|
|
||||||
message = "Served /extract",
|
|
||||||
source = params.source,
|
|
||||||
key = params.key,
|
|
||||||
time_ms = start.elapsed().as_millis()
|
|
||||||
);
|
|
||||||
|
|
||||||
let disposition_type = if params.download {
|
|
||||||
"attachment"
|
|
||||||
} else {
|
|
||||||
"inline"
|
|
||||||
};
|
|
||||||
let file_name = params.name.unwrap_or_else(|| {
|
|
||||||
params
|
|
||||||
.key
|
|
||||||
.rsplit('/')
|
|
||||||
.next()
|
|
||||||
.unwrap_or(¶ms.key)
|
|
||||||
.to_owned()
|
|
||||||
});
|
|
||||||
let disposition = format!("{disposition_type}; filename=\"{file_name}\"");
|
|
||||||
|
|
||||||
match value {
|
|
||||||
PileValue::String(s) => (
|
|
||||||
StatusCode::OK,
|
|
||||||
[
|
|
||||||
(header::CONTENT_TYPE, "text/plain".to_owned()),
|
|
||||||
(header::CONTENT_DISPOSITION, disposition),
|
|
||||||
],
|
|
||||||
s.to_string(),
|
|
||||||
)
|
|
||||||
.into_response(),
|
|
||||||
|
|
||||||
PileValue::Binary(binary) => {
|
|
||||||
let mime = binary.mime().to_string();
|
|
||||||
let body = match binary {
|
|
||||||
BinaryPileValue::Blob { bytes, .. } => Body::from(bytes.0.to_vec()),
|
|
||||||
BinaryPileValue::File { path, .. } => match tokio::fs::File::open(&path).await {
|
|
||||||
Ok(file) => Body::from_stream(ReaderStream::new(file)),
|
|
||||||
Err(e) => {
|
|
||||||
return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}"))
|
|
||||||
.into_response();
|
|
||||||
}
|
|
||||||
},
|
|
||||||
};
|
|
||||||
(
|
|
||||||
StatusCode::OK,
|
|
||||||
[
|
|
||||||
(header::CONTENT_TYPE, mime),
|
|
||||||
(header::CONTENT_DISPOSITION, disposition),
|
|
||||||
],
|
|
||||||
body,
|
|
||||||
)
|
|
||||||
.into_response()
|
|
||||||
}
|
|
||||||
|
|
||||||
_ => match value.to_json(&extract_state).await {
|
|
||||||
Ok(json) => (
|
|
||||||
StatusCode::OK,
|
|
||||||
[(header::CONTENT_DISPOSITION, disposition)],
|
|
||||||
Json(json),
|
|
||||||
)
|
|
||||||
.into_response(),
|
|
||||||
|
|
||||||
Err(e) => (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,103 +0,0 @@
|
|||||||
use axum::{
|
|
||||||
Json,
|
|
||||||
extract::{Query, State},
|
|
||||||
http::StatusCode,
|
|
||||||
response::{IntoResponse, Response},
|
|
||||||
};
|
|
||||||
use pile_dataset::Datasets;
|
|
||||||
use serde::{Deserialize, Serialize};
|
|
||||||
use std::sync::Arc;
|
|
||||||
use tracing::debug;
|
|
||||||
use utoipa::ToSchema;
|
|
||||||
|
|
||||||
#[derive(Deserialize, ToSchema)]
|
|
||||||
pub struct ItemsQuery {
|
|
||||||
#[serde(default)]
|
|
||||||
offset: usize,
|
|
||||||
#[serde(default = "default_limit")]
|
|
||||||
limit: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
fn default_limit() -> usize {
|
|
||||||
100
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize, ToSchema)]
|
|
||||||
pub struct ItemsResponse {
|
|
||||||
pub items: Vec<ItemRef>,
|
|
||||||
pub total: usize,
|
|
||||||
pub offset: usize,
|
|
||||||
pub limit: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize, ToSchema)]
|
|
||||||
pub struct ItemRef {
|
|
||||||
pub source: String,
|
|
||||||
pub key: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// List all items across all sources with consistent ordering, paginated by offset and limit
|
|
||||||
#[utoipa::path(
|
|
||||||
get,
|
|
||||||
path = "/items",
|
|
||||||
params(
|
|
||||||
("offset" = usize, Query, description = "Number of items to skip"),
|
|
||||||
("limit" = usize, Query, description = "Maximum number of items to return (max 1000)"),
|
|
||||||
),
|
|
||||||
responses(
|
|
||||||
(status = 200, description = "Paginated list of items", body = ItemsResponse),
|
|
||||||
)
|
|
||||||
)]
|
|
||||||
pub async fn items_list(
|
|
||||||
State(state): State<Arc<Datasets>>,
|
|
||||||
Query(params): Query<ItemsQuery>,
|
|
||||||
) -> Response {
|
|
||||||
let limit = params.limit.min(1000);
|
|
||||||
let offset = params.offset;
|
|
||||||
|
|
||||||
debug!(message = "Serving /items", offset, limit);
|
|
||||||
|
|
||||||
// Sort sources by label for a consistent global order: (source, key)
|
|
||||||
let mut source_labels: Vec<_> = state.sources.keys().collect();
|
|
||||||
source_labels.sort();
|
|
||||||
|
|
||||||
let mut items: Vec<ItemRef> = Vec::with_capacity(limit);
|
|
||||||
let mut total = 0usize;
|
|
||||||
let mut remaining_offset = offset;
|
|
||||||
|
|
||||||
for label in source_labels {
|
|
||||||
let dataset = &state.sources[label];
|
|
||||||
let source_len = dataset.len();
|
|
||||||
|
|
||||||
if remaining_offset >= source_len {
|
|
||||||
// This entire source is before our window; skip it efficiently
|
|
||||||
remaining_offset -= source_len;
|
|
||||||
total += source_len;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
let want = (limit - items.len()).min(source_len - remaining_offset);
|
|
||||||
let source_str = label.as_str().to_owned();
|
|
||||||
for item in dataset.iter_page(remaining_offset, want) {
|
|
||||||
items.push(ItemRef {
|
|
||||||
source: source_str.clone(),
|
|
||||||
key: item.key().to_string(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
remaining_offset = 0;
|
|
||||||
total += source_len;
|
|
||||||
}
|
|
||||||
|
|
||||||
debug!(message = "Served /items", offset, limit, total);
|
|
||||||
|
|
||||||
(
|
|
||||||
StatusCode::OK,
|
|
||||||
Json(ItemsResponse {
|
|
||||||
items,
|
|
||||||
total,
|
|
||||||
offset,
|
|
||||||
limit,
|
|
||||||
}),
|
|
||||||
)
|
|
||||||
.into_response()
|
|
||||||
}
|
|
||||||
@@ -1,89 +0,0 @@
|
|||||||
use axum::{
|
|
||||||
Router,
|
|
||||||
routing::{get, post},
|
|
||||||
};
|
|
||||||
use pile_dataset::Datasets;
|
|
||||||
use std::sync::Arc;
|
|
||||||
use utoipa::OpenApi;
|
|
||||||
use utoipa_swagger_ui::SwaggerUi;
|
|
||||||
|
|
||||||
mod lookup;
|
|
||||||
pub use lookup::*;
|
|
||||||
|
|
||||||
mod extract;
|
|
||||||
pub use extract::*;
|
|
||||||
|
|
||||||
mod items;
|
|
||||||
pub use items::*;
|
|
||||||
|
|
||||||
mod config_schema;
|
|
||||||
pub use config_schema::*;
|
|
||||||
|
|
||||||
mod schema_field;
|
|
||||||
pub use schema_field::*;
|
|
||||||
|
|
||||||
mod schema;
|
|
||||||
pub use schema::*;
|
|
||||||
|
|
||||||
#[derive(OpenApi)]
|
|
||||||
#[openapi(
|
|
||||||
tags(),
|
|
||||||
paths(
|
|
||||||
lookup,
|
|
||||||
get_extract,
|
|
||||||
items_list,
|
|
||||||
config_schema,
|
|
||||||
schema_field,
|
|
||||||
schema_all
|
|
||||||
),
|
|
||||||
components(schemas(
|
|
||||||
LookupRequest,
|
|
||||||
LookupResponse,
|
|
||||||
LookupResult,
|
|
||||||
ExtractQuery,
|
|
||||||
ItemsQuery,
|
|
||||||
ItemsResponse,
|
|
||||||
ItemRef
|
|
||||||
))
|
|
||||||
)]
|
|
||||||
pub(crate) struct Api;
|
|
||||||
|
|
||||||
#[inline]
|
|
||||||
pub fn router(ds: Arc<Datasets>, with_docs: bool) -> Router<()> {
|
|
||||||
router_prefix(ds, with_docs, None)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline]
|
|
||||||
pub fn router_prefix(ds: Arc<Datasets>, with_docs: bool, prefix: Option<&str>) -> Router<()> {
|
|
||||||
let mut router = Router::new()
|
|
||||||
.route("/lookup", post(lookup))
|
|
||||||
.route("/extract", get(get_extract))
|
|
||||||
.route("/items", get(items_list))
|
|
||||||
.route("/config/schema", get(config_schema))
|
|
||||||
.route("/schema", get(schema_all))
|
|
||||||
.route("/schema/{field}", get(schema_field))
|
|
||||||
.with_state(ds.clone());
|
|
||||||
|
|
||||||
if let Some(prefix) = prefix {
|
|
||||||
router = Router::new().nest(prefix, router);
|
|
||||||
}
|
|
||||||
|
|
||||||
if with_docs {
|
|
||||||
let docs_path = match prefix {
|
|
||||||
None => "/docs".into(),
|
|
||||||
Some(prefix) => format!("{prefix}/docs"),
|
|
||||||
};
|
|
||||||
|
|
||||||
let api = Api::openapi();
|
|
||||||
let api = match prefix {
|
|
||||||
None => api,
|
|
||||||
Some(prefix) => utoipa::openapi::OpenApi::default().nest(prefix, api),
|
|
||||||
};
|
|
||||||
|
|
||||||
let docs =
|
|
||||||
SwaggerUi::new(docs_path.clone()).url(format!("{}/openapi.json", docs_path), api);
|
|
||||||
|
|
||||||
router = router.merge(docs);
|
|
||||||
}
|
|
||||||
router
|
|
||||||
}
|
|
||||||
@@ -1,129 +0,0 @@
|
|||||||
use axum::{
|
|
||||||
Json,
|
|
||||||
extract::{Query, State},
|
|
||||||
http::StatusCode,
|
|
||||||
response::{IntoResponse, Response},
|
|
||||||
};
|
|
||||||
use pile_config::Label;
|
|
||||||
use pile_dataset::Datasets;
|
|
||||||
use pile_value::{extract::traits::ExtractState, value::PileValue};
|
|
||||||
use serde::{Deserialize, Serialize};
|
|
||||||
use std::{collections::HashMap, sync::Arc};
|
|
||||||
use utoipa::IntoParams;
|
|
||||||
|
|
||||||
#[derive(Deserialize, IntoParams)]
|
|
||||||
pub struct SchemaQuery {
|
|
||||||
source: String,
|
|
||||||
key: String,
|
|
||||||
|
|
||||||
#[serde(default)]
|
|
||||||
hidden: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
#[serde(untagged)]
|
|
||||||
pub enum ApiValue {
|
|
||||||
Binary { binary: bool, mime: String },
|
|
||||||
Object { object: bool },
|
|
||||||
Array(Vec<ApiValue>),
|
|
||||||
String(String),
|
|
||||||
Number(serde_json::Number),
|
|
||||||
Null,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub type SchemaResponse = HashMap<String, ApiValue>;
|
|
||||||
|
|
||||||
async fn pile_value_to_api(
|
|
||||||
state: &ExtractState,
|
|
||||||
value: PileValue,
|
|
||||||
) -> Result<ApiValue, std::io::Error> {
|
|
||||||
match value {
|
|
||||||
PileValue::String(s) => Ok(ApiValue::String(s.to_string())),
|
|
||||||
PileValue::U64(n) => Ok(ApiValue::Number(n.into())),
|
|
||||||
PileValue::I64(n) => Ok(ApiValue::Number(n.into())),
|
|
||||||
PileValue::Null => Ok(ApiValue::Null),
|
|
||||||
|
|
||||||
PileValue::Binary(x) => Ok(ApiValue::Binary {
|
|
||||||
binary: true,
|
|
||||||
mime: x.mime().to_string(),
|
|
||||||
}),
|
|
||||||
|
|
||||||
PileValue::Array(arr) => {
|
|
||||||
let mut out = Vec::with_capacity(arr.len());
|
|
||||||
for item in arr.iter() {
|
|
||||||
out.push(Box::pin(pile_value_to_api(state, item.clone())).await?);
|
|
||||||
}
|
|
||||||
Ok(ApiValue::Array(out))
|
|
||||||
}
|
|
||||||
|
|
||||||
PileValue::ObjectExtractor(_) | PileValue::ListExtractor(_) | PileValue::Item(_) => {
|
|
||||||
Ok(ApiValue::Object { object: true })
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get all schema field values for a single item.
|
|
||||||
#[utoipa::path(
|
|
||||||
get,
|
|
||||||
path = "/schema",
|
|
||||||
params(
|
|
||||||
("source" = String, Query, description = "Source label"),
|
|
||||||
("key" = String, Query, description = "Item key"),
|
|
||||||
("hidden" = bool, Query, description = "Include hidden fields (default: false)"),
|
|
||||||
),
|
|
||||||
responses(
|
|
||||||
(status = 200, description = "Schema field values as a map of label to value"),
|
|
||||||
(status = 400, description = "Invalid source label"),
|
|
||||||
(status = 404, description = "Item not found"),
|
|
||||||
(status = 500, description = "Internal server error"),
|
|
||||||
)
|
|
||||||
)]
|
|
||||||
pub async fn schema_all(
|
|
||||||
State(state): State<Arc<Datasets>>,
|
|
||||||
Query(params): Query<SchemaQuery>,
|
|
||||||
) -> Response {
|
|
||||||
let label = match Label::try_from(params.source.clone()) {
|
|
||||||
Ok(l) => l,
|
|
||||||
Err(e) => return (StatusCode::BAD_REQUEST, format!("{e:?}")).into_response(),
|
|
||||||
};
|
|
||||||
|
|
||||||
let Some(item) = state.get(&label, ¶ms.key).await else {
|
|
||||||
return StatusCode::NOT_FOUND.into_response();
|
|
||||||
};
|
|
||||||
|
|
||||||
let extract_state = ExtractState { ignore_mime: false };
|
|
||||||
let item = PileValue::Item(item);
|
|
||||||
|
|
||||||
let mut result: SchemaResponse = HashMap::new();
|
|
||||||
|
|
||||||
for (field_label, field_spec) in &state.config.schema {
|
|
||||||
if field_spec.hidden && !params.hidden {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut value = None;
|
|
||||||
for path in &field_spec.path {
|
|
||||||
match item.query(&extract_state, path).await {
|
|
||||||
Ok(Some(PileValue::Null)) | Ok(None) => continue,
|
|
||||||
Ok(Some(v)) => {
|
|
||||||
value = Some(v);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let Some(v) = value else { continue };
|
|
||||||
|
|
||||||
let api_value = match pile_value_to_api(&extract_state, v).await {
|
|
||||||
Ok(v) => v,
|
|
||||||
Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
|
|
||||||
};
|
|
||||||
|
|
||||||
result.insert(field_label.as_str().to_owned(), api_value);
|
|
||||||
}
|
|
||||||
|
|
||||||
(StatusCode::OK, Json(result)).into_response()
|
|
||||||
}
|
|
||||||
@@ -1,173 +0,0 @@
|
|||||||
use axum::{
|
|
||||||
Json,
|
|
||||||
body::Body,
|
|
||||||
extract::{Path, Query, State},
|
|
||||||
http::{StatusCode, header},
|
|
||||||
response::{IntoResponse, Response},
|
|
||||||
};
|
|
||||||
use pile_config::Label;
|
|
||||||
use pile_dataset::Datasets;
|
|
||||||
use pile_value::{
|
|
||||||
extract::traits::ExtractState,
|
|
||||||
value::{BinaryPileValue, PileValue},
|
|
||||||
};
|
|
||||||
use serde::Deserialize;
|
|
||||||
use std::{sync::Arc, time::Instant};
|
|
||||||
use tokio_util::io::ReaderStream;
|
|
||||||
use tracing::debug;
|
|
||||||
use utoipa::IntoParams;
|
|
||||||
|
|
||||||
#[derive(Deserialize, IntoParams)]
|
|
||||||
pub struct SchemaFieldQuery {
|
|
||||||
source: String,
|
|
||||||
key: String,
|
|
||||||
|
|
||||||
#[serde(default)]
|
|
||||||
download: bool,
|
|
||||||
name: Option<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Extract a specific schema field from an item's metadata.
|
|
||||||
#[utoipa::path(
|
|
||||||
get,
|
|
||||||
path = "/schema/{field}",
|
|
||||||
params(
|
|
||||||
("field" = String, Path, description = "Schema field"),
|
|
||||||
("source" = String, Query, description = "Source label"),
|
|
||||||
("key" = String, Query, description = "Item key"),
|
|
||||||
("name" = Option<String>, Query, description = "Downloaded filename; defaults to the last segment of the key"),
|
|
||||||
),
|
|
||||||
responses(
|
|
||||||
(status = 200, description = "Field value as JSON"),
|
|
||||||
(status = 400, description = "Invalid source label or path"),
|
|
||||||
(status = 404, description = "Item or field not found"),
|
|
||||||
(status = 500, description = "Internal server error"),
|
|
||||||
)
|
|
||||||
)]
|
|
||||||
pub async fn schema_field(
|
|
||||||
State(state): State<Arc<Datasets>>,
|
|
||||||
Path(field): Path<String>,
|
|
||||||
Query(params): Query<SchemaFieldQuery>,
|
|
||||||
) -> Response {
|
|
||||||
let start = Instant::now();
|
|
||||||
|
|
||||||
let label = match Label::try_from(params.source.clone()) {
|
|
||||||
Ok(l) => l,
|
|
||||||
Err(e) => return (StatusCode::BAD_REQUEST, format!("{e:?}")).into_response(),
|
|
||||||
};
|
|
||||||
|
|
||||||
debug!(
|
|
||||||
message = "Serving /schema/{field}",
|
|
||||||
source = params.source,
|
|
||||||
key = params.key,
|
|
||||||
field = field,
|
|
||||||
);
|
|
||||||
|
|
||||||
let Some(item) = state.get(&label, ¶ms.key).await else {
|
|
||||||
return StatusCode::NOT_FOUND.into_response();
|
|
||||||
};
|
|
||||||
|
|
||||||
let field_label = match Label::new(&field) {
|
|
||||||
Some(x) => x,
|
|
||||||
None => return StatusCode::NOT_FOUND.into_response(),
|
|
||||||
};
|
|
||||||
|
|
||||||
let paths = match state.config.schema.get(&field_label) {
|
|
||||||
Some(x) => &x.path,
|
|
||||||
None => return StatusCode::NOT_FOUND.into_response(),
|
|
||||||
};
|
|
||||||
|
|
||||||
let extract_state = ExtractState { ignore_mime: false };
|
|
||||||
let item = PileValue::Item(item);
|
|
||||||
|
|
||||||
let mut value = None;
|
|
||||||
for path in paths {
|
|
||||||
match item.query(&extract_state, path).await {
|
|
||||||
Ok(None) => continue,
|
|
||||||
|
|
||||||
Ok(Some(PileValue::Null)) => {
|
|
||||||
value = Some(PileValue::Null);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(Some(v)) => {
|
|
||||||
value = Some(v);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let Some(value) = value else {
|
|
||||||
return (StatusCode::BAD_REQUEST, "no value").into_response();
|
|
||||||
};
|
|
||||||
|
|
||||||
debug!(
|
|
||||||
message = "Served /schema/{field}",
|
|
||||||
source = params.source,
|
|
||||||
key = params.key,
|
|
||||||
field = field,
|
|
||||||
time_ms = start.elapsed().as_millis()
|
|
||||||
);
|
|
||||||
|
|
||||||
let disposition_type = if params.download {
|
|
||||||
"attachment"
|
|
||||||
} else {
|
|
||||||
"inline"
|
|
||||||
};
|
|
||||||
let file_name = params.name.unwrap_or_else(|| {
|
|
||||||
params
|
|
||||||
.key
|
|
||||||
.rsplit('/')
|
|
||||||
.next()
|
|
||||||
.unwrap_or(¶ms.key)
|
|
||||||
.to_owned()
|
|
||||||
});
|
|
||||||
let disposition = format!("{disposition_type}; filename=\"{file_name}\"");
|
|
||||||
|
|
||||||
match value {
|
|
||||||
PileValue::String(s) => (
|
|
||||||
StatusCode::OK,
|
|
||||||
[
|
|
||||||
(header::CONTENT_TYPE, "text/plain".to_owned()),
|
|
||||||
(header::CONTENT_DISPOSITION, disposition),
|
|
||||||
],
|
|
||||||
s.to_string(),
|
|
||||||
)
|
|
||||||
.into_response(),
|
|
||||||
|
|
||||||
PileValue::Binary(binary) => {
|
|
||||||
let mime = binary.mime().to_string();
|
|
||||||
let body = match binary {
|
|
||||||
BinaryPileValue::Blob { bytes, .. } => Body::from(bytes.0.to_vec()),
|
|
||||||
BinaryPileValue::File { path, .. } => match tokio::fs::File::open(&path).await {
|
|
||||||
Ok(file) => Body::from_stream(ReaderStream::new(file)),
|
|
||||||
Err(e) => {
|
|
||||||
return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}"))
|
|
||||||
.into_response();
|
|
||||||
}
|
|
||||||
},
|
|
||||||
};
|
|
||||||
(
|
|
||||||
StatusCode::OK,
|
|
||||||
[
|
|
||||||
(header::CONTENT_TYPE, mime),
|
|
||||||
(header::CONTENT_DISPOSITION, disposition),
|
|
||||||
],
|
|
||||||
body,
|
|
||||||
)
|
|
||||||
.into_response()
|
|
||||||
}
|
|
||||||
|
|
||||||
_ => match value.to_json(&extract_state).await {
|
|
||||||
Ok(json) => (
|
|
||||||
StatusCode::OK,
|
|
||||||
[(header::CONTENT_DISPOSITION, disposition)],
|
|
||||||
Json(json),
|
|
||||||
)
|
|
||||||
.into_response(),
|
|
||||||
Err(e) => (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -80,12 +80,10 @@ impl CancelFlag {
|
|||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub async fn await_cancel(&self) {
|
pub async fn await_cancel(&self) {
|
||||||
let notified = self.notify.notified();
|
if self.is_cancelled() {
|
||||||
tokio::pin!(notified);
|
return;
|
||||||
notified.as_mut().enable();
|
|
||||||
if !self.is_cancelled() {
|
|
||||||
notified.await;
|
|
||||||
}
|
}
|
||||||
|
self.notify.notified().await;
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
|
|||||||
@@ -8,38 +8,29 @@ edition = { workspace = true }
|
|||||||
workspace = true
|
workspace = true
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
pile-io = { workspace = true }
|
|
||||||
pile-config = { workspace = true }
|
pile-config = { workspace = true }
|
||||||
pile-flac = { workspace = true }
|
pile-flac = { workspace = true }
|
||||||
|
|
||||||
anyhow = { workspace = true }
|
|
||||||
serde_json = { workspace = true }
|
serde_json = { workspace = true }
|
||||||
walkdir = { workspace = true }
|
walkdir = { workspace = true }
|
||||||
tracing = { workspace = true }
|
tracing = { workspace = true }
|
||||||
chrono = { workspace = true }
|
chrono = { workspace = true }
|
||||||
toml = { workspace = true }
|
toml = { workspace = true }
|
||||||
smartstring = { workspace = true }
|
smartstring = { workspace = true }
|
||||||
regex = { workspace = true }
|
|
||||||
blake3 = { workspace = true }
|
blake3 = { workspace = true }
|
||||||
sha2 = { workspace = true }
|
|
||||||
sha1 = { workspace = true }
|
|
||||||
md5 = { workspace = true }
|
|
||||||
epub = { workspace = true }
|
epub = { workspace = true }
|
||||||
kamadak-exif = { workspace = true }
|
kamadak-exif = { workspace = true }
|
||||||
pdf = { workspace = true }
|
pdf = { workspace = true }
|
||||||
pdfium-render = { workspace = true, optional = true }
|
pdfium-render = { workspace = true, optional = true }
|
||||||
image = { workspace = true }
|
image = { workspace = true, optional = true }
|
||||||
id3 = { workspace = true }
|
id3 = { workspace = true }
|
||||||
tokio = { workspace = true }
|
tokio = { workspace = true }
|
||||||
|
tokio-stream = { workspace = true }
|
||||||
async-trait = { workspace = true }
|
async-trait = { workspace = true }
|
||||||
|
aws-sdk-s3 = { workspace = true }
|
||||||
mime = { workspace = true }
|
mime = { workspace = true }
|
||||||
mime_guess = { workspace = true }
|
mime_guess = { workspace = true }
|
||||||
serde = { workspace = true }
|
|
||||||
strum = { workspace = true }
|
|
||||||
|
|
||||||
[build-dependencies]
|
|
||||||
reqwest = { workspace = true }
|
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = []
|
default = []
|
||||||
pdfium = ["dep:pdfium-render"]
|
pdfium = ["dep:pdfium-render", "dep:image"]
|
||||||
|
|||||||
@@ -1,34 +1,7 @@
|
|||||||
use std::env;
|
use std::env;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
const PDFIUM_VERSION: &str = "chromium%2F7725";
|
const PDFIUM_URL: &str = "https://github.com/bblanchon/pdfium-binaries/releases/download/chromium%2F7725/pdfium-linux-x64.tgz";
|
||||||
|
|
||||||
fn pdfium_url(os: &str, arch: &str) -> String {
|
|
||||||
let platform = match (os, arch) {
|
|
||||||
("linux", "x86_64") => "linux-x64",
|
|
||||||
("linux", "aarch64") => "linux-arm64",
|
|
||||||
("macos", "x86_64") => "mac-x64",
|
|
||||||
("macos", "aarch64") => "mac-arm64",
|
|
||||||
_ => panic!("unsupported platform: {os}-{arch}"),
|
|
||||||
};
|
|
||||||
format!(
|
|
||||||
"https://github.com/bblanchon/pdfium-binaries/releases/download/{PDFIUM_VERSION}/pdfium-{platform}.tgz"
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn lib_name(os: &str) -> &'static str {
|
|
||||||
match os {
|
|
||||||
"macos" => "libpdfium.dylib",
|
|
||||||
_ => "libpdfium.so",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn rpath_flag(os: &str) -> &'static str {
|
|
||||||
match os {
|
|
||||||
"macos" => "-Wl,-rpath,@loader_path",
|
|
||||||
_ => "-Wl,-rpath,$ORIGIN",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[expect(clippy::expect_used)]
|
#[expect(clippy::expect_used)]
|
||||||
#[expect(clippy::unwrap_used)]
|
#[expect(clippy::unwrap_used)]
|
||||||
@@ -40,9 +13,6 @@ fn main() {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
let os = env::var("CARGO_CFG_TARGET_OS").unwrap();
|
|
||||||
let arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap();
|
|
||||||
|
|
||||||
let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
|
let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
|
||||||
|
|
||||||
// OUT_DIR is target/<profile>/build/<pkg>-<hash>/out
|
// OUT_DIR is target/<profile>/build/<pkg>-<hash>/out
|
||||||
@@ -60,23 +30,18 @@ fn main() {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
let lib_file = lib_name(&os);
|
let lib_path = profile_dir.join("libpdfium.so");
|
||||||
let lib_path = profile_dir.join(lib_file);
|
|
||||||
|
|
||||||
if !lib_path.exists() {
|
if !lib_path.exists() {
|
||||||
let url = pdfium_url(&os, &arch);
|
|
||||||
let tgz_path = out_dir.join("pdfium.tgz");
|
let tgz_path = out_dir.join("pdfium.tgz");
|
||||||
|
|
||||||
eprintln!("cargo:warning=Downloading PDFium from {url}");
|
eprintln!("cargo:warning=Downloading PDFium from {PDFIUM_URL}");
|
||||||
|
|
||||||
let response = reqwest::blocking::get(&url).expect("failed to download PDFium");
|
let status = std::process::Command::new("curl")
|
||||||
assert!(
|
.args(["-L", "--fail", "-o", tgz_path.to_str().unwrap(), PDFIUM_URL])
|
||||||
response.status().is_success(),
|
.status()
|
||||||
"failed to download PDFium: {}",
|
.expect("failed to run curl");
|
||||||
response.status()
|
assert!(status.success(), "curl failed to download PDFium");
|
||||||
);
|
|
||||||
let bytes = response.bytes().expect("failed to read PDFium response");
|
|
||||||
std::fs::write(&tgz_path, &bytes).expect("failed to write pdfium.tgz");
|
|
||||||
|
|
||||||
let status = std::process::Command::new("tar")
|
let status = std::process::Command::new("tar")
|
||||||
.args([
|
.args([
|
||||||
@@ -89,11 +54,11 @@ fn main() {
|
|||||||
.expect("failed to run tar");
|
.expect("failed to run tar");
|
||||||
assert!(status.success(), "tar failed to extract PDFium");
|
assert!(status.success(), "tar failed to extract PDFium");
|
||||||
|
|
||||||
std::fs::copy(out_dir.join("lib").join(lib_file), &lib_path)
|
std::fs::copy(out_dir.join("lib").join("libpdfium.so"), &lib_path)
|
||||||
.expect("failed to copy pdfium library");
|
.expect("failed to copy libpdfium.so");
|
||||||
}
|
}
|
||||||
|
|
||||||
println!("cargo:rustc-link-search=native={}", profile_dir.display());
|
println!("cargo:rustc-link-search=native={}", profile_dir.display());
|
||||||
println!("cargo:rustc-link-lib=dylib=pdfium");
|
println!("cargo:rustc-link-lib=dylib=pdfium");
|
||||||
println!("cargo:rustc-link-arg={}", rpath_flag(&os));
|
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,75 +0,0 @@
|
|||||||
use epub::doc::EpubDoc;
|
|
||||||
use mime::Mime;
|
|
||||||
use pile_io::SyncReadBridge;
|
|
||||||
use std::sync::{Arc, OnceLock};
|
|
||||||
use tracing::trace;
|
|
||||||
|
|
||||||
use crate::{
|
|
||||||
extract::traits::ExtractState,
|
|
||||||
value::{ArcBytes, BinaryPileValue, PileValue},
|
|
||||||
};
|
|
||||||
|
|
||||||
pub struct EpubCoverExtractor {
|
|
||||||
item: BinaryPileValue,
|
|
||||||
output: OnceLock<Option<(Mime, Vec<u8>)>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl EpubCoverExtractor {
|
|
||||||
pub fn new(item: &BinaryPileValue) -> Self {
|
|
||||||
Self {
|
|
||||||
item: item.clone(),
|
|
||||||
output: OnceLock::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_inner(&self) -> Result<Option<&(Mime, Vec<u8>)>, std::io::Error> {
|
|
||||||
if let Some(x) = self.output.get() {
|
|
||||||
return Ok(x.as_ref());
|
|
||||||
}
|
|
||||||
|
|
||||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
|
||||||
let result = tokio::task::spawn_blocking(move || {
|
|
||||||
let mut doc = EpubDoc::from_reader(reader)?;
|
|
||||||
let cover_id = match doc.get_cover_id() {
|
|
||||||
Ok(id) => id,
|
|
||||||
Err(_) => return Ok::<_, anyhow::Error>(None),
|
|
||||||
};
|
|
||||||
|
|
||||||
let mime: Mime = doc
|
|
||||||
.resources
|
|
||||||
.get(&cover_id)
|
|
||||||
.and_then(|(_, mime_str)| mime_str.parse().ok())
|
|
||||||
.unwrap_or(mime::IMAGE_JPEG);
|
|
||||||
|
|
||||||
let bytes = doc.get_cover()?;
|
|
||||||
Ok(Some((mime, bytes)))
|
|
||||||
})
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
let result = match result {
|
|
||||||
Ok(x) => x,
|
|
||||||
Err(error) => match error.downcast::<std::io::Error>() {
|
|
||||||
Ok(x) => return Err(x),
|
|
||||||
Err(error) => {
|
|
||||||
trace!(message = "Could not extract epub cover", ?error, item = ?self.item);
|
|
||||||
None
|
|
||||||
}
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(self.output.get_or_init(|| result).as_ref())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn get(&self, state: &ExtractState) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
if !state.ignore_mime && self.item.mime().essence_str() != "application/epub+zip" {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(self.get_inner().await?.map(|(mime, bytes)| {
|
|
||||||
PileValue::Binary(BinaryPileValue::Blob {
|
|
||||||
mime: mime.clone(),
|
|
||||||
bytes: ArcBytes(Arc::new(bytes.clone())),
|
|
||||||
})
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,109 +0,0 @@
|
|||||||
use pile_config::Label;
|
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
mod epub_cover;
|
|
||||||
pub use epub_cover::*;
|
|
||||||
|
|
||||||
mod epub_meta;
|
|
||||||
pub use epub_meta::*;
|
|
||||||
|
|
||||||
mod epub_text;
|
|
||||||
pub use epub_text::*;
|
|
||||||
|
|
||||||
use crate::{
|
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
|
||||||
value::{BinaryPileValue, PileValue},
|
|
||||||
};
|
|
||||||
|
|
||||||
pub struct EpubExtractor {
|
|
||||||
text: Arc<EpubTextExtractor>,
|
|
||||||
meta: Arc<EpubMetaExtractor>,
|
|
||||||
cover: Arc<EpubCoverExtractor>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl EpubExtractor {
|
|
||||||
pub fn new(item: &BinaryPileValue) -> Self {
|
|
||||||
Self {
|
|
||||||
text: Arc::new(EpubTextExtractor::new(item)),
|
|
||||||
meta: Arc::new(EpubMetaExtractor::new(item)),
|
|
||||||
cover: Arc::new(EpubCoverExtractor::new(item)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl ObjectExtractor for EpubExtractor {
|
|
||||||
async fn field(
|
|
||||||
&self,
|
|
||||||
state: &ExtractState,
|
|
||||||
name: &pile_config::Label,
|
|
||||||
args: Option<&str>,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
match (name.as_str(), args) {
|
|
||||||
("text", args) => Ok(Some(
|
|
||||||
self.text
|
|
||||||
.field(state, name, args)
|
|
||||||
.await
|
|
||||||
.map(|x| x.unwrap_or(PileValue::Null))?,
|
|
||||||
)),
|
|
||||||
|
|
||||||
("meta", None) => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))),
|
|
||||||
("cover", None) => self.cover.get(state).await,
|
|
||||||
_ => Ok(None),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[expect(clippy::unwrap_used)]
|
|
||||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
|
||||||
Ok(vec![
|
|
||||||
Label::new("text").unwrap(),
|
|
||||||
Label::new("meta").unwrap(),
|
|
||||||
Label::new("cover").unwrap(),
|
|
||||||
])
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn to_json(&self, state: &ExtractState) -> Result<serde_json::Value, std::io::Error> {
|
|
||||||
let keys = self.fields().await?;
|
|
||||||
let mut map = serde_json::Map::new();
|
|
||||||
for k in &keys {
|
|
||||||
let v = match self.field(state, k, None).await? {
|
|
||||||
Some(x) => x,
|
|
||||||
None => continue,
|
|
||||||
};
|
|
||||||
|
|
||||||
if k.as_str() == "text" {
|
|
||||||
map.insert(
|
|
||||||
k.to_string(),
|
|
||||||
serde_json::Value::String(format!(
|
|
||||||
"<String ({} bytes)",
|
|
||||||
match v {
|
|
||||||
PileValue::String(x) => x.len(),
|
|
||||||
_ => 0,
|
|
||||||
}
|
|
||||||
)),
|
|
||||||
);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if k.as_str() == "cover" {
|
|
||||||
let summary = match &v {
|
|
||||||
PileValue::Binary(BinaryPileValue::Blob { mime, bytes }) => {
|
|
||||||
format!("<Blob ({mime}, {} bytes)>", bytes.0.len())
|
|
||||||
}
|
|
||||||
PileValue::Binary(BinaryPileValue::File { mime, .. }) => {
|
|
||||||
format!("<File ({mime})>")
|
|
||||||
}
|
|
||||||
|
|
||||||
PileValue::Null => "<null>".to_owned(),
|
|
||||||
_ => "<cover>".to_owned(),
|
|
||||||
};
|
|
||||||
map.insert(k.to_string(), serde_json::Value::String(summary));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
map.insert(k.to_string(), Box::pin(v.to_json(state)).await?);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(serde_json::Value::Object(map))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,236 +0,0 @@
|
|||||||
use mime::Mime;
|
|
||||||
use pile_config::Label;
|
|
||||||
use pile_flac::{FlacBlock, FlacDecodeError, FlacReader};
|
|
||||||
use pile_io::SyncReadBridge;
|
|
||||||
use std::{
|
|
||||||
collections::HashMap,
|
|
||||||
io::BufReader,
|
|
||||||
sync::{Arc, OnceLock},
|
|
||||||
};
|
|
||||||
use tracing::trace;
|
|
||||||
|
|
||||||
use crate::{
|
|
||||||
extract::traits::{ExtractState, ListExtractor, ObjectExtractor},
|
|
||||||
value::{ArcBytes, BinaryPileValue, PileValue},
|
|
||||||
};
|
|
||||||
|
|
||||||
pub struct FlacImagesExtractor {
|
|
||||||
item: BinaryPileValue,
|
|
||||||
cached_count: OnceLock<usize>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl FlacImagesExtractor {
|
|
||||||
pub fn new(item: &BinaryPileValue) -> Self {
|
|
||||||
Self {
|
|
||||||
item: item.clone(),
|
|
||||||
cached_count: OnceLock::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_count(&self) -> Result<usize, std::io::Error> {
|
|
||||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
|
||||||
let count = tokio::task::spawn_blocking(move || {
|
|
||||||
let reader = FlacReader::new(BufReader::new(reader));
|
|
||||||
let mut count = 0usize;
|
|
||||||
for block in reader {
|
|
||||||
match block {
|
|
||||||
Ok(FlacBlock::AudioFrame(_)) => break,
|
|
||||||
Ok(FlacBlock::Picture(_)) => count += 1,
|
|
||||||
Err(FlacDecodeError::IoError(err)) => return Err(err),
|
|
||||||
Err(_) => return Ok(0),
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok::<_, std::io::Error>(count)
|
|
||||||
})
|
|
||||||
.await??;
|
|
||||||
|
|
||||||
return Ok(count);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn mime_ok(&self, state: &ExtractState) -> bool {
|
|
||||||
if state.ignore_mime {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
let essence = self.item.mime().essence_str();
|
|
||||||
essence == "audio/flac" || essence == "audio/x-flac"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl ListExtractor for FlacImagesExtractor {
|
|
||||||
async fn get(
|
|
||||||
&self,
|
|
||||||
state: &ExtractState,
|
|
||||||
mut idx: usize,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
trace!(
|
|
||||||
item = ?self.item,
|
|
||||||
"Getting index {idx} from FlacImagesExtractor",
|
|
||||||
);
|
|
||||||
|
|
||||||
if !self.mime_ok(state) {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
let item = self.item.clone();
|
|
||||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
|
||||||
let image = tokio::task::spawn_blocking(move || {
|
|
||||||
let reader = FlacReader::new(BufReader::new(reader));
|
|
||||||
let mut out: Option<(Mime, Vec<u8>)> = None;
|
|
||||||
'blocks: for block in reader {
|
|
||||||
match block {
|
|
||||||
Ok(FlacBlock::AudioFrame(_)) => break,
|
|
||||||
Ok(FlacBlock::Picture(picture)) => {
|
|
||||||
if idx > 0 {
|
|
||||||
idx -= 1;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
out = Some((picture.mime, picture.img_data));
|
|
||||||
break 'blocks;
|
|
||||||
}
|
|
||||||
|
|
||||||
Err(FlacDecodeError::IoError(err)) => return Err(err),
|
|
||||||
Err(error) => {
|
|
||||||
trace!(message = "Could not parse FLAC images", ?item, ?error);
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok::<_, std::io::Error>(out)
|
|
||||||
})
|
|
||||||
.await
|
|
||||||
.map_err(std::io::Error::other)??;
|
|
||||||
|
|
||||||
Ok(image.map(|(mime, data)| {
|
|
||||||
PileValue::Binary(BinaryPileValue::Blob {
|
|
||||||
mime,
|
|
||||||
bytes: ArcBytes(Arc::new(data)),
|
|
||||||
})
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn len(&self, state: &ExtractState) -> Result<usize, std::io::Error> {
|
|
||||||
if !self.mime_ok(state) {
|
|
||||||
return Ok(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(x) = self.cached_count.get() {
|
|
||||||
return Ok(*x);
|
|
||||||
}
|
|
||||||
|
|
||||||
let count = self.get_count().await?;
|
|
||||||
return Ok(*self.cached_count.get_or_init(|| count));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct FlacExtractor {
|
|
||||||
item: BinaryPileValue,
|
|
||||||
output: OnceLock<HashMap<Label, PileValue>>,
|
|
||||||
images: PileValue,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl FlacExtractor {
|
|
||||||
pub fn new(item: &BinaryPileValue) -> Self {
|
|
||||||
Self {
|
|
||||||
item: item.clone(),
|
|
||||||
output: OnceLock::new(),
|
|
||||||
images: PileValue::ListExtractor(Arc::new(FlacImagesExtractor::new(item))),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
|
|
||||||
if let Some(x) = self.output.get() {
|
|
||||||
return Ok(x);
|
|
||||||
}
|
|
||||||
|
|
||||||
trace!(message = "Reading FLAC tags", item = ?self.item);
|
|
||||||
|
|
||||||
let item = self.item.clone();
|
|
||||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
|
||||||
let output = tokio::task::spawn_blocking(move || {
|
|
||||||
let reader = FlacReader::new(BufReader::new(reader));
|
|
||||||
let mut output: HashMap<Label, Vec<PileValue>> = HashMap::new();
|
|
||||||
|
|
||||||
for block in reader {
|
|
||||||
match block {
|
|
||||||
Ok(FlacBlock::AudioFrame(_)) => break,
|
|
||||||
Ok(FlacBlock::VorbisComment(comment)) => {
|
|
||||||
for (k, v) in comment.comment.comments {
|
|
||||||
if let Some(label) = Label::new(k.to_string().to_lowercase()) {
|
|
||||||
output
|
|
||||||
.entry(label)
|
|
||||||
.or_default()
|
|
||||||
.push(PileValue::String(Arc::new(v)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Err(FlacDecodeError::IoError(err)) => return Err(err),
|
|
||||||
Err(error) => {
|
|
||||||
trace!(message = "Could not parse FLAC metadata", ?item, ?error);
|
|
||||||
return Ok(HashMap::new());
|
|
||||||
}
|
|
||||||
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let output: HashMap<Label, PileValue> = output
|
|
||||||
.into_iter()
|
|
||||||
.map(|(k, v)| (k, PileValue::Array(Arc::new(v))))
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
Ok::<HashMap<Label, PileValue>, std::io::Error>(output)
|
|
||||||
})
|
|
||||||
.await??;
|
|
||||||
|
|
||||||
return Ok(self.output.get_or_init(|| output));
|
|
||||||
}
|
|
||||||
|
|
||||||
fn mime_ok(&self, state: &ExtractState) -> bool {
|
|
||||||
if state.ignore_mime {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
let essence = self.item.mime().essence_str();
|
|
||||||
essence == "audio/flac" || essence == "audio/x-flac"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl ObjectExtractor for FlacExtractor {
|
|
||||||
async fn field(
|
|
||||||
&self,
|
|
||||||
state: &ExtractState,
|
|
||||||
name: &Label,
|
|
||||||
args: Option<&str>,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
if args.is_some() {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
if !self.mime_ok(state) {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
if name.as_str() == "images" {
|
|
||||||
return Ok(Some(self.images.clone()));
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(self.get_inner().await?.get(name).cloned())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[expect(clippy::unwrap_used)]
|
|
||||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
|
||||||
Ok(self
|
|
||||||
.get_inner()
|
|
||||||
.await?
|
|
||||||
.keys()
|
|
||||||
.cloned()
|
|
||||||
.chain([Label::new("images").unwrap()])
|
|
||||||
.collect::<Vec<_>>())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,113 +0,0 @@
|
|||||||
use crate::{
|
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
|
||||||
value::{BinaryPileValue, PileValue},
|
|
||||||
};
|
|
||||||
use pile_config::Label;
|
|
||||||
use std::{
|
|
||||||
collections::HashMap,
|
|
||||||
path::Component,
|
|
||||||
sync::{Arc, OnceLock},
|
|
||||||
};
|
|
||||||
|
|
||||||
pub struct FsExtractor {
|
|
||||||
item: BinaryPileValue,
|
|
||||||
output: OnceLock<HashMap<Label, PileValue>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl FsExtractor {
|
|
||||||
pub fn new(item: &BinaryPileValue) -> Self {
|
|
||||||
Self {
|
|
||||||
item: item.clone(),
|
|
||||||
output: OnceLock::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
|
|
||||||
if let Some(x) = self.output.get() {
|
|
||||||
return Ok(x);
|
|
||||||
}
|
|
||||||
|
|
||||||
let path = match &self.item {
|
|
||||||
BinaryPileValue::File { path, .. } => path,
|
|
||||||
_ => return Ok(self.output.get_or_init(HashMap::new)),
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut root = false;
|
|
||||||
let components = path
|
|
||||||
.components()
|
|
||||||
.filter_map(|x| match x {
|
|
||||||
Component::CurDir => None,
|
|
||||||
Component::Normal(x) => Some(x.to_str().map(|x| x.to_owned())),
|
|
||||||
Component::ParentDir => Some(Some("..".to_owned())),
|
|
||||||
Component::RootDir => {
|
|
||||||
root = true;
|
|
||||||
Some(None)
|
|
||||||
}
|
|
||||||
Component::Prefix(x) => Some(x.as_os_str().to_str().map(|x| x.to_owned())),
|
|
||||||
})
|
|
||||||
.collect::<Option<Vec<_>>>();
|
|
||||||
|
|
||||||
let mut path_str = components.as_ref().map(|x| x.join("/"));
|
|
||||||
if root {
|
|
||||||
path_str = path_str.map(|x| format!("/{x}"));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[expect(clippy::unwrap_used)]
|
|
||||||
let output = HashMap::from([
|
|
||||||
(
|
|
||||||
Label::new("extension").unwrap(),
|
|
||||||
path.extension()
|
|
||||||
.and_then(|x| x.to_str())
|
|
||||||
.map(|x| PileValue::String(Arc::new(x.into())))
|
|
||||||
.unwrap_or(PileValue::Null),
|
|
||||||
),
|
|
||||||
(
|
|
||||||
Label::new("path").unwrap(),
|
|
||||||
path_str
|
|
||||||
.map(|x| PileValue::String(Arc::new(x.into())))
|
|
||||||
.unwrap_or(PileValue::Null),
|
|
||||||
),
|
|
||||||
(
|
|
||||||
Label::new("segments").unwrap(),
|
|
||||||
components
|
|
||||||
.clone()
|
|
||||||
.map(|x| {
|
|
||||||
PileValue::Array(Arc::new(
|
|
||||||
x.iter()
|
|
||||||
.map(|x| PileValue::String(Arc::new(x.into())))
|
|
||||||
.collect(),
|
|
||||||
))
|
|
||||||
})
|
|
||||||
.unwrap_or(PileValue::Null),
|
|
||||||
),
|
|
||||||
(
|
|
||||||
Label::new("name").unwrap(),
|
|
||||||
components
|
|
||||||
.and_then(|x| x.last().map(|x| PileValue::String(Arc::new(x.into()))))
|
|
||||||
.unwrap_or(PileValue::Null),
|
|
||||||
),
|
|
||||||
]);
|
|
||||||
|
|
||||||
return Ok(self.output.get_or_init(|| output));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl ObjectExtractor for FsExtractor {
|
|
||||||
async fn field(
|
|
||||||
&self,
|
|
||||||
_state: &ExtractState,
|
|
||||||
name: &Label,
|
|
||||||
args: Option<&str>,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
if args.is_some() {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(self.get_inner()?.get(name).cloned())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
|
||||||
Ok(self.get_inner()?.keys().cloned().collect())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,111 +0,0 @@
|
|||||||
use crate::{
|
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
|
||||||
value::{BinaryPileValue, PileValue},
|
|
||||||
};
|
|
||||||
use pile_config::Label;
|
|
||||||
use pile_io::SyncReadBridge;
|
|
||||||
use std::{io::Read, sync::Arc};
|
|
||||||
use tokio::sync::OnceCell;
|
|
||||||
|
|
||||||
fn to_hex(bytes: &[u8]) -> String {
|
|
||||||
bytes.iter().map(|b| format!("{b:02x}")).collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
macro_rules! hash_algos {
|
|
||||||
($($name:ident),* $(,)?) => {
|
|
||||||
pub struct HashExtractor {
|
|
||||||
item: BinaryPileValue,
|
|
||||||
$($name: OnceCell<String>,)*
|
|
||||||
}
|
|
||||||
|
|
||||||
impl HashExtractor {
|
|
||||||
pub fn new(item: &BinaryPileValue) -> Self {
|
|
||||||
Self {
|
|
||||||
item: item.clone(),
|
|
||||||
$($name: OnceCell::new(),)*
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static LABELS: std::sync::LazyLock<Vec<Label>> = std::sync::LazyLock::new(|| {
|
|
||||||
vec![$(Label::new(stringify!($name)).unwrap()),*]
|
|
||||||
});
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
hash_algos!(blake3, md5, sha1, sha224, sha256, sha384, sha512);
|
|
||||||
|
|
||||||
impl HashExtractor {
|
|
||||||
async fn compute(&self, name: &Label) -> Result<Option<String>, std::io::Error> {
|
|
||||||
let name_str = name.as_ref();
|
|
||||||
|
|
||||||
macro_rules! algo {
|
|
||||||
($cell:ident, $compute:expr) => {
|
|
||||||
if name_str == stringify!($cell) {
|
|
||||||
return Ok(Some(
|
|
||||||
self.$cell
|
|
||||||
.get_or_try_init(|| async {
|
|
||||||
let read = self.item.read().await?;
|
|
||||||
let mut read = SyncReadBridge::new_current(read);
|
|
||||||
tokio::task::spawn_blocking(move || {
|
|
||||||
let mut bytes = Vec::new();
|
|
||||||
read.read_to_end(&mut bytes)?;
|
|
||||||
Ok::<String, std::io::Error>($compute(&bytes))
|
|
||||||
})
|
|
||||||
.await?
|
|
||||||
})
|
|
||||||
.await?
|
|
||||||
.clone(),
|
|
||||||
));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
algo!(blake3, |b: &Vec<u8>| blake3::hash(b).to_hex().to_string());
|
|
||||||
algo!(md5, |b: &Vec<u8>| format!("{:x}", md5::compute(b)));
|
|
||||||
algo!(sha1, |b: &Vec<u8>| {
|
|
||||||
use sha1::Digest;
|
|
||||||
to_hex(sha1::Sha1::digest(b).as_ref())
|
|
||||||
});
|
|
||||||
algo!(sha224, |b: &Vec<u8>| {
|
|
||||||
use sha2::Digest;
|
|
||||||
to_hex(sha2::Sha224::digest(b).as_ref())
|
|
||||||
});
|
|
||||||
algo!(sha256, |b: &Vec<u8>| {
|
|
||||||
use sha2::Digest;
|
|
||||||
to_hex(sha2::Sha256::digest(b).as_ref())
|
|
||||||
});
|
|
||||||
algo!(sha384, |b: &Vec<u8>| {
|
|
||||||
use sha2::Digest;
|
|
||||||
to_hex(sha2::Sha384::digest(b).as_ref())
|
|
||||||
});
|
|
||||||
algo!(sha512, |b: &Vec<u8>| {
|
|
||||||
use sha2::Digest;
|
|
||||||
to_hex(sha2::Sha512::digest(b).as_ref())
|
|
||||||
});
|
|
||||||
|
|
||||||
Ok(None)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl ObjectExtractor for HashExtractor {
|
|
||||||
async fn field(
|
|
||||||
&self,
|
|
||||||
_state: &ExtractState,
|
|
||||||
name: &Label,
|
|
||||||
args: Option<&str>,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
if args.is_some() {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
Ok(self
|
|
||||||
.compute(name)
|
|
||||||
.await?
|
|
||||||
.map(|s| PileValue::String(Arc::new(s.into()))))
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
|
||||||
Ok(LABELS.clone())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,241 +0,0 @@
|
|||||||
use id3::Tag;
|
|
||||||
use mime::Mime;
|
|
||||||
use pile_config::Label;
|
|
||||||
use pile_io::SyncReadBridge;
|
|
||||||
use std::{
|
|
||||||
borrow::Cow,
|
|
||||||
collections::HashMap,
|
|
||||||
io::BufReader,
|
|
||||||
sync::{Arc, OnceLock},
|
|
||||||
};
|
|
||||||
use tracing::trace;
|
|
||||||
|
|
||||||
use crate::{
|
|
||||||
extract::traits::{ExtractState, ListExtractor, ObjectExtractor},
|
|
||||||
value::{ArcBytes, BinaryPileValue, PileValue},
|
|
||||||
};
|
|
||||||
|
|
||||||
pub struct Id3ImagesExtractor {
|
|
||||||
item: BinaryPileValue,
|
|
||||||
cached_count: OnceLock<usize>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Id3ImagesExtractor {
|
|
||||||
pub fn new(item: &BinaryPileValue) -> Self {
|
|
||||||
Self {
|
|
||||||
item: item.clone(),
|
|
||||||
cached_count: OnceLock::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn read_tag(&self) -> Result<Option<Tag>, std::io::Error> {
|
|
||||||
let item = self.item.clone();
|
|
||||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
|
||||||
tokio::task::spawn_blocking(move || match Tag::read_from2(BufReader::new(reader)) {
|
|
||||||
Ok(tag) => Ok(Some(tag)),
|
|
||||||
Err(id3::Error {
|
|
||||||
kind: id3::ErrorKind::Io(e),
|
|
||||||
..
|
|
||||||
}) => Err(e),
|
|
||||||
Err(error) => {
|
|
||||||
trace!(message = "Could not parse id3 tags", ?item, ?error);
|
|
||||||
Ok(None)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.await
|
|
||||||
.map_err(std::io::Error::other)?
|
|
||||||
}
|
|
||||||
|
|
||||||
fn mime_ok(&self, state: &ExtractState) -> bool {
|
|
||||||
state.ignore_mime || self.item.mime().essence_str() == "audio/mpeg"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl ListExtractor for Id3ImagesExtractor {
|
|
||||||
async fn get(
|
|
||||||
&self,
|
|
||||||
state: &ExtractState,
|
|
||||||
idx: usize,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
if !self.mime_ok(state) {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
let Some(tag) = self.read_tag().await? else {
|
|
||||||
return Ok(None);
|
|
||||||
};
|
|
||||||
|
|
||||||
let Some(picture) = tag.pictures().nth(idx) else {
|
|
||||||
return Ok(None);
|
|
||||||
};
|
|
||||||
|
|
||||||
let mime: Mime = picture
|
|
||||||
.mime_type
|
|
||||||
.parse()
|
|
||||||
.unwrap_or(mime::APPLICATION_OCTET_STREAM);
|
|
||||||
let data = picture.data.clone();
|
|
||||||
|
|
||||||
Ok(Some(PileValue::Binary(BinaryPileValue::Blob {
|
|
||||||
mime,
|
|
||||||
bytes: ArcBytes(Arc::new(data)),
|
|
||||||
})))
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn len(&self, state: &ExtractState) -> Result<usize, std::io::Error> {
|
|
||||||
if !self.mime_ok(state) {
|
|
||||||
return Ok(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(x) = self.cached_count.get() {
|
|
||||||
return Ok(*x);
|
|
||||||
}
|
|
||||||
|
|
||||||
let count = match self.read_tag().await? {
|
|
||||||
Some(tag) => tag.pictures().count(),
|
|
||||||
None => 0,
|
|
||||||
};
|
|
||||||
Ok(*self.cached_count.get_or_init(|| count))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct Id3Extractor {
|
|
||||||
item: BinaryPileValue,
|
|
||||||
output: OnceLock<HashMap<Label, PileValue>>,
|
|
||||||
images: PileValue,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Id3Extractor {
|
|
||||||
pub fn new(item: &BinaryPileValue) -> Self {
|
|
||||||
Self {
|
|
||||||
item: item.clone(),
|
|
||||||
output: OnceLock::new(),
|
|
||||||
images: PileValue::ListExtractor(Arc::new(Id3ImagesExtractor::new(item))),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
|
|
||||||
if let Some(x) = self.output.get() {
|
|
||||||
return Ok(x);
|
|
||||||
}
|
|
||||||
|
|
||||||
trace!(message = "Reading id3 tags", key = ?self.item);
|
|
||||||
|
|
||||||
let item = self.item.clone();
|
|
||||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
|
||||||
let tag = match tokio::task::spawn_blocking(move || Tag::read_from2(BufReader::new(reader)))
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
Ok(Ok(tag)) => tag,
|
|
||||||
|
|
||||||
Err(e) => return Err(e.into()),
|
|
||||||
Ok(Err(id3::Error {
|
|
||||||
kind: id3::ErrorKind::Io(e),
|
|
||||||
..
|
|
||||||
})) => return Err(e),
|
|
||||||
|
|
||||||
Ok(Err(error)) => {
|
|
||||||
trace!(message = "Could not parse id3 tags", ?item, ?error);
|
|
||||||
return Ok(self.output.get_or_init(HashMap::new));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut output: HashMap<Label, Vec<PileValue>> = HashMap::new();
|
|
||||||
for frame in tag.frames() {
|
|
||||||
if let Some(texts) = frame.content().text_values() {
|
|
||||||
let name = frame_id_to_field(frame.id());
|
|
||||||
if let Some(key) = Label::new(name) {
|
|
||||||
for text in texts {
|
|
||||||
output
|
|
||||||
.entry(key.clone())
|
|
||||||
.or_default()
|
|
||||||
.push(PileValue::String(Arc::new(text.into())));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let output = output
|
|
||||||
.into_iter()
|
|
||||||
.map(|(k, v)| (k, PileValue::Array(Arc::new(v))))
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
return Ok(self.output.get_or_init(|| output));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Map an ID3 frame ID to the equivalent Vorbis Comment field name.
|
|
||||||
/// Falls back to the lowercased frame ID if no mapping exists.
|
|
||||||
fn frame_id_to_field(id: &str) -> Cow<'static, str> {
|
|
||||||
match id {
|
|
||||||
// spell:off
|
|
||||||
"TIT2" => Cow::Borrowed("title"),
|
|
||||||
"TIT1" => Cow::Borrowed("grouping"),
|
|
||||||
"TIT3" => Cow::Borrowed("subtitle"),
|
|
||||||
"TPE1" => Cow::Borrowed("artist"),
|
|
||||||
"TPE2" => Cow::Borrowed("albumartist"),
|
|
||||||
"TPE3" => Cow::Borrowed("conductor"),
|
|
||||||
"TOPE" => Cow::Borrowed("originalartist"),
|
|
||||||
"TALB" => Cow::Borrowed("album"),
|
|
||||||
"TOAL" => Cow::Borrowed("originalalbum"),
|
|
||||||
"TRCK" => Cow::Borrowed("tracknumber"),
|
|
||||||
"TPOS" => Cow::Borrowed("discnumber"),
|
|
||||||
"TSST" => Cow::Borrowed("discsubtitle"),
|
|
||||||
"TDRC" | "TYER" => Cow::Borrowed("date"),
|
|
||||||
"TDOR" | "TORY" => Cow::Borrowed("originaldate"),
|
|
||||||
"TCON" => Cow::Borrowed("genre"),
|
|
||||||
"TCOM" => Cow::Borrowed("composer"),
|
|
||||||
"TEXT" => Cow::Borrowed("lyricist"),
|
|
||||||
"TPUB" => Cow::Borrowed("label"),
|
|
||||||
"TSRC" => Cow::Borrowed("isrc"),
|
|
||||||
"TBPM" => Cow::Borrowed("bpm"),
|
|
||||||
"TLAN" => Cow::Borrowed("language"),
|
|
||||||
"TMED" => Cow::Borrowed("media"),
|
|
||||||
"TMOO" => Cow::Borrowed("mood"),
|
|
||||||
"TCOP" => Cow::Borrowed("copyright"),
|
|
||||||
"TENC" => Cow::Borrowed("encodedby"),
|
|
||||||
"TSSE" => Cow::Borrowed("encodersettings"),
|
|
||||||
"TSOA" => Cow::Borrowed("albumsort"),
|
|
||||||
"TSOP" => Cow::Borrowed("artistsort"),
|
|
||||||
"TSOT" => Cow::Borrowed("titlesort"),
|
|
||||||
"MVNM" => Cow::Borrowed("movement"),
|
|
||||||
"MVIN" => Cow::Borrowed("movementnumber"),
|
|
||||||
_ => Cow::Owned(id.to_lowercase()),
|
|
||||||
// spell:on
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl ObjectExtractor for Id3Extractor {
|
|
||||||
async fn field(
|
|
||||||
&self,
|
|
||||||
state: &ExtractState,
|
|
||||||
name: &Label,
|
|
||||||
args: Option<&str>,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
if args.is_some() {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
if !state.ignore_mime && self.item.mime().essence_str() != "audio/mpeg" {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
if name.as_str() == "images" {
|
|
||||||
return Ok(Some(self.images.clone()));
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(self.get_inner().await?.get(name).cloned())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[expect(clippy::unwrap_used)]
|
|
||||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
|
||||||
Ok(self
|
|
||||||
.get_inner()
|
|
||||||
.await?
|
|
||||||
.keys()
|
|
||||||
.cloned()
|
|
||||||
.chain([Label::new("images").unwrap()])
|
|
||||||
.collect::<Vec<_>>())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,100 +0,0 @@
|
|||||||
use image::ImageFormat;
|
|
||||||
use mime::Mime;
|
|
||||||
use pile_config::Label;
|
|
||||||
use pile_io::AsyncReader;
|
|
||||||
use std::{io::Cursor, str::FromStr, sync::Arc};
|
|
||||||
use tracing::trace;
|
|
||||||
use transform::{CropTransformer, ImageTransformer, MaxDimTransformer};
|
|
||||||
|
|
||||||
mod transform;
|
|
||||||
|
|
||||||
use crate::{
|
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
|
||||||
value::{ArcBytes, BinaryPileValue, PileValue},
|
|
||||||
};
|
|
||||||
|
|
||||||
pub struct ImageExtractor {
|
|
||||||
item: BinaryPileValue,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ImageExtractor {
|
|
||||||
pub fn new(item: &BinaryPileValue) -> Self {
|
|
||||||
Self { item: item.clone() }
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn apply<T: ImageTransformer + Send + 'static>(
|
|
||||||
&self,
|
|
||||||
args: &str,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
let transformer = match T::parse_args(args) {
|
|
||||||
Ok(t) => t,
|
|
||||||
Err(_) => return Ok(None),
|
|
||||||
};
|
|
||||||
|
|
||||||
let mime = self.item.mime().clone();
|
|
||||||
let bytes = self.item.read().await?.read_to_end().await?;
|
|
||||||
|
|
||||||
let Some(format) = ImageFormat::from_mime_type(&mime) else {
|
|
||||||
return Ok(Some(PileValue::Binary(BinaryPileValue::Blob {
|
|
||||||
mime,
|
|
||||||
bytes: ArcBytes(Arc::new(bytes)),
|
|
||||||
})));
|
|
||||||
};
|
|
||||||
|
|
||||||
let bytes_for_closure = bytes.clone();
|
|
||||||
let result = tokio::task::spawn_blocking(move || {
|
|
||||||
let mut img = image::load_from_memory_with_format(&bytes_for_closure, format)?;
|
|
||||||
transformer.transform(&mut img);
|
|
||||||
|
|
||||||
let mut out = Cursor::new(Vec::new());
|
|
||||||
img.write_to(&mut out, format)?;
|
|
||||||
|
|
||||||
let out_mime =
|
|
||||||
Mime::from_str(format.to_mime_type()).unwrap_or(mime::APPLICATION_OCTET_STREAM);
|
|
||||||
Ok::<_, image::ImageError>((out_mime, out.into_inner()))
|
|
||||||
})
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
match result {
|
|
||||||
Ok((out_mime, out_bytes)) => Ok(Some(PileValue::Binary(BinaryPileValue::Blob {
|
|
||||||
mime: out_mime,
|
|
||||||
bytes: ArcBytes(Arc::new(out_bytes)),
|
|
||||||
}))),
|
|
||||||
|
|
||||||
Err(_) => Ok(Some(PileValue::Binary(BinaryPileValue::Blob {
|
|
||||||
mime,
|
|
||||||
bytes: ArcBytes(Arc::new(bytes)),
|
|
||||||
}))),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl ObjectExtractor for ImageExtractor {
|
|
||||||
async fn field(
|
|
||||||
&self,
|
|
||||||
_state: &ExtractState,
|
|
||||||
name: &Label,
|
|
||||||
args: Option<&str>,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
let Some(args) = args else {
|
|
||||||
return Ok(None);
|
|
||||||
};
|
|
||||||
|
|
||||||
trace!(?args, "Getting field {name:?} from ImageExtractor",);
|
|
||||||
|
|
||||||
match name.as_str() {
|
|
||||||
"maxdim" => self.apply::<MaxDimTransformer>(args).await,
|
|
||||||
"crop" => self.apply::<CropTransformer>(args).await,
|
|
||||||
_ => Ok(None),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[expect(clippy::unwrap_used)]
|
|
||||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
|
||||||
Ok(vec![
|
|
||||||
Label::new("maxdim").unwrap(),
|
|
||||||
Label::new("crop").unwrap(),
|
|
||||||
])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,4 +0,0 @@
|
|||||||
mod pixeldim;
|
|
||||||
|
|
||||||
pub mod transformers;
|
|
||||||
pub use transformers::{CropTransformer, ImageTransformer, MaxDimTransformer};
|
|
||||||
@@ -1,68 +0,0 @@
|
|||||||
use serde::{Deserialize, Deserializer};
|
|
||||||
use std::fmt;
|
|
||||||
use std::str::FromStr;
|
|
||||||
|
|
||||||
// TODO: parse -, + (100vw - 10px)
|
|
||||||
// TODO: parse 100vw [min] 10
|
|
||||||
// TODO: parse 100vw [max] 10
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq)]
|
|
||||||
pub enum PixelDim {
|
|
||||||
Pixels(u32),
|
|
||||||
WidthPercent(f32),
|
|
||||||
HeightPercent(f32),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl FromStr for PixelDim {
|
|
||||||
type Err = String;
|
|
||||||
|
|
||||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
|
||||||
let numeric_end = s.find(|c: char| !c.is_ascii_digit() && c != '.');
|
|
||||||
|
|
||||||
let (quantity, unit) = numeric_end.map(|x| s.split_at(x)).unwrap_or((s, "px"));
|
|
||||||
let quantity = quantity.trim();
|
|
||||||
let unit = unit.trim();
|
|
||||||
|
|
||||||
match unit {
|
|
||||||
"vw" => Ok(PixelDim::WidthPercent(
|
|
||||||
quantity
|
|
||||||
.parse()
|
|
||||||
.map_err(|_err| format!("invalid quantity {quantity}"))?,
|
|
||||||
)),
|
|
||||||
|
|
||||||
"vh" => Ok(PixelDim::HeightPercent(
|
|
||||||
quantity
|
|
||||||
.parse()
|
|
||||||
.map_err(|_err| format!("invalid quantity {quantity}"))?,
|
|
||||||
)),
|
|
||||||
|
|
||||||
"px" => Ok(PixelDim::Pixels(
|
|
||||||
quantity
|
|
||||||
.parse()
|
|
||||||
.map_err(|_err| format!("invalid quantity {quantity}"))?,
|
|
||||||
)),
|
|
||||||
|
|
||||||
_ => Err(format!("invalid unit {unit}")),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'de> Deserialize<'de> for PixelDim {
|
|
||||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
|
||||||
where
|
|
||||||
D: Deserializer<'de>,
|
|
||||||
{
|
|
||||||
let s = String::deserialize(deserializer)?;
|
|
||||||
FromStr::from_str(&s).map_err(serde::de::Error::custom)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl fmt::Display for PixelDim {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
||||||
match self {
|
|
||||||
PixelDim::Pixels(px) => write!(f, "{px}"),
|
|
||||||
PixelDim::WidthPercent(p) => write!(f, "{p:.2}vw"),
|
|
||||||
PixelDim::HeightPercent(p) => write!(f, "{p:.2}vh"),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,188 +0,0 @@
|
|||||||
use image::DynamicImage;
|
|
||||||
use serde::{Deserialize, Serialize};
|
|
||||||
use std::{fmt::Display, str::FromStr};
|
|
||||||
use strum::{Display, EnumString};
|
|
||||||
|
|
||||||
use super::super::{pixeldim::PixelDim, transformers::ImageTransformer};
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, EnumString, Serialize, Deserialize, Display)]
|
|
||||||
pub enum Direction {
|
|
||||||
#[serde(rename = "n")]
|
|
||||||
#[strum(to_string = "n")]
|
|
||||||
#[strum(serialize = "north")]
|
|
||||||
North,
|
|
||||||
|
|
||||||
#[serde(rename = "e")]
|
|
||||||
#[strum(serialize = "e")]
|
|
||||||
#[strum(serialize = "east")]
|
|
||||||
East,
|
|
||||||
|
|
||||||
#[serde(rename = "s")]
|
|
||||||
#[strum(serialize = "s")]
|
|
||||||
#[strum(serialize = "south")]
|
|
||||||
South,
|
|
||||||
|
|
||||||
#[serde(rename = "w")]
|
|
||||||
#[strum(to_string = "w")]
|
|
||||||
#[strum(serialize = "west")]
|
|
||||||
West,
|
|
||||||
|
|
||||||
#[serde(rename = "c")]
|
|
||||||
#[strum(serialize = "c")]
|
|
||||||
#[strum(serialize = "center")]
|
|
||||||
Center,
|
|
||||||
|
|
||||||
#[serde(rename = "ne")]
|
|
||||||
#[strum(serialize = "ne")]
|
|
||||||
#[strum(serialize = "northeast")]
|
|
||||||
NorthEast,
|
|
||||||
|
|
||||||
#[serde(rename = "se")]
|
|
||||||
#[strum(serialize = "se")]
|
|
||||||
#[strum(serialize = "southeast")]
|
|
||||||
SouthEast,
|
|
||||||
|
|
||||||
#[serde(rename = "nw")]
|
|
||||||
#[strum(serialize = "nw")]
|
|
||||||
#[strum(serialize = "northwest")]
|
|
||||||
NorthWest,
|
|
||||||
|
|
||||||
#[serde(rename = "sw")]
|
|
||||||
#[strum(serialize = "sw")]
|
|
||||||
#[strum(serialize = "southwest")]
|
|
||||||
SouthWest,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Crop an image to (at most) the given size.
|
|
||||||
/// See [Self::new] for details.
|
|
||||||
#[derive(Debug, Clone, PartialEq)]
|
|
||||||
pub struct CropTransformer {
|
|
||||||
w: PixelDim,
|
|
||||||
h: PixelDim,
|
|
||||||
float: Direction,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl CropTransformer {
|
|
||||||
/// Create a new [CropTransformer] with the given parameters.
|
|
||||||
///
|
|
||||||
/// A [CropTransformer] creates an image of size `w x h`, but...
|
|
||||||
/// - does not reduce width if `w` is greater than image width
|
|
||||||
/// - does not reduce height if `h` is greater than image height
|
|
||||||
/// - does nothing if `w` or `h` is less than or equal to zero.
|
|
||||||
#[expect(dead_code)]
|
|
||||||
pub fn new(w: PixelDim, h: PixelDim, float: Direction) -> Self {
|
|
||||||
Self { w, h, float }
|
|
||||||
}
|
|
||||||
|
|
||||||
fn crop_dim(&self, img_width: u32, img_height: u32) -> (u32, u32) {
|
|
||||||
let crop_width = match self.w {
|
|
||||||
PixelDim::Pixels(w) => w,
|
|
||||||
PixelDim::WidthPercent(pct) => ((img_width as f32) * pct / 100.0) as u32,
|
|
||||||
PixelDim::HeightPercent(pct) => ((img_height as f32) * pct / 100.0) as u32,
|
|
||||||
};
|
|
||||||
|
|
||||||
let crop_height = match self.h {
|
|
||||||
PixelDim::Pixels(h) => h,
|
|
||||||
PixelDim::WidthPercent(pct) => ((img_width as f32) * pct / 100.0) as u32,
|
|
||||||
PixelDim::HeightPercent(pct) => ((img_height as f32) * pct / 100.0) as u32,
|
|
||||||
};
|
|
||||||
|
|
||||||
(crop_width, crop_height)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[expect(clippy::integer_division)]
|
|
||||||
fn crop_pos(
|
|
||||||
&self,
|
|
||||||
img_width: u32,
|
|
||||||
img_height: u32,
|
|
||||||
crop_width: u32,
|
|
||||||
crop_height: u32,
|
|
||||||
) -> (u32, u32) {
|
|
||||||
match self.float {
|
|
||||||
Direction::North => {
|
|
||||||
let x = (img_width - crop_width) / 2;
|
|
||||||
let y = 0;
|
|
||||||
(x, y)
|
|
||||||
}
|
|
||||||
Direction::East => {
|
|
||||||
let x = img_width - crop_width;
|
|
||||||
let y = (img_height - crop_height) / 2;
|
|
||||||
(x, y)
|
|
||||||
}
|
|
||||||
Direction::South => {
|
|
||||||
let x = (img_width - crop_width) / 2;
|
|
||||||
let y = img_height - crop_height;
|
|
||||||
(x, y)
|
|
||||||
}
|
|
||||||
Direction::West => {
|
|
||||||
let x = 0;
|
|
||||||
let y = (img_height - crop_height) / 2;
|
|
||||||
(x, y)
|
|
||||||
}
|
|
||||||
Direction::Center => {
|
|
||||||
let x = (img_width - crop_width) / 2;
|
|
||||||
let y = (img_height - crop_height) / 2;
|
|
||||||
(x, y)
|
|
||||||
}
|
|
||||||
Direction::NorthEast => {
|
|
||||||
let x = img_width - crop_width;
|
|
||||||
let y = 0;
|
|
||||||
(x, y)
|
|
||||||
}
|
|
||||||
Direction::SouthEast => {
|
|
||||||
let x = img_width - crop_width;
|
|
||||||
let y = img_height - crop_height;
|
|
||||||
(x, y)
|
|
||||||
}
|
|
||||||
Direction::NorthWest => {
|
|
||||||
let x = 0;
|
|
||||||
let y = 0;
|
|
||||||
(x, y)
|
|
||||||
}
|
|
||||||
Direction::SouthWest => {
|
|
||||||
let x = 0;
|
|
||||||
let y = img_height - crop_height;
|
|
||||||
(x, y)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Display for CropTransformer {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
write!(f, "crop({},{},{})", self.w, self.h, self.float)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ImageTransformer for CropTransformer {
|
|
||||||
fn parse_args(args: &str) -> Result<Self, String> {
|
|
||||||
let args: Vec<&str> = args.split(",").collect();
|
|
||||||
if args.len() != 3 {
|
|
||||||
return Err(format!("expected 3 args, got {}", args.len()));
|
|
||||||
}
|
|
||||||
|
|
||||||
let w = args[0].trim().parse::<PixelDim>()?;
|
|
||||||
let h = args[1].trim().parse::<PixelDim>()?;
|
|
||||||
|
|
||||||
let direction = args[2].trim();
|
|
||||||
let direction = Direction::from_str(direction)
|
|
||||||
.map_err(|_err| format!("invalid direction {direction}"))?;
|
|
||||||
|
|
||||||
Ok(Self {
|
|
||||||
w,
|
|
||||||
h,
|
|
||||||
float: direction,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn transform(&self, input: &mut DynamicImage) {
|
|
||||||
let (img_width, img_height) = (input.width(), input.height());
|
|
||||||
let (crop_width, crop_height) = self.crop_dim(img_width, img_height);
|
|
||||||
|
|
||||||
if (crop_width < img_width || crop_height < img_height) && crop_width > 0 && crop_height > 0
|
|
||||||
{
|
|
||||||
let (x, y) = self.crop_pos(img_width, img_height, crop_width, crop_height);
|
|
||||||
*input = input.crop(x, y, crop_width, crop_height);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,87 +0,0 @@
|
|||||||
use image::{DynamicImage, imageops::FilterType};
|
|
||||||
use std::fmt::Display;
|
|
||||||
|
|
||||||
use super::super::{pixeldim::PixelDim, transformers::ImageTransformer};
|
|
||||||
|
|
||||||
/// Scale an image until it fits in a configured bounding box.
|
|
||||||
#[derive(Debug, Clone, PartialEq)]
|
|
||||||
pub struct MaxDimTransformer {
|
|
||||||
w: PixelDim,
|
|
||||||
h: PixelDim,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl MaxDimTransformer {
|
|
||||||
/// Create a new [MaxDimTransformer] that scales an image down
|
|
||||||
/// until it fits in a box of dimension `w x h`.
|
|
||||||
///
|
|
||||||
/// Images are never scaled up.
|
|
||||||
#[expect(dead_code)]
|
|
||||||
pub fn new(w: PixelDim, h: PixelDim) -> Self {
|
|
||||||
Self { w, h }
|
|
||||||
}
|
|
||||||
|
|
||||||
fn target_dim(&self, img_width: u32, img_height: u32) -> (u32, u32) {
|
|
||||||
let max_width = match self.w {
|
|
||||||
PixelDim::Pixels(w) => Some(w),
|
|
||||||
PixelDim::WidthPercent(pct) => Some(((img_width as f32) * pct / 100.0) as u32),
|
|
||||||
PixelDim::HeightPercent(_) => None,
|
|
||||||
};
|
|
||||||
|
|
||||||
let max_height = match self.h {
|
|
||||||
PixelDim::Pixels(h) => Some(h),
|
|
||||||
PixelDim::HeightPercent(pct) => Some(((img_height as f32) * pct / 100.0) as u32),
|
|
||||||
PixelDim::WidthPercent(_) => None,
|
|
||||||
};
|
|
||||||
|
|
||||||
if max_width.map(|x| img_width <= x).unwrap_or(true)
|
|
||||||
&& max_height.map(|x| img_height <= x).unwrap_or(true)
|
|
||||||
{
|
|
||||||
return (img_width, img_height);
|
|
||||||
}
|
|
||||||
|
|
||||||
let width_ratio = max_width
|
|
||||||
.map(|x| x as f32 / img_width as f32)
|
|
||||||
.unwrap_or(1.0);
|
|
||||||
|
|
||||||
let height_ratio = max_height
|
|
||||||
.map(|x| x as f32 / img_height as f32)
|
|
||||||
.unwrap_or(1.0);
|
|
||||||
|
|
||||||
let ratio = width_ratio.min(height_ratio);
|
|
||||||
|
|
||||||
(
|
|
||||||
(img_width as f32 * ratio) as u32,
|
|
||||||
(img_height as f32 * ratio) as u32,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Display for MaxDimTransformer {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
write!(f, "maxdim({},{})", self.w, self.h)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ImageTransformer for MaxDimTransformer {
|
|
||||||
fn parse_args(args: &str) -> Result<Self, String> {
|
|
||||||
let args: Vec<&str> = args.split(",").collect();
|
|
||||||
if args.len() != 2 {
|
|
||||||
return Err(format!("expected 2 args, got {}", args.len()));
|
|
||||||
}
|
|
||||||
|
|
||||||
let w = args[0].parse::<PixelDim>()?;
|
|
||||||
let h = args[1].parse::<PixelDim>()?;
|
|
||||||
|
|
||||||
Ok(Self { w, h })
|
|
||||||
}
|
|
||||||
|
|
||||||
fn transform(&self, input: &mut DynamicImage) {
|
|
||||||
let (img_width, img_height) = (input.width(), input.height());
|
|
||||||
let (target_width, target_height) = self.target_dim(img_width, img_height);
|
|
||||||
|
|
||||||
// Only resize if needed
|
|
||||||
if target_width != img_width || target_height != img_height {
|
|
||||||
*input = input.resize(target_width, target_height, FilterType::Lanczos3);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,26 +0,0 @@
|
|||||||
//! Defines all transformation steps we can apply to an image
|
|
||||||
|
|
||||||
use image::DynamicImage;
|
|
||||||
use std::fmt::{Debug, Display};
|
|
||||||
|
|
||||||
mod crop;
|
|
||||||
pub use crop::*;
|
|
||||||
|
|
||||||
mod maxdim;
|
|
||||||
pub use maxdim::*;
|
|
||||||
|
|
||||||
/// A single transformation that may be applied to an image.
|
|
||||||
pub trait ImageTransformer
|
|
||||||
where
|
|
||||||
Self: PartialEq,
|
|
||||||
Self: Sized + Clone,
|
|
||||||
Self: Display + Debug,
|
|
||||||
{
|
|
||||||
/// Transform the given image in place
|
|
||||||
fn transform(&self, input: &mut DynamicImage);
|
|
||||||
|
|
||||||
/// Parse an arg string.
|
|
||||||
///
|
|
||||||
/// `name({arg_string})`
|
|
||||||
fn parse_args(args: &str) -> Result<Self, String>;
|
|
||||||
}
|
|
||||||
@@ -1,88 +0,0 @@
|
|||||||
use pile_config::Label;
|
|
||||||
use pile_io::AsyncReader;
|
|
||||||
use std::{
|
|
||||||
collections::HashMap,
|
|
||||||
sync::{Arc, OnceLock},
|
|
||||||
};
|
|
||||||
|
|
||||||
use crate::{
|
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
|
||||||
value::{BinaryPileValue, PileValue},
|
|
||||||
};
|
|
||||||
|
|
||||||
fn json_to_pile(value: serde_json::Value) -> PileValue {
|
|
||||||
match value {
|
|
||||||
serde_json::Value::Null => PileValue::Null,
|
|
||||||
serde_json::Value::Bool(b) => PileValue::String(Arc::new(b.to_string().into())),
|
|
||||||
serde_json::Value::Number(n) => PileValue::String(Arc::new(n.to_string().into())),
|
|
||||||
serde_json::Value::String(s) => PileValue::String(Arc::new(s.into())),
|
|
||||||
serde_json::Value::Array(a) => {
|
|
||||||
PileValue::Array(Arc::new(a.into_iter().map(json_to_pile).collect()))
|
|
||||||
}
|
|
||||||
serde_json::Value::Object(_) => PileValue::Null,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct JsonExtractor {
|
|
||||||
item: BinaryPileValue,
|
|
||||||
output: OnceLock<HashMap<Label, PileValue>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl JsonExtractor {
|
|
||||||
pub fn new(item: &BinaryPileValue) -> Self {
|
|
||||||
Self {
|
|
||||||
item: item.clone(),
|
|
||||||
output: OnceLock::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
|
|
||||||
if let Some(x) = self.output.get() {
|
|
||||||
return Ok(x);
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut reader = self.item.read().await?;
|
|
||||||
let bytes = reader.read_to_end().await?;
|
|
||||||
let json: serde_json::Value = match serde_json::from_slice(&bytes) {
|
|
||||||
Ok(x) => x,
|
|
||||||
Err(_) => return Ok(self.output.get_or_init(HashMap::new)),
|
|
||||||
};
|
|
||||||
|
|
||||||
let output: HashMap<Label, PileValue> = match json {
|
|
||||||
serde_json::Value::Object(map) => map
|
|
||||||
.into_iter()
|
|
||||||
.filter_map(|(k, v)| Label::new(&k).map(|label| (label, json_to_pile(v))))
|
|
||||||
.collect(),
|
|
||||||
_ => HashMap::new(),
|
|
||||||
};
|
|
||||||
|
|
||||||
return Ok(self.output.get_or_init(|| output));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl ObjectExtractor for JsonExtractor {
|
|
||||||
async fn field(
|
|
||||||
&self,
|
|
||||||
state: &ExtractState,
|
|
||||||
name: &Label,
|
|
||||||
args: Option<&str>,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
if args.is_some() {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
if !state.ignore_mime
|
|
||||||
&& (self.item.mime().type_() != mime::APPLICATION
|
|
||||||
&& self.item.mime().type_() != mime::TEXT)
|
|
||||||
{
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(self.get_inner().await?.get(name).cloned())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
|
||||||
Ok(self.get_inner().await?.keys().cloned().collect())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,100 +0,0 @@
|
|||||||
use pile_config::Label;
|
|
||||||
use std::sync::Arc;
|
|
||||||
use tracing::trace;
|
|
||||||
|
|
||||||
#[cfg(feature = "pdfium")]
|
|
||||||
mod pdf_pages;
|
|
||||||
#[cfg(feature = "pdfium")]
|
|
||||||
pub use pdf_pages::*;
|
|
||||||
|
|
||||||
mod pdf_meta;
|
|
||||||
pub use pdf_meta::*;
|
|
||||||
|
|
||||||
mod pdf_text;
|
|
||||||
pub use pdf_text::*;
|
|
||||||
|
|
||||||
use crate::{
|
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
|
||||||
value::{BinaryPileValue, PileValue},
|
|
||||||
};
|
|
||||||
|
|
||||||
pub struct PdfExtractor {
|
|
||||||
text: Arc<PdfTextExtractor>,
|
|
||||||
meta: Arc<PdfMetaExtractor>,
|
|
||||||
#[cfg(feature = "pdfium")]
|
|
||||||
pages: Arc<PdfPagesExtractor>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PdfExtractor {
|
|
||||||
pub fn new(item: &BinaryPileValue) -> Self {
|
|
||||||
Self {
|
|
||||||
text: Arc::new(PdfTextExtractor::new(item)),
|
|
||||||
meta: Arc::new(PdfMetaExtractor::new(item)),
|
|
||||||
#[cfg(feature = "pdfium")]
|
|
||||||
pages: Arc::new(PdfPagesExtractor::new(item)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl ObjectExtractor for PdfExtractor {
|
|
||||||
async fn field(
|
|
||||||
&self,
|
|
||||||
state: &ExtractState,
|
|
||||||
name: &pile_config::Label,
|
|
||||||
args: Option<&str>,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
trace!(
|
|
||||||
?args,
|
|
||||||
item = ?self.text.item,
|
|
||||||
"Getting field {name:?} from PdfExtractor",
|
|
||||||
);
|
|
||||||
|
|
||||||
match (name.as_str(), args) {
|
|
||||||
("text", args) => self.text.field(state, name, args).await,
|
|
||||||
("meta", None) => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))),
|
|
||||||
#[cfg(feature = "pdfium")]
|
|
||||||
("pages", None) => Ok(Some(PileValue::ListExtractor(self.pages.clone()))),
|
|
||||||
_ => Ok(None),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[expect(clippy::unwrap_used)]
|
|
||||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
|
||||||
Ok(vec![
|
|
||||||
Label::new("text").unwrap(),
|
|
||||||
Label::new("meta").unwrap(),
|
|
||||||
#[cfg(feature = "pdfium")]
|
|
||||||
Label::new("pages").unwrap(),
|
|
||||||
])
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn to_json(&self, state: &ExtractState) -> Result<serde_json::Value, std::io::Error> {
|
|
||||||
let keys = self.fields().await?;
|
|
||||||
let mut map = serde_json::Map::new();
|
|
||||||
for k in &keys {
|
|
||||||
let v = match self.field(state, k, None).await? {
|
|
||||||
Some(x) => x,
|
|
||||||
None => continue,
|
|
||||||
};
|
|
||||||
|
|
||||||
if k.as_str() == "text" {
|
|
||||||
map.insert(
|
|
||||||
k.to_string(),
|
|
||||||
serde_json::Value::String(format!(
|
|
||||||
"<String ({} bytes)",
|
|
||||||
match v {
|
|
||||||
PileValue::String(x) => x.len(),
|
|
||||||
_ => 0,
|
|
||||||
}
|
|
||||||
)),
|
|
||||||
);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
map.insert(k.to_string(), Box::pin(v.to_json(state)).await?);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(serde_json::Value::Object(map))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,68 +0,0 @@
|
|||||||
use pile_config::Label;
|
|
||||||
use pile_io::AsyncReader;
|
|
||||||
use std::sync::{Arc, OnceLock};
|
|
||||||
|
|
||||||
use crate::{
|
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
|
||||||
value::{BinaryPileValue, PileValue},
|
|
||||||
};
|
|
||||||
|
|
||||||
pub struct TextExtractor {
|
|
||||||
item: BinaryPileValue,
|
|
||||||
output: OnceLock<PileValue>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl TextExtractor {
|
|
||||||
pub fn new(item: &BinaryPileValue) -> Self {
|
|
||||||
Self {
|
|
||||||
item: item.clone(),
|
|
||||||
output: OnceLock::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl ObjectExtractor for TextExtractor {
|
|
||||||
async fn field(
|
|
||||||
&self,
|
|
||||||
state: &ExtractState,
|
|
||||||
name: &Label,
|
|
||||||
args: Option<&str>,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
if args.is_some() {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
if !state.ignore_mime
|
|
||||||
&& (self.item.mime().type_() != mime::TEXT
|
|
||||||
&& self.item.mime().type_() != mime::APPLICATION)
|
|
||||||
{
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
if name.as_str() != "text" {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
if let Some(x) = self.output.get() {
|
|
||||||
return Ok(Some(x.clone()));
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut reader = self.item.read().await?;
|
|
||||||
let bytes = reader.read_to_end().await?;
|
|
||||||
let string = String::from_utf8(bytes).ok();
|
|
||||||
let value = match string {
|
|
||||||
Some(x) => PileValue::String(Arc::new(x.into())),
|
|
||||||
None => PileValue::Null,
|
|
||||||
};
|
|
||||||
|
|
||||||
return Ok(Some(self.output.get_or_init(|| value).clone()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[expect(clippy::unwrap_used)]
|
|
||||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
|
||||||
Ok(vec![Label::new("text").unwrap()])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,58 +0,0 @@
|
|||||||
use std::{collections::HashMap, sync::Arc};
|
|
||||||
|
|
||||||
use pile_config::Label;
|
|
||||||
|
|
||||||
use crate::{
|
|
||||||
extract::{
|
|
||||||
misc::MapExtractor,
|
|
||||||
traits::{ExtractState, ObjectExtractor},
|
|
||||||
},
|
|
||||||
value::{Item, PileValue},
|
|
||||||
};
|
|
||||||
|
|
||||||
pub struct ItemExtractor {
|
|
||||||
inner: MapExtractor,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ItemExtractor {
|
|
||||||
pub fn new(item: &Item) -> Self {
|
|
||||||
let files = {
|
|
||||||
let Item::File { files, .. } = &item;
|
|
||||||
let mut inner = HashMap::new();
|
|
||||||
for f in files {
|
|
||||||
inner.insert(f.0.clone(), f.1.clone());
|
|
||||||
}
|
|
||||||
PileValue::ObjectExtractor(Arc::new(MapExtractor { inner }))
|
|
||||||
};
|
|
||||||
|
|
||||||
#[expect(clippy::unwrap_used)]
|
|
||||||
let inner = MapExtractor {
|
|
||||||
inner: HashMap::from([
|
|
||||||
(Label::new("files").unwrap(), files),
|
|
||||||
(
|
|
||||||
Label::new("key").unwrap(),
|
|
||||||
PileValue::String(Arc::new(item.key())),
|
|
||||||
),
|
|
||||||
]),
|
|
||||||
};
|
|
||||||
|
|
||||||
Self { inner }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl ObjectExtractor for ItemExtractor {
|
|
||||||
async fn field(
|
|
||||||
&self,
|
|
||||||
state: &ExtractState,
|
|
||||||
name: &pile_config::Label,
|
|
||||||
args: Option<&str>,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
self.inner.field(state, name, args).await
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
|
||||||
let fields = self.inner.fields().await?;
|
|
||||||
Ok(fields)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,6 +1,5 @@
|
|||||||
use epub::doc::EpubDoc;
|
use epub::doc::EpubDoc;
|
||||||
use pile_config::Label;
|
use pile_config::Label;
|
||||||
use pile_io::SyncReadBridge;
|
|
||||||
use std::{
|
use std::{
|
||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
sync::{Arc, OnceLock},
|
sync::{Arc, OnceLock},
|
||||||
@@ -8,17 +7,17 @@ use std::{
|
|||||||
use tracing::trace;
|
use tracing::trace;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
extract::traits::ObjectExtractor,
|
||||||
value::{BinaryPileValue, PileValue},
|
value::{Item, PileValue, SyncReadBridge},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct EpubMetaExtractor {
|
pub struct EpubMetaExtractor {
|
||||||
item: BinaryPileValue,
|
item: Item,
|
||||||
output: OnceLock<HashMap<Label, PileValue>>,
|
output: OnceLock<HashMap<Label, PileValue>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl EpubMetaExtractor {
|
impl EpubMetaExtractor {
|
||||||
pub fn new(item: &BinaryPileValue) -> Self {
|
pub fn new(item: &Item) -> Self {
|
||||||
Self {
|
Self {
|
||||||
item: item.clone(),
|
item: item.clone(),
|
||||||
output: OnceLock::new(),
|
output: OnceLock::new(),
|
||||||
@@ -30,9 +29,16 @@ impl EpubMetaExtractor {
|
|||||||
return Ok(x);
|
return Ok(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let key = self.item.key();
|
||||||
|
let ext = key.as_str().rsplit('.').next();
|
||||||
|
if !matches!(ext, Some("epub")) {
|
||||||
|
return Ok(self.output.get_or_init(HashMap::new));
|
||||||
|
}
|
||||||
|
|
||||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||||
let raw_meta = tokio::task::spawn_blocking(move || {
|
let raw_meta = tokio::task::spawn_blocking(move || {
|
||||||
let doc = EpubDoc::from_reader(reader)?;
|
let doc = EpubDoc::from_reader(reader)
|
||||||
|
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||||
|
|
||||||
let fields: &[&'static str] = &[
|
let fields: &[&'static str] = &[
|
||||||
"title",
|
"title",
|
||||||
@@ -48,19 +54,17 @@ impl EpubMetaExtractor {
|
|||||||
let meta: Vec<(&'static str, Option<String>)> =
|
let meta: Vec<(&'static str, Option<String>)> =
|
||||||
fields.iter().map(|&key| (key, doc.mdata(key))).collect();
|
fields.iter().map(|&key| (key, doc.mdata(key))).collect();
|
||||||
|
|
||||||
Ok::<_, anyhow::Error>(meta)
|
Ok::<_, std::io::Error>(meta)
|
||||||
})
|
})
|
||||||
.await?;
|
.await
|
||||||
|
.map_err(std::io::Error::other)?;
|
||||||
|
|
||||||
let raw_meta = match raw_meta {
|
let raw_meta = match raw_meta {
|
||||||
Ok(x) => x,
|
Ok(x) => x,
|
||||||
Err(error) => match error.downcast::<std::io::Error>() {
|
Err(error) => {
|
||||||
Ok(x) => return Err(x),
|
trace!(message = "Could not process epub", ?error, key = ?self.item.key());
|
||||||
Err(error) => {
|
return Ok(self.output.get_or_init(HashMap::new));
|
||||||
trace!(message = "Could not process epub", ?error, item = ?self.item);
|
}
|
||||||
return Ok(self.output.get_or_init(HashMap::new));
|
|
||||||
}
|
|
||||||
},
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut output: HashMap<Label, PileValue> = HashMap::new();
|
let mut output: HashMap<Label, PileValue> = HashMap::new();
|
||||||
@@ -81,20 +85,7 @@ impl EpubMetaExtractor {
|
|||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl ObjectExtractor for EpubMetaExtractor {
|
impl ObjectExtractor for EpubMetaExtractor {
|
||||||
async fn field(
|
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
&self,
|
|
||||||
state: &ExtractState,
|
|
||||||
name: &Label,
|
|
||||||
args: Option<&str>,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
if args.is_some() {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
if !state.ignore_mime && self.item.mime().essence_str() != "application/epub+zip" {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(self.get_inner().await?.get(name).cloned())
|
Ok(self.get_inner().await?.get(name).cloned())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1,24 +1,23 @@
|
|||||||
use epub::doc::EpubDoc;
|
use epub::doc::EpubDoc;
|
||||||
use pile_config::Label;
|
use pile_config::Label;
|
||||||
use pile_io::SyncReadBridge;
|
|
||||||
use std::{
|
use std::{
|
||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
sync::{Arc, OnceLock},
|
sync::{Arc, OnceLock},
|
||||||
};
|
};
|
||||||
use tracing::trace;
|
use tracing::debug;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
extract::traits::ObjectExtractor,
|
||||||
value::{BinaryPileValue, PileValue},
|
value::{Item, PileValue, SyncReadBridge},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct EpubTextExtractor {
|
pub struct EpubTextExtractor {
|
||||||
item: BinaryPileValue,
|
item: Item,
|
||||||
output: OnceLock<HashMap<Label, PileValue>>,
|
output: OnceLock<HashMap<Label, PileValue>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl EpubTextExtractor {
|
impl EpubTextExtractor {
|
||||||
pub fn new(item: &BinaryPileValue) -> Self {
|
pub fn new(item: &Item) -> Self {
|
||||||
Self {
|
Self {
|
||||||
item: item.clone(),
|
item: item.clone(),
|
||||||
output: OnceLock::new(),
|
output: OnceLock::new(),
|
||||||
@@ -30,9 +29,16 @@ impl EpubTextExtractor {
|
|||||||
return Ok(x);
|
return Ok(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let key = self.item.key();
|
||||||
|
let ext = key.as_str().rsplit('.').next();
|
||||||
|
if !matches!(ext, Some("epub")) {
|
||||||
|
return Ok(self.output.get_or_init(HashMap::new));
|
||||||
|
}
|
||||||
|
|
||||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||||
let raw_text = tokio::task::spawn_blocking(move || {
|
let raw_text = tokio::task::spawn_blocking(move || {
|
||||||
let mut doc = EpubDoc::from_reader(reader)?;
|
let mut doc = EpubDoc::from_reader(reader)
|
||||||
|
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||||
|
|
||||||
let mut text_parts: Vec<String> = Vec::new();
|
let mut text_parts: Vec<String> = Vec::new();
|
||||||
|
|
||||||
@@ -45,19 +51,17 @@ impl EpubTextExtractor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok::<_, anyhow::Error>(text_parts.join(" "))
|
Ok::<_, std::io::Error>(text_parts.join(" "))
|
||||||
})
|
})
|
||||||
.await?;
|
.await
|
||||||
|
.map_err(std::io::Error::other)?;
|
||||||
|
|
||||||
let raw_text = match raw_text {
|
let raw_text = match raw_text {
|
||||||
Ok(x) => x,
|
Ok(x) => x,
|
||||||
Err(error) => match error.downcast::<std::io::Error>() {
|
Err(error) => {
|
||||||
Ok(x) => return Err(x),
|
debug!(message = "Could not process epub", ?error, key = ?self.item.key());
|
||||||
Err(error) => {
|
return Ok(self.output.get_or_init(HashMap::new));
|
||||||
trace!(message = "Could not process epub", ?error, item = ?self.item);
|
}
|
||||||
return Ok(self.output.get_or_init(HashMap::new));
|
|
||||||
}
|
|
||||||
},
|
|
||||||
};
|
};
|
||||||
|
|
||||||
#[expect(clippy::unwrap_used)]
|
#[expect(clippy::unwrap_used)]
|
||||||
@@ -91,20 +95,7 @@ fn strip_html(html: &str) -> String {
|
|||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl ObjectExtractor for EpubTextExtractor {
|
impl ObjectExtractor for EpubTextExtractor {
|
||||||
async fn field(
|
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
&self,
|
|
||||||
state: &ExtractState,
|
|
||||||
name: &Label,
|
|
||||||
args: Option<&str>,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
if args.is_some() {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
if !state.ignore_mime && self.item.mime().essence_str() != "application/epub+zip" {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(self.get_inner().await?.get(name).cloned())
|
Ok(self.get_inner().await?.get(name).cloned())
|
||||||
}
|
}
|
||||||
|
|
||||||
46
crates/pile-value/src/extract/item/epub/mod.rs
Normal file
46
crates/pile-value/src/extract/item/epub/mod.rs
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
use pile_config::Label;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
mod epub_meta;
|
||||||
|
pub use epub_meta::*;
|
||||||
|
|
||||||
|
mod epub_text;
|
||||||
|
pub use epub_text::*;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
extract::traits::ObjectExtractor,
|
||||||
|
value::{Item, PileValue},
|
||||||
|
};
|
||||||
|
|
||||||
|
pub struct EpubExtractor {
|
||||||
|
text: Arc<EpubTextExtractor>,
|
||||||
|
meta: Arc<EpubMetaExtractor>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl EpubExtractor {
|
||||||
|
pub fn new(item: &Item) -> Self {
|
||||||
|
Self {
|
||||||
|
text: Arc::new(EpubTextExtractor::new(item)),
|
||||||
|
meta: Arc::new(EpubMetaExtractor::new(item)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
impl ObjectExtractor for EpubExtractor {
|
||||||
|
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
|
match name.as_str() {
|
||||||
|
"text" => self.text.field(name).await,
|
||||||
|
"meta" => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))),
|
||||||
|
_ => Ok(None),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[expect(clippy::unwrap_used)]
|
||||||
|
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||||
|
Ok(vec![
|
||||||
|
Label::new("text").unwrap(),
|
||||||
|
Label::new("meta").unwrap(),
|
||||||
|
])
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,5 +1,4 @@
|
|||||||
use pile_config::Label;
|
use pile_config::Label;
|
||||||
use pile_io::SyncReadBridge;
|
|
||||||
use std::{
|
use std::{
|
||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
io::BufReader,
|
io::BufReader,
|
||||||
@@ -8,17 +7,17 @@ use std::{
|
|||||||
use tracing::trace;
|
use tracing::trace;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
extract::traits::ObjectExtractor,
|
||||||
value::{BinaryPileValue, PileValue},
|
value::{Item, PileValue, SyncReadBridge},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct ExifExtractor {
|
pub struct ExifExtractor {
|
||||||
item: BinaryPileValue,
|
item: Item,
|
||||||
output: OnceLock<HashMap<Label, PileValue>>,
|
output: OnceLock<HashMap<Label, PileValue>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ExifExtractor {
|
impl ExifExtractor {
|
||||||
pub fn new(item: &BinaryPileValue) -> Self {
|
pub fn new(item: &Item) -> Self {
|
||||||
Self {
|
Self {
|
||||||
item: item.clone(),
|
item: item.clone(),
|
||||||
output: OnceLock::new(),
|
output: OnceLock::new(),
|
||||||
@@ -33,7 +32,9 @@ impl ExifExtractor {
|
|||||||
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||||
let raw_fields = tokio::task::spawn_blocking(move || {
|
let raw_fields = tokio::task::spawn_blocking(move || {
|
||||||
let mut br = BufReader::new(reader);
|
let mut br = BufReader::new(reader);
|
||||||
let exif = exif::Reader::new().read_from_container(&mut br)?;
|
let exif = exif::Reader::new()
|
||||||
|
.read_from_container(&mut br)
|
||||||
|
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
|
||||||
|
|
||||||
let fields: Vec<(String, String)> = exif
|
let fields: Vec<(String, String)> = exif
|
||||||
.fields()
|
.fields()
|
||||||
@@ -45,15 +46,15 @@ impl ExifExtractor {
|
|||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
Ok::<_, exif::Error>(fields)
|
Ok::<_, std::io::Error>(fields)
|
||||||
})
|
})
|
||||||
.await?;
|
.await
|
||||||
|
.map_err(std::io::Error::other)?;
|
||||||
|
|
||||||
let raw_fields = match raw_fields {
|
let raw_fields = match raw_fields {
|
||||||
Ok(x) => x,
|
Ok(x) => x,
|
||||||
Err(exif::Error::Io(x)) => return Err(x),
|
|
||||||
Err(error) => {
|
Err(error) => {
|
||||||
trace!(message = "Could not process exif", ?error, item = ?self.item);
|
trace!(message = "Could not process exif", ?error, key = ?self.item.key());
|
||||||
return Ok(self.output.get_or_init(HashMap::new));
|
return Ok(self.output.get_or_init(HashMap::new));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -64,7 +65,6 @@ impl ExifExtractor {
|
|||||||
let Some(label) = tag_to_label(&tag_name) else {
|
let Some(label) = tag_to_label(&tag_name) else {
|
||||||
continue;
|
continue;
|
||||||
};
|
};
|
||||||
|
|
||||||
// First occurrence wins (PRIMARY IFD comes before THUMBNAIL)
|
// First occurrence wins (PRIMARY IFD comes before THUMBNAIL)
|
||||||
output
|
output
|
||||||
.entry(label)
|
.entry(label)
|
||||||
@@ -86,26 +86,7 @@ fn tag_to_label(tag: &str) -> Option<Label> {
|
|||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl ObjectExtractor for ExifExtractor {
|
impl ObjectExtractor for ExifExtractor {
|
||||||
async fn field(
|
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
&self,
|
|
||||||
state: &ExtractState,
|
|
||||||
name: &Label,
|
|
||||||
args: Option<&str>,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
trace!(
|
|
||||||
?args,
|
|
||||||
item = ?self.item,
|
|
||||||
"Getting field {name:?} from ExifExtractor",
|
|
||||||
);
|
|
||||||
|
|
||||||
if args.is_some() {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
if !state.ignore_mime && self.item.mime().type_() != mime::IMAGE {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(self.get_inner().await?.get(name).cloned())
|
Ok(self.get_inner().await?.get(name).cloned())
|
||||||
}
|
}
|
||||||
|
|
||||||
162
crates/pile-value/src/extract/item/flac.rs
Normal file
162
crates/pile-value/src/extract/item/flac.rs
Normal file
@@ -0,0 +1,162 @@
|
|||||||
|
use mime::Mime;
|
||||||
|
use pile_config::Label;
|
||||||
|
use pile_flac::{FlacBlock, FlacReader};
|
||||||
|
use std::{
|
||||||
|
collections::HashMap,
|
||||||
|
io::BufReader,
|
||||||
|
sync::{Arc, OnceLock},
|
||||||
|
};
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
extract::traits::{ListExtractor, ObjectExtractor},
|
||||||
|
value::{Item, PileValue, SyncReadBridge},
|
||||||
|
};
|
||||||
|
|
||||||
|
pub struct FlacImagesExtractor {
|
||||||
|
item: Item,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FlacImagesExtractor {
|
||||||
|
pub fn new(item: &Item) -> Self {
|
||||||
|
Self { item: item.clone() }
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_images(&self) -> Result<Vec<PileValue>, std::io::Error> {
|
||||||
|
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||||
|
let raw_images = tokio::task::spawn_blocking(move || {
|
||||||
|
let reader = FlacReader::new(BufReader::new(reader));
|
||||||
|
let mut images: Vec<(Mime, Vec<u8>)> = Vec::new();
|
||||||
|
for block in reader {
|
||||||
|
match block.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))? {
|
||||||
|
FlacBlock::Picture(picture) => {
|
||||||
|
images.push((picture.mime, picture.img_data));
|
||||||
|
}
|
||||||
|
FlacBlock::AudioFrame(_) => break,
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok::<_, std::io::Error>(images)
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.map_err(std::io::Error::other)??;
|
||||||
|
|
||||||
|
Ok(raw_images
|
||||||
|
.into_iter()
|
||||||
|
.map(|(mime, data)| PileValue::Blob {
|
||||||
|
mime,
|
||||||
|
bytes: Arc::new(data),
|
||||||
|
})
|
||||||
|
.collect())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
impl ListExtractor for FlacImagesExtractor {
|
||||||
|
async fn get<'a>(&'a self, idx: usize) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
|
Ok(self.get_images().await?.into_iter().nth(idx))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn len(&self) -> Result<usize, std::io::Error> {
|
||||||
|
Ok(self.get_images().await?.len())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct FlacExtractor {
|
||||||
|
item: Item,
|
||||||
|
output: OnceLock<HashMap<Label, PileValue>>,
|
||||||
|
images: Option<PileValue>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FlacExtractor {
|
||||||
|
pub fn new(item: &Item) -> Self {
|
||||||
|
let is_flac = match item {
|
||||||
|
Item::File { path, .. } => path.to_str().unwrap_or_default().ends_with(".flac"),
|
||||||
|
Item::S3 { key, .. } => key.ends_with(".flac"),
|
||||||
|
};
|
||||||
|
|
||||||
|
let images =
|
||||||
|
is_flac.then(|| PileValue::ListExtractor(Arc::new(FlacImagesExtractor::new(item))));
|
||||||
|
|
||||||
|
Self {
|
||||||
|
item: item.clone(),
|
||||||
|
output: OnceLock::new(),
|
||||||
|
images,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
|
||||||
|
if let Some(x) = self.output.get() {
|
||||||
|
return Ok(x);
|
||||||
|
}
|
||||||
|
|
||||||
|
let key = match &self.item {
|
||||||
|
Item::File { path, .. } => path.to_str().unwrap_or_default().to_owned(),
|
||||||
|
Item::S3 { key, .. } => key.to_string(),
|
||||||
|
};
|
||||||
|
|
||||||
|
if !key.ends_with(".flac") {
|
||||||
|
let _ = self.output.set(HashMap::new());
|
||||||
|
#[expect(clippy::unwrap_used)]
|
||||||
|
return Ok(self.output.get().unwrap());
|
||||||
|
}
|
||||||
|
|
||||||
|
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||||
|
let raw_tags = tokio::task::spawn_blocking(move || {
|
||||||
|
let reader = FlacReader::new(BufReader::new(reader));
|
||||||
|
let mut tags: Vec<(String, String)> = Vec::new();
|
||||||
|
for block in reader {
|
||||||
|
match block.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))? {
|
||||||
|
FlacBlock::VorbisComment(comment) => {
|
||||||
|
for (k, v) in comment.comment.comments {
|
||||||
|
tags.push((k.to_string().to_lowercase(), v.into()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
FlacBlock::AudioFrame(_) => break,
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok::<_, std::io::Error>(tags)
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.map_err(std::io::Error::other)??;
|
||||||
|
|
||||||
|
let mut output: HashMap<Label, Vec<PileValue>> = HashMap::new();
|
||||||
|
for (k, v) in raw_tags {
|
||||||
|
if let Some(label) = Label::new(k) {
|
||||||
|
output
|
||||||
|
.entry(label)
|
||||||
|
.or_default()
|
||||||
|
.push(PileValue::String(Arc::new(v.into())));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let output: HashMap<Label, PileValue> = output
|
||||||
|
.into_iter()
|
||||||
|
.map(|(k, v)| (k, PileValue::Array(Arc::new(v))))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let _ = self.output.set(output);
|
||||||
|
#[expect(clippy::unwrap_used)]
|
||||||
|
return Ok(self.output.get().unwrap());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
impl ObjectExtractor for FlacExtractor {
|
||||||
|
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
|
if name.as_str() == "images"
|
||||||
|
&& let Some(ref images) = self.images
|
||||||
|
{
|
||||||
|
return Ok(Some(images.clone()));
|
||||||
|
}
|
||||||
|
Ok(self.get_inner().await?.get(name).cloned())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||||
|
let mut fields = self.get_inner().await?.keys().cloned().collect::<Vec<_>>();
|
||||||
|
if self.images.is_some() {
|
||||||
|
#[expect(clippy::unwrap_used)]
|
||||||
|
fields.push(Label::new("images").unwrap());
|
||||||
|
}
|
||||||
|
Ok(fields)
|
||||||
|
}
|
||||||
|
}
|
||||||
80
crates/pile-value/src/extract/item/fs.rs
Normal file
80
crates/pile-value/src/extract/item/fs.rs
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
use pile_config::Label;
|
||||||
|
use std::{
|
||||||
|
collections::HashMap,
|
||||||
|
path::Component,
|
||||||
|
sync::{Arc, OnceLock},
|
||||||
|
};
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
extract::traits::ObjectExtractor,
|
||||||
|
value::{Item, PileValue},
|
||||||
|
};
|
||||||
|
|
||||||
|
pub struct FsExtractor {
|
||||||
|
item: Item,
|
||||||
|
output: OnceLock<HashMap<Label, PileValue>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FsExtractor {
|
||||||
|
pub fn new(item: &Item) -> Self {
|
||||||
|
Self {
|
||||||
|
item: item.clone(),
|
||||||
|
output: OnceLock::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
|
||||||
|
if let Some(x) = self.output.get() {
|
||||||
|
return Ok(x);
|
||||||
|
}
|
||||||
|
|
||||||
|
let Item::File { path, .. } = &self.item else {
|
||||||
|
return Ok(self.output.get_or_init(HashMap::new));
|
||||||
|
};
|
||||||
|
|
||||||
|
#[expect(clippy::unwrap_used)]
|
||||||
|
let output = HashMap::from([
|
||||||
|
(
|
||||||
|
Label::new("extension").unwrap(),
|
||||||
|
path.extension()
|
||||||
|
.and_then(|x| x.to_str())
|
||||||
|
.map(|x| PileValue::String(Arc::new(x.into())))
|
||||||
|
.unwrap_or(PileValue::Null),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
Label::new("path").unwrap(),
|
||||||
|
path.to_str()
|
||||||
|
.map(|x| PileValue::String(Arc::new(x.into())))
|
||||||
|
.unwrap_or(PileValue::Null),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
Label::new("segments").unwrap(),
|
||||||
|
path.components()
|
||||||
|
.map(|x| match x {
|
||||||
|
Component::CurDir => Some(".".to_owned()),
|
||||||
|
Component::Normal(x) => x.to_str().map(|x| x.to_owned()),
|
||||||
|
Component::ParentDir => Some("..".to_owned()),
|
||||||
|
Component::RootDir => Some("/".to_owned()),
|
||||||
|
Component::Prefix(x) => x.as_os_str().to_str().map(|x| x.to_owned()),
|
||||||
|
})
|
||||||
|
.map(|x| x.map(|x| PileValue::String(Arc::new(x.into()))))
|
||||||
|
.collect::<Option<Vec<_>>>()
|
||||||
|
.map(|v| PileValue::Array(Arc::new(v)))
|
||||||
|
.unwrap_or(PileValue::Null),
|
||||||
|
),
|
||||||
|
]);
|
||||||
|
|
||||||
|
return Ok(self.output.get_or_init(|| output));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
impl ObjectExtractor for FsExtractor {
|
||||||
|
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
|
Ok(self.get_inner()?.get(name).cloned())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||||
|
Ok(self.get_inner()?.keys().cloned().collect())
|
||||||
|
}
|
||||||
|
}
|
||||||
133
crates/pile-value/src/extract/item/id3.rs
Normal file
133
crates/pile-value/src/extract/item/id3.rs
Normal file
@@ -0,0 +1,133 @@
|
|||||||
|
use id3::Tag;
|
||||||
|
use pile_config::Label;
|
||||||
|
use std::{
|
||||||
|
borrow::Cow,
|
||||||
|
collections::HashMap,
|
||||||
|
io::BufReader,
|
||||||
|
sync::{Arc, OnceLock},
|
||||||
|
};
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
extract::traits::ObjectExtractor,
|
||||||
|
value::{Item, PileValue, SyncReadBridge},
|
||||||
|
};
|
||||||
|
|
||||||
|
pub struct Id3Extractor {
|
||||||
|
item: Item,
|
||||||
|
output: OnceLock<HashMap<Label, PileValue>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Id3Extractor {
|
||||||
|
pub fn new(item: &Item) -> Self {
|
||||||
|
Self {
|
||||||
|
item: item.clone(),
|
||||||
|
output: OnceLock::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
|
||||||
|
if let Some(x) = self.output.get() {
|
||||||
|
return Ok(x);
|
||||||
|
}
|
||||||
|
|
||||||
|
let key = self.item.key();
|
||||||
|
let ext = key.as_str().rsplit('.').next();
|
||||||
|
if !matches!(ext, Some("mp3") | Some("aiff") | Some("aif") | Some("wav")) {
|
||||||
|
return Ok(self.output.get_or_init(HashMap::new));
|
||||||
|
}
|
||||||
|
|
||||||
|
let reader = SyncReadBridge::new_current(self.item.read().await?);
|
||||||
|
let tag = match tokio::task::spawn_blocking(move || Tag::read_from2(BufReader::new(reader)))
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(Ok(tag)) => tag,
|
||||||
|
|
||||||
|
Ok(Err(id3::Error {
|
||||||
|
kind: id3::ErrorKind::NoTag,
|
||||||
|
..
|
||||||
|
})) => {
|
||||||
|
return Ok(self.output.get_or_init(HashMap::new));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Err(id3::Error {
|
||||||
|
kind: id3::ErrorKind::Io(e),
|
||||||
|
..
|
||||||
|
})) => return Err(e),
|
||||||
|
|
||||||
|
Ok(Err(e)) => return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, e)),
|
||||||
|
Err(e) => return Err(e.into()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut output: HashMap<Label, Vec<PileValue>> = HashMap::new();
|
||||||
|
for frame in tag.frames() {
|
||||||
|
if let Some(text) = frame.content().text() {
|
||||||
|
let name = frame_id_to_field(frame.id());
|
||||||
|
if let Some(key) = Label::new(name) {
|
||||||
|
output
|
||||||
|
.entry(key)
|
||||||
|
.or_default()
|
||||||
|
.push(PileValue::String(Arc::new(text.into())));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let output = output
|
||||||
|
.into_iter()
|
||||||
|
.map(|(k, v)| (k, PileValue::Array(Arc::new(v))))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
return Ok(self.output.get_or_init(|| output));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Map an ID3 frame ID to the equivalent Vorbis Comment field name.
|
||||||
|
/// Falls back to the lowercased frame ID if no mapping exists.
|
||||||
|
fn frame_id_to_field(id: &str) -> Cow<'static, str> {
|
||||||
|
match id {
|
||||||
|
// spell:off
|
||||||
|
"TIT2" => Cow::Borrowed("title"),
|
||||||
|
"TIT1" => Cow::Borrowed("grouping"),
|
||||||
|
"TIT3" => Cow::Borrowed("subtitle"),
|
||||||
|
"TPE1" => Cow::Borrowed("artist"),
|
||||||
|
"TPE2" => Cow::Borrowed("albumartist"),
|
||||||
|
"TPE3" => Cow::Borrowed("conductor"),
|
||||||
|
"TOPE" => Cow::Borrowed("originalartist"),
|
||||||
|
"TALB" => Cow::Borrowed("album"),
|
||||||
|
"TOAL" => Cow::Borrowed("originalalbum"),
|
||||||
|
"TRCK" => Cow::Borrowed("tracknumber"),
|
||||||
|
"TPOS" => Cow::Borrowed("discnumber"),
|
||||||
|
"TSST" => Cow::Borrowed("discsubtitle"),
|
||||||
|
"TDRC" | "TYER" => Cow::Borrowed("date"),
|
||||||
|
"TDOR" | "TORY" => Cow::Borrowed("originaldate"),
|
||||||
|
"TCON" => Cow::Borrowed("genre"),
|
||||||
|
"TCOM" => Cow::Borrowed("composer"),
|
||||||
|
"TEXT" => Cow::Borrowed("lyricist"),
|
||||||
|
"TPUB" => Cow::Borrowed("label"),
|
||||||
|
"TSRC" => Cow::Borrowed("isrc"),
|
||||||
|
"TBPM" => Cow::Borrowed("bpm"),
|
||||||
|
"TLAN" => Cow::Borrowed("language"),
|
||||||
|
"TMED" => Cow::Borrowed("media"),
|
||||||
|
"TMOO" => Cow::Borrowed("mood"),
|
||||||
|
"TCOP" => Cow::Borrowed("copyright"),
|
||||||
|
"TENC" => Cow::Borrowed("encodedby"),
|
||||||
|
"TSSE" => Cow::Borrowed("encodersettings"),
|
||||||
|
"TSOA" => Cow::Borrowed("albumsort"),
|
||||||
|
"TSOP" => Cow::Borrowed("artistsort"),
|
||||||
|
"TSOT" => Cow::Borrowed("titlesort"),
|
||||||
|
"MVNM" => Cow::Borrowed("movement"),
|
||||||
|
"MVIN" => Cow::Borrowed("movementnumber"),
|
||||||
|
_ => Cow::Owned(id.to_lowercase()),
|
||||||
|
// spell:on
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
impl ObjectExtractor for Id3Extractor {
|
||||||
|
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
|
Ok(self.get_inner().await?.get(name).cloned())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||||
|
Ok(self.get_inner().await?.keys().cloned().collect())
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -18,38 +18,25 @@ pub use exif::*;
|
|||||||
mod pdf;
|
mod pdf;
|
||||||
pub use pdf::*;
|
pub use pdf::*;
|
||||||
|
|
||||||
mod json;
|
|
||||||
pub use json::*;
|
|
||||||
|
|
||||||
mod toml;
|
mod toml;
|
||||||
use pile_config::Label;
|
use pile_config::Label;
|
||||||
pub use toml::*;
|
pub use toml::*;
|
||||||
|
|
||||||
mod text;
|
mod sidecar;
|
||||||
pub use text::*;
|
pub use sidecar::*;
|
||||||
|
|
||||||
mod image;
|
|
||||||
pub use image::*;
|
|
||||||
|
|
||||||
mod hash;
|
|
||||||
pub use hash::*;
|
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::{
|
extract::{misc::MapExtractor, traits::ObjectExtractor},
|
||||||
misc::MapExtractor,
|
value::{Item, PileValue},
|
||||||
traits::{ExtractState, ObjectExtractor},
|
|
||||||
},
|
|
||||||
value::{BinaryPileValue, PileValue},
|
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct BinaryExtractor {
|
pub struct ItemExtractor {
|
||||||
inner: MapExtractor,
|
inner: MapExtractor,
|
||||||
image: Arc<ImageExtractor>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl BinaryExtractor {
|
impl ItemExtractor {
|
||||||
#[expect(clippy::unwrap_used)]
|
#[expect(clippy::unwrap_used)]
|
||||||
pub fn new(item: &BinaryPileValue) -> Self {
|
pub fn new(item: &Item) -> Self {
|
||||||
let inner = MapExtractor {
|
let inner = MapExtractor {
|
||||||
inner: HashMap::from([
|
inner: HashMap::from([
|
||||||
(
|
(
|
||||||
@@ -76,50 +63,37 @@ impl BinaryExtractor {
|
|||||||
Label::new("pdf").unwrap(),
|
Label::new("pdf").unwrap(),
|
||||||
PileValue::ObjectExtractor(Arc::new(PdfExtractor::new(item))),
|
PileValue::ObjectExtractor(Arc::new(PdfExtractor::new(item))),
|
||||||
),
|
),
|
||||||
(
|
|
||||||
Label::new("json").unwrap(),
|
|
||||||
PileValue::ObjectExtractor(Arc::new(JsonExtractor::new(item))),
|
|
||||||
),
|
|
||||||
(
|
(
|
||||||
Label::new("toml").unwrap(),
|
Label::new("toml").unwrap(),
|
||||||
PileValue::ObjectExtractor(Arc::new(TomlExtractor::new(item))),
|
PileValue::ObjectExtractor(Arc::new(TomlExtractor::new(item))),
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
Label::new("text").unwrap(),
|
Label::new("sidecar").unwrap(),
|
||||||
PileValue::ObjectExtractor(Arc::new(TextExtractor::new(item))),
|
PileValue::ObjectExtractor(Arc::new(SidecarExtractor::new(item))),
|
||||||
),
|
|
||||||
(
|
|
||||||
Label::new("hash").unwrap(),
|
|
||||||
PileValue::ObjectExtractor(Arc::new(HashExtractor::new(item))),
|
|
||||||
),
|
),
|
||||||
]),
|
]),
|
||||||
};
|
};
|
||||||
|
|
||||||
Self {
|
Self { inner }
|
||||||
inner,
|
|
||||||
image: Arc::new(ImageExtractor::new(item)),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl ObjectExtractor for BinaryExtractor {
|
impl ObjectExtractor for ItemExtractor {
|
||||||
async fn field(
|
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
&self,
|
self.inner.field(name).await
|
||||||
state: &ExtractState,
|
|
||||||
name: &pile_config::Label,
|
|
||||||
args: Option<&str>,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
if self.image.fields().await?.contains(name) {
|
|
||||||
self.image.field(state, name, args).await
|
|
||||||
} else {
|
|
||||||
self.inner.field(state, name, args).await
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[expect(clippy::unwrap_used)]
|
||||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||||
let mut fields = self.inner.fields().await?;
|
return Ok(vec![
|
||||||
fields.extend(self.image.fields().await?);
|
Label::new("flac").unwrap(),
|
||||||
Ok(fields)
|
Label::new("id3").unwrap(),
|
||||||
|
Label::new("fs").unwrap(),
|
||||||
|
Label::new("epub").unwrap(),
|
||||||
|
Label::new("exif").unwrap(),
|
||||||
|
Label::new("pdf").unwrap(),
|
||||||
|
Label::new("sidecar").unwrap(),
|
||||||
|
]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
61
crates/pile-value/src/extract/item/pdf/mod.rs
Normal file
61
crates/pile-value/src/extract/item/pdf/mod.rs
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
use pile_config::Label;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
#[cfg(feature = "pdfium")]
|
||||||
|
mod pdf_pages;
|
||||||
|
#[cfg(feature = "pdfium")]
|
||||||
|
pub use pdf_pages::*;
|
||||||
|
|
||||||
|
mod pdf_meta;
|
||||||
|
pub use pdf_meta::*;
|
||||||
|
|
||||||
|
mod pdf_text;
|
||||||
|
pub use pdf_text::*;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
extract::traits::ObjectExtractor,
|
||||||
|
value::{Item, PileValue},
|
||||||
|
};
|
||||||
|
|
||||||
|
pub struct PdfExtractor {
|
||||||
|
text: Arc<PdfTextExtractor>,
|
||||||
|
meta: Arc<PdfMetaExtractor>,
|
||||||
|
#[cfg(feature = "pdfium")]
|
||||||
|
pages: Arc<PdfPagesExtractor>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PdfExtractor {
|
||||||
|
pub fn new(item: &Item) -> Self {
|
||||||
|
Self {
|
||||||
|
text: Arc::new(PdfTextExtractor::new(item)),
|
||||||
|
meta: Arc::new(PdfMetaExtractor::new(item)),
|
||||||
|
#[cfg(feature = "pdfium")]
|
||||||
|
pages: Arc::new(PdfPagesExtractor::new(item)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
impl ObjectExtractor for PdfExtractor {
|
||||||
|
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
|
match name.as_str() {
|
||||||
|
"text" => self.text.field(name).await,
|
||||||
|
"meta" => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))),
|
||||||
|
#[cfg(feature = "pdfium")]
|
||||||
|
"pages" => Ok(Some(PileValue::ListExtractor(self.pages.clone()))),
|
||||||
|
_ => Ok(None),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[expect(clippy::unwrap_used)]
|
||||||
|
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||||
|
Ok(vec![
|
||||||
|
Label::new("text").unwrap(),
|
||||||
|
Label::new("meta").unwrap(),
|
||||||
|
#[cfg(feature = "pdfium")]
|
||||||
|
Label::new("cover").unwrap(),
|
||||||
|
#[cfg(feature = "pdfium")]
|
||||||
|
Label::new("pages").unwrap(),
|
||||||
|
])
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,7 +1,6 @@
|
|||||||
use pdf::file::FileOptions;
|
use pdf::file::FileOptions;
|
||||||
use pdf::primitive::{Date, TimeRel};
|
use pdf::primitive::{Date, TimeRel};
|
||||||
use pile_config::Label;
|
use pile_config::Label;
|
||||||
use pile_io::SyncReadBridge;
|
|
||||||
use std::{
|
use std::{
|
||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
io::BufReader,
|
io::BufReader,
|
||||||
@@ -9,19 +8,18 @@ use std::{
|
|||||||
};
|
};
|
||||||
use tracing::trace;
|
use tracing::trace;
|
||||||
|
|
||||||
use crate::value::BinaryPileValue;
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
extract::traits::ObjectExtractor,
|
||||||
value::PileValue,
|
value::{Item, PileValue, SyncReadBridge},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct PdfMetaExtractor {
|
pub struct PdfMetaExtractor {
|
||||||
item: BinaryPileValue,
|
item: Item,
|
||||||
output: OnceLock<HashMap<Label, PileValue>>,
|
output: OnceLock<HashMap<Label, PileValue>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PdfMetaExtractor {
|
impl PdfMetaExtractor {
|
||||||
pub fn new(item: &BinaryPileValue) -> Self {
|
pub fn new(item: &Item) -> Self {
|
||||||
Self {
|
Self {
|
||||||
item: item.clone(),
|
item: item.clone(),
|
||||||
output: OnceLock::new(),
|
output: OnceLock::new(),
|
||||||
@@ -83,7 +81,7 @@ impl PdfMetaExtractor {
|
|||||||
let (page_count, raw_meta) = match raw_meta {
|
let (page_count, raw_meta) = match raw_meta {
|
||||||
Ok(x) => x,
|
Ok(x) => x,
|
||||||
Err(error) => {
|
Err(error) => {
|
||||||
trace!(message = "Could not process pdf", ?error, item = ?self.item);
|
trace!(message = "Could not process pdf", ?error, key = ?self.item.key());
|
||||||
return Ok(self.output.get_or_init(HashMap::new));
|
return Ok(self.output.get_or_init(HashMap::new));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -124,20 +122,7 @@ fn format_date(d: &Date) -> String {
|
|||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl ObjectExtractor for PdfMetaExtractor {
|
impl ObjectExtractor for PdfMetaExtractor {
|
||||||
async fn field(
|
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
&self,
|
|
||||||
state: &ExtractState,
|
|
||||||
name: &Label,
|
|
||||||
args: Option<&str>,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
if args.is_some() {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
if !state.ignore_mime && self.item.mime().essence_str() != "application/pdf" {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(self.get_inner().await?.get(name).cloned())
|
Ok(self.get_inner().await?.get(name).cloned())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1,6 +1,5 @@
|
|||||||
use image::ImageFormat;
|
use image::ImageFormat;
|
||||||
use pdfium_render::prelude::*;
|
use pdfium_render::prelude::*;
|
||||||
use pile_io::SyncReadBridge;
|
|
||||||
use std::{
|
use std::{
|
||||||
io::{BufReader, Cursor},
|
io::{BufReader, Cursor},
|
||||||
sync::Arc,
|
sync::Arc,
|
||||||
@@ -8,16 +7,16 @@ use std::{
|
|||||||
use tracing::trace;
|
use tracing::trace;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::traits::{ExtractState, ListExtractor},
|
extract::traits::ListExtractor,
|
||||||
value::{ArcBytes, BinaryPileValue, PileValue},
|
value::{Item, PileValue, SyncReadBridge},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct PdfPagesExtractor {
|
pub struct PdfPagesExtractor {
|
||||||
item: BinaryPileValue,
|
item: Item,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PdfPagesExtractor {
|
impl PdfPagesExtractor {
|
||||||
pub fn new(item: &BinaryPileValue) -> Self {
|
pub fn new(item: &Item) -> Self {
|
||||||
Self { item: item.clone() }
|
Self { item: item.clone() }
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -35,20 +34,7 @@ impl PdfPagesExtractor {
|
|||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl ListExtractor for PdfPagesExtractor {
|
impl ListExtractor for PdfPagesExtractor {
|
||||||
async fn get(
|
async fn get(&self, idx: usize) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
&self,
|
|
||||||
state: &ExtractState,
|
|
||||||
idx: usize,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
trace!(
|
|
||||||
item = ?self.item,
|
|
||||||
"Getting index {idx} from PdfPagesExtractor",
|
|
||||||
);
|
|
||||||
|
|
||||||
if !state.ignore_mime && self.item.mime().essence_str() != "application/pdf" {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
let bytes = self.get_bytes().await?;
|
let bytes = self.get_bytes().await?;
|
||||||
let png = tokio::task::spawn_blocking(move || {
|
let png = tokio::task::spawn_blocking(move || {
|
||||||
let pdfium = Pdfium::default();
|
let pdfium = Pdfium::default();
|
||||||
@@ -78,23 +64,19 @@ impl ListExtractor for PdfPagesExtractor {
|
|||||||
|
|
||||||
let value = match png {
|
let value = match png {
|
||||||
Ok(None) => return Ok(None),
|
Ok(None) => return Ok(None),
|
||||||
Ok(Some(bytes)) => PileValue::Binary(BinaryPileValue::Blob {
|
Ok(Some(bytes)) => PileValue::Blob {
|
||||||
mime: mime::IMAGE_PNG,
|
mime: mime::IMAGE_PNG,
|
||||||
bytes: ArcBytes(Arc::new(bytes)),
|
bytes: Arc::new(bytes),
|
||||||
}),
|
},
|
||||||
Err(error) => {
|
Err(error) => {
|
||||||
trace!(message = "Could not render pdf page", ?error, idx, item = ?self.item);
|
trace!(message = "Could not render pdf page", ?error, idx, key = ?self.item.key());
|
||||||
PileValue::Null
|
PileValue::Null
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
Ok(Some(value))
|
Ok(Some(value))
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn len(&self, state: &ExtractState) -> Result<usize, std::io::Error> {
|
async fn len(&self) -> Result<usize, std::io::Error> {
|
||||||
if !state.ignore_mime && self.item.mime().essence_str() != "application/pdf" {
|
|
||||||
return Ok(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
let bytes = self.get_bytes().await?;
|
let bytes = self.get_bytes().await?;
|
||||||
let count = tokio::task::spawn_blocking(move || {
|
let count = tokio::task::spawn_blocking(move || {
|
||||||
let pdfium = Pdfium::default();
|
let pdfium = Pdfium::default();
|
||||||
@@ -108,7 +90,7 @@ impl ListExtractor for PdfPagesExtractor {
|
|||||||
match count {
|
match count {
|
||||||
Ok(n) => Ok(n),
|
Ok(n) => Ok(n),
|
||||||
Err(error) => {
|
Err(error) => {
|
||||||
trace!(message = "Could not read pdf page count", ?error, item = ?self.item);
|
trace!(message = "Could not read pdf page count", ?error, key = ?self.item.key());
|
||||||
Ok(0)
|
Ok(0)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -116,10 +98,10 @@ impl ListExtractor for PdfPagesExtractor {
|
|||||||
|
|
||||||
// Override, extracting all pages is very slow,
|
// Override, extracting all pages is very slow,
|
||||||
// and we can't display binary in json anyway
|
// and we can't display binary in json anyway
|
||||||
async fn to_json(&self, state: &ExtractState) -> Result<serde_json::Value, std::io::Error> {
|
async fn to_json(&self) -> Result<serde_json::Value, std::io::Error> {
|
||||||
Ok(serde_json::Value::String(format!(
|
Ok(serde_json::Value::String(format!(
|
||||||
"<PdfPages ({} pages)>",
|
"<PdfPages ({} pages)>",
|
||||||
self.len(state).await?
|
self.len().await?
|
||||||
)))
|
)))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1,7 +1,6 @@
|
|||||||
use pdf::content::{Op, TextDrawAdjusted};
|
use pdf::content::{Op, TextDrawAdjusted};
|
||||||
use pdf::file::FileOptions;
|
use pdf::file::FileOptions;
|
||||||
use pile_config::Label;
|
use pile_config::Label;
|
||||||
use pile_io::SyncReadBridge;
|
|
||||||
use std::{
|
use std::{
|
||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
io::BufReader,
|
io::BufReader,
|
||||||
@@ -9,19 +8,18 @@ use std::{
|
|||||||
};
|
};
|
||||||
use tracing::trace;
|
use tracing::trace;
|
||||||
|
|
||||||
use crate::value::BinaryPileValue;
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
extract::traits::ObjectExtractor,
|
||||||
value::PileValue,
|
value::{Item, PileValue, SyncReadBridge},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct PdfTextExtractor {
|
pub struct PdfTextExtractor {
|
||||||
pub(super) item: BinaryPileValue,
|
item: Item,
|
||||||
output: OnceLock<HashMap<Label, PileValue>>,
|
output: OnceLock<HashMap<Label, PileValue>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PdfTextExtractor {
|
impl PdfTextExtractor {
|
||||||
pub fn new(item: &BinaryPileValue) -> Self {
|
pub fn new(item: &Item) -> Self {
|
||||||
Self {
|
Self {
|
||||||
item: item.clone(),
|
item: item.clone(),
|
||||||
output: OnceLock::new(),
|
output: OnceLock::new(),
|
||||||
@@ -87,7 +85,7 @@ impl PdfTextExtractor {
|
|||||||
let raw_text = match raw_text {
|
let raw_text = match raw_text {
|
||||||
Ok(x) => x,
|
Ok(x) => x,
|
||||||
Err(error) => {
|
Err(error) => {
|
||||||
trace!(message = "Could not process pdf", ?error, item = ?self.item);
|
trace!(message = "Could not process pdf", ?error, key = ?self.item.key());
|
||||||
return Ok(self.output.get_or_init(HashMap::new));
|
return Ok(self.output.get_or_init(HashMap::new));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -104,20 +102,7 @@ impl PdfTextExtractor {
|
|||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl ObjectExtractor for PdfTextExtractor {
|
impl ObjectExtractor for PdfTextExtractor {
|
||||||
async fn field(
|
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
&self,
|
|
||||||
state: &ExtractState,
|
|
||||||
name: &Label,
|
|
||||||
args: Option<&str>,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
if args.is_some() {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
if !state.ignore_mime && self.item.mime().essence_str() != "application/pdf" {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(self.get_inner().await?.get(name).cloned())
|
Ok(self.get_inner().await?.get(name).cloned())
|
||||||
}
|
}
|
||||||
|
|
||||||
45
crates/pile-value/src/extract/item/sidecar.rs
Normal file
45
crates/pile-value/src/extract/item/sidecar.rs
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
use pile_config::Label;
|
||||||
|
use std::sync::OnceLock;
|
||||||
|
|
||||||
|
use super::TomlExtractor;
|
||||||
|
use crate::{
|
||||||
|
extract::traits::ObjectExtractor,
|
||||||
|
value::{Item, PileValue},
|
||||||
|
};
|
||||||
|
|
||||||
|
pub struct SidecarExtractor {
|
||||||
|
item: Item,
|
||||||
|
output: OnceLock<Option<TomlExtractor>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SidecarExtractor {
|
||||||
|
pub fn new(item: &Item) -> Self {
|
||||||
|
Self {
|
||||||
|
item: item.clone(),
|
||||||
|
output: OnceLock::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
impl ObjectExtractor for SidecarExtractor {
|
||||||
|
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
|
match self
|
||||||
|
.output
|
||||||
|
.get_or_init(|| self.item.sidecar().map(TomlExtractor::new))
|
||||||
|
{
|
||||||
|
Some(x) => Ok(x.field(name).await?),
|
||||||
|
None => Ok(Some(PileValue::Null)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
||||||
|
match self
|
||||||
|
.output
|
||||||
|
.get_or_init(|| self.item.sidecar().map(TomlExtractor::new))
|
||||||
|
{
|
||||||
|
Some(x) => Ok(x.fields().await?),
|
||||||
|
None => Ok(Vec::new()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,13 +1,12 @@
|
|||||||
use pile_config::Label;
|
use pile_config::Label;
|
||||||
use pile_io::AsyncReader;
|
|
||||||
use std::{
|
use std::{
|
||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
sync::{Arc, OnceLock},
|
sync::{Arc, OnceLock},
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
extract::traits::ObjectExtractor,
|
||||||
value::{BinaryPileValue, PileValue},
|
value::{AsyncReader, Item, PileValue},
|
||||||
};
|
};
|
||||||
|
|
||||||
fn toml_to_pile(value: toml::Value) -> PileValue {
|
fn toml_to_pile(value: toml::Value) -> PileValue {
|
||||||
@@ -25,12 +24,12 @@ fn toml_to_pile(value: toml::Value) -> PileValue {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub struct TomlExtractor {
|
pub struct TomlExtractor {
|
||||||
item: BinaryPileValue,
|
item: Item,
|
||||||
output: OnceLock<HashMap<Label, PileValue>>,
|
output: OnceLock<HashMap<Label, PileValue>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TomlExtractor {
|
impl TomlExtractor {
|
||||||
pub fn new(item: &BinaryPileValue) -> Self {
|
pub fn new(item: &Item) -> Self {
|
||||||
Self {
|
Self {
|
||||||
item: item.clone(),
|
item: item.clone(),
|
||||||
output: OnceLock::new(),
|
output: OnceLock::new(),
|
||||||
@@ -42,7 +41,13 @@ impl TomlExtractor {
|
|||||||
return Ok(x);
|
return Ok(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut reader = self.item.read().await?;
|
let mut reader = match self.item.read().await {
|
||||||
|
Ok(r) => r,
|
||||||
|
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
|
||||||
|
return Ok(self.output.get_or_init(HashMap::new));
|
||||||
|
}
|
||||||
|
Err(e) => return Err(e),
|
||||||
|
};
|
||||||
let bytes = reader.read_to_end().await?;
|
let bytes = reader.read_to_end().await?;
|
||||||
let toml: toml::Value = match toml::from_slice(&bytes) {
|
let toml: toml::Value = match toml::from_slice(&bytes) {
|
||||||
Ok(x) => x,
|
Ok(x) => x,
|
||||||
@@ -63,20 +68,7 @@ impl TomlExtractor {
|
|||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl ObjectExtractor for TomlExtractor {
|
impl ObjectExtractor for TomlExtractor {
|
||||||
async fn field(
|
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
&self,
|
|
||||||
state: &ExtractState,
|
|
||||||
name: &Label,
|
|
||||||
args: Option<&str>,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
if args.is_some() {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
if !state.ignore_mime && self.item.mime().type_() != mime::TEXT {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(self.get_inner().await?.get(name).cloned())
|
Ok(self.get_inner().await?.get(name).cloned())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1,9 +1,6 @@
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use crate::{
|
use crate::{extract::traits::ListExtractor, value::PileValue};
|
||||||
extract::traits::{ExtractState, ListExtractor},
|
|
||||||
value::PileValue,
|
|
||||||
};
|
|
||||||
|
|
||||||
pub struct ArrayExtractor {
|
pub struct ArrayExtractor {
|
||||||
inner: Arc<Vec<PileValue>>,
|
inner: Arc<Vec<PileValue>>,
|
||||||
@@ -17,15 +14,11 @@ impl ArrayExtractor {
|
|||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl ListExtractor for ArrayExtractor {
|
impl ListExtractor for ArrayExtractor {
|
||||||
async fn get(
|
async fn get(&self, idx: usize) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
&self,
|
|
||||||
_state: &ExtractState,
|
|
||||||
idx: usize,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
Ok(self.inner.get(idx).cloned())
|
Ok(self.inner.get(idx).cloned())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn len(&self, _state: &ExtractState) -> Result<usize, std::io::Error> {
|
async fn len(&self) -> Result<usize, std::io::Error> {
|
||||||
Ok(self.inner.len())
|
Ok(self.inner.len())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,10 +1,7 @@
|
|||||||
use pile_config::Label;
|
use pile_config::Label;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
use crate::{
|
use crate::{extract::traits::ObjectExtractor, value::PileValue};
|
||||||
extract::traits::{ExtractState, ObjectExtractor},
|
|
||||||
value::PileValue,
|
|
||||||
};
|
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub struct MapExtractor {
|
pub struct MapExtractor {
|
||||||
@@ -13,16 +10,7 @@ pub struct MapExtractor {
|
|||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl ObjectExtractor for MapExtractor {
|
impl ObjectExtractor for MapExtractor {
|
||||||
async fn field(
|
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
&self,
|
|
||||||
_state: &ExtractState,
|
|
||||||
name: &Label,
|
|
||||||
args: Option<&str>,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
if args.is_some() {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(self.inner.get(name).cloned())
|
Ok(self.inner.get(name).cloned())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,4 @@
|
|||||||
use crate::{
|
use crate::{extract::traits::ListExtractor, value::PileValue};
|
||||||
extract::traits::{ExtractState, ListExtractor},
|
|
||||||
value::PileValue,
|
|
||||||
};
|
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub struct VecExtractor {
|
pub struct VecExtractor {
|
||||||
@@ -10,15 +7,11 @@ pub struct VecExtractor {
|
|||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl ListExtractor for VecExtractor {
|
impl ListExtractor for VecExtractor {
|
||||||
async fn get(
|
async fn get(&self, idx: usize) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
&self,
|
|
||||||
_state: &ExtractState,
|
|
||||||
idx: usize,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
Ok(self.inner.get(idx).cloned())
|
Ok(self.inner.get(idx).cloned())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn len(&self, _state: &ExtractState) -> Result<usize, std::io::Error> {
|
async fn len(&self) -> Result<usize, std::io::Error> {
|
||||||
Ok(self.inner.len())
|
Ok(self.inner.len())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,4 @@
|
|||||||
pub mod blob;
|
|
||||||
pub mod item;
|
pub mod item;
|
||||||
pub mod misc;
|
pub mod misc;
|
||||||
pub mod regex;
|
|
||||||
pub mod string;
|
pub mod string;
|
||||||
pub mod traits;
|
pub mod traits;
|
||||||
|
|||||||
@@ -1,104 +0,0 @@
|
|||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
use pile_config::Label;
|
|
||||||
use regex::Regex;
|
|
||||||
use smartstring::{LazyCompact, SmartString};
|
|
||||||
|
|
||||||
use crate::{
|
|
||||||
extract::traits::{ExtractState, ListExtractor, ObjectExtractor},
|
|
||||||
value::PileValue,
|
|
||||||
};
|
|
||||||
|
|
||||||
struct RegexData {
|
|
||||||
regex: Arc<Regex>,
|
|
||||||
/// Captured substrings indexed by group index (0 = whole match).
|
|
||||||
captures: Vec<Option<Arc<SmartString<LazyCompact>>>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl RegexData {
|
|
||||||
fn new(regex: Arc<Regex>, input: &str) -> Option<Self> {
|
|
||||||
let caps = regex.captures(input)?;
|
|
||||||
let captures = caps
|
|
||||||
.iter()
|
|
||||||
.map(|m| m.map(|m| Arc::new(m.as_str().into())))
|
|
||||||
.collect();
|
|
||||||
Some(Self { regex, captures })
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Exposes named capture groups as object fields.
|
|
||||||
pub struct RegexExtractor(Arc<RegexData>);
|
|
||||||
|
|
||||||
impl RegexExtractor {
|
|
||||||
/// Run `regex` against `input`. Returns `None` if there is no match.
|
|
||||||
pub fn new(regex: Arc<Regex>, input: &str) -> Option<Self> {
|
|
||||||
Some(Self(Arc::new(RegexData::new(regex, input)?)))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl ObjectExtractor for RegexExtractor {
|
|
||||||
async fn field(
|
|
||||||
&self,
|
|
||||||
_state: &ExtractState,
|
|
||||||
name: &Label,
|
|
||||||
args: Option<&str>,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
if args.is_some() {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
let Some(idx) = self
|
|
||||||
.0
|
|
||||||
.regex
|
|
||||||
.capture_names()
|
|
||||||
.position(|n| n == Some(name.as_str()))
|
|
||||||
else {
|
|
||||||
return Ok(None);
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(Some(
|
|
||||||
match self.0.captures.get(idx).and_then(|v| v.as_ref()) {
|
|
||||||
Some(s) => PileValue::String(s.clone()),
|
|
||||||
None => PileValue::Null,
|
|
||||||
},
|
|
||||||
))
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
|
|
||||||
#[expect(clippy::unwrap_used)]
|
|
||||||
Ok(self
|
|
||||||
.0
|
|
||||||
.regex
|
|
||||||
.capture_names()
|
|
||||||
.flatten()
|
|
||||||
.map(|n| Label::new(n).unwrap())
|
|
||||||
.collect())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn as_list(&self) -> Option<Arc<dyn ListExtractor>> {
|
|
||||||
Some(Arc::new(RegexExtractor(self.0.clone())))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl ListExtractor for RegexExtractor {
|
|
||||||
async fn get(
|
|
||||||
&self,
|
|
||||||
_state: &ExtractState,
|
|
||||||
idx: usize,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
let raw_idx = idx + 1;
|
|
||||||
let Some(slot) = self.0.captures.get(raw_idx) else {
|
|
||||||
return Ok(None);
|
|
||||||
};
|
|
||||||
Ok(Some(match slot {
|
|
||||||
Some(s) => PileValue::String(s.clone()),
|
|
||||||
None => PileValue::Null,
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn len(&self, _state: &ExtractState) -> Result<usize, std::io::Error> {
|
|
||||||
Ok(self.0.captures.len().saturating_sub(1))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,15 +1,22 @@
|
|||||||
use pile_config::Label;
|
use pile_config::Label;
|
||||||
use regex::Regex;
|
|
||||||
use smartstring::{LazyCompact, SmartString};
|
use smartstring::{LazyCompact, SmartString};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use crate::{
|
use crate::{extract::traits::ObjectExtractor, value::PileValue};
|
||||||
extract::{
|
|
||||||
regex::RegexExtractor,
|
fn parse_name(s: &str) -> (&str, Option<&str>) {
|
||||||
traits::{ExtractState, ObjectExtractor},
|
match s.find('(') {
|
||||||
},
|
None => (s, None),
|
||||||
value::PileValue,
|
Some(i) => {
|
||||||
};
|
let name = &s[..i];
|
||||||
|
let rest = &s[i + 1..];
|
||||||
|
match rest.strip_suffix(')') {
|
||||||
|
Some(args) => (name, Some(args)),
|
||||||
|
None => (name, None),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub struct StringExtractor {
|
pub struct StringExtractor {
|
||||||
item: Arc<SmartString<LazyCompact>>,
|
item: Arc<SmartString<LazyCompact>>,
|
||||||
@@ -23,13 +30,9 @@ impl StringExtractor {
|
|||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl ObjectExtractor for StringExtractor {
|
impl ObjectExtractor for StringExtractor {
|
||||||
async fn field(
|
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
|
||||||
&self,
|
let (name, args) = parse_name(name.as_str());
|
||||||
_state: &ExtractState,
|
Ok(match (name, args) {
|
||||||
name: &Label,
|
|
||||||
args: Option<&str>,
|
|
||||||
) -> Result<Option<PileValue>, std::io::Error> {
|
|
||||||
Ok(match (name.as_str(), args) {
|
|
||||||
("trim", None) => Some(PileValue::String(Arc::new(
|
("trim", None) => Some(PileValue::String(Arc::new(
|
||||||
self.item.as_str().trim().into(),
|
self.item.as_str().trim().into(),
|
||||||
))),
|
))),
|
||||||
@@ -71,18 +74,6 @@ impl ObjectExtractor for StringExtractor {
|
|||||||
.collect(),
|
.collect(),
|
||||||
))),
|
))),
|
||||||
|
|
||||||
("regex", Some(pattern)) => {
|
|
||||||
let Ok(re) = Regex::new(pattern) else {
|
|
||||||
return Ok(None);
|
|
||||||
};
|
|
||||||
Some(
|
|
||||||
match RegexExtractor::new(Arc::new(re), self.item.as_str()) {
|
|
||||||
Some(ext) => PileValue::ObjectExtractor(Arc::new(ext)),
|
|
||||||
None => PileValue::Null,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
_ => None,
|
_ => None,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -94,15 +85,11 @@ impl ObjectExtractor for StringExtractor {
|
|||||||
Label::new("upper").unwrap(),
|
Label::new("upper").unwrap(),
|
||||||
Label::new("lower").unwrap(),
|
Label::new("lower").unwrap(),
|
||||||
Label::new("nonempty").unwrap(),
|
Label::new("nonempty").unwrap(),
|
||||||
Label::new("trimprefix").unwrap(),
|
|
||||||
Label::new("trimsuffix").unwrap(),
|
|
||||||
Label::new("split").unwrap(),
|
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
#[expect(clippy::expect_used)]
|
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
@@ -111,11 +98,8 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[expect(clippy::unwrap_used)]
|
#[expect(clippy::unwrap_used)]
|
||||||
async fn field(ext: &StringExtractor, name: &str, args: Option<&str>) -> Option<PileValue> {
|
async fn field(ext: &StringExtractor, name: &str) -> Option<PileValue> {
|
||||||
let state = ExtractState { ignore_mime: false };
|
ext.field(&Label::new(name).unwrap()).await.unwrap()
|
||||||
ext.field(&state, &Label::new(name).unwrap(), args)
|
|
||||||
.await
|
|
||||||
.unwrap()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn string(v: Option<PileValue>) -> Option<String> {
|
fn string(v: Option<PileValue>) -> Option<String> {
|
||||||
@@ -141,20 +125,20 @@ mod tests {
|
|||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn trim() {
|
async fn trim() {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
string(field(&extractor(" hi "), "trim", None).await),
|
string(field(&extractor(" hi "), "trim").await),
|
||||||
Some("hi".into())
|
Some("hi".into())
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn trim_no_args() {
|
async fn trim_no_args() {
|
||||||
assert!(field(&extractor("x"), "trim", Some("foo")).await.is_none());
|
assert!(field(&extractor("x"), "trim(foo)").await.is_none());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn nonempty_with_content() {
|
async fn nonempty_with_content() {
|
||||||
assert!(matches!(
|
assert!(matches!(
|
||||||
field(&extractor("hello"), "nonempty", None).await,
|
field(&extractor("hello"), "nonempty").await,
|
||||||
Some(PileValue::String(_))
|
Some(PileValue::String(_))
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
@@ -162,7 +146,7 @@ mod tests {
|
|||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn nonempty_empty_string() {
|
async fn nonempty_empty_string() {
|
||||||
assert!(matches!(
|
assert!(matches!(
|
||||||
field(&extractor(""), "nonempty", None).await,
|
field(&extractor(""), "nonempty").await,
|
||||||
Some(PileValue::Null)
|
Some(PileValue::Null)
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
@@ -170,7 +154,7 @@ mod tests {
|
|||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn trimprefix_present() {
|
async fn trimprefix_present() {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
string(field(&extractor("foobar"), "trimprefix", Some("foo")).await),
|
string(field(&extractor("foobar"), "trimprefix(foo)").await),
|
||||||
Some("bar".into())
|
Some("bar".into())
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -178,24 +162,20 @@ mod tests {
|
|||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn trimprefix_absent() {
|
async fn trimprefix_absent() {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
string(field(&extractor("foobar"), "trimprefix", Some("baz")).await),
|
string(field(&extractor("foobar"), "trimprefix(baz)").await),
|
||||||
Some("foobar".into())
|
Some("foobar".into())
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn trimprefix_no_args() {
|
async fn trimprefix_no_args() {
|
||||||
assert!(
|
assert!(field(&extractor("foobar"), "trimprefix").await.is_none());
|
||||||
field(&extractor("foobar"), "trimprefix", None)
|
|
||||||
.await
|
|
||||||
.is_none()
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn trimsuffix_present() {
|
async fn trimsuffix_present() {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
string(field(&extractor("foobar"), "trimsuffix", Some("bar")).await),
|
string(field(&extractor("foobar"), "trimsuffix(bar)").await),
|
||||||
Some("foo".into())
|
Some("foo".into())
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -203,7 +183,7 @@ mod tests {
|
|||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn trimsuffix_absent() {
|
async fn trimsuffix_absent() {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
string(field(&extractor("foobar"), "trimsuffix", Some("baz")).await),
|
string(field(&extractor("foobar"), "trimsuffix(baz)").await),
|
||||||
Some("foobar".into())
|
Some("foobar".into())
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -211,7 +191,7 @@ mod tests {
|
|||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn split_basic() {
|
async fn split_basic() {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
array(field(&extractor("a,b,c"), "split", Some(",")).await),
|
array(field(&extractor("a,b,c"), "split(,)").await),
|
||||||
vec!["a", "b", "c"]
|
vec!["a", "b", "c"]
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -219,18 +199,23 @@ mod tests {
|
|||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn split_no_match() {
|
async fn split_no_match() {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
array(field(&extractor("abc"), "split", Some(",")).await),
|
array(field(&extractor("abc"), "split(,)").await),
|
||||||
vec!["abc"]
|
vec!["abc"]
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn split_no_args() {
|
async fn split_no_args() {
|
||||||
assert!(field(&extractor("abc"), "split", None).await.is_none());
|
assert!(field(&extractor("abc"), "split").await.is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn split_unclosed_paren() {
|
||||||
|
assert!(field(&extractor("abc"), "split(,").await.is_none());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn unknown_field() {
|
async fn unknown_field() {
|
||||||
assert!(field(&extractor("abc"), "bogus", None).await.is_none());
|
assert!(field(&extractor("abc"), "bogus").await.is_none());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,13 +1,3 @@
|
|||||||
#[derive(Debug, Clone)]
|
|
||||||
pub struct ExtractState {
|
|
||||||
/// If true, extract all fields from all items.
|
|
||||||
/// Do not pre-filter using mime type.
|
|
||||||
///
|
|
||||||
/// This may detect additional fields, but
|
|
||||||
/// makes extraction take much longer
|
|
||||||
pub ignore_mime: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// An attachment that extracts metadata from an [Item].
|
/// An attachment that extracts metadata from an [Item].
|
||||||
///
|
///
|
||||||
/// Metadata is exposed as an immutable map of {label: value},
|
/// Metadata is exposed as an immutable map of {label: value},
|
||||||
@@ -17,17 +7,9 @@ pub trait ObjectExtractor: Send + Sync {
|
|||||||
/// Get the field at `name` from `item`.
|
/// Get the field at `name` from `item`.
|
||||||
/// - returns `None` if `name` is not a valid field
|
/// - returns `None` if `name` is not a valid field
|
||||||
/// - returns `Some(Null)` if `name` is not available
|
/// - returns `Some(Null)` if `name` is not available
|
||||||
///
|
|
||||||
/// For extractors that parse binary, this fn should return
|
|
||||||
/// an error only if we failed to obtain the data we need (permission denied, etc).
|
|
||||||
///
|
|
||||||
/// If the underlying data has an invalid format (e.g, running a pdf extractor on a non-pdf file),
|
|
||||||
/// this fn should return `Ok(Some(None))`.
|
|
||||||
async fn field(
|
async fn field(
|
||||||
&self,
|
&self,
|
||||||
state: &ExtractState,
|
|
||||||
name: &pile_config::Label,
|
name: &pile_config::Label,
|
||||||
args: Option<&str>,
|
|
||||||
) -> Result<Option<crate::value::PileValue>, std::io::Error>;
|
) -> Result<Option<crate::value::PileValue>, std::io::Error>;
|
||||||
|
|
||||||
/// Return all fields in this extractor.
|
/// Return all fields in this extractor.
|
||||||
@@ -35,21 +17,16 @@ pub trait ObjectExtractor: Send + Sync {
|
|||||||
/// and [None] for all others.
|
/// and [None] for all others.
|
||||||
async fn fields(&self) -> Result<Vec<pile_config::Label>, std::io::Error>;
|
async fn fields(&self) -> Result<Vec<pile_config::Label>, std::io::Error>;
|
||||||
|
|
||||||
/// Return a list view of this extractor, if supported.
|
|
||||||
fn as_list(&self) -> Option<std::sync::Arc<dyn ListExtractor>> {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Convert this to a JSON value.
|
/// Convert this to a JSON value.
|
||||||
async fn to_json(&self, state: &ExtractState) -> Result<serde_json::Value, std::io::Error> {
|
async fn to_json(&self) -> Result<serde_json::Value, std::io::Error> {
|
||||||
let keys = self.fields().await?;
|
let keys = self.fields().await?;
|
||||||
let mut map = serde_json::Map::new();
|
let mut map = serde_json::Map::new();
|
||||||
for k in &keys {
|
for k in &keys {
|
||||||
let v = match self.field(state, k, None).await? {
|
let v = match self.field(k).await? {
|
||||||
Some(x) => x,
|
Some(x) => x,
|
||||||
None => continue,
|
None => continue,
|
||||||
};
|
};
|
||||||
map.insert(k.to_string(), Box::pin(v.to_json(state)).await?);
|
map.insert(k.to_string(), Box::pin(v.to_json()).await?);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(serde_json::Value::Object(map))
|
Ok(serde_json::Value::Object(map))
|
||||||
@@ -65,25 +42,25 @@ pub trait ListExtractor: Send + Sync {
|
|||||||
/// Indices start at zero, and must be consecutive.
|
/// Indices start at zero, and must be consecutive.
|
||||||
/// - returns `None` if `idx` is out of range
|
/// - returns `None` if `idx` is out of range
|
||||||
/// - returns `Some(Null)` if `None` is at `idx`
|
/// - returns `Some(Null)` if `None` is at `idx`
|
||||||
async fn get(
|
async fn get(&self, idx: usize) -> Result<Option<crate::value::PileValue>, std::io::Error>;
|
||||||
&self,
|
|
||||||
state: &ExtractState,
|
|
||||||
idx: usize,
|
|
||||||
) -> Result<Option<crate::value::PileValue>, std::io::Error>;
|
|
||||||
|
|
||||||
async fn len(&self, state: &ExtractState) -> Result<usize, std::io::Error>;
|
async fn len(&self) -> Result<usize, std::io::Error>;
|
||||||
|
|
||||||
|
async fn is_empty(&self) -> Result<bool, std::io::Error> {
|
||||||
|
Ok(self.len().await? == 0)
|
||||||
|
}
|
||||||
|
|
||||||
/// Convert this list to a JSON value.
|
/// Convert this list to a JSON value.
|
||||||
async fn to_json(&self, state: &ExtractState) -> Result<serde_json::Value, std::io::Error> {
|
async fn to_json(&self) -> Result<serde_json::Value, std::io::Error> {
|
||||||
let len = self.len(state).await?;
|
let len = self.len().await?;
|
||||||
let mut list = Vec::with_capacity(len);
|
let mut list = Vec::with_capacity(len);
|
||||||
for i in 0..len {
|
for i in 0..len {
|
||||||
#[expect(clippy::expect_used)]
|
#[expect(clippy::expect_used)]
|
||||||
let v = self
|
let v = self
|
||||||
.get(state, i)
|
.get(i)
|
||||||
.await?
|
.await?
|
||||||
.expect("value must be present according to length");
|
.expect("value must be present according to length");
|
||||||
list.push(Box::pin(v.to_json(state)).await?);
|
list.push(Box::pin(v.to_json()).await?);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(serde_json::Value::Array(list))
|
Ok(serde_json::Value::Array(list))
|
||||||
|
|||||||
@@ -1,137 +1,131 @@
|
|||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
use pile_config::Label;
|
use pile_config::Label;
|
||||||
use regex::Regex;
|
use std::{path::PathBuf, sync::Arc};
|
||||||
use smartstring::{LazyCompact, SmartString};
|
use tokio_stream::wrappers::ReceiverStream;
|
||||||
use std::{
|
|
||||||
collections::{BTreeMap, HashMap},
|
|
||||||
path::PathBuf,
|
|
||||||
sync::{Arc, OnceLock},
|
|
||||||
};
|
|
||||||
use walkdir::WalkDir;
|
use walkdir::WalkDir;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
source::{DataSource, misc::path_ts_latest},
|
source::{DataSource, misc::path_ts_latest},
|
||||||
value::{BinaryPileValue, Item, PileValue},
|
value::Item,
|
||||||
};
|
};
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct DirDataSource {
|
pub struct DirDataSource {
|
||||||
pub name: Label,
|
pub name: Label,
|
||||||
pub dir: PathBuf,
|
pub dir: PathBuf,
|
||||||
pub base_pattern: Regex,
|
|
||||||
pub files: HashMap<Label, String>,
|
pub sidecars: bool,
|
||||||
pub index: OnceLock<BTreeMap<SmartString<LazyCompact>, Item>>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DirDataSource {
|
impl DirDataSource {
|
||||||
pub async fn new(
|
pub fn new(name: &Label, dir: PathBuf, sidecars: bool) -> Self {
|
||||||
name: &Label,
|
Self {
|
||||||
dir: PathBuf,
|
|
||||||
base_pattern: Regex,
|
|
||||||
files: HashMap<Label, String>,
|
|
||||||
) -> Result<Arc<Self>, std::io::Error> {
|
|
||||||
let source = Arc::new(Self {
|
|
||||||
name: name.clone(),
|
name: name.clone(),
|
||||||
dir,
|
dir,
|
||||||
base_pattern,
|
sidecars,
|
||||||
files,
|
|
||||||
index: OnceLock::new(),
|
|
||||||
});
|
|
||||||
|
|
||||||
let mut index = BTreeMap::new();
|
|
||||||
'entry: for entry in WalkDir::new(&source.dir) {
|
|
||||||
let entry = match entry {
|
|
||||||
Err(e) => {
|
|
||||||
let msg = format!("walkdir error: {e:?}");
|
|
||||||
let err = e.into_io_error().unwrap_or(std::io::Error::other(msg));
|
|
||||||
return Err(err);
|
|
||||||
}
|
|
||||||
Ok(e) => e,
|
|
||||||
};
|
|
||||||
|
|
||||||
if entry.file_type().is_dir() {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
let path = entry.into_path();
|
|
||||||
let rel_path = match path.strip_prefix(&source.dir) {
|
|
||||||
Ok(p) => p,
|
|
||||||
Err(_) => continue 'entry,
|
|
||||||
};
|
|
||||||
let path_str = match rel_path.to_str() {
|
|
||||||
Some(x) => x,
|
|
||||||
None => continue 'entry,
|
|
||||||
};
|
|
||||||
|
|
||||||
let captures = match source.base_pattern.captures(path_str) {
|
|
||||||
Some(c) => c,
|
|
||||||
None => continue 'entry,
|
|
||||||
};
|
|
||||||
let base = match captures.get(1) {
|
|
||||||
Some(m) => m.as_str(),
|
|
||||||
None => continue 'entry,
|
|
||||||
};
|
|
||||||
|
|
||||||
let key: SmartString<LazyCompact> = base.into();
|
|
||||||
if index.contains_key(&key) {
|
|
||||||
continue 'entry;
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut item_files = HashMap::new();
|
|
||||||
for (label, template) in &source.files {
|
|
||||||
let file_path = source.dir.join(template.replace("{base}", base));
|
|
||||||
if file_path.exists() {
|
|
||||||
let mime = mime_guess::from_path(&file_path).first_or_octet_stream();
|
|
||||||
item_files.insert(
|
|
||||||
label.clone(),
|
|
||||||
PileValue::Binary(BinaryPileValue::File {
|
|
||||||
mime,
|
|
||||||
path: file_path,
|
|
||||||
}),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
index.insert(
|
|
||||||
key.clone(),
|
|
||||||
Item::File {
|
|
||||||
key,
|
|
||||||
source: Arc::clone(&source),
|
|
||||||
files: item_files,
|
|
||||||
},
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
source.index.get_or_init(|| index);
|
|
||||||
Ok(source)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DataSource for Arc<DirDataSource> {
|
impl DataSource for Arc<DirDataSource> {
|
||||||
#[expect(clippy::expect_used)]
|
|
||||||
fn len(&self) -> usize {
|
|
||||||
self.index.get().expect("index should be initialized").len()
|
|
||||||
}
|
|
||||||
|
|
||||||
#[expect(clippy::expect_used)]
|
|
||||||
async fn get(&self, key: &str) -> Result<Option<Item>, std::io::Error> {
|
async fn get(&self, key: &str) -> Result<Option<Item>, std::io::Error> {
|
||||||
return Ok(self
|
let key = match key.parse::<PathBuf>() {
|
||||||
.index
|
Ok(x) => self.dir.join(x),
|
||||||
.get()
|
Err(_) => return Ok(None),
|
||||||
.expect("index should be initialized")
|
};
|
||||||
.get(key)
|
|
||||||
.cloned());
|
if !key.is_file() {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ignore toml files if sidecars are enabled
|
||||||
|
if self.sidecars && key.extension().and_then(|x| x.to_str()) == Some("toml") {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Ok(Some(Item::File {
|
||||||
|
source: Arc::clone(self),
|
||||||
|
mime: mime_guess::from_path(&key).first_or_octet_stream(),
|
||||||
|
path: key.clone(),
|
||||||
|
sidecar: self.sidecars.then(|| {
|
||||||
|
Box::new(Item::File {
|
||||||
|
source: Arc::clone(self),
|
||||||
|
mime: mime_guess::from_path(key.with_extension("toml")).first_or_octet_stream(),
|
||||||
|
path: key.with_extension("toml"),
|
||||||
|
sidecar: None,
|
||||||
|
})
|
||||||
|
}),
|
||||||
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[expect(clippy::expect_used)]
|
fn iter(&self) -> ReceiverStream<Result<Item, std::io::Error>> {
|
||||||
fn iter(&self) -> impl Iterator<Item = &Item> {
|
let (tx, rx) = tokio::sync::mpsc::channel(64);
|
||||||
self.index
|
let source = Arc::clone(self);
|
||||||
.get()
|
|
||||||
.expect("index should be initialized")
|
let dir = self.dir.clone();
|
||||||
.values()
|
tokio::task::spawn_blocking(move || {
|
||||||
|
for entry in WalkDir::new(dir) {
|
||||||
|
let entry = match entry {
|
||||||
|
Err(e) => {
|
||||||
|
let msg = format!("walkdir error: {e:?}");
|
||||||
|
let err = e.into_io_error().unwrap_or(std::io::Error::other(msg));
|
||||||
|
if tx.blocking_send(Err(err)).is_err() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Ok(e) => e,
|
||||||
|
};
|
||||||
|
|
||||||
|
if entry.file_type().is_dir() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let path = entry.into_path();
|
||||||
|
|
||||||
|
let item = match path.extension().and_then(|x| x.to_str()) {
|
||||||
|
None => continue,
|
||||||
|
Some("toml") if source.sidecars => continue,
|
||||||
|
Some(_) => Item::File {
|
||||||
|
source: Arc::clone(&source),
|
||||||
|
mime: mime_guess::from_path(&path).first_or_octet_stream(),
|
||||||
|
path: path.clone(),
|
||||||
|
|
||||||
|
sidecar: source.sidecars.then(|| {
|
||||||
|
Box::new(Item::File {
|
||||||
|
source: Arc::clone(&source),
|
||||||
|
mime: mime_guess::from_path(path.with_extension("toml"))
|
||||||
|
.first_or_octet_stream(),
|
||||||
|
path: path.with_extension("toml"),
|
||||||
|
sidecar: None,
|
||||||
|
})
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
if tx.blocking_send(Ok(item)).is_err() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
ReceiverStream::new(rx)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn latest_change(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
|
async fn latest_change(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
|
||||||
path_ts_latest(&self.dir)
|
let mut ts: Option<DateTime<Utc>> = None;
|
||||||
|
|
||||||
|
if !self.dir.exists() {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
let new = path_ts_latest(&self.dir)?;
|
||||||
|
match (ts, new) {
|
||||||
|
(_, None) => {}
|
||||||
|
(None, Some(new)) => ts = Some(new),
|
||||||
|
(Some(old), Some(new)) => ts = Some(old.max(new)),
|
||||||
|
};
|
||||||
|
|
||||||
|
return Ok(ts);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,29 +1,27 @@
|
|||||||
mod dir;
|
mod dir;
|
||||||
pub use dir::*;
|
pub use dir::*;
|
||||||
|
|
||||||
|
mod s3;
|
||||||
|
pub use s3::*;
|
||||||
|
|
||||||
pub mod misc;
|
pub mod misc;
|
||||||
|
|
||||||
|
use chrono::{DateTime, Utc};
|
||||||
|
use tokio_stream::wrappers::ReceiverStream;
|
||||||
|
|
||||||
/// A read-only set of [Item]s.
|
/// A read-only set of [Item]s.
|
||||||
pub trait DataSource {
|
pub trait DataSource {
|
||||||
/// Get the number of items in this source
|
|
||||||
fn len(&self) -> usize;
|
|
||||||
|
|
||||||
/// Get an item from this datasource
|
/// Get an item from this datasource
|
||||||
fn get(
|
fn get(
|
||||||
&self,
|
&self,
|
||||||
key: &str,
|
key: &str,
|
||||||
) -> impl Future<Output = Result<Option<crate::value::Item>, std::io::Error>> + Send;
|
) -> impl Future<Output = Result<Option<crate::value::Item>, std::io::Error>> + Send;
|
||||||
|
|
||||||
/// Iterate over all items in this source in sorted key order
|
/// Iterate over all items in this source in an arbitrary order
|
||||||
fn iter(&self) -> impl Iterator<Item = &crate::value::Item>;
|
fn iter(&self) -> ReceiverStream<Result<crate::value::Item, std::io::Error>>;
|
||||||
|
|
||||||
/// Iterate over a page of items, sorted by key
|
|
||||||
fn iter_page(&self, offset: usize, limit: usize) -> impl Iterator<Item = &crate::value::Item> {
|
|
||||||
self.iter().skip(offset).take(limit)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Return the time of the latest change to the data in this source
|
/// Return the time of the latest change to the data in this source
|
||||||
fn latest_change(
|
fn latest_change(
|
||||||
&self,
|
&self,
|
||||||
) -> impl Future<Output = Result<Option<chrono::DateTime<chrono::Utc>>, std::io::Error>> + Send;
|
) -> impl Future<Output = Result<Option<DateTime<Utc>>, std::io::Error>> + Send;
|
||||||
}
|
}
|
||||||
|
|||||||
255
crates/pile-value/src/source/s3.rs
Normal file
255
crates/pile-value/src/source/s3.rs
Normal file
@@ -0,0 +1,255 @@
|
|||||||
|
use aws_sdk_s3::config::{BehaviorVersion, Credentials, Region};
|
||||||
|
use chrono::{DateTime, Utc};
|
||||||
|
use pile_config::{Label, S3Credentials};
|
||||||
|
use smartstring::{LazyCompact, SmartString};
|
||||||
|
use std::sync::Arc;
|
||||||
|
use tokio_stream::wrappers::ReceiverStream;
|
||||||
|
|
||||||
|
use crate::{source::DataSource, value::Item};
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct S3DataSource {
|
||||||
|
pub name: Label,
|
||||||
|
pub bucket: SmartString<LazyCompact>,
|
||||||
|
pub prefix: Option<SmartString<LazyCompact>>,
|
||||||
|
pub sidecars: bool,
|
||||||
|
pub client: Arc<aws_sdk_s3::Client>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl S3DataSource {
|
||||||
|
pub fn new(
|
||||||
|
name: &Label,
|
||||||
|
bucket: String,
|
||||||
|
prefix: Option<String>,
|
||||||
|
endpoint: Option<String>,
|
||||||
|
region: String,
|
||||||
|
credentials: &S3Credentials,
|
||||||
|
sidecars: bool,
|
||||||
|
) -> Result<Self, std::io::Error> {
|
||||||
|
let client = {
|
||||||
|
let creds = Credentials::new(
|
||||||
|
&credentials.access_key_id,
|
||||||
|
&credentials.secret_access_key,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
"pile",
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut s3_config = aws_sdk_s3::config::Builder::new()
|
||||||
|
.behavior_version(BehaviorVersion::latest())
|
||||||
|
.region(Region::new(region))
|
||||||
|
.credentials_provider(creds);
|
||||||
|
|
||||||
|
if let Some(ep) = endpoint {
|
||||||
|
s3_config = s3_config.endpoint_url(ep).force_path_style(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
aws_sdk_s3::Client::from_conf(s3_config.build())
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
name: name.clone(),
|
||||||
|
bucket: bucket.into(),
|
||||||
|
prefix: prefix.map(|x| x.into()),
|
||||||
|
sidecars,
|
||||||
|
client: Arc::new(client),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn find_sidecar_key(&self, key: &str) -> Option<SmartString<LazyCompact>> {
|
||||||
|
// First try {key}.toml
|
||||||
|
let full_toml = format!("{key}.toml");
|
||||||
|
if self
|
||||||
|
.client
|
||||||
|
.head_object()
|
||||||
|
.bucket(self.bucket.as_str())
|
||||||
|
.key(&full_toml)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.is_ok()
|
||||||
|
{
|
||||||
|
return Some(full_toml.into());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Then try {key-with-extension-stripped}.toml
|
||||||
|
let stripped = std::path::Path::new(key).with_extension("toml");
|
||||||
|
if let Some(stripped_str) = stripped.to_str()
|
||||||
|
&& stripped_str != full_toml.as_str()
|
||||||
|
&& self
|
||||||
|
.client
|
||||||
|
.head_object()
|
||||||
|
.bucket(self.bucket.as_str())
|
||||||
|
.key(stripped_str)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.is_ok()
|
||||||
|
{
|
||||||
|
return Some(stripped_str.into());
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn make_item(self: &Arc<Self>, key: impl Into<SmartString<LazyCompact>>) -> Item {
|
||||||
|
let key: SmartString<LazyCompact> = key.into();
|
||||||
|
let mime = mime_guess::from_path(key.as_str()).first_or_octet_stream();
|
||||||
|
|
||||||
|
let sidecar = if self.sidecars {
|
||||||
|
self.find_sidecar_key(key.as_str())
|
||||||
|
.await
|
||||||
|
.map(|sidecar_key| {
|
||||||
|
Box::new(Item::S3 {
|
||||||
|
source: Arc::clone(self),
|
||||||
|
mime: mime_guess::from_path(sidecar_key.as_str()).first_or_octet_stream(),
|
||||||
|
key: sidecar_key,
|
||||||
|
sidecar: None,
|
||||||
|
})
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
Item::S3 {
|
||||||
|
source: Arc::clone(self),
|
||||||
|
mime,
|
||||||
|
key,
|
||||||
|
sidecar,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DataSource for Arc<S3DataSource> {
|
||||||
|
async fn get(&self, key: &str) -> Result<Option<Item>, std::io::Error> {
|
||||||
|
if self.sidecars && key.ends_with(".toml") {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
let result = self
|
||||||
|
.client
|
||||||
|
.head_object()
|
||||||
|
.bucket(self.bucket.as_str())
|
||||||
|
.key(key)
|
||||||
|
.send()
|
||||||
|
.await;
|
||||||
|
|
||||||
|
match result {
|
||||||
|
Err(sdk_err) => {
|
||||||
|
let not_found = sdk_err
|
||||||
|
.as_service_error()
|
||||||
|
.map(|e| e.is_not_found())
|
||||||
|
.unwrap_or(false);
|
||||||
|
if not_found {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
Err(std::io::Error::other(sdk_err))
|
||||||
|
}
|
||||||
|
Ok(_) => Ok(Some(self.make_item(key).await)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn iter(&self) -> ReceiverStream<Result<Item, std::io::Error>> {
|
||||||
|
let (tx, rx) = tokio::sync::mpsc::channel(64);
|
||||||
|
let source = Arc::clone(self);
|
||||||
|
|
||||||
|
tokio::spawn(async move {
|
||||||
|
let mut continuation_token: Option<String> = None;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
let mut req = source
|
||||||
|
.client
|
||||||
|
.list_objects_v2()
|
||||||
|
.bucket(source.bucket.as_str());
|
||||||
|
|
||||||
|
if let Some(prefix) = &source.prefix {
|
||||||
|
req = req.prefix(prefix.as_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(token) = continuation_token {
|
||||||
|
req = req.continuation_token(token);
|
||||||
|
}
|
||||||
|
|
||||||
|
let resp = match req.send().await {
|
||||||
|
Err(e) => {
|
||||||
|
let _ = tx.send(Err(std::io::Error::other(e))).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
Ok(resp) => resp,
|
||||||
|
};
|
||||||
|
|
||||||
|
let next_token = resp.next_continuation_token().map(ToOwned::to_owned);
|
||||||
|
let is_truncated = resp.is_truncated().unwrap_or(false);
|
||||||
|
|
||||||
|
for obj in resp.contents() {
|
||||||
|
let key = match obj.key() {
|
||||||
|
Some(k) => k.to_owned(),
|
||||||
|
None => continue,
|
||||||
|
};
|
||||||
|
|
||||||
|
if source.sidecars && key.ends_with(".toml") {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let item = source.make_item(key).await;
|
||||||
|
|
||||||
|
if tx.send(Ok(item)).await.is_err() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !is_truncated {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
continuation_token = next_token;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
ReceiverStream::new(rx)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn latest_change(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
|
||||||
|
let mut ts: Option<DateTime<Utc>> = None;
|
||||||
|
let mut continuation_token: Option<String> = None;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
let mut req = self.client.list_objects_v2().bucket(self.bucket.as_str());
|
||||||
|
|
||||||
|
if let Some(prefix) = &self.prefix {
|
||||||
|
req = req.prefix(prefix.as_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(token) = continuation_token {
|
||||||
|
req = req.continuation_token(token);
|
||||||
|
}
|
||||||
|
|
||||||
|
let resp = match req.send().await {
|
||||||
|
Err(_) => return Ok(None),
|
||||||
|
Ok(resp) => resp,
|
||||||
|
};
|
||||||
|
|
||||||
|
let next_token = resp.next_continuation_token().map(ToOwned::to_owned);
|
||||||
|
let is_truncated = resp.is_truncated().unwrap_or(false);
|
||||||
|
|
||||||
|
for obj in resp.contents() {
|
||||||
|
if let Some(last_modified) = obj.last_modified() {
|
||||||
|
let dt = DateTime::from_timestamp(
|
||||||
|
last_modified.secs(),
|
||||||
|
last_modified.subsec_nanos(),
|
||||||
|
);
|
||||||
|
if let Some(dt) = dt {
|
||||||
|
ts = Some(match ts {
|
||||||
|
None => dt,
|
||||||
|
Some(prev) => prev.max(dt),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !is_truncated {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
continuation_token = next_token;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(ts)
|
||||||
|
}
|
||||||
|
}
|
||||||
158
crates/pile-value/src/source/s3reader.rs
Normal file
158
crates/pile-value/src/source/s3reader.rs
Normal file
@@ -0,0 +1,158 @@
|
|||||||
|
use aws_sdk_s3::{error::SdkError, operation::get_object::GetObjectError};
|
||||||
|
use mime::Mime;
|
||||||
|
use std::io::{Error as IoError, Seek, SeekFrom, Write};
|
||||||
|
use thiserror::Error;
|
||||||
|
|
||||||
|
use super::S3Client;
|
||||||
|
use crate::retry;
|
||||||
|
|
||||||
|
#[derive(Debug, Error)]
|
||||||
|
#[expect(clippy::large_enum_variant)]
|
||||||
|
pub enum S3ReaderError {
|
||||||
|
#[error("sdk error")]
|
||||||
|
SdkError(#[from] SdkError<GetObjectError>),
|
||||||
|
|
||||||
|
#[error("byte stream error")]
|
||||||
|
ByteStreamError(#[from] aws_sdk_s3::primitives::ByteStreamError),
|
||||||
|
|
||||||
|
#[error("i/o error")]
|
||||||
|
IoError(#[from] IoError),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Provides a [`std::io::Read`]-like interface to an S3 object. \
|
||||||
|
/// This doesn't actually implement [`std::io::Read`] because Read isn't async.
|
||||||
|
///
|
||||||
|
/// Also implements [`std::io::Seek`]
|
||||||
|
pub struct S3Reader {
|
||||||
|
pub(super) client: S3Client,
|
||||||
|
pub(super) bucket: String,
|
||||||
|
pub(super) key: String,
|
||||||
|
|
||||||
|
pub(super) cursor: u64,
|
||||||
|
pub(super) size: u64,
|
||||||
|
pub(super) mime: Mime,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl S3Reader {
|
||||||
|
pub async fn read(&mut self, mut buf: &mut [u8]) -> Result<usize, S3ReaderError> {
|
||||||
|
let len_left = self.size - self.cursor;
|
||||||
|
if len_left == 0 || buf.is_empty() {
|
||||||
|
return Ok(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[expect(clippy::unwrap_used)] // TODO: probably fits?
|
||||||
|
let start_byte = usize::try_from(self.cursor).unwrap();
|
||||||
|
|
||||||
|
#[expect(clippy::unwrap_used)] // usize fits in u64
|
||||||
|
let len_to_read = u64::try_from(buf.len()).unwrap().min(len_left);
|
||||||
|
|
||||||
|
#[expect(clippy::unwrap_used)] // must fit, we called min()
|
||||||
|
let len_to_read = usize::try_from(len_to_read).unwrap();
|
||||||
|
|
||||||
|
let end_byte = start_byte + len_to_read - 1;
|
||||||
|
|
||||||
|
let b = retry!(
|
||||||
|
self.client.retries,
|
||||||
|
self.client
|
||||||
|
.client
|
||||||
|
.get_object()
|
||||||
|
.bucket(self.bucket.as_str())
|
||||||
|
.key(self.key.as_str())
|
||||||
|
.range(format!("bytes={start_byte}-{end_byte}"))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
)?;
|
||||||
|
|
||||||
|
// Looks like `bytes 31000000-31999999/33921176``
|
||||||
|
// println!("{:?}", b.content_range);
|
||||||
|
|
||||||
|
let mut bytes = b.body.collect().await?.into_bytes();
|
||||||
|
bytes.truncate(len_to_read);
|
||||||
|
let l = bytes.len();
|
||||||
|
|
||||||
|
// Memory to memory writes are infallible
|
||||||
|
#[expect(clippy::unwrap_used)]
|
||||||
|
buf.write_all(&bytes).unwrap();
|
||||||
|
|
||||||
|
// Cannot fail, usize should always fit into u64
|
||||||
|
#[expect(clippy::unwrap_used)]
|
||||||
|
{
|
||||||
|
self.cursor += u64::try_from(l).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
return Ok(len_to_read);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_done(&self) -> bool {
|
||||||
|
return self.cursor == self.size;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn mime(&self) -> &Mime {
|
||||||
|
&self.mime
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Write the entire contents of this reader to `r`.
|
||||||
|
///
|
||||||
|
/// This method always downloads the whole object,
|
||||||
|
/// and always preserves `self.cursor`.
|
||||||
|
pub async fn download<W: Write>(&mut self, r: &mut W) -> Result<(), S3ReaderError> {
|
||||||
|
let pos = self.stream_position()?;
|
||||||
|
|
||||||
|
const BUF_LEN: usize = 10_000_000;
|
||||||
|
#[expect(clippy::unwrap_used)] // Cannot fail
|
||||||
|
let mut buf: Box<[u8; BUF_LEN]> = vec![0u8; BUF_LEN].try_into().unwrap();
|
||||||
|
|
||||||
|
while !self.is_done() {
|
||||||
|
let b = self.read(&mut buf[..]).await?;
|
||||||
|
r.write_all(&buf[0..b])?;
|
||||||
|
}
|
||||||
|
|
||||||
|
self.seek(SeekFrom::Start(pos))?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Seek for S3Reader {
|
||||||
|
fn seek(&mut self, pos: SeekFrom) -> std::io::Result<u64> {
|
||||||
|
match pos {
|
||||||
|
SeekFrom::Start(x) => self.cursor = x.min(self.size - 1),
|
||||||
|
|
||||||
|
// Cannot panic, we handle all cases
|
||||||
|
#[expect(clippy::unwrap_used)]
|
||||||
|
SeekFrom::Current(x) => {
|
||||||
|
if x < 0 {
|
||||||
|
if u64::try_from(x.abs()).unwrap() > self.cursor {
|
||||||
|
return Err(std::io::Error::new(
|
||||||
|
std::io::ErrorKind::InvalidInput,
|
||||||
|
"cannot seek past start",
|
||||||
|
));
|
||||||
|
}
|
||||||
|
self.cursor -= u64::try_from(x.abs()).unwrap();
|
||||||
|
} else {
|
||||||
|
self.cursor += u64::try_from(x).unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cannot panic, we handle all cases
|
||||||
|
#[expect(clippy::unwrap_used)]
|
||||||
|
SeekFrom::End(x) => {
|
||||||
|
if x < 0 {
|
||||||
|
if u64::try_from(x.abs()).unwrap() > self.size {
|
||||||
|
return Err(std::io::Error::new(
|
||||||
|
std::io::ErrorKind::InvalidInput,
|
||||||
|
"cannot seek past start",
|
||||||
|
));
|
||||||
|
}
|
||||||
|
// Cannot fail, is abs
|
||||||
|
self.cursor = self.size - u64::try_from(x.abs()).unwrap();
|
||||||
|
} else {
|
||||||
|
// Cannot fail, is positive
|
||||||
|
self.cursor = self.size + u64::try_from(x).unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
self.cursor = self.cursor.min(self.size - 1);
|
||||||
|
return Ok(self.cursor);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,45 +1,105 @@
|
|||||||
use pile_config::Label;
|
use mime::Mime;
|
||||||
use smartstring::{LazyCompact, SmartString};
|
use smartstring::{LazyCompact, SmartString};
|
||||||
use std::{collections::HashMap, sync::Arc};
|
use std::{fs::File, path::PathBuf, sync::Arc};
|
||||||
|
|
||||||
use crate::{source::DirDataSource, value::PileValue};
|
use crate::{
|
||||||
|
source::{DirDataSource, S3DataSource},
|
||||||
|
value::{ItemReader, S3Reader},
|
||||||
|
};
|
||||||
|
|
||||||
//
|
//
|
||||||
// MARK: item
|
// MARK: item
|
||||||
//
|
//
|
||||||
|
|
||||||
/// A cheaply-cloneable pointer to an item in a dataset
|
/// A cheaply-cloneable pointer to an item in a dataset
|
||||||
#[derive(Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub enum Item {
|
pub enum Item {
|
||||||
File {
|
File {
|
||||||
key: SmartString<LazyCompact>,
|
|
||||||
source: Arc<DirDataSource>,
|
source: Arc<DirDataSource>,
|
||||||
files: HashMap<Label, PileValue>,
|
mime: Mime,
|
||||||
|
|
||||||
|
path: PathBuf,
|
||||||
|
sidecar: Option<Box<Item>>,
|
||||||
|
},
|
||||||
|
|
||||||
|
S3 {
|
||||||
|
source: Arc<S3DataSource>,
|
||||||
|
mime: Mime,
|
||||||
|
|
||||||
|
key: SmartString<LazyCompact>,
|
||||||
|
sidecar: Option<Box<Item>>,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::fmt::Debug for Item {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
match self {
|
|
||||||
Self::File { key, files, .. } => f
|
|
||||||
.debug_struct("Item::File")
|
|
||||||
.field("key", key)
|
|
||||||
.field("files", &files.keys().collect::<Vec<_>>())
|
|
||||||
.finish(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Item {
|
impl Item {
|
||||||
|
/// Open the item for reading. For S3, performs a HEAD request to determine
|
||||||
|
/// the object size.
|
||||||
|
pub async fn read(&self) -> Result<ItemReader, std::io::Error> {
|
||||||
|
Ok(match self {
|
||||||
|
Self::File { path, .. } => ItemReader::File(File::open(path)?),
|
||||||
|
|
||||||
|
Self::S3 { source, key, .. } => {
|
||||||
|
let head = source
|
||||||
|
.client
|
||||||
|
.head_object()
|
||||||
|
.bucket(source.bucket.as_str())
|
||||||
|
.key(key.as_str())
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(std::io::Error::other)?;
|
||||||
|
|
||||||
|
let size = head.content_length().unwrap_or(0) as u64;
|
||||||
|
|
||||||
|
ItemReader::S3(S3Reader {
|
||||||
|
client: source.client.clone(),
|
||||||
|
bucket: source.bucket.clone(),
|
||||||
|
key: key.to_owned(),
|
||||||
|
cursor: 0,
|
||||||
|
size,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
pub fn source_name(&self) -> &pile_config::Label {
|
pub fn source_name(&self) -> &pile_config::Label {
|
||||||
match self {
|
match self {
|
||||||
Self::File { source, .. } => &source.name,
|
Self::File { source, .. } => &source.name,
|
||||||
|
Self::S3 { source, .. } => &source.name,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[expect(clippy::expect_used)]
|
||||||
pub fn key(&self) -> SmartString<LazyCompact> {
|
pub fn key(&self) -> SmartString<LazyCompact> {
|
||||||
match self {
|
match self {
|
||||||
Self::File { key, .. } => key.clone(),
|
Self::File { path, .. } => path.to_str().expect("path is not utf-8").into(),
|
||||||
|
Self::S3 { key, .. } => key.clone(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn hash(&self) -> Result<blake3::Hash, std::io::Error> {
|
||||||
|
match self {
|
||||||
|
Self::File { path, .. } => {
|
||||||
|
let mut hasher = blake3::Hasher::new();
|
||||||
|
let mut file = std::fs::File::open(path)?;
|
||||||
|
std::io::copy(&mut file, &mut hasher)?;
|
||||||
|
return Ok(hasher.finalize());
|
||||||
|
}
|
||||||
|
|
||||||
|
Self::S3 { .. } => todo!(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn mime(&self) -> &Mime {
|
||||||
|
match self {
|
||||||
|
Self::File { mime, .. } => mime,
|
||||||
|
Self::S3 { mime, .. } => mime,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn sidecar(&self) -> Option<&Self> {
|
||||||
|
match self {
|
||||||
|
Self::File { sidecar, .. } => sidecar.as_ref().map(|x| &**x),
|
||||||
|
Self::S3 { sidecar, .. } => sidecar.as_ref().map(|x| &**x),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,10 +1,81 @@
|
|||||||
use pile_io::{AsyncReader, AsyncSeekReader};
|
use smartstring::{LazyCompact, SmartString};
|
||||||
use std::{
|
use std::{
|
||||||
fs::File,
|
fs::File,
|
||||||
io::{Cursor, Seek},
|
io::{Read, Seek, SeekFrom},
|
||||||
|
sync::Arc,
|
||||||
};
|
};
|
||||||
|
use tokio::runtime::Handle;
|
||||||
|
|
||||||
use crate::value::ArcBytes;
|
//
|
||||||
|
// MARK: traits
|
||||||
|
//
|
||||||
|
|
||||||
|
pub trait AsyncReader: Send {
|
||||||
|
/// Read a chunk of bytes.
|
||||||
|
fn read(
|
||||||
|
&mut self,
|
||||||
|
buf: &mut [u8],
|
||||||
|
) -> impl Future<Output = Result<usize, std::io::Error>> + Send;
|
||||||
|
|
||||||
|
/// Read all remaining bytes into a `Vec`.
|
||||||
|
fn read_to_end(&mut self) -> impl Future<Output = Result<Vec<u8>, std::io::Error>> + Send {
|
||||||
|
async {
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
let mut chunk = vec![0u8; 65536];
|
||||||
|
loop {
|
||||||
|
let n = self.read(&mut chunk).await?;
|
||||||
|
if n == 0 {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
buf.extend_from_slice(&chunk[..n]);
|
||||||
|
}
|
||||||
|
Ok(buf)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub trait AsyncSeekReader: AsyncReader {
|
||||||
|
fn seek(&mut self, pos: SeekFrom) -> impl Future<Output = Result<u64, std::io::Error>> + Send;
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// MARK: sync bridge
|
||||||
|
//
|
||||||
|
|
||||||
|
/// Turn an async [Reader] into a sync [Read] + [Seek].
|
||||||
|
///
|
||||||
|
/// Never use this outside of [tokio::task::spawn_blocking],
|
||||||
|
/// the async runtime will deadlock if this struct blocks
|
||||||
|
/// the runtime.
|
||||||
|
pub struct SyncReadBridge<R: AsyncReader> {
|
||||||
|
inner: R,
|
||||||
|
handle: Handle,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<R: AsyncReader> SyncReadBridge<R> {
|
||||||
|
/// Creates a new adapter using a handle to the current runtime.
|
||||||
|
/// Panics if called outside of tokio
|
||||||
|
pub fn new_current(inner: R) -> Self {
|
||||||
|
Self::new(inner, Handle::current())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a new adapter using a handle to an existing runtime.
|
||||||
|
pub fn new(inner: R, handle: Handle) -> Self {
|
||||||
|
Self { inner, handle }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<R: AsyncReader> Read for SyncReadBridge<R> {
|
||||||
|
fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
|
||||||
|
self.handle.block_on(self.inner.read(buf))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<R: AsyncReader + AsyncSeekReader> Seek for SyncReadBridge<R> {
|
||||||
|
fn seek(&mut self, pos: SeekFrom) -> Result<u64, std::io::Error> {
|
||||||
|
self.handle.block_on(self.inner.seek(pos))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// MARK: itemreader
|
// MARK: itemreader
|
||||||
@@ -12,14 +83,14 @@ use crate::value::ArcBytes;
|
|||||||
|
|
||||||
pub enum ItemReader {
|
pub enum ItemReader {
|
||||||
File(File),
|
File(File),
|
||||||
Vec(Cursor<ArcBytes>),
|
S3(S3Reader),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AsyncReader for ItemReader {
|
impl AsyncReader for ItemReader {
|
||||||
async fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
|
async fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
|
||||||
match self {
|
match self {
|
||||||
Self::File(x) => std::io::Read::read(x, buf),
|
Self::File(x) => std::io::Read::read(x, buf),
|
||||||
Self::Vec(x) => std::io::Read::read(x, buf),
|
Self::S3(x) => x.read(buf).await,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -28,7 +99,95 @@ impl AsyncSeekReader for ItemReader {
|
|||||||
async fn seek(&mut self, pos: std::io::SeekFrom) -> Result<u64, std::io::Error> {
|
async fn seek(&mut self, pos: std::io::SeekFrom) -> Result<u64, std::io::Error> {
|
||||||
match self {
|
match self {
|
||||||
Self::File(x) => x.seek(pos),
|
Self::File(x) => x.seek(pos),
|
||||||
Self::Vec(x) => x.seek(pos),
|
Self::S3(x) => x.seek(pos).await,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// MARK: S3Reader
|
||||||
|
//
|
||||||
|
|
||||||
|
pub struct S3Reader {
|
||||||
|
pub client: Arc<aws_sdk_s3::Client>,
|
||||||
|
pub bucket: SmartString<LazyCompact>,
|
||||||
|
pub key: SmartString<LazyCompact>,
|
||||||
|
pub cursor: u64,
|
||||||
|
pub size: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AsyncReader for S3Reader {
|
||||||
|
async fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
|
||||||
|
let len_left = self.size.saturating_sub(self.cursor);
|
||||||
|
if len_left == 0 || buf.is_empty() {
|
||||||
|
return Ok(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
let start_byte = self.cursor;
|
||||||
|
let len_to_read = (buf.len() as u64).min(len_left);
|
||||||
|
let end_byte = start_byte + len_to_read - 1;
|
||||||
|
|
||||||
|
let resp = self
|
||||||
|
.client
|
||||||
|
.get_object()
|
||||||
|
.bucket(self.bucket.as_str())
|
||||||
|
.key(self.key.as_str())
|
||||||
|
.range(format!("bytes={start_byte}-{end_byte}"))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(std::io::Error::other)?;
|
||||||
|
|
||||||
|
let bytes = resp
|
||||||
|
.body
|
||||||
|
.collect()
|
||||||
|
.await
|
||||||
|
.map(|x| x.into_bytes())
|
||||||
|
.map_err(std::io::Error::other)?;
|
||||||
|
|
||||||
|
let n = bytes.len().min(buf.len());
|
||||||
|
buf[..n].copy_from_slice(&bytes[..n]);
|
||||||
|
self.cursor += n as u64;
|
||||||
|
Ok(n)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AsyncSeekReader for S3Reader {
|
||||||
|
async fn seek(&mut self, pos: SeekFrom) -> Result<u64, std::io::Error> {
|
||||||
|
match pos {
|
||||||
|
SeekFrom::Start(x) => self.cursor = x.min(self.size),
|
||||||
|
|
||||||
|
SeekFrom::Current(x) => {
|
||||||
|
if x < 0 {
|
||||||
|
let abs = x.unsigned_abs();
|
||||||
|
if abs > self.cursor {
|
||||||
|
return Err(std::io::Error::new(
|
||||||
|
std::io::ErrorKind::InvalidInput,
|
||||||
|
"cannot seek past start",
|
||||||
|
));
|
||||||
|
}
|
||||||
|
self.cursor -= abs;
|
||||||
|
} else {
|
||||||
|
self.cursor += x as u64;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::io::SeekFrom::End(x) => {
|
||||||
|
if x < 0 {
|
||||||
|
let abs = x.unsigned_abs();
|
||||||
|
if abs > self.size {
|
||||||
|
return Err(std::io::Error::new(
|
||||||
|
std::io::ErrorKind::InvalidInput,
|
||||||
|
"cannot seek past start",
|
||||||
|
));
|
||||||
|
}
|
||||||
|
self.cursor = self.size - abs;
|
||||||
|
} else {
|
||||||
|
self.cursor = self.size + x as u64;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
self.cursor = self.cursor.min(self.size);
|
||||||
|
Ok(self.cursor)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -2,61 +2,18 @@ use mime::Mime;
|
|||||||
use pile_config::objectpath::{ObjectPath, PathSegment};
|
use pile_config::objectpath::{ObjectPath, PathSegment};
|
||||||
use serde_json::{Map, Value};
|
use serde_json::{Map, Value};
|
||||||
use smartstring::{LazyCompact, SmartString};
|
use smartstring::{LazyCompact, SmartString};
|
||||||
use std::{fmt::Debug, fs::File, io::Cursor, path::PathBuf, sync::Arc};
|
use std::sync::Arc;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
extract::{
|
extract::{
|
||||||
blob::BinaryExtractor,
|
|
||||||
item::ItemExtractor,
|
item::ItemExtractor,
|
||||||
misc::{ArrayExtractor, MapExtractor, VecExtractor},
|
misc::{ArrayExtractor, MapExtractor, VecExtractor},
|
||||||
string::StringExtractor,
|
string::StringExtractor,
|
||||||
traits::{ExtractState, ListExtractor, ObjectExtractor},
|
traits::{ListExtractor, ObjectExtractor},
|
||||||
},
|
},
|
||||||
value::{Item, ItemReader},
|
value::Item,
|
||||||
};
|
};
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub struct ArcBytes(pub Arc<Vec<u8>>);
|
|
||||||
impl Debug for ArcBytes {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
f.debug_struct("ArcBytes")
|
|
||||||
.field("len()", &self.0.len())
|
|
||||||
.finish()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl AsRef<[u8]> for ArcBytes {
|
|
||||||
fn as_ref(&self) -> &[u8] {
|
|
||||||
&self.0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub enum BinaryPileValue {
|
|
||||||
/// A binary blob
|
|
||||||
Blob { mime: Mime, bytes: ArcBytes },
|
|
||||||
|
|
||||||
/// An pointer to a file
|
|
||||||
File { mime: Mime, path: PathBuf },
|
|
||||||
}
|
|
||||||
|
|
||||||
impl BinaryPileValue {
|
|
||||||
/// Open the item for reading.
|
|
||||||
pub async fn read(&self) -> Result<ItemReader, std::io::Error> {
|
|
||||||
match self {
|
|
||||||
Self::File { path, .. } => Ok(ItemReader::File(File::open(path)?)),
|
|
||||||
Self::Blob { bytes, .. } => Ok(ItemReader::Vec(Cursor::new(bytes.clone()))),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn mime(&self) -> &Mime {
|
|
||||||
match self {
|
|
||||||
Self::Blob { mime, .. } => mime,
|
|
||||||
Self::File { mime, .. } => mime,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// An immutable, cheaply-cloneable, lazily-computed value.
|
/// An immutable, cheaply-cloneable, lazily-computed value.
|
||||||
/// Very similar to [serde_json::Value].
|
/// Very similar to [serde_json::Value].
|
||||||
pub enum PileValue {
|
pub enum PileValue {
|
||||||
@@ -70,6 +27,12 @@ pub enum PileValue {
|
|||||||
/// An array of values
|
/// An array of values
|
||||||
Array(Arc<Vec<PileValue>>),
|
Array(Arc<Vec<PileValue>>),
|
||||||
|
|
||||||
|
/// A binary blob
|
||||||
|
Blob {
|
||||||
|
mime: Mime,
|
||||||
|
bytes: Arc<Vec<u8>>,
|
||||||
|
},
|
||||||
|
|
||||||
/// A lazily-computed map of {label: value}
|
/// A lazily-computed map of {label: value}
|
||||||
ObjectExtractor(Arc<dyn ObjectExtractor>),
|
ObjectExtractor(Arc<dyn ObjectExtractor>),
|
||||||
|
|
||||||
@@ -78,9 +41,6 @@ pub enum PileValue {
|
|||||||
|
|
||||||
/// An pointer to an item in this dataset
|
/// An pointer to an item in this dataset
|
||||||
Item(Item),
|
Item(Item),
|
||||||
|
|
||||||
/// Binary data
|
|
||||||
Binary(BinaryPileValue),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Clone for PileValue {
|
impl Clone for PileValue {
|
||||||
@@ -93,8 +53,11 @@ impl Clone for PileValue {
|
|||||||
Self::Array(x) => Self::Array(x.clone()),
|
Self::Array(x) => Self::Array(x.clone()),
|
||||||
Self::ObjectExtractor(x) => Self::ObjectExtractor(x.clone()),
|
Self::ObjectExtractor(x) => Self::ObjectExtractor(x.clone()),
|
||||||
Self::ListExtractor(x) => Self::ListExtractor(x.clone()),
|
Self::ListExtractor(x) => Self::ListExtractor(x.clone()),
|
||||||
|
Self::Blob { mime, bytes } => Self::Blob {
|
||||||
|
mime: mime.clone(),
|
||||||
|
bytes: bytes.clone(),
|
||||||
|
},
|
||||||
Self::Item(i) => Self::Item(i.clone()),
|
Self::Item(i) => Self::Item(i.clone()),
|
||||||
Self::Binary(b) => Self::Binary(b.clone()),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -107,10 +70,10 @@ impl PileValue {
|
|||||||
Self::I64(_) => Arc::new(MapExtractor::default()),
|
Self::I64(_) => Arc::new(MapExtractor::default()),
|
||||||
Self::Array(_) => Arc::new(MapExtractor::default()),
|
Self::Array(_) => Arc::new(MapExtractor::default()),
|
||||||
Self::String(s) => Arc::new(StringExtractor::new(s)),
|
Self::String(s) => Arc::new(StringExtractor::new(s)),
|
||||||
|
Self::Blob { .. } => Arc::new(MapExtractor::default()),
|
||||||
Self::ListExtractor(_) => Arc::new(MapExtractor::default()),
|
Self::ListExtractor(_) => Arc::new(MapExtractor::default()),
|
||||||
Self::ObjectExtractor(e) => e.clone(),
|
Self::ObjectExtractor(e) => e.clone(),
|
||||||
Self::Item(i) => Arc::new(ItemExtractor::new(i)),
|
Self::Item(i) => Arc::new(ItemExtractor::new(i)),
|
||||||
Self::Binary(b) => Arc::new(BinaryExtractor::new(b)),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -121,26 +84,20 @@ impl PileValue {
|
|||||||
Self::I64(_) => Arc::new(VecExtractor::default()),
|
Self::I64(_) => Arc::new(VecExtractor::default()),
|
||||||
Self::Array(a) => Arc::new(ArrayExtractor::new(a.clone())),
|
Self::Array(a) => Arc::new(ArrayExtractor::new(a.clone())),
|
||||||
Self::String(_) => Arc::new(VecExtractor::default()),
|
Self::String(_) => Arc::new(VecExtractor::default()),
|
||||||
|
Self::Blob { .. } => Arc::new(VecExtractor::default()),
|
||||||
Self::ListExtractor(e) => e.clone(),
|
Self::ListExtractor(e) => e.clone(),
|
||||||
Self::ObjectExtractor(e) => e
|
Self::ObjectExtractor(_) => Arc::new(VecExtractor::default()),
|
||||||
.as_list()
|
|
||||||
.unwrap_or_else(|| Arc::new(VecExtractor::default())),
|
|
||||||
Self::Item(_) => Arc::new(VecExtractor::default()),
|
Self::Item(_) => Arc::new(VecExtractor::default()),
|
||||||
Self::Binary(_) => Arc::new(VecExtractor::default()),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn query(
|
pub async fn query(&self, query: &ObjectPath) -> Result<Option<Self>, std::io::Error> {
|
||||||
&self,
|
|
||||||
state: &ExtractState,
|
|
||||||
query: &ObjectPath,
|
|
||||||
) -> Result<Option<Self>, std::io::Error> {
|
|
||||||
let mut out: Option<PileValue> = Some(self.clone());
|
let mut out: Option<PileValue> = Some(self.clone());
|
||||||
|
|
||||||
for s in &query.segments {
|
for s in &query.segments {
|
||||||
match s {
|
match s {
|
||||||
PathSegment::Root => out = Some(self.clone()),
|
PathSegment::Root => out = Some(self.clone()),
|
||||||
PathSegment::Field { name, args } => {
|
PathSegment::Field(field) => {
|
||||||
let e = match out.map(|x| x.object_extractor()) {
|
let e = match out.map(|x| x.object_extractor()) {
|
||||||
Some(e) => e,
|
Some(e) => e,
|
||||||
None => {
|
None => {
|
||||||
@@ -149,7 +106,7 @@ impl PileValue {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
out = e.field(state, name, args.as_deref()).await?;
|
out = e.field(field).await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
PathSegment::Index(idx) => {
|
PathSegment::Index(idx) => {
|
||||||
@@ -164,7 +121,7 @@ impl PileValue {
|
|||||||
let idx = if *idx >= 0 {
|
let idx = if *idx >= 0 {
|
||||||
usize::try_from(*idx).ok()
|
usize::try_from(*idx).ok()
|
||||||
} else {
|
} else {
|
||||||
usize::try_from(e.len(state).await? as i64 - idx).ok()
|
usize::try_from(e.len().await? as i64 - idx).ok()
|
||||||
};
|
};
|
||||||
|
|
||||||
let idx = match idx {
|
let idx = match idx {
|
||||||
@@ -175,41 +132,7 @@ impl PileValue {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
out = e.get(state, idx).await?;
|
out = e.get(idx).await?;
|
||||||
}
|
|
||||||
|
|
||||||
PathSegment::Range {
|
|
||||||
start,
|
|
||||||
end,
|
|
||||||
inclusive,
|
|
||||||
} => {
|
|
||||||
let e = match out.map(|x| x.list_extractor()) {
|
|
||||||
Some(e) => e,
|
|
||||||
None => {
|
|
||||||
out = None;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let len = e.len(state).await? as i64;
|
|
||||||
|
|
||||||
let start_idx = if *start >= 0 { *start } else { len + start };
|
|
||||||
let end_idx = if *end >= 0 { *end } else { len + end };
|
|
||||||
let end_idx = if *inclusive { end_idx + 1 } else { end_idx };
|
|
||||||
|
|
||||||
let start_idx = start_idx.max(0) as usize;
|
|
||||||
let end_idx = (end_idx.max(0) as usize).min(len as usize);
|
|
||||||
|
|
||||||
let mut items = Vec::new();
|
|
||||||
for i in start_idx..end_idx {
|
|
||||||
match e.get(state, i).await? {
|
|
||||||
Some(v) => items.push(v),
|
|
||||||
None => break,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: lazy view?
|
|
||||||
out = Some(PileValue::Array(Arc::new(items)));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -224,33 +147,27 @@ impl PileValue {
|
|||||||
/// - `ObjectExtractor` is recursed into; returns `Some(Object(map))` with
|
/// - `ObjectExtractor` is recursed into; returns `Some(Object(map))` with
|
||||||
/// only the fields that had data, or `None` if all fields were absent.
|
/// only the fields that had data, or `None` if all fields were absent.
|
||||||
/// - `Array` / `ListExtractor` are treated as opaque leaf values (not descended into).
|
/// - `Array` / `ListExtractor` are treated as opaque leaf values (not descended into).
|
||||||
pub async fn count_fields(
|
pub async fn count_fields(&self) -> Result<Option<Value>, std::io::Error> {
|
||||||
&self,
|
|
||||||
state: &ExtractState,
|
|
||||||
) -> Result<Option<Value>, std::io::Error> {
|
|
||||||
Ok(match self {
|
Ok(match self {
|
||||||
Self::Null => None,
|
Self::Null => None,
|
||||||
|
|
||||||
Self::U64(_)
|
Self::U64(_) | Self::I64(_) | Self::String(_) | Self::Blob { .. } => {
|
||||||
| Self::I64(_)
|
Some(Value::Number(1u64.into()))
|
||||||
| Self::String(_)
|
}
|
||||||
| Self::Binary(BinaryPileValue::Blob { .. }) => Some(Value::Number(1u64.into())),
|
|
||||||
|
|
||||||
Self::Array(x) => (!x.is_empty()).then(|| Value::Number(1u64.into())),
|
Self::Array(x) => (!x.is_empty()).then(|| Value::Number(1u64.into())),
|
||||||
Self::ListExtractor(x) => (x.len(state).await? > 0).then(|| Value::Number(1u64.into())),
|
Self::ListExtractor(x) => (!x.is_empty().await?).then(|| Value::Number(1u64.into())),
|
||||||
|
|
||||||
Self::ObjectExtractor(_)
|
Self::ObjectExtractor(_) | Self::Item(_) => {
|
||||||
| Self::Item(_)
|
|
||||||
| Self::Binary(BinaryPileValue::File { .. }) => {
|
|
||||||
let e = self.object_extractor();
|
let e = self.object_extractor();
|
||||||
let keys = e.fields().await?;
|
let keys = e.fields().await?;
|
||||||
let mut map = Map::new();
|
let mut map = Map::new();
|
||||||
for k in &keys {
|
for k in &keys {
|
||||||
let v = match e.field(state, k, None).await? {
|
let v = match e.field(k).await? {
|
||||||
Some(x) => x,
|
Some(x) => x,
|
||||||
None => continue,
|
None => continue,
|
||||||
};
|
};
|
||||||
if let Some(counted) = Box::pin(v.count_fields(state)).await? {
|
if let Some(counted) = Box::pin(v.count_fields()).await? {
|
||||||
map.insert(k.to_string(), counted);
|
map.insert(k.to_string(), counted);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -270,7 +187,7 @@ impl PileValue {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn to_json(&self, state: &ExtractState) -> Result<Value, std::io::Error> {
|
pub async fn to_json(&self) -> Result<Value, std::io::Error> {
|
||||||
Ok(match self {
|
Ok(match self {
|
||||||
Self::Null => Value::Null,
|
Self::Null => Value::Null,
|
||||||
Self::U64(x) => Value::Number((*x).into()),
|
Self::U64(x) => Value::Number((*x).into()),
|
||||||
@@ -278,20 +195,34 @@ impl PileValue {
|
|||||||
Self::String(x) => Value::String(x.to_string()),
|
Self::String(x) => Value::String(x.to_string()),
|
||||||
|
|
||||||
// TODO: replace with something meaningful?
|
// TODO: replace with something meaningful?
|
||||||
Self::Binary(BinaryPileValue::Blob { mime, bytes }) => {
|
Self::Blob { mime, bytes } => {
|
||||||
Value::String(format!("<Blob ({mime}, {} bytes)>", bytes.0.len()))
|
Value::String(format!("<Blob ({mime}, {} bytes)>", bytes.len()))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[expect(clippy::expect_used)]
|
||||||
Self::Array(_) | Self::ListExtractor(_) => {
|
Self::Array(_) | Self::ListExtractor(_) => {
|
||||||
let e = self.list_extractor();
|
let e = self.list_extractor();
|
||||||
return e.to_json(state).await;
|
let len = e.len().await?;
|
||||||
|
let mut arr = Vec::new();
|
||||||
|
for i in 0..len {
|
||||||
|
let v = e.get(i).await?.expect("item must be present");
|
||||||
|
arr.push(Box::pin(v.to_json()).await?);
|
||||||
|
}
|
||||||
|
Value::Array(arr)
|
||||||
}
|
}
|
||||||
|
|
||||||
Self::ObjectExtractor(_)
|
Self::ObjectExtractor(_) | Self::Item(_) => {
|
||||||
| Self::Item(_)
|
|
||||||
| Self::Binary(BinaryPileValue::File { .. }) => {
|
|
||||||
let e = self.object_extractor();
|
let e = self.object_extractor();
|
||||||
return e.to_json(state).await;
|
let keys = e.fields().await?;
|
||||||
|
let mut map = Map::new();
|
||||||
|
for k in &keys {
|
||||||
|
let v = match e.field(k).await? {
|
||||||
|
Some(x) => x,
|
||||||
|
None => continue,
|
||||||
|
};
|
||||||
|
map.insert(k.to_string(), Box::pin(v.to_json()).await?);
|
||||||
|
}
|
||||||
|
Value::Object(map)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -9,14 +9,14 @@ workspace = true
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
pile-toolbox = { workspace = true }
|
pile-toolbox = { workspace = true }
|
||||||
pile-dataset = { workspace = true }
|
pile-dataset = { workspace = true, features = ["axum", "pdfium"] }
|
||||||
pile-serve = { workspace = true }
|
pile-value = { workspace = true, features = ["pdfium"] }
|
||||||
pile-value = { workspace = true }
|
|
||||||
pile-config = { workspace = true }
|
pile-config = { workspace = true }
|
||||||
|
|
||||||
tracing = { workspace = true }
|
tracing = { workspace = true }
|
||||||
tracing-subscriber = { workspace = true }
|
tracing-subscriber = { workspace = true }
|
||||||
tokio = { workspace = true }
|
tokio = { workspace = true }
|
||||||
|
tokio-stream = { workspace = true }
|
||||||
clap = { workspace = true }
|
clap = { workspace = true }
|
||||||
#clap_complete = { workspace = true }
|
#clap_complete = { workspace = true }
|
||||||
serde = { workspace = true }
|
serde = { workspace = true }
|
||||||
@@ -27,15 +27,3 @@ anstyle = { workspace = true }
|
|||||||
toml = { workspace = true }
|
toml = { workspace = true }
|
||||||
serde_json = { workspace = true }
|
serde_json = { workspace = true }
|
||||||
axum = { workspace = true }
|
axum = { workspace = true }
|
||||||
utoipa = { workspace = true }
|
|
||||||
utoipa-swagger-ui = { workspace = true }
|
|
||||||
url = { workspace = true }
|
|
||||||
tracing-loki = { workspace = true }
|
|
||||||
base64 = { workspace = true }
|
|
||||||
dotenvy = { workspace = true }
|
|
||||||
envy = { workspace = true }
|
|
||||||
thiserror = { workspace = true }
|
|
||||||
|
|
||||||
[features]
|
|
||||||
default = ["pdfium"]
|
|
||||||
pdfium = ["pile-dataset/pdfium", "pile-serve/pdfium", "pile-value/pdfium"]
|
|
||||||
|
|||||||
106
crates/pile/src/command/annotate.rs
Normal file
106
crates/pile/src/command/annotate.rs
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
use anyhow::{Context, Result};
|
||||||
|
use clap::Args;
|
||||||
|
use pile_config::{Label, Source};
|
||||||
|
use pile_dataset::{Datasets, index::DbFtsIndex};
|
||||||
|
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
||||||
|
use pile_value::{
|
||||||
|
source::{DataSource, DirDataSource},
|
||||||
|
value::{Item, PileValue},
|
||||||
|
};
|
||||||
|
use std::{path::PathBuf, sync::Arc};
|
||||||
|
use tokio_stream::StreamExt;
|
||||||
|
use tracing::{info, warn};
|
||||||
|
|
||||||
|
use crate::{CliCmd, GlobalContext};
|
||||||
|
|
||||||
|
#[derive(Debug, Args)]
|
||||||
|
pub struct AnnotateCommand {
|
||||||
|
/// The schema field to read (must be defined in pile.toml)
|
||||||
|
field: String,
|
||||||
|
|
||||||
|
/// Sidecar path to write to (e.g. meta.title)
|
||||||
|
dest: String,
|
||||||
|
|
||||||
|
/// Path to dataset config
|
||||||
|
#[arg(long, short = 'c', default_value = "./pile.toml")]
|
||||||
|
config: PathBuf,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AnnotateCommand {
|
||||||
|
fn parse_dest(dest: &str) -> Result<Vec<Label>> {
|
||||||
|
dest.split('.')
|
||||||
|
.map(|s| {
|
||||||
|
Label::new(s).ok_or_else(|| anyhow::anyhow!("invalid label {s:?} in dest path"))
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CliCmd for AnnotateCommand {
|
||||||
|
async fn run(
|
||||||
|
self,
|
||||||
|
_ctx: GlobalContext,
|
||||||
|
_flag: CancelFlag,
|
||||||
|
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
||||||
|
let field = Label::new(&self.field)
|
||||||
|
.ok_or_else(|| anyhow::anyhow!("invalid field name {:?}", self.field))?;
|
||||||
|
let dest_path = Self::parse_dest(&self.dest)?;
|
||||||
|
|
||||||
|
let ds = Datasets::open(&self.config)
|
||||||
|
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
||||||
|
|
||||||
|
if !ds.config.schema.contains_key(&field) {
|
||||||
|
return Err(anyhow::anyhow!("field {:?} is not defined in schema", self.field).into());
|
||||||
|
}
|
||||||
|
|
||||||
|
let index = DbFtsIndex::new(&ds.path_workdir, &ds.config);
|
||||||
|
let count = 0u64;
|
||||||
|
|
||||||
|
for (name, source) in &ds.config.dataset.source {
|
||||||
|
match source {
|
||||||
|
Source::Filesystem { path, sidecars } => {
|
||||||
|
if !sidecars {
|
||||||
|
warn!("Source {name} does not have sidecars enabled, skipping");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let source = Arc::new(DirDataSource::new(name, path.clone(), *sidecars));
|
||||||
|
|
||||||
|
let mut stream = source.iter();
|
||||||
|
while let Some(res) = stream.next().await {
|
||||||
|
let item = res.with_context(|| format!("while reading source {name}"))?;
|
||||||
|
|
||||||
|
let Item::File { path, .. } = &item else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
|
||||||
|
let item = PileValue::Item(item.clone());
|
||||||
|
let Some(value) =
|
||||||
|
index.get_field(&item, &field).await.with_context(|| {
|
||||||
|
format!("while extracting field from {}", path.display())
|
||||||
|
})?
|
||||||
|
else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
|
||||||
|
// TODO: implement sidecar writing
|
||||||
|
let _ = (&dest_path, &value);
|
||||||
|
todo!("write_sidecar not yet implemented");
|
||||||
|
|
||||||
|
#[expect(unreachable_code)]
|
||||||
|
{
|
||||||
|
count += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Source::S3 { .. } => {
|
||||||
|
warn!("Source {name} is an S3 source; sidecar annotation is not yet supported");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("Annotated {count} items");
|
||||||
|
|
||||||
|
return Ok(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -13,10 +13,6 @@ pub struct CheckCommand {
|
|||||||
/// Path to dataset config
|
/// Path to dataset config
|
||||||
#[arg(long, short = 'c', default_value = "./pile.toml")]
|
#[arg(long, short = 'c', default_value = "./pile.toml")]
|
||||||
config: PathBuf,
|
config: PathBuf,
|
||||||
|
|
||||||
/// Working directory root
|
|
||||||
#[arg(long, default_value = "./.pile")]
|
|
||||||
workdir: PathBuf,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl CliCmd for CheckCommand {
|
impl CliCmd for CheckCommand {
|
||||||
@@ -47,8 +43,7 @@ impl CliCmd for CheckCommand {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let ds = Datasets::open(&self.config, self.workdir)
|
let ds = Datasets::open(&self.config)
|
||||||
.await
|
|
||||||
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
||||||
|
|
||||||
let ts_fts = ds.ts_fts().context("while determining fts age")?;
|
let ts_fts = ds.ts_fts().context("while determining fts age")?;
|
||||||
|
|||||||
@@ -1,12 +1,12 @@
|
|||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
use clap::Args;
|
use clap::Args;
|
||||||
use pile_config::objectpath::ObjectPath;
|
|
||||||
use pile_dataset::Datasets;
|
use pile_dataset::Datasets;
|
||||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
||||||
use pile_value::{extract::traits::ExtractState, value::PileValue};
|
use pile_value::value::PileValue;
|
||||||
use serde_json::{Map, Value};
|
use serde_json::{Map, Value};
|
||||||
use std::{path::PathBuf, time::Instant};
|
use std::{path::PathBuf, time::Instant};
|
||||||
use tokio::task::JoinSet;
|
use tokio::task::JoinSet;
|
||||||
|
use tokio_stream::StreamExt;
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
use crate::{CliCmd, GlobalContext};
|
use crate::{CliCmd, GlobalContext};
|
||||||
@@ -41,17 +41,9 @@ pub struct FieldsCommand {
|
|||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
max_percent: Option<f64>,
|
max_percent: Option<f64>,
|
||||||
|
|
||||||
/// Print counts of non-null schema fields instead of raw fields
|
|
||||||
#[arg(long)]
|
|
||||||
schema: bool,
|
|
||||||
|
|
||||||
/// Restrict to these sources (all sources if empty)
|
/// Restrict to these sources (all sources if empty)
|
||||||
#[arg(long, short = 's')]
|
#[arg(long, short = 's')]
|
||||||
source: Vec<String>,
|
source: Vec<String>,
|
||||||
|
|
||||||
/// Working directory root
|
|
||||||
#[arg(long, default_value = "./.pile")]
|
|
||||||
workdir: PathBuf,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl CliCmd for FieldsCommand {
|
impl CliCmd for FieldsCommand {
|
||||||
@@ -62,26 +54,13 @@ impl CliCmd for FieldsCommand {
|
|||||||
_ctx: GlobalContext,
|
_ctx: GlobalContext,
|
||||||
flag: CancelFlag,
|
flag: CancelFlag,
|
||||||
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
||||||
let ds = Datasets::open(&self.config, &self.workdir)
|
let ds = Datasets::open(&self.config)
|
||||||
.await
|
|
||||||
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
||||||
|
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
let mut total_counts: Map<String, Value> = Map::new();
|
let mut total_counts: Map<String, Value> = Map::new();
|
||||||
let mut total_items = 0u64;
|
let mut total_items = 0u64;
|
||||||
let jobs = self.jobs.max(1);
|
let jobs = self.jobs.max(1);
|
||||||
let state = ExtractState { ignore_mime: false };
|
|
||||||
|
|
||||||
// Pre-collect schema fields for the --schema mode
|
|
||||||
let schema_fields: Vec<(String, Vec<ObjectPath>)> = if self.schema {
|
|
||||||
ds.config
|
|
||||||
.schema
|
|
||||||
.iter()
|
|
||||||
.map(|(name, spec)| (name.to_string(), spec.path.clone()))
|
|
||||||
.collect()
|
|
||||||
} else {
|
|
||||||
Vec::new()
|
|
||||||
};
|
|
||||||
|
|
||||||
for (name, dataset) in ds.sources.iter().filter(|(name, _)| {
|
for (name, dataset) in ds.sources.iter().filter(|(name, _)| {
|
||||||
self.source.is_empty() || self.source.iter().any(|s| s == name.as_str())
|
self.source.is_empty() || self.source.iter().any(|s| s == name.as_str())
|
||||||
@@ -108,56 +87,25 @@ impl CliCmd for FieldsCommand {
|
|||||||
return Err(CancelableTaskError::Cancelled);
|
return Err(CancelableTaskError::Cancelled);
|
||||||
}
|
}
|
||||||
|
|
||||||
match stream.next() {
|
match stream.next().await {
|
||||||
None => break,
|
None => break,
|
||||||
Some(item) => {
|
Some(item_result) => {
|
||||||
let item = item.clone();
|
let item =
|
||||||
|
item_result.with_context(|| format!("while reading source {name}"))?;
|
||||||
let name = name.clone();
|
let name = name.clone();
|
||||||
let state = state.clone();
|
join_set.spawn(async move {
|
||||||
if self.schema {
|
let item = PileValue::Item(item);
|
||||||
let schema_fields = schema_fields.clone();
|
let result = item.count_fields().await.with_context(|| {
|
||||||
join_set.spawn(async move {
|
format!("while counting fields in source {name}")
|
||||||
let pv = PileValue::Item(item);
|
})?;
|
||||||
let mut counts = Map::new();
|
Ok(result.and_then(|v| {
|
||||||
for (field_name, paths) in &schema_fields {
|
if let Value::Object(m) = v {
|
||||||
let mut present = false;
|
Some(m)
|
||||||
for path in paths {
|
} else {
|
||||||
let v =
|
None
|
||||||
pv.query(&state, path).await.with_context(|| {
|
|
||||||
format!(
|
|
||||||
"while extracting field {field_name} in source {name}"
|
|
||||||
)
|
|
||||||
})?;
|
|
||||||
if let Some(v) = v
|
|
||||||
&& !matches!(v, PileValue::Null)
|
|
||||||
{
|
|
||||||
present = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
counts.insert(
|
|
||||||
field_name.clone(),
|
|
||||||
Value::Number((present as u64).into()),
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
Ok(Some(counts))
|
}))
|
||||||
});
|
});
|
||||||
} else {
|
|
||||||
join_set.spawn(async move {
|
|
||||||
let item = PileValue::Item(item);
|
|
||||||
let result =
|
|
||||||
item.count_fields(&state).await.with_context(|| {
|
|
||||||
format!("while counting fields in source {name}")
|
|
||||||
})?;
|
|
||||||
Ok(result.and_then(|v| {
|
|
||||||
if let Value::Object(m) = v {
|
|
||||||
Some(m)
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,7 +2,6 @@ use anyhow::{Context, Result};
|
|||||||
use clap::Args;
|
use clap::Args;
|
||||||
use pile_dataset::Datasets;
|
use pile_dataset::Datasets;
|
||||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
||||||
use pile_value::extract::traits::ExtractState;
|
|
||||||
use std::{fmt::Debug, path::PathBuf};
|
use std::{fmt::Debug, path::PathBuf};
|
||||||
|
|
||||||
use crate::{CliCmd, GlobalContext};
|
use crate::{CliCmd, GlobalContext};
|
||||||
@@ -16,10 +15,6 @@ pub struct IndexCommand {
|
|||||||
/// Number of threads to use for indexing
|
/// Number of threads to use for indexing
|
||||||
#[arg(long, short = 'j', default_value = "3")]
|
#[arg(long, short = 'j', default_value = "3")]
|
||||||
jobs: usize,
|
jobs: usize,
|
||||||
|
|
||||||
/// Working directory root
|
|
||||||
#[arg(long, default_value = "./.pile")]
|
|
||||||
workdir: PathBuf,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl CliCmd for IndexCommand {
|
impl CliCmd for IndexCommand {
|
||||||
@@ -28,21 +23,17 @@ impl CliCmd for IndexCommand {
|
|||||||
_ctx: GlobalContext,
|
_ctx: GlobalContext,
|
||||||
flag: CancelFlag,
|
flag: CancelFlag,
|
||||||
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
||||||
let ds = Datasets::open(&self.config, &self.workdir)
|
let ds = Datasets::open(&self.config)
|
||||||
.await
|
|
||||||
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
||||||
|
|
||||||
let state = ExtractState { ignore_mime: false };
|
ds.fts_refresh(self.jobs, Some(flag)).await.map_err(|x| {
|
||||||
ds.fts_refresh(&state, self.jobs, Some(flag))
|
x.map_err(|x| {
|
||||||
.await
|
anyhow::Error::from(x).context(format!(
|
||||||
.map_err(|x| {
|
"while refreshing fts for {}",
|
||||||
x.map_err(|x| {
|
self.config.display()
|
||||||
anyhow::Error::from(x).context(format!(
|
))
|
||||||
"while refreshing fts for {}",
|
})
|
||||||
self.config.display()
|
})?;
|
||||||
))
|
|
||||||
})
|
|
||||||
})?;
|
|
||||||
|
|
||||||
return Ok(0);
|
return Ok(0);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,117 +0,0 @@
|
|||||||
use anyhow::{Context, Result};
|
|
||||||
use clap::Args;
|
|
||||||
use pile_config::{Label, objectpath::ObjectPath};
|
|
||||||
use pile_dataset::Datasets;
|
|
||||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
|
||||||
use pile_value::{extract::traits::ExtractState, value::PileValue};
|
|
||||||
use std::path::PathBuf;
|
|
||||||
|
|
||||||
use crate::{CliCmd, GlobalContext};
|
|
||||||
|
|
||||||
#[derive(Debug, Args)]
|
|
||||||
pub struct ItemCommand {
|
|
||||||
/// Source name (as defined in pile.toml)
|
|
||||||
source: String,
|
|
||||||
|
|
||||||
/// Item key within the source
|
|
||||||
key: String,
|
|
||||||
|
|
||||||
/// If present, extract a specific field
|
|
||||||
#[arg(long, short = 'p')]
|
|
||||||
path: Option<String>,
|
|
||||||
|
|
||||||
/// If present, print the schema fields instead of item data
|
|
||||||
#[arg(long)]
|
|
||||||
schema: bool,
|
|
||||||
|
|
||||||
#[arg(long, short = 'x')]
|
|
||||||
exclude: Vec<String>,
|
|
||||||
|
|
||||||
/// Path to dataset config
|
|
||||||
#[arg(long, short = 'c', default_value = "./pile.toml")]
|
|
||||||
config: PathBuf,
|
|
||||||
|
|
||||||
/// Working directory root
|
|
||||||
#[arg(long, default_value = "./.pile")]
|
|
||||||
workdir: PathBuf,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl CliCmd for ItemCommand {
|
|
||||||
#[expect(clippy::print_stdout)]
|
|
||||||
#[expect(clippy::unwrap_used)]
|
|
||||||
async fn run(
|
|
||||||
self,
|
|
||||||
_ctx: GlobalContext,
|
|
||||||
_flag: CancelFlag,
|
|
||||||
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
|
||||||
let source = Label::new(&self.source)
|
|
||||||
.ok_or_else(|| anyhow::anyhow!("invalid source name {:?}", self.source))?;
|
|
||||||
|
|
||||||
let ds = Datasets::open(&self.config, &self.workdir)
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
|
||||||
|
|
||||||
let state = ExtractState { ignore_mime: false };
|
|
||||||
|
|
||||||
let item = ds.get(&source, &self.key).await.ok_or_else(|| {
|
|
||||||
anyhow::anyhow!("{:?} not found in source {:?}", self.key, self.source)
|
|
||||||
})?;
|
|
||||||
let pv = PileValue::Item(item);
|
|
||||||
|
|
||||||
if self.schema {
|
|
||||||
let mut map = serde_json::Map::new();
|
|
||||||
for (name, spec) in &ds.config.schema {
|
|
||||||
if self.exclude.contains(&name.to_string()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut value = None;
|
|
||||||
for path in &spec.path {
|
|
||||||
let v = pv
|
|
||||||
.query(&state, path)
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("while extracting field {name}"))?;
|
|
||||||
if let Some(v) = v
|
|
||||||
&& !matches!(v, PileValue::Null)
|
|
||||||
{
|
|
||||||
let j = v
|
|
||||||
.to_json(&state)
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("while extracting field {name}"))?;
|
|
||||||
value = Some(j);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
map.insert(name.to_string(), value.unwrap_or(serde_json::Value::Null));
|
|
||||||
}
|
|
||||||
let json = serde_json::to_string_pretty(&serde_json::Value::Object(map)).unwrap();
|
|
||||||
println!("{json}");
|
|
||||||
return Ok(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
let json = if let Some(path_str) = self.path {
|
|
||||||
let path: ObjectPath = path_str
|
|
||||||
.parse()
|
|
||||||
.with_context(|| format!("invalid path {path_str:?}"))?;
|
|
||||||
|
|
||||||
let v = pv
|
|
||||||
.query(&state, &path)
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("while extracting {}", self.key))?
|
|
||||||
.ok_or_else(|| {
|
|
||||||
anyhow::anyhow!("{:?} not found in source {:?}", self.key, self.source)
|
|
||||||
})?;
|
|
||||||
v.to_json(&state)
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("while extracting {}", self.key))?
|
|
||||||
} else {
|
|
||||||
pv.to_json(&state)
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("while extracting {}", self.key))?
|
|
||||||
};
|
|
||||||
|
|
||||||
let json = serde_json::to_string_pretty(&json).unwrap();
|
|
||||||
println!("{json}");
|
|
||||||
return Ok(0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,30 +1,25 @@
|
|||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
use clap::Args;
|
use clap::Args;
|
||||||
use pile_config::Label;
|
|
||||||
use pile_config::objectpath::ObjectPath;
|
use pile_config::objectpath::ObjectPath;
|
||||||
use pile_dataset::Datasets;
|
use pile_dataset::Datasets;
|
||||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
||||||
use pile_value::{extract::traits::ExtractState, value::PileValue};
|
use pile_value::value::PileValue;
|
||||||
use std::{path::PathBuf, str::FromStr, sync::Arc};
|
use std::{path::PathBuf, str::FromStr, sync::Arc};
|
||||||
use tokio::task::JoinSet;
|
use tokio::task::JoinSet;
|
||||||
|
use tokio_stream::StreamExt;
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
use crate::{CliCmd, GlobalContext};
|
use crate::{CliCmd, GlobalContext};
|
||||||
|
|
||||||
#[derive(Debug, Args)]
|
#[derive(Debug, Args)]
|
||||||
pub struct ListCommand {
|
pub struct ListCommand {
|
||||||
/// Path to query, e.g. $.flac.artist (or schema field name when --schema is set)
|
/// Path to query, e.g. $.flac.artist
|
||||||
#[clap(default_value = "$")]
|
|
||||||
path: String,
|
path: String,
|
||||||
|
|
||||||
/// Only print items where the value is null (inverse of default)
|
/// Only print items where the value is null (inverse of default)
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
invert: bool,
|
invert: bool,
|
||||||
|
|
||||||
/// Treat path as a schema field name and resolve via schema paths
|
|
||||||
#[arg(long)]
|
|
||||||
schema: bool,
|
|
||||||
|
|
||||||
/// Path to dataset config
|
/// Path to dataset config
|
||||||
#[arg(long, short = 'c', default_value = "./pile.toml")]
|
#[arg(long, short = 'c', default_value = "./pile.toml")]
|
||||||
config: PathBuf,
|
config: PathBuf,
|
||||||
@@ -36,10 +31,6 @@ pub struct ListCommand {
|
|||||||
/// Restrict to these sources (all sources if empty)
|
/// Restrict to these sources (all sources if empty)
|
||||||
#[arg(long, short = 's')]
|
#[arg(long, short = 's')]
|
||||||
source: Vec<String>,
|
source: Vec<String>,
|
||||||
|
|
||||||
/// Working directory root
|
|
||||||
#[arg(long, default_value = "./.pile")]
|
|
||||||
workdir: PathBuf,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl CliCmd for ListCommand {
|
impl CliCmd for ListCommand {
|
||||||
@@ -49,26 +40,14 @@ impl CliCmd for ListCommand {
|
|||||||
_ctx: GlobalContext,
|
_ctx: GlobalContext,
|
||||||
flag: CancelFlag,
|
flag: CancelFlag,
|
||||||
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
||||||
let ds = Datasets::open(&self.config, &self.workdir)
|
let path = ObjectPath::from_str(&self.path)
|
||||||
.await
|
.with_context(|| format!("invalid path {:?}", self.path))?;
|
||||||
|
let path = Arc::new(path);
|
||||||
|
|
||||||
|
let ds = Datasets::open(&self.config)
|
||||||
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
||||||
|
|
||||||
// Resolve path arg: either schema field paths or a single ObjectPath
|
|
||||||
let schema_paths: Arc<Vec<ObjectPath>> = if self.schema {
|
|
||||||
let label = Label::new(&self.path)
|
|
||||||
.ok_or_else(|| anyhow::anyhow!("invalid schema field name {:?}", self.path))?;
|
|
||||||
let spec = ds.config.schema.get(&label).ok_or_else(|| {
|
|
||||||
anyhow::anyhow!("schema field {:?} not found in config", self.path)
|
|
||||||
})?;
|
|
||||||
Arc::new(spec.path.clone())
|
|
||||||
} else {
|
|
||||||
let path = ObjectPath::from_str(&self.path)
|
|
||||||
.with_context(|| format!("invalid path {:?}", self.path))?;
|
|
||||||
Arc::new(vec![path])
|
|
||||||
};
|
|
||||||
|
|
||||||
let jobs = self.jobs.max(1);
|
let jobs = self.jobs.max(1);
|
||||||
let state = ExtractState { ignore_mime: false };
|
|
||||||
|
|
||||||
for (name, dataset) in ds.sources.iter().filter(|(name, _)| {
|
for (name, dataset) in ds.sources.iter().filter(|(name, _)| {
|
||||||
self.source.is_empty() || self.source.iter().any(|s| s == name.as_str())
|
self.source.is_empty() || self.source.iter().any(|s| s == name.as_str())
|
||||||
@@ -90,26 +69,22 @@ impl CliCmd for ListCommand {
|
|||||||
return Err(CancelableTaskError::Cancelled);
|
return Err(CancelableTaskError::Cancelled);
|
||||||
}
|
}
|
||||||
|
|
||||||
match stream.next() {
|
match stream.next().await {
|
||||||
None => break,
|
None => break,
|
||||||
Some(item) => {
|
Some(item_result) => {
|
||||||
let item = item.clone();
|
let item =
|
||||||
|
item_result.with_context(|| format!("while reading source {name}"))?;
|
||||||
let source_name = name.to_string();
|
let source_name = name.to_string();
|
||||||
let key = item.key().to_string();
|
let key = item.key().to_string();
|
||||||
let schema_paths = schema_paths.clone();
|
let path = path.clone();
|
||||||
let invert = self.invert;
|
let invert = self.invert;
|
||||||
let state = state.clone();
|
|
||||||
|
|
||||||
join_set.spawn(async move {
|
join_set.spawn(async move {
|
||||||
let pv = PileValue::Item(item);
|
let item = PileValue::Item(item);
|
||||||
let mut is_present = false;
|
let value = item.query(&path).await?;
|
||||||
for path in schema_paths.as_ref() {
|
|
||||||
let value = pv.query(&state, path).await?;
|
let is_present =
|
||||||
if matches!(value, Some(v) if !matches!(v, PileValue::Null)) {
|
matches!(value, Some(v) if !matches!(v, PileValue::Null));
|
||||||
is_present = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let should_print = if invert { !is_present } else { is_present };
|
let should_print = if invert { !is_present } else { is_present };
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,6 @@ use anyhow::{Context, Result};
|
|||||||
use clap::Args;
|
use clap::Args;
|
||||||
use pile_dataset::Datasets;
|
use pile_dataset::Datasets;
|
||||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
||||||
use pile_value::extract::traits::ExtractState;
|
|
||||||
use std::{fmt::Debug, path::PathBuf};
|
use std::{fmt::Debug, path::PathBuf};
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
@@ -31,10 +30,6 @@ pub struct LookupCommand {
|
|||||||
/// Number of threads to use for indexing
|
/// Number of threads to use for indexing
|
||||||
#[arg(long, short = 'j', default_value = "3")]
|
#[arg(long, short = 'j', default_value = "3")]
|
||||||
jobs: usize,
|
jobs: usize,
|
||||||
|
|
||||||
/// Working directory root
|
|
||||||
#[arg(long, default_value = "./.pile")]
|
|
||||||
workdir: PathBuf,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl CliCmd for LookupCommand {
|
impl CliCmd for LookupCommand {
|
||||||
@@ -44,24 +39,19 @@ impl CliCmd for LookupCommand {
|
|||||||
_ctx: GlobalContext,
|
_ctx: GlobalContext,
|
||||||
flag: CancelFlag,
|
flag: CancelFlag,
|
||||||
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
||||||
let ds = Datasets::open(&self.config, &self.workdir)
|
let ds = Datasets::open(&self.config)
|
||||||
.await
|
|
||||||
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
||||||
|
|
||||||
let state = ExtractState { ignore_mime: false };
|
|
||||||
|
|
||||||
if self.refresh && ds.needs_fts().await.context("while checking dataset fts")? {
|
if self.refresh && ds.needs_fts().await.context("while checking dataset fts")? {
|
||||||
info!("FTS index is missing or out-of-date, regenerating");
|
info!("FTS index is missing or out-of-date, regenerating");
|
||||||
ds.fts_refresh(&state, self.jobs, Some(flag))
|
ds.fts_refresh(self.jobs, Some(flag)).await.map_err(|x| {
|
||||||
.await
|
x.map_err(|x| {
|
||||||
.map_err(|x| {
|
anyhow::Error::from(x).context(format!(
|
||||||
x.map_err(|x| {
|
"while refreshing fts for {}",
|
||||||
anyhow::Error::from(x).context(format!(
|
self.config.display()
|
||||||
"while refreshing fts for {}",
|
))
|
||||||
self.config.display()
|
})
|
||||||
))
|
})?;
|
||||||
})
|
|
||||||
})?;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let results = ds
|
let results = ds
|
||||||
|
|||||||
@@ -4,16 +4,15 @@ use pile_toolbox::cancelabletask::{
|
|||||||
CancelFlag, CancelableTask, CancelableTaskError, CancelableTaskResult,
|
CancelFlag, CancelableTask, CancelableTaskError, CancelableTaskResult,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
mod annotate;
|
||||||
mod check;
|
mod check;
|
||||||
mod fields;
|
mod fields;
|
||||||
mod index;
|
mod index;
|
||||||
mod init;
|
mod init;
|
||||||
mod item;
|
|
||||||
mod list;
|
mod list;
|
||||||
mod lookup;
|
mod lookup;
|
||||||
mod probe;
|
mod probe;
|
||||||
mod serve;
|
mod serve;
|
||||||
mod server;
|
|
||||||
|
|
||||||
use crate::{Cli, GlobalContext};
|
use crate::{Cli, GlobalContext};
|
||||||
|
|
||||||
@@ -23,6 +22,12 @@ pub enum SubCommand {
|
|||||||
#[clap(alias = "doc")]
|
#[clap(alias = "doc")]
|
||||||
Docs {},
|
Docs {},
|
||||||
|
|
||||||
|
/// Annotate all items with a field, writing it to a sidecar path
|
||||||
|
Annotate {
|
||||||
|
#[command(flatten)]
|
||||||
|
cmd: annotate::AnnotateCommand,
|
||||||
|
},
|
||||||
|
|
||||||
/// Create an empty dataset
|
/// Create an empty dataset
|
||||||
Init {
|
Init {
|
||||||
#[command(flatten)]
|
#[command(flatten)]
|
||||||
@@ -55,49 +60,36 @@ pub enum SubCommand {
|
|||||||
},
|
},
|
||||||
|
|
||||||
/// Print an overview of all fields present in this dataset
|
/// Print an overview of all fields present in this dataset
|
||||||
Fields {
|
Overview {
|
||||||
#[command(flatten)]
|
#[command(flatten)]
|
||||||
cmd: fields::FieldsCommand,
|
cmd: fields::FieldsCommand,
|
||||||
},
|
},
|
||||||
|
|
||||||
/// Print all metadata from a file
|
/// Print all metadata from an item
|
||||||
Probe {
|
Probe {
|
||||||
#[command(flatten)]
|
#[command(flatten)]
|
||||||
cmd: probe::ProbeCommand,
|
cmd: probe::ProbeCommand,
|
||||||
},
|
},
|
||||||
|
|
||||||
/// Print all metadata from an item
|
/// Expose a dataset via an http api
|
||||||
Item {
|
|
||||||
#[command(flatten)]
|
|
||||||
cmd: item::ItemCommand,
|
|
||||||
},
|
|
||||||
|
|
||||||
/// Expose one dataset via a simple http api
|
|
||||||
Serve {
|
Serve {
|
||||||
#[command(flatten)]
|
#[command(flatten)]
|
||||||
cmd: serve::ServeCommand,
|
cmd: serve::ServeCommand,
|
||||||
},
|
},
|
||||||
|
|
||||||
/// Serve many datasets under an authenticated http api
|
|
||||||
Server {
|
|
||||||
#[command(flatten)]
|
|
||||||
cmd: server::ServerCommand,
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl CliCmdDispatch for SubCommand {
|
impl CliCmdDispatch for SubCommand {
|
||||||
fn start(self, ctx: GlobalContext) -> Result<CancelableTask<Result<i32>>> {
|
fn start(self, ctx: GlobalContext) -> Result<CancelableTask<Result<i32>>> {
|
||||||
match self {
|
match self {
|
||||||
|
Self::Annotate { cmd } => cmd.start(ctx),
|
||||||
Self::Init { cmd } => cmd.start(ctx),
|
Self::Init { cmd } => cmd.start(ctx),
|
||||||
Self::Check { cmd } => cmd.start(ctx),
|
Self::Check { cmd } => cmd.start(ctx),
|
||||||
Self::Index { cmd } => cmd.start(ctx),
|
Self::Index { cmd } => cmd.start(ctx),
|
||||||
Self::List { cmd } => cmd.start(ctx),
|
Self::List { cmd } => cmd.start(ctx),
|
||||||
Self::Lookup { cmd } => cmd.start(ctx),
|
Self::Lookup { cmd } => cmd.start(ctx),
|
||||||
Self::Fields { cmd } => cmd.start(ctx),
|
Self::Overview { cmd } => cmd.start(ctx),
|
||||||
Self::Probe { cmd } => cmd.start(ctx),
|
Self::Probe { cmd } => cmd.start(ctx),
|
||||||
Self::Item { cmd } => cmd.start(ctx),
|
|
||||||
Self::Serve { cmd } => cmd.start(ctx),
|
Self::Serve { cmd } => cmd.start(ctx),
|
||||||
Self::Server { cmd } => cmd.start(ctx),
|
|
||||||
|
|
||||||
Self::Docs {} => {
|
Self::Docs {} => {
|
||||||
print_help_recursively(&mut Cli::command(), None);
|
print_help_recursively(&mut Cli::command(), None);
|
||||||
|
|||||||
@@ -1,21 +1,28 @@
|
|||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
use clap::Args;
|
use clap::Args;
|
||||||
use pile_config::objectpath::ObjectPath;
|
use pile_config::{Label, objectpath::ObjectPath};
|
||||||
use pile_dataset::Datasets;
|
use pile_dataset::Datasets;
|
||||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
||||||
use pile_value::{extract::traits::ExtractState, value::PileValue};
|
use pile_value::value::PileValue;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
use crate::{CliCmd, GlobalContext};
|
use crate::{CliCmd, GlobalContext};
|
||||||
|
|
||||||
#[derive(Debug, Args)]
|
#[derive(Debug, Args)]
|
||||||
pub struct ProbeCommand {
|
pub struct ProbeCommand {
|
||||||
/// The file to probe
|
/// Source name (as defined in pile.toml)
|
||||||
file: PathBuf,
|
source: String,
|
||||||
|
|
||||||
|
/// Item key within the source
|
||||||
|
key: String,
|
||||||
|
|
||||||
/// If present, extract a specific field
|
/// If present, extract a specific field
|
||||||
#[arg(long, short = 'p')]
|
#[arg(long, short = 'p')]
|
||||||
path: Option<String>,
|
path: Option<String>,
|
||||||
|
|
||||||
|
/// Path to dataset config
|
||||||
|
#[arg(long, short = 'c', default_value = "./pile.toml")]
|
||||||
|
config: PathBuf,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl CliCmd for ProbeCommand {
|
impl CliCmd for ProbeCommand {
|
||||||
@@ -26,37 +33,32 @@ impl CliCmd for ProbeCommand {
|
|||||||
_ctx: GlobalContext,
|
_ctx: GlobalContext,
|
||||||
_flag: CancelFlag,
|
_flag: CancelFlag,
|
||||||
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
||||||
let ds = Datasets::virt(".")
|
let source = Label::new(&self.source)
|
||||||
.await
|
.ok_or_else(|| anyhow::anyhow!("invalid source name {:?}", self.source))?;
|
||||||
.with_context(|| "while opening virtual dataset".to_owned())?;
|
|
||||||
|
|
||||||
let state = ExtractState { ignore_mime: false };
|
let ds = Datasets::open(&self.config)
|
||||||
let key = self.file.to_str().context("path is not utf-8")?;
|
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
||||||
|
|
||||||
let json = if let Some(path_str) = self.path {
|
let json = if let Some(path_str) = self.path {
|
||||||
let path: ObjectPath = path_str
|
let path: ObjectPath = path_str
|
||||||
.parse()
|
.parse()
|
||||||
.with_context(|| format!("invalid path {path_str:?}"))?;
|
.with_context(|| format!("invalid path {path_str:?}"))?;
|
||||||
|
|
||||||
ds.get_field(
|
ds.get_field(&source, &self.key, &path)
|
||||||
&state,
|
|
||||||
&Datasets::virt_source(),
|
|
||||||
self.file.to_str().context("path is not utf-8")?,
|
|
||||||
&path,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("while extracting {key}"))?
|
|
||||||
.ok_or_else(|| anyhow::anyhow!("{key:?} not found"))?
|
|
||||||
} else {
|
|
||||||
let item = ds
|
|
||||||
.get(&Datasets::virt_source(), key)
|
|
||||||
.await
|
.await
|
||||||
.ok_or_else(|| anyhow::anyhow!("{key:?} not found"))?;
|
.with_context(|| format!("while extracting {}", self.key))?
|
||||||
|
.ok_or_else(|| {
|
||||||
|
anyhow::anyhow!("{:?} not found in source {:?}", self.key, self.source)
|
||||||
|
})?
|
||||||
|
} else {
|
||||||
|
let item = ds.get(&source, &self.key).await.ok_or_else(|| {
|
||||||
|
anyhow::anyhow!("{:?} not found in source {:?}", self.key, self.source)
|
||||||
|
})?;
|
||||||
|
|
||||||
let item = PileValue::Item(item);
|
let item = PileValue::Item(item);
|
||||||
item.to_json(&state)
|
item.to_json()
|
||||||
.await
|
.await
|
||||||
.with_context(|| format!("while extracting {key}"))?
|
.with_context(|| format!("while extracting {}", self.key))?
|
||||||
};
|
};
|
||||||
|
|
||||||
let json = serde_json::to_string_pretty(&json).unwrap();
|
let json = serde_json::to_string_pretty(&json).unwrap();
|
||||||
|
|||||||
@@ -2,7 +2,6 @@ use anyhow::{Context, Result};
|
|||||||
use clap::Args;
|
use clap::Args;
|
||||||
use pile_dataset::Datasets;
|
use pile_dataset::Datasets;
|
||||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
||||||
use pile_value::extract::traits::ExtractState;
|
|
||||||
use std::{fmt::Debug, path::PathBuf, sync::Arc};
|
use std::{fmt::Debug, path::PathBuf, sync::Arc};
|
||||||
use tracing::{error, info};
|
use tracing::{error, info};
|
||||||
|
|
||||||
@@ -25,10 +24,6 @@ pub struct ServeCommand {
|
|||||||
/// Number of threads to use for indexing
|
/// Number of threads to use for indexing
|
||||||
#[arg(long, short = 'j', default_value = "3")]
|
#[arg(long, short = 'j', default_value = "3")]
|
||||||
jobs: usize,
|
jobs: usize,
|
||||||
|
|
||||||
/// Working directory root
|
|
||||||
#[arg(long, default_value = "./.pile")]
|
|
||||||
workdir: PathBuf,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl CliCmd for ServeCommand {
|
impl CliCmd for ServeCommand {
|
||||||
@@ -37,15 +32,12 @@ impl CliCmd for ServeCommand {
|
|||||||
_ctx: GlobalContext,
|
_ctx: GlobalContext,
|
||||||
flag: CancelFlag,
|
flag: CancelFlag,
|
||||||
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
||||||
let ds = Datasets::open(&self.config, &self.workdir)
|
let ds = Datasets::open(&self.config)
|
||||||
.await
|
|
||||||
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
|
||||||
|
|
||||||
let state = ExtractState { ignore_mime: false };
|
|
||||||
|
|
||||||
if self.refresh && ds.needs_fts().await.context("while checking dataset fts")? {
|
if self.refresh && ds.needs_fts().await.context("while checking dataset fts")? {
|
||||||
info!("FTS index is missing or out-of-date, regenerating");
|
info!("FTS index is missing or out-of-date, regenerating");
|
||||||
ds.fts_refresh(&state, self.jobs, Some(flag.clone()))
|
ds.fts_refresh(self.jobs, Some(flag.clone()))
|
||||||
.await
|
.await
|
||||||
.map_err(|x| {
|
.map_err(|x| {
|
||||||
x.map_err(|x| {
|
x.map_err(|x| {
|
||||||
@@ -57,7 +49,8 @@ impl CliCmd for ServeCommand {
|
|||||||
})?;
|
})?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let app = pile_serve::router(Arc::new(ds), true)
|
let app = Arc::new(ds)
|
||||||
|
.router(true)
|
||||||
.into_make_service_with_connect_info::<std::net::SocketAddr>();
|
.into_make_service_with_connect_info::<std::net::SocketAddr>();
|
||||||
|
|
||||||
let listener = match tokio::net::TcpListener::bind(self.addr.clone()).await {
|
let listener = match tokio::net::TcpListener::bind(self.addr.clone()).await {
|
||||||
|
|||||||
@@ -1,241 +0,0 @@
|
|||||||
use anyhow::{Context, Result};
|
|
||||||
use axum::{
|
|
||||||
Json, Router,
|
|
||||||
extract::{Request, State},
|
|
||||||
http::StatusCode,
|
|
||||||
middleware::{Next, from_fn_with_state},
|
|
||||||
response::{IntoResponse, Response},
|
|
||||||
routing::get,
|
|
||||||
};
|
|
||||||
use clap::Args;
|
|
||||||
use pile_dataset::{DatasetError, Datasets};
|
|
||||||
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
|
|
||||||
use pile_value::extract::traits::ExtractState;
|
|
||||||
use serde::Serialize;
|
|
||||||
use std::{fmt::Debug, path::PathBuf, sync::Arc, time::Duration};
|
|
||||||
use tracing::{error, info};
|
|
||||||
use utoipa::{OpenApi, ToSchema};
|
|
||||||
use utoipa_swagger_ui::SwaggerUi;
|
|
||||||
|
|
||||||
use crate::{CliCmd, GlobalContext};
|
|
||||||
|
|
||||||
#[derive(Debug, Args)]
|
|
||||||
pub struct ServerCommand {
|
|
||||||
/// The datasets we should serve. Can be repeated.
|
|
||||||
#[arg(long, short = 'c')]
|
|
||||||
config: Vec<PathBuf>,
|
|
||||||
|
|
||||||
/// If provided, do not serve docs
|
|
||||||
#[arg(long)]
|
|
||||||
no_docs: bool,
|
|
||||||
|
|
||||||
/// If provided, never auto-refresh indices
|
|
||||||
#[arg(long)]
|
|
||||||
no_refresh: bool,
|
|
||||||
|
|
||||||
/// Number of threads to use to refresh indices
|
|
||||||
#[arg(long, default_value = "5")]
|
|
||||||
refresh_jobs: usize,
|
|
||||||
|
|
||||||
/// Refresh indices every `n` seconds
|
|
||||||
#[arg(long, default_value = "300")]
|
|
||||||
refresh_delay: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl CliCmd for ServerCommand {
|
|
||||||
async fn run(
|
|
||||||
self,
|
|
||||||
ctx: GlobalContext,
|
|
||||||
flag: CancelFlag,
|
|
||||||
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
|
|
||||||
let datasets = {
|
|
||||||
let mut datasets = Vec::new();
|
|
||||||
for c in &self.config {
|
|
||||||
let ds = Datasets::open(&c, &ctx.config.workdir_root)
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("while opening dataset for {}", c.display()))?;
|
|
||||||
datasets.push(Arc::new(ds));
|
|
||||||
}
|
|
||||||
|
|
||||||
Arc::new(datasets)
|
|
||||||
};
|
|
||||||
|
|
||||||
// Start auto-refresh task
|
|
||||||
if !self.no_refresh {
|
|
||||||
let datasets = datasets.clone();
|
|
||||||
let jobs = self.refresh_jobs.max(1);
|
|
||||||
let delay = self.refresh_delay.max(1);
|
|
||||||
|
|
||||||
async fn refresh_dataset(ds: &Datasets, jobs: usize) -> Result<(), DatasetError> {
|
|
||||||
if ds.needs_fts().await? {
|
|
||||||
let state = ExtractState { ignore_mime: false };
|
|
||||||
match ds.fts_refresh(&state, jobs, None).await {
|
|
||||||
Ok(()) => {}
|
|
||||||
Err(CancelableTaskError::Error(err)) => return Err(err),
|
|
||||||
Err(CancelableTaskError::Cancelled) => unreachable!(),
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
|
|
||||||
tokio::task::spawn(async move {
|
|
||||||
loop {
|
|
||||||
for ds in datasets.iter() {
|
|
||||||
match refresh_dataset(ds, jobs).await {
|
|
||||||
Ok(x) => x,
|
|
||||||
Err(error) => {
|
|
||||||
error!(
|
|
||||||
message = "Error while refreshing dataset",
|
|
||||||
dataset = ds.config.dataset.name.as_str(),
|
|
||||||
?error
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
tokio::time::sleep(Duration::from_secs(10)).await;
|
|
||||||
}
|
|
||||||
|
|
||||||
tokio::time::sleep(Duration::from_secs(delay as u64)).await;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
let bearer = BearerToken(ctx.config.api_token.clone().map(Arc::new));
|
|
||||||
|
|
||||||
let mut router = Router::new();
|
|
||||||
for d in datasets.iter() {
|
|
||||||
let prefix = format!("/{}", d.config.dataset.name);
|
|
||||||
router = router.merge(pile_serve::router_prefix(
|
|
||||||
d.clone(),
|
|
||||||
!self.no_docs,
|
|
||||||
Some(&prefix),
|
|
||||||
))
|
|
||||||
}
|
|
||||||
|
|
||||||
router = router.merge(
|
|
||||||
Router::new()
|
|
||||||
.route("/datasets", get(list_datasets))
|
|
||||||
.with_state(datasets.clone()),
|
|
||||||
);
|
|
||||||
|
|
||||||
if !self.no_docs {
|
|
||||||
let docs_path = "/docs";
|
|
||||||
let docs = SwaggerUi::new(docs_path)
|
|
||||||
.url(format!("{}/openapi.json", docs_path), Api::openapi());
|
|
||||||
|
|
||||||
router = router.merge(docs);
|
|
||||||
}
|
|
||||||
|
|
||||||
router = router.layer(from_fn_with_state(bearer, bearer_auth_middleware));
|
|
||||||
|
|
||||||
let app = router.into_make_service_with_connect_info::<std::net::SocketAddr>();
|
|
||||||
|
|
||||||
let listener = match tokio::net::TcpListener::bind(ctx.config.server_addr.clone()).await {
|
|
||||||
Ok(x) => x,
|
|
||||||
Err(error) => {
|
|
||||||
match error.kind() {
|
|
||||||
std::io::ErrorKind::AddrInUse => {
|
|
||||||
error!(
|
|
||||||
message = "Cannot bind to address, already in use",
|
|
||||||
addr = ctx.config.server_addr
|
|
||||||
);
|
|
||||||
}
|
|
||||||
_ => {
|
|
||||||
error!(message = "Error while starting server", ?error);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::process::exit(1);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
match listener.local_addr() {
|
|
||||||
Ok(x) => info!("listening on http://{x}"),
|
|
||||||
Err(error) => {
|
|
||||||
error!(message = "Could not determine local address", ?error);
|
|
||||||
return Err(anyhow::Error::from(error).into());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
match axum::serve(listener, app)
|
|
||||||
.with_graceful_shutdown(async move { flag.await_cancel().await })
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
Ok(_) => {}
|
|
||||||
Err(error) => {
|
|
||||||
error!(message = "Error while serving api", ?error);
|
|
||||||
return Err(anyhow::Error::from(error).into());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return Err(CancelableTaskError::Cancelled);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
// MARK: bearer auth middleware
|
|
||||||
//
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
struct BearerToken(Option<Arc<String>>);
|
|
||||||
|
|
||||||
async fn bearer_auth_middleware(
|
|
||||||
State(BearerToken(expected)): State<BearerToken>,
|
|
||||||
request: Request,
|
|
||||||
next: Next,
|
|
||||||
) -> Response {
|
|
||||||
let Some(expected) = expected else {
|
|
||||||
return next.run(request).await;
|
|
||||||
};
|
|
||||||
|
|
||||||
let authorized = request
|
|
||||||
.headers()
|
|
||||||
.get(axum::http::header::AUTHORIZATION)
|
|
||||||
.and_then(|v| v.to_str().ok())
|
|
||||||
.and_then(|v| v.strip_prefix("Bearer "))
|
|
||||||
.is_some_and(|token| token == expected.as_str());
|
|
||||||
|
|
||||||
if authorized {
|
|
||||||
next.run(request).await
|
|
||||||
} else {
|
|
||||||
StatusCode::UNAUTHORIZED.into_response()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
// MARK: routes
|
|
||||||
//
|
|
||||||
|
|
||||||
#[derive(OpenApi)]
|
|
||||||
#[openapi(
|
|
||||||
tags(),
|
|
||||||
paths(list_datasets),
|
|
||||||
components(schemas(ListDatasetsResponse))
|
|
||||||
)]
|
|
||||||
pub(crate) struct Api;
|
|
||||||
|
|
||||||
#[derive(Serialize, ToSchema)]
|
|
||||||
pub struct ListDatasetsResponse {
|
|
||||||
name: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// List all datasets served by this server
|
|
||||||
#[utoipa::path(
|
|
||||||
get,
|
|
||||||
path = "/datasets",
|
|
||||||
responses(
|
|
||||||
(status = 200, description = "List of datasets"),
|
|
||||||
(status = 500, description = "Internal server error"),
|
|
||||||
)
|
|
||||||
)]
|
|
||||||
pub async fn list_datasets(State(state): State<Arc<Vec<Arc<Datasets>>>>) -> Response {
|
|
||||||
let datasets = state
|
|
||||||
.iter()
|
|
||||||
.map(|x| ListDatasetsResponse {
|
|
||||||
name: x.config.dataset.name.clone().into(),
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
return (StatusCode::OK, Json(datasets)).into_response();
|
|
||||||
}
|
|
||||||
@@ -1,109 +0,0 @@
|
|||||||
use serde::Deserialize;
|
|
||||||
use std::{num::NonZeroUsize, path::PathBuf};
|
|
||||||
use tracing::debug;
|
|
||||||
|
|
||||||
use crate::config::{
|
|
||||||
env::load_env,
|
|
||||||
logging::{LoggingFormat, LoggingInitializer, LoggingPreset, LoggingTarget, LokiConfig},
|
|
||||||
};
|
|
||||||
|
|
||||||
/// Note that the field of this struct are not capitalized.
|
|
||||||
/// Envy is case-insensitive, and expects Rust fields to be snake_case.
|
|
||||||
#[derive(Debug, Deserialize, Clone)]
|
|
||||||
pub struct PileServerConfig {
|
|
||||||
#[serde(flatten)]
|
|
||||||
pub loki: Option<LokiConfig>,
|
|
||||||
|
|
||||||
/// The logging level to run with
|
|
||||||
#[serde(default)]
|
|
||||||
pub loglevel: LoggingPreset,
|
|
||||||
|
|
||||||
#[serde(default)]
|
|
||||||
pub logformat: LoggingFormat,
|
|
||||||
|
|
||||||
/// How many worker threads to use
|
|
||||||
pub threads: Option<NonZeroUsize>,
|
|
||||||
|
|
||||||
/// IP and port to bind to
|
|
||||||
/// Should look like `127.0.0.1:3030`
|
|
||||||
pub server_addr: String,
|
|
||||||
|
|
||||||
pub api_token: Option<String>,
|
|
||||||
pub workdir_root: PathBuf,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for PileServerConfig {
|
|
||||||
fn default() -> Self {
|
|
||||||
Self {
|
|
||||||
loki: None,
|
|
||||||
loglevel: LoggingPreset::Debug,
|
|
||||||
logformat: LoggingFormat::Ansi,
|
|
||||||
threads: None,
|
|
||||||
server_addr: "0.0.0.0:3000".into(),
|
|
||||||
api_token: None,
|
|
||||||
workdir_root: "./.pile".into(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PileServerConfig {
|
|
||||||
pub fn load(with_env: bool, cli_log_level: LoggingPreset) -> Self {
|
|
||||||
let config = match with_env {
|
|
||||||
false => Self::default(),
|
|
||||||
true => {
|
|
||||||
let env = match load_env::<Self>() {
|
|
||||||
Ok(x) => x,
|
|
||||||
|
|
||||||
#[expect(clippy::print_stdout)]
|
|
||||||
Err(err) => {
|
|
||||||
println!("Error while loading .env: {err}");
|
|
||||||
std::process::exit(1);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
env.get_config().clone()
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
{
|
|
||||||
let res = LoggingInitializer {
|
|
||||||
app_name: "pile-server",
|
|
||||||
loki: config.loki.clone(),
|
|
||||||
preset: if with_env {
|
|
||||||
config.loglevel
|
|
||||||
} else {
|
|
||||||
cli_log_level
|
|
||||||
},
|
|
||||||
target: LoggingTarget::Stderr {
|
|
||||||
format: config.logformat,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
.initialize();
|
|
||||||
|
|
||||||
if let Err(e) = res {
|
|
||||||
#[expect(clippy::print_stderr)]
|
|
||||||
for e in e.chain() {
|
|
||||||
eprintln!("{e}");
|
|
||||||
}
|
|
||||||
|
|
||||||
std::process::exit(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
debug!(message = "Config loaded", ?config);
|
|
||||||
|
|
||||||
return config;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn make_runtime(&self) -> tokio::runtime::Runtime {
|
|
||||||
let mut rt = tokio::runtime::Builder::new_multi_thread();
|
|
||||||
rt.enable_all();
|
|
||||||
if let Some(threads) = self.threads {
|
|
||||||
rt.worker_threads(threads.into());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[expect(clippy::unwrap_used)]
|
|
||||||
let rt = rt.build().unwrap();
|
|
||||||
|
|
||||||
return rt;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,108 +0,0 @@
|
|||||||
#![expect(dead_code)]
|
|
||||||
|
|
||||||
use serde::de::DeserializeOwned;
|
|
||||||
use std::{
|
|
||||||
collections::HashMap,
|
|
||||||
env::VarError,
|
|
||||||
io::ErrorKind,
|
|
||||||
path::{Path, PathBuf},
|
|
||||||
};
|
|
||||||
use thiserror::Error;
|
|
||||||
|
|
||||||
/// An error we might encounter when loading an env
|
|
||||||
#[derive(Debug, Error)]
|
|
||||||
pub enum EnvLoadError {
|
|
||||||
#[error("i/o error")]
|
|
||||||
IOError(#[from] std::io::Error),
|
|
||||||
|
|
||||||
#[error("varerror")]
|
|
||||||
VarError(#[from] VarError),
|
|
||||||
|
|
||||||
#[error("line parse error: `{on_line}` at char {at_char}")]
|
|
||||||
LineParse { on_line: String, at_char: usize },
|
|
||||||
|
|
||||||
#[error("other dotenvy error")]
|
|
||||||
Other(#[from] dotenvy::Error),
|
|
||||||
|
|
||||||
#[error("missing value {0}")]
|
|
||||||
MissingValue(String),
|
|
||||||
|
|
||||||
#[error("parse error: {0}")]
|
|
||||||
OtherParseError(String),
|
|
||||||
}
|
|
||||||
|
|
||||||
pub enum LoadedEnv<T> {
|
|
||||||
/// We loaded config from `.env` and env vars
|
|
||||||
FoundFile { config: T, path: PathBuf },
|
|
||||||
|
|
||||||
/// We could not find `.env` and only loaded env vars
|
|
||||||
OnlyVars(T),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T> LoadedEnv<T> {
|
|
||||||
pub fn get_config(&self) -> &T {
|
|
||||||
match self {
|
|
||||||
Self::FoundFile { config, .. } => config,
|
|
||||||
Self::OnlyVars(config) => config,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Load the configuration type `T` from the current environment,
|
|
||||||
/// including the `.env` if it exists.
|
|
||||||
#[expect(clippy::wildcard_enum_match_arm)]
|
|
||||||
pub fn load_env<T: DeserializeOwned>() -> Result<LoadedEnv<T>, EnvLoadError> {
|
|
||||||
let env_path = match dotenvy::dotenv() {
|
|
||||||
Ok(path) => Some(path),
|
|
||||||
|
|
||||||
Err(dotenvy::Error::Io(err)) => match err.kind() {
|
|
||||||
ErrorKind::NotFound => None,
|
|
||||||
_ => return Err(EnvLoadError::IOError(err)),
|
|
||||||
},
|
|
||||||
|
|
||||||
Err(dotenvy::Error::EnvVar(err)) => {
|
|
||||||
return Err(EnvLoadError::VarError(err));
|
|
||||||
}
|
|
||||||
|
|
||||||
Err(dotenvy::Error::LineParse(on_line, at_char)) => {
|
|
||||||
return Err(EnvLoadError::LineParse { on_line, at_char });
|
|
||||||
}
|
|
||||||
|
|
||||||
Err(err) => {
|
|
||||||
return Err(EnvLoadError::Other(err));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
match envy::from_env::<T>() {
|
|
||||||
Ok(config) => {
|
|
||||||
if let Some(path) = env_path {
|
|
||||||
return Ok(LoadedEnv::FoundFile { path, config });
|
|
||||||
} else {
|
|
||||||
return Ok(LoadedEnv::OnlyVars(config));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Err(envy::Error::MissingValue(value)) => {
|
|
||||||
return Err(EnvLoadError::MissingValue(value.into()));
|
|
||||||
}
|
|
||||||
|
|
||||||
Err(envy::Error::Custom(message)) => {
|
|
||||||
return Err(EnvLoadError::OtherParseError(message));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Load an .env file to a hashmap.
|
|
||||||
///
|
|
||||||
/// This function does not read the current env,
|
|
||||||
/// only parsing vars explicitly declared in the given file.
|
|
||||||
pub fn load_env_dict(p: impl AsRef<Path>) -> Result<HashMap<String, String>, EnvLoadError> {
|
|
||||||
let mut out = HashMap::new();
|
|
||||||
|
|
||||||
for item in dotenvy::from_filename_iter(p)? {
|
|
||||||
let (key, val) = item?;
|
|
||||||
out.insert(key, val);
|
|
||||||
}
|
|
||||||
|
|
||||||
return Ok(out);
|
|
||||||
}
|
|
||||||
@@ -1,13 +1,7 @@
|
|||||||
use anyhow::Result;
|
|
||||||
use clap::ValueEnum;
|
use clap::ValueEnum;
|
||||||
use indicatif::MultiProgress;
|
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use std::{fmt::Display, str::FromStr};
|
use std::{fmt::Display, str::FromStr};
|
||||||
use tracing_indicatif::IndicatifWriter;
|
use tracing_subscriber::EnvFilter;
|
||||||
use tracing_subscriber::{
|
|
||||||
EnvFilter, Layer, fmt::MakeWriter, layer::SubscriberExt, util::SubscriberInitExt,
|
|
||||||
};
|
|
||||||
use url::Url;
|
|
||||||
|
|
||||||
#[derive(Debug, Default)]
|
#[derive(Debug, Default)]
|
||||||
pub enum LogLevel {
|
pub enum LogLevel {
|
||||||
@@ -38,7 +32,6 @@ pub enum LoggingPreset {
|
|||||||
Info,
|
Info,
|
||||||
Debug,
|
Debug,
|
||||||
Trace,
|
Trace,
|
||||||
Loki,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct LoggingConfig {
|
pub struct LoggingConfig {
|
||||||
@@ -145,203 +138,6 @@ impl LoggingPreset {
|
|||||||
pile_dataset: LogLevel::Trace,
|
pile_dataset: LogLevel::Trace,
|
||||||
pile_toolbox: LogLevel::Trace,
|
pile_toolbox: LogLevel::Trace,
|
||||||
},
|
},
|
||||||
|
|
||||||
Self::Loki => LoggingConfig {
|
|
||||||
other: LogLevel::Warn,
|
|
||||||
extractor: LogLevel::Error,
|
|
||||||
|
|
||||||
pile: LogLevel::Trace,
|
|
||||||
pile_flac: LogLevel::Trace,
|
|
||||||
pile_config: LogLevel::Trace,
|
|
||||||
pile_dataset: LogLevel::Trace,
|
|
||||||
pile_toolbox: LogLevel::Trace,
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
|
||||||
// MARK: initializer
|
|
||||||
//
|
|
||||||
|
|
||||||
#[derive(Debug, Deserialize, Clone)]
|
|
||||||
pub struct LokiConfig {
|
|
||||||
pub loki_host: Url,
|
|
||||||
pub loki_user: String,
|
|
||||||
pub loki_pass: String,
|
|
||||||
pub loki_node_name: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Where to print logs
|
|
||||||
#[expect(dead_code)]
|
|
||||||
pub enum LoggingTarget {
|
|
||||||
/// Send logs to stdout
|
|
||||||
Stdout { format: LoggingFormat },
|
|
||||||
|
|
||||||
/// Send logs to stderr
|
|
||||||
Stderr { format: LoggingFormat },
|
|
||||||
|
|
||||||
/// Send logs to an IndicatifWriter.
|
|
||||||
///
|
|
||||||
/// This is the same as Stderr { format: Ansi {color:true} },
|
|
||||||
/// but uses an indicatifwriter with the given multiprogress.
|
|
||||||
Indicatif(MultiProgress),
|
|
||||||
}
|
|
||||||
|
|
||||||
/// How to print logs
|
|
||||||
#[derive(Debug, Clone, Copy, Deserialize, Default)]
|
|
||||||
pub enum LoggingFormat {
|
|
||||||
#[default]
|
|
||||||
Ansi,
|
|
||||||
AnsiNoColor,
|
|
||||||
Json,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct LoggingInitializer {
|
|
||||||
pub app_name: &'static str,
|
|
||||||
|
|
||||||
/// If `Some`, send logs to the given loki server
|
|
||||||
pub loki: Option<LokiConfig>,
|
|
||||||
|
|
||||||
/// Log filter for printed logs
|
|
||||||
pub preset: LoggingPreset,
|
|
||||||
|
|
||||||
/// Where to print logs
|
|
||||||
pub target: LoggingTarget,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl LoggingInitializer {
|
|
||||||
pub fn initialize(self) -> Result<()> {
|
|
||||||
let mut stderr_ansi_layer = None;
|
|
||||||
let mut stderr_json_layer = None;
|
|
||||||
let mut stdout_ansi_layer = None;
|
|
||||||
let mut stdout_json_layer = None;
|
|
||||||
let mut indicatif_layer = None;
|
|
||||||
match self.target {
|
|
||||||
LoggingTarget::Stderr {
|
|
||||||
format: LoggingFormat::Ansi,
|
|
||||||
} => {
|
|
||||||
stderr_ansi_layer = Some(
|
|
||||||
tracing_subscriber::fmt::Layer::default()
|
|
||||||
.without_time()
|
|
||||||
.with_ansi(true)
|
|
||||||
.with_writer(std::io::stderr)
|
|
||||||
.with_filter::<EnvFilter>(self.preset.get_config().into()),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
LoggingTarget::Stderr {
|
|
||||||
format: LoggingFormat::AnsiNoColor,
|
|
||||||
} => {
|
|
||||||
stderr_ansi_layer = Some(
|
|
||||||
tracing_subscriber::fmt::Layer::default()
|
|
||||||
.without_time()
|
|
||||||
.with_ansi(false)
|
|
||||||
.with_writer(std::io::stderr)
|
|
||||||
.with_filter::<EnvFilter>(self.preset.get_config().into()),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
LoggingTarget::Stderr {
|
|
||||||
format: LoggingFormat::Json,
|
|
||||||
} => {
|
|
||||||
stderr_json_layer = Some(
|
|
||||||
tracing_subscriber::fmt::Layer::default()
|
|
||||||
.without_time()
|
|
||||||
.with_ansi(false)
|
|
||||||
.json()
|
|
||||||
.flatten_event(true)
|
|
||||||
.with_writer(std::io::stderr)
|
|
||||||
.with_filter::<EnvFilter>(self.preset.get_config().into()),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
LoggingTarget::Stdout {
|
|
||||||
format: LoggingFormat::Ansi,
|
|
||||||
} => {
|
|
||||||
stdout_ansi_layer = Some(
|
|
||||||
tracing_subscriber::fmt::Layer::default()
|
|
||||||
.without_time()
|
|
||||||
.with_ansi(true)
|
|
||||||
.with_writer(std::io::stdout)
|
|
||||||
.with_filter::<EnvFilter>(self.preset.get_config().into()),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
LoggingTarget::Stdout {
|
|
||||||
format: LoggingFormat::AnsiNoColor,
|
|
||||||
} => {
|
|
||||||
stdout_ansi_layer = Some(
|
|
||||||
tracing_subscriber::fmt::Layer::default()
|
|
||||||
.without_time()
|
|
||||||
.with_ansi(false)
|
|
||||||
.with_writer(std::io::stdout)
|
|
||||||
.with_filter::<EnvFilter>(self.preset.get_config().into()),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
LoggingTarget::Stdout {
|
|
||||||
format: LoggingFormat::Json,
|
|
||||||
} => {
|
|
||||||
stdout_json_layer = Some(
|
|
||||||
tracing_subscriber::fmt::Layer::default()
|
|
||||||
.without_time()
|
|
||||||
.with_ansi(false)
|
|
||||||
.json()
|
|
||||||
.flatten_event(true)
|
|
||||||
.with_writer(std::io::stdout)
|
|
||||||
.with_filter::<EnvFilter>(self.preset.get_config().into()),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
LoggingTarget::Indicatif(mp) => {
|
|
||||||
let writer: IndicatifWriter<tracing_indicatif::writer::Stderr> =
|
|
||||||
IndicatifWriter::new(mp);
|
|
||||||
|
|
||||||
indicatif_layer = Some(
|
|
||||||
tracing_subscriber::fmt::Layer::default()
|
|
||||||
.without_time()
|
|
||||||
.with_ansi(true)
|
|
||||||
.with_writer(writer.make_writer())
|
|
||||||
.with_filter::<EnvFilter>(self.preset.get_config().into()),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let loki_layer = {
|
|
||||||
if let Some(cfg) = self.loki {
|
|
||||||
use anyhow::Context;
|
|
||||||
use base64::{Engine, prelude::BASE64_STANDARD};
|
|
||||||
|
|
||||||
let basic_auth = format!("{}:{}", cfg.loki_user, cfg.loki_pass);
|
|
||||||
let encoded_basic_auth = BASE64_STANDARD.encode(basic_auth.as_bytes());
|
|
||||||
|
|
||||||
let (layer, task) = tracing_loki::builder()
|
|
||||||
.label("node_name", cfg.loki_node_name)
|
|
||||||
.context("while building loki node_name label")?
|
|
||||||
.label("app", self.app_name)
|
|
||||||
.context("while building loki app label")?
|
|
||||||
.http_header("Authorization", format!("Basic {encoded_basic_auth}"))
|
|
||||||
.context("while building loki header")?
|
|
||||||
.build_url(cfg.loki_host)
|
|
||||||
.context("while building loki layer")?;
|
|
||||||
|
|
||||||
tokio::spawn(task);
|
|
||||||
Some(layer.with_filter::<EnvFilter>(LoggingPreset::Loki.get_config().into()))
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
tracing_subscriber::registry()
|
|
||||||
.with(loki_layer)
|
|
||||||
.with(stdout_ansi_layer)
|
|
||||||
.with(stdout_json_layer)
|
|
||||||
.with(stderr_ansi_layer)
|
|
||||||
.with(stderr_json_layer)
|
|
||||||
.with(indicatif_layer)
|
|
||||||
.init();
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,6 +1,2 @@
|
|||||||
pub mod env;
|
mod logging;
|
||||||
pub mod logging;
|
pub use logging::*;
|
||||||
|
|
||||||
#[expect(clippy::module_inception)]
|
|
||||||
mod config;
|
|
||||||
pub use config::*;
|
|
||||||
|
|||||||
@@ -1,13 +1,15 @@
|
|||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
use clap::Parser;
|
use clap::Parser;
|
||||||
|
use config::LoggingPreset;
|
||||||
use indicatif::MultiProgress;
|
use indicatif::MultiProgress;
|
||||||
use pile_toolbox::cancelabletask::CancelableTaskResult;
|
use pile_toolbox::cancelabletask::CancelableTaskResult;
|
||||||
use std::process::ExitCode;
|
use std::process::ExitCode;
|
||||||
use tracing::{error, warn};
|
use tracing::{error, warn};
|
||||||
|
use tracing_indicatif::{IndicatifWriter, writer::Stderr};
|
||||||
|
use tracing_subscriber::fmt::MakeWriter;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
command::{CliCmd, CliCmdDispatch, SubCommand},
|
command::{CliCmd, CliCmdDispatch, SubCommand},
|
||||||
config::{PileServerConfig, logging::LoggingPreset},
|
|
||||||
signal::start_signal_task,
|
signal::start_signal_task,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -33,12 +35,19 @@ struct Cli {
|
|||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct GlobalContext {
|
pub struct GlobalContext {
|
||||||
pub mp: MultiProgress,
|
#[expect(dead_code)]
|
||||||
pub config: PileServerConfig,
|
mp: MultiProgress,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn main() -> ExitCode {
|
fn main() -> ExitCode {
|
||||||
match main_inner() {
|
#[expect(clippy::unwrap_used)]
|
||||||
|
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||||
|
.enable_all()
|
||||||
|
.worker_threads(10)
|
||||||
|
.build()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
match rt.block_on(main_inner()) {
|
||||||
Ok(code) => {
|
Ok(code) => {
|
||||||
std::process::exit(code);
|
std::process::exit(code);
|
||||||
}
|
}
|
||||||
@@ -51,7 +60,7 @@ fn main() -> ExitCode {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn main_inner() -> Result<i32> {
|
async fn main_inner() -> Result<i32> {
|
||||||
let cli = Cli::parse();
|
let cli = Cli::parse();
|
||||||
|
|
||||||
let level_i: i16 = cli.v as i16 - cli.q as i16;
|
let level_i: i16 = cli.v as i16 - cli.q as i16;
|
||||||
@@ -72,32 +81,36 @@ fn main_inner() -> Result<i32> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let mp = MultiProgress::new();
|
let mp = MultiProgress::new();
|
||||||
let config = PileServerConfig::load(matches!(cli.cmd, SubCommand::Server { .. }), level);
|
let writer: IndicatifWriter<Stderr> = IndicatifWriter::new(mp.clone());
|
||||||
let rt = config.make_runtime();
|
|
||||||
|
|
||||||
let ctx = GlobalContext { mp, config };
|
tracing_subscriber::fmt()
|
||||||
|
.with_env_filter(level.get_config())
|
||||||
|
.without_time()
|
||||||
|
.with_ansi(true)
|
||||||
|
.with_writer(writer.make_writer())
|
||||||
|
.init();
|
||||||
|
|
||||||
rt.block_on(async {
|
let ctx = GlobalContext { mp };
|
||||||
let task = cli.cmd.start(ctx).context("while starting task")?;
|
|
||||||
let signal_task = start_signal_task(task.flag().clone());
|
|
||||||
|
|
||||||
match task.join().await {
|
let task = cli.cmd.start(ctx).context("while starting task")?;
|
||||||
Ok(CancelableTaskResult::Finished(Ok(code))) => Ok(code),
|
let signal_task = start_signal_task(task.flag().clone());
|
||||||
Ok(CancelableTaskResult::Cancelled) => {
|
|
||||||
signal_task.abort();
|
|
||||||
warn!("Task cancelled successfully");
|
|
||||||
Ok(1)
|
|
||||||
}
|
|
||||||
|
|
||||||
Err(err) => {
|
match task.join().await {
|
||||||
signal_task.abort();
|
Ok(CancelableTaskResult::Finished(Ok(code))) => Ok(code),
|
||||||
Err(err).context("while joining task")
|
Ok(CancelableTaskResult::Cancelled) => {
|
||||||
}
|
signal_task.abort();
|
||||||
|
warn!("Task cancelled successfully");
|
||||||
Ok(CancelableTaskResult::Finished(Err(err))) => {
|
Ok(1)
|
||||||
signal_task.abort();
|
|
||||||
Err(err).context("while running task")
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
})
|
|
||||||
|
Err(err) => {
|
||||||
|
signal_task.abort();
|
||||||
|
Err(err).context("while joining task")
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(CancelableTaskResult::Finished(Err(err))) => {
|
||||||
|
signal_task.abort();
|
||||||
|
Err(err).context("while running task")
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -42,10 +42,7 @@ pileRustPlatform.buildRustPackage {
|
|||||||
|
|
||||||
postInstall = ''
|
postInstall = ''
|
||||||
wrapProgram $out/bin/pile \
|
wrapProgram $out/bin/pile \
|
||||||
${if pkgs.stdenv.isDarwin then
|
--prefix LD_LIBRARY_PATH : ${pkgs.lib.makeLibraryPath [ pkgs.pdfium-binaries ]}
|
||||||
"--prefix DYLD_LIBRARY_PATH : ${pkgs.lib.makeLibraryPath [ pkgs.pdfium-binaries ]}"
|
|
||||||
else
|
|
||||||
"--prefix LD_LIBRARY_PATH : ${pkgs.lib.makeLibraryPath [ pkgs.pdfium-binaries ]}"}
|
|
||||||
'';
|
'';
|
||||||
|
|
||||||
meta = {
|
meta = {
|
||||||
|
|||||||
@@ -1,24 +0,0 @@
|
|||||||
services:
|
|
||||||
pile:
|
|
||||||
#image: git.betalupi.com/mark/pile:latest
|
|
||||||
image: pile
|
|
||||||
container_name: pile
|
|
||||||
restart: unless-stopped
|
|
||||||
|
|
||||||
ports:
|
|
||||||
- 7100:7100
|
|
||||||
volumes:
|
|
||||||
- "./x.ignore/books:/data/books:ro"
|
|
||||||
- "./pile:/workdir"
|
|
||||||
|
|
||||||
environment:
|
|
||||||
SERVER_ADDR: "0.0.0.0:7100"
|
|
||||||
WORKDIR_ROOT: "/workdir"
|
|
||||||
API_TOKEN: "pile_token"
|
|
||||||
THREADS: 8
|
|
||||||
#LOKI_HOST: "http://loki:3100"
|
|
||||||
#LOKI_USER: "user"
|
|
||||||
#LOKI_PASS: "pass"
|
|
||||||
#LOKI_NODE_NAME: "pile"
|
|
||||||
|
|
||||||
command: "pile server -c /data/books/pile.toml"
|
|
||||||
Reference in New Issue
Block a user