Compare commits

...

61 Commits

Author SHA1 Message Date
450ea7aa86 Typos
Some checks failed
CI / Typos (push) Successful in 30s
CI / Clippy (push) Successful in 1m39s
CI / Build and test (push) Successful in 2m30s
Docker / build-and-push (push) Failing after 2m15s
CI / Build and test (all features) (push) Successful in 6m16s
2026-04-15 09:29:06 -07:00
3bc66ddc48 Split server into crate
Some checks failed
CI / Typos (push) Failing after 17s
CI / Build and test (push) Successful in 1m41s
CI / Clippy (push) Has been cancelled
CI / Build and test (all features) (push) Has been cancelled
Docker / build-and-push (push) Has been cancelled
2026-04-15 09:23:52 -07:00
251d130987 Tweak status codes
Some checks failed
CI / Clippy (push) Successful in 1m2s
CI / Typos (push) Failing after 1m10s
Docker / build-and-push (push) Failing after 2m21s
CI / Build and test (all features) (push) Successful in 3m27s
CI / Build and test (push) Successful in 6m7s
2026-04-03 12:35:01 -07:00
0281a33f86 Client tweaks
Some checks failed
CI / Typos (push) Failing after 20s
CI / Build and test (push) Successful in 1m47s
CI / Clippy (push) Successful in 2m25s
Docker / build-and-push (push) Successful in 4m4s
CI / Build and test (all features) (push) Successful in 6m44s
2026-04-03 09:01:51 -07:00
d3ab2684f4 Auto-refresh
Some checks failed
CI / Typos (push) Failing after 31s
CI / Clippy (push) Successful in 58s
Docker / build-and-push (push) Failing after 3m13s
CI / Build and test (all features) (push) Successful in 3m33s
CI / Build and test (push) Successful in 4m3s
2026-04-03 08:57:43 -07:00
4d4e9c93a2 Add hash extractor 2026-04-03 08:57:37 -07:00
e6e340d082 Add --schema arg to list command
Some checks failed
CI / Typos (push) Failing after 1m3s
CI / Build and test (push) Successful in 2m39s
CI / Clippy (push) Successful in 3m37s
Docker / build-and-push (push) Successful in 4m49s
CI / Build and test (all features) (push) Successful in 7m21s
2026-03-28 11:34:57 -07:00
8b4dfb1a1a Add --schema arg to fields command 2026-03-28 11:29:19 -07:00
60dc755561 Extract id3 covers 2026-03-28 11:25:30 -07:00
5527b61d39 Refactor grouping 2026-03-28 11:20:16 -07:00
9967e066bb Tweak schema api
All checks were successful
CI / Typos (push) Successful in 32s
CI / Clippy (push) Successful in 1m15s
CI / Build and test (push) Successful in 1m42s
CI / Build and test (all features) (push) Successful in 6m9s
Docker / build-and-push (push) Successful in 3m23s
2026-03-27 03:13:15 -07:00
336480469c Add hidden attribute
All checks were successful
CI / Typos (push) Successful in 19s
CI / Build and test (push) Successful in 1m50s
CI / Clippy (push) Successful in 4m5s
Docker / build-and-push (push) Successful in 4m9s
CI / Build and test (all features) (push) Successful in 8m5s
2026-03-26 21:51:59 -07:00
5807733e62 Configure server though env
All checks were successful
CI / Typos (push) Successful in 21s
CI / Build and test (push) Successful in 2m33s
CI / Clippy (push) Successful in 3m12s
Docker / build-and-push (push) Successful in 5m35s
CI / Build and test (all features) (push) Successful in 7m1s
2026-03-26 20:46:51 -07:00
256af68382 Schema endpoint 2026-03-26 20:46:49 -07:00
fac300431a Docker
All checks were successful
CI / Typos (push) Successful in 16s
CI / Build and test (push) Successful in 1m43s
CI / Clippy (push) Successful in 2m42s
Docker / build-and-push (push) Successful in 4m28s
CI / Build and test (all features) (push) Successful in 6m28s
2026-03-26 19:49:36 -07:00
47a0adbaff Workdir config
All checks were successful
CI / Typos (push) Successful in 23s
CI / Build and test (push) Successful in 1m53s
CI / Clippy (push) Successful in 4m4s
Docker / build-and-push (push) Successful in 4m28s
CI / Build and test (all features) (push) Successful in 7m2s
2026-03-26 19:31:40 -07:00
80f4ebdbe6 Remove S3 + encryption
All checks were successful
CI / Typos (push) Successful in 20s
CI / Clippy (push) Successful in 2m44s
CI / Build and test (push) Successful in 3m10s
Docker / build-and-push (push) Successful in 5m6s
CI / Build and test (all features) (push) Successful in 6m51s
2026-03-26 14:37:18 -07:00
ec7326a55e Image transformation
Some checks failed
CI / Typos (push) Successful in 20s
CI / Build and test (push) Failing after 2m41s
CI / Clippy (push) Successful in 3m23s
CI / Build and test (all features) (push) Failing after 10m11s
Docker / build-and-push (push) Failing after 1m1s
2026-03-26 14:03:30 -07:00
599c38ac26 Docker 2026-03-26 14:03:27 -07:00
f9a39d5ff9 Add name parameter 2026-03-26 09:58:32 -07:00
5b0953d250 Add field endpoint
Some checks failed
CI / Typos (push) Successful in 17s
CI / Build and test (push) Failing after 2m37s
CI / Clippy (push) Successful in 3m29s
CI / Build and test (all features) (push) Has been cancelled
2026-03-24 09:44:01 -07:00
f7ea25f059 Return len
Some checks failed
CI / Typos (push) Successful in 31s
CI / Clippy (push) Successful in 1m17s
CI / Build and test (all features) (push) Failing after 5m32s
CI / Build and test (push) Failing after 7m49s
2026-03-24 03:43:26 -07:00
6f267880c8 Many field paths
Some checks failed
CI / Typos (push) Successful in 22s
CI / Build and test (push) Failing after 2m40s
CI / Clippy (push) Successful in 3m28s
CI / Build and test (all features) (push) Failing after 11m10s
2026-03-23 22:41:07 -07:00
d95ebeaba0 Extract epub covers 2026-03-23 22:40:32 -07:00
0792b2f2c6 Proxy router
Some checks failed
CI / Typos (push) Successful in 34s
CI / Clippy (push) Successful in 1m17s
CI / Build and test (all features) (push) Failing after 5m15s
CI / Build and test (push) Failing after 10m29s
2026-03-23 22:25:23 -07:00
e83c522e78 Add server client
Some checks failed
CI / Typos (push) Successful in 24s
CI / Clippy (push) Successful in 1m16s
CI / Build and test (all features) (push) Failing after 5m5s
CI / Build and test (push) Failing after 6m55s
2026-03-23 21:53:39 -07:00
dfcb4b0a24 Add server subcommand 2026-03-23 21:43:18 -07:00
76d38d48c5 Reorganize S3 clients 2026-03-23 21:09:22 -07:00
5da81679be Configure content disposition 2026-03-23 16:26:28 -07:00
9008a248c1 Dataset length 2026-03-23 14:48:49 -07:00
4737acbcf4 Add S3 encryption
All checks were successful
CI / Typos (push) Successful in 19s
CI / Build and test (push) Successful in 2m36s
CI / Clippy (push) Successful in 3m33s
CI / Build and test (all features) (push) Successful in 8m52s
2026-03-21 21:05:48 -07:00
39f3c7707b Fix S3 source 2026-03-21 21:05:41 -07:00
c2f4b12e35 Transparent Nulls 2026-03-21 10:29:01 -07:00
302d2acef3 Slice arrays 2026-03-21 10:20:41 -07:00
7caf2553bc Get fields in item cmd 2026-03-21 10:20:41 -07:00
44466f16cf Tweak fs extractor 2026-03-21 10:20:39 -07:00
48262bab48 Exclude large strings 2026-03-21 09:32:22 -07:00
b6cb5870b4 Add regex extractor 2026-03-21 09:27:12 -07:00
ed169b3ab4 Add item subcommand 2026-03-21 08:49:48 -07:00
2f2eb323d5 Add text extractor 2026-03-18 20:47:29 -07:00
915d10bd0e Implement hash() for S3
All checks were successful
CI / Typos (push) Successful in 20s
CI / Build and test (push) Successful in 2m32s
CI / Clippy (push) Successful in 6m3s
CI / Build and test (all features) (push) Successful in 15m23s
2026-03-16 22:31:36 -07:00
1c1c47f5b2 Cancel fix 2026-03-16 22:31:36 -07:00
053459f340 Refactor sidecars 2026-03-16 22:31:33 -07:00
f2f5726d7b FLAC error edits 2026-03-16 19:17:51 -07:00
b1f76b0741 Cross-platform builds 2026-03-16 19:03:14 -07:00
583a1aa6b1 Flatten arrays for FTS index
Some checks failed
CI / Typos (push) Successful in 31s
CI / Clippy (push) Failing after 1m13s
CI / Build and test (all features) (push) Successful in 4m22s
CI / Build and test (push) Successful in 6m2s
2026-03-16 09:56:48 -07:00
2a2d5af36c Sidecar fixes 2026-03-16 09:56:48 -07:00
1d90306408 Add json extractor 2026-03-16 09:56:48 -07:00
979fbb9b0d Filter by mime 2026-03-16 09:56:48 -07:00
8041fc7531 upload subcommand 2026-03-16 09:56:48 -07:00
4ce563ae80 Consistent paths, disable sources 2026-03-16 09:56:48 -07:00
26a428dedc Refactor errors 2026-03-16 09:56:48 -07:00
60483dd53d FLAC image extractor tweak 2026-03-16 09:56:48 -07:00
eea01616a3 Improve arg parsing 2026-03-16 09:56:48 -07:00
2af318c0ec Fix tests 2026-03-16 09:56:48 -07:00
8dd617a24d Bump versions 2026-03-16 09:56:48 -07:00
6c7b23a9e3 More string extractors 2026-03-16 09:56:48 -07:00
08578c7655 Add nix files 2026-03-16 09:56:48 -07:00
d138b6ac95 /item range requests 2026-03-16 09:56:48 -07:00
24428f956c Stream items in /item 2026-03-16 09:56:48 -07:00
078801be40 Extractor rewrite 2026-03-16 09:56:46 -07:00
116 changed files with 7130 additions and 3784 deletions

13
.editorconfig Normal file
View File

@@ -0,0 +1,13 @@
root = true
[*]
indent_style = tab
indent_size = 4
end_of_line = lf
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
[*.yml]
indent_style = space
indent_size = 2

View File

@@ -0,0 +1,30 @@
name: Docker
on:
push:
branches: [main]
jobs:
build-and-push:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Log in to Gitea container registry
uses: docker/login-action@v3
with:
registry: git.betalupi.com
username: ${{ gitea.actor }}
password: ${{ secrets.DEPLOY_TOKEN }}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build and push
uses: docker/build-push-action@v6
with:
context: .
push: true
tags: git.betalupi.com/mark/pile:latest
cache-from: type=registry,ref=git.betalupi.com/mark/pile:cache
cache-to: type=registry,ref=git.betalupi.com/mark/pile:cache,mode=max

1262
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -5,7 +5,7 @@ resolver = "2"
[workspace.package]
rust-version = "1.94.0"
edition = "2024"
version = "0.0.1"
version = "0.0.2"
[workspace.lints.rust]
unused_import_braces = "deny"
@@ -57,6 +57,7 @@ unimplemented = "deny"
unwrap_used = "warn"
expect_used = "warn"
type_complexity = "allow"
len_without_is_empty = "allow"
#
# MARK: dependencies
@@ -67,30 +68,35 @@ pile-toolbox = { path = "crates/pile-toolbox" }
pile-config = { path = "crates/pile-config" }
pile-flac = { path = "crates/pile-flac" }
pile-dataset = { path = "crates/pile-dataset" }
pile-value = { path = "crates/pile-value" }
pile-io = { path = "crates/pile-io" }
pile-client = { path = "crates/pile-client" }
pile-serve = { path = "crates/pile-serve" }
# Clients & servers
# MARK: Clients & servers
tantivy = "0.25.0"
servable = { version = "0.0.7", features = ["image"] }
axum = { version = "0.8.8", features = ["macros", "multipart"] }
utoipa = { version = "5.4.0", features = [
"axum_extras",
"chrono",
"url",
"uuid",
"axum_extras",
"chrono",
"url",
"uuid",
] }
utoipa-swagger-ui = { version = "9.0.2", features = [
"axum",
"debug-embed",
"vendored",
"axum",
"debug-embed",
"vendored",
] }
reqwest = { version = "0.12", features = ["blocking", "json", "stream"] }
tracing-loki = "0.2.6"
# Async & Parallelism
# MARK: Async & Parallelism
tokio = { version = "1.49.0", features = ["full"] }
tokio-stream = "0.1"
async-trait = "0.1"
aws-sdk-s3 = "1"
aws-config = "1"
# CLI & logging
# MARK: CLI & logging
tracing = "0.1.44"
tracing-subscriber = { version = "0.3.22", features = ["env-filter", "json"] }
indicatif = { version = "0.18.4", features = ["improved_unicode"] }
@@ -98,16 +104,21 @@ tracing-indicatif = "0.3.14"
anstyle = "1.0.13"
clap = { version = "4.5.60", features = ["derive"] }
# Serialization & formats
# MARK: Serialization & formats
serde = { version = "1.0.228", features = ["derive"] }
serde_json = "1.0.149"
base64 = "0.22.1"
bytes = "1"
toml = "1.0.3"
toml_edit = "0.25.4"
sha2 = "0.11.0-rc.5"
sha1 = "0.10"
md5 = "0.7"
blake3 = "1.8.3"
dotenvy = "0.15.7"
envy = "0.4.2"
# Extractors
# MARK: Extractors
pdf = "0.10.0"
id3 = "1.16.4"
epub = "1.2.2"
@@ -115,7 +126,7 @@ kamadak-exif = "0.6.1"
pdfium-render = "0.8"
image = { version = "0.25", default-features = false, features = ["png"] }
# Misc helpers
# MARK: Misc helpers
thiserror = "2.0.18"
anyhow = "1.0.102"
itertools = "0.14.0"
@@ -126,6 +137,9 @@ mime = "0.3.17"
mime_guess = "2.0.5"
paste = "1.0.15"
smartstring = "1.0.1"
regex = "1"
chrono = "0.4.43"
parking_lot = "0.12.5"
rayon = "1.11.0"
percent-encoding = "2"
url = { version = "2.5.8", features = ["serde"] }

38
Dockerfile Normal file
View File

@@ -0,0 +1,38 @@
FROM rust:1.94-bookworm AS base
#
# MARK: Build
#
FROM base AS build
RUN apt-get update && \
apt-get install -y --no-install-recommends \
ca-certificates wget unzip \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app/rust
COPY . .
RUN cargo build --release --workspace
RUN cargo test --release --workspace
#
# MARK: Release
#
FROM debian:bookworm AS deploy
WORKDIR /app
RUN apt-get update && \
apt-get install -y --no-install-recommends \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*
COPY --from=build \
/app/rust/target/release/pile \
/app/rust/target/release/libpdfium.so \
/app/bin/
ENV PATH="/app/bin:$PATH"
ENV RUST_BACKTRACE=full
ENTRYPOINT [""]

26
bump-version.sh Executable file
View File

@@ -0,0 +1,26 @@
#!/usr/bin/env bash
set -euo pipefail
CARGO_TOML="$(dirname "$0")/Cargo.toml"
DEFAULT_NIX="$(dirname "$0")/default.nix"
# Read current version from workspace Cargo.toml
current=$(grep '^version = ' "$CARGO_TOML" | head -1 | sed 's/version = "\(.*\)"/\1/')
echo "Current version: $current"
read -rp "New version: " new
if [[ -z "$new" ]]; then
echo "No version entered. Aborting." >&2
exit 1
fi
# Update Cargo.toml workspace version
sed -i "s/^version = \"$current\"/version = \"$new\"/" "$CARGO_TOML"
# Update default.nix version field
sed -i "s/version = \"$current\";/version = \"$new\";/" "$DEFAULT_NIX"
echo "Bumped $current -> $new in:"
echo " $CARGO_TOML"
echo " $DEFAULT_NIX"

View File

@@ -0,0 +1,18 @@
[package]
name = "pile-client"
version = { workspace = true }
rust-version = { workspace = true }
edition = { workspace = true }
[lints]
workspace = true
[dependencies]
pile-serve = { workspace = true }
reqwest = { workspace = true }
serde = { workspace = true }
thiserror = { workspace = true }
bytes = { workspace = true }
axum = { workspace = true }
tracing = { workspace = true }

View File

@@ -0,0 +1,339 @@
use axum::{
Router, body::Body as AxumBody, extract::State, response::Response as AxumResponse,
routing::any,
};
use bytes::Bytes;
use reqwest::{Client, StatusCode, header};
use serde::Deserialize;
use thiserror::Error;
use tracing::{trace, warn};
pub use pile_serve::{
ApiValue, FieldSpec, FieldsResponse, ItemsResponse, LookupRequest, LookupResponse,
SchemaResponse,
};
#[derive(Debug, Error)]
pub enum ClientError {
#[error("invalid bearer token")]
InvalidToken,
#[error("HTTP {status}: {body}")]
Http { status: StatusCode, body: String },
#[error(transparent)]
Reqwest(#[from] reqwest::Error),
}
#[derive(Debug, Deserialize)]
pub struct DatasetInfo {
pub name: String,
}
/// Raw field response: the content-type and body bytes as returned by the server.
pub struct FieldResponse {
pub content_type: String,
pub data: Bytes,
}
//
// MARK: PileClient
//
/// A client for a pile server. Use [`PileClient::dataset`] to get a dataset-scoped client.
pub struct PileClient {
base_url: String,
client: Client,
token: Option<String>,
}
impl PileClient {
pub fn new(base_url: impl Into<String>, token: Option<&str>) -> Result<Self, ClientError> {
let mut headers = header::HeaderMap::new();
if let Some(token) = token {
let value = header::HeaderValue::from_str(&format!("Bearer {token}"))
.map_err(|_| ClientError::InvalidToken)?;
headers.insert(header::AUTHORIZATION, value);
}
let client = Client::builder()
.default_headers(headers)
.build()
.map_err(ClientError::Reqwest)?;
Ok(Self {
base_url: base_url.into(),
client,
token: token.map(str::to_owned),
})
}
/// Returns a client scoped to a specific dataset (i.e. `/{name}/...`).
pub fn dataset(&self, name: &str) -> DatasetClient {
DatasetClient {
base_url: format!("{}/{name}", self.base_url),
client: self.client.clone(),
token: self.token.clone(),
}
}
/// `GET /datasets` — list all datasets served by this server.
pub async fn list_datasets(&self) -> Result<Vec<DatasetInfo>, ClientError> {
let url = format!("{}/datasets", self.base_url);
trace!(url, "GET /datasets");
let resp = self.client.get(url).send().await?;
check_status(resp).await?.json().await.map_err(Into::into)
}
}
//
// MARK: DatasetClient
//
/// A client scoped to a single dataset on the server.
pub struct DatasetClient {
base_url: String,
client: Client,
token: Option<String>,
}
impl DatasetClient {
/// `POST /lookup` — full-text search within this dataset.
pub async fn lookup(
&self,
query: impl Into<String>,
limit: Option<usize>,
) -> Result<LookupResponse, ClientError> {
let body = LookupRequest {
query: query.into(),
limit,
};
let url = format!("{}/lookup", self.base_url);
trace!(url, "POST /lookup");
let resp = self.client.post(url).json(&body).send().await?;
check_status(resp).await?.json().await.map_err(Into::into)
}
/// `GET /extract` — extract a field from an item by object path (e.g. `$.flac.title`).
pub async fn get_extract(
&self,
source: &str,
key: &str,
path: &str,
) -> Result<FieldResponse, ClientError> {
let url = format!("{}/extract", self.base_url);
trace!(url, source, key, path, "GET /extract");
let resp = self
.client
.get(url)
.query(&[("source", source), ("key", key), ("path", path)])
.send()
.await?;
let resp = check_status(resp).await?;
let content_type = resp
.headers()
.get(header::CONTENT_TYPE)
.and_then(|v| v.to_str().ok())
.unwrap_or("application/octet-stream")
.to_owned();
let data = resp.bytes().await?;
Ok(FieldResponse { content_type, data })
}
/// `GET /schema/{field}` — get a single schema field value from an item.
pub async fn schema_field(
&self,
source: &str,
key: &str,
field: &str,
) -> Result<FieldResponse, ClientError> {
let url = format!("{}/schema/{field}", self.base_url);
trace!(url, source, key, field, "GET /schema/{field}");
let resp = self
.client
.get(url)
.query(&[("source", source), ("key", key)])
.send()
.await?;
let resp = check_status(resp).await?;
let content_type = resp
.headers()
.get(header::CONTENT_TYPE)
.and_then(|v| v.to_str().ok())
.unwrap_or("application/octet-stream")
.to_owned();
let data = resp.bytes().await?;
Ok(FieldResponse { content_type, data })
}
/// `GET /schema` — get all schema field values for a single item.
pub async fn schema(
&self,
source: &str,
key: &str,
hidden: bool,
) -> Result<SchemaResponse, ClientError> {
let url = format!("{}/schema", self.base_url);
trace!(url, source, key, hidden, "GET /schema");
let resp = self
.client
.get(url)
.query(&[("source", source), ("key", key)])
.query(&[("hidden", hidden)])
.send()
.await?;
check_status(resp).await?.json().await.map_err(Into::into)
}
/// `GET /config/schema` — retrieve this dataset's schema spec.
pub async fn config_schema(&self) -> Result<FieldsResponse, ClientError> {
let url = format!("{}/config/schema", self.base_url);
trace!(url, "GET /config/schema");
let resp = self.client.get(url).send().await?;
check_status(resp).await?.json().await.map_err(Into::into)
}
/// `GET /items` — paginate over all items in this dataset, ordered by (source, key).
pub async fn list_items(
&self,
offset: usize,
limit: usize,
) -> Result<ItemsResponse, ClientError> {
let url = format!("{}/items", self.base_url);
trace!(url, offset, limit, "GET /items");
let resp = self
.client
.get(url)
.query(&[("offset", offset), ("limit", limit)])
.send()
.await?;
check_status(resp).await?.json().await.map_err(Into::into)
}
/// Returns an axum [`Router`] that proxies all requests to this dataset's
/// endpoints on the remote pile server, streaming responses without buffering.
/// All headers are forwarded; hop-by-hop headers are stripped.
pub fn proxy_router(&self) -> Router {
let state = ProxyState {
base_url: self.base_url.clone(),
client: self.client.clone(),
token: self.token.clone(),
};
Router::new()
.route("/", any(proxy_handler))
.route("/{*path}", any(proxy_handler))
.with_state(state)
}
}
//
// MARK: Proxy
//
#[derive(Clone)]
struct ProxyState {
base_url: String,
client: Client,
token: Option<String>,
}
async fn proxy_handler(
State(state): State<ProxyState>,
req: axum::extract::Request,
) -> AxumResponse {
let path = req.uri().path().to_owned();
let query_str = req
.uri()
.query()
.map(|q| format!("?{q}"))
.unwrap_or_default();
let method = req.method().clone();
let url = format!("{}{}{}", state.base_url, path, query_str);
trace!(method = %method, url, "proxying request");
let mut req_builder = state.client.request(method, &url);
// Forward all request headers except hop-by-hop and Host.
// Authorization is skipped so the client's default bearer token is used.
for (name, value) in req.headers() {
if !is_hop_by_hop(name) && name != header::HOST && name != header::AUTHORIZATION {
req_builder = req_builder.header(name, value);
}
}
// Attach bearer token if present (overrides client default for clarity).
if let Some(ref token) = state.token
&& let Ok(value) = header::HeaderValue::from_str(&format!("Bearer {token}"))
{
req_builder = req_builder.header(header::AUTHORIZATION, value);
}
// Stream the request body upstream.
let body_stream = req.into_body().into_data_stream();
req_builder = req_builder.body(reqwest::Body::wrap_stream(body_stream));
let upstream = match req_builder.send().await {
Ok(r) => r,
Err(e) => {
warn!(error = %e, "upstream request failed");
return AxumResponse::builder()
.status(StatusCode::BAD_GATEWAY.as_u16())
.body(AxumBody::from(e.to_string()))
.unwrap_or_else(|_| AxumResponse::new(AxumBody::empty()));
}
};
let status = upstream.status().as_u16();
trace!(status, "upstream response");
let resp_headers = upstream.headers().clone();
let mut builder = AxumResponse::builder().status(status);
for (name, value) in &resp_headers {
if !is_hop_by_hop(name) {
builder = builder.header(name, value);
}
}
// Stream the response body without buffering.
builder
.body(AxumBody::from_stream(upstream.bytes_stream()))
.unwrap_or_else(|_| AxumResponse::new(AxumBody::empty()))
}
fn is_hop_by_hop(name: &header::HeaderName) -> bool {
name == header::CONNECTION
|| name == header::TRANSFER_ENCODING
|| name == header::TE
|| name == header::UPGRADE
|| name == header::PROXY_AUTHORIZATION
|| name == header::PROXY_AUTHENTICATE
|| name.as_str() == "keep-alive"
|| name.as_str() == "trailers"
}
//
// MARK: helpers
//
async fn check_status(resp: reqwest::Response) -> Result<reqwest::Response, ClientError> {
let status = resp.status();
if status.is_success() {
return Ok(resp);
}
let body = resp.text().await.unwrap_or_default();
Err(ClientError::Http { status, body })
}

View File

@@ -9,7 +9,7 @@ name = "dataset"
# working_dir = ".pile"
# Data sources available in this dataset
source."music" = { type = "filesystem", path = "music" }
source."music" = { type = "filesystem", path = "library" }
# This dataset's schema.
# Defines normalized fields that are extracted from source entries on-demand.
@@ -21,18 +21,18 @@ source."music" = { type = "filesystem", path = "music" }
# # only text is supported in this version.
# type = "text",
#
# # An array of jsonpaths (rfc9535) used to extract this field from each source entry.
# # How to extract this field from each source entry.
# # These are evaluated in order, the first non-null value is used.
# # A single string is equivalent to an array with one element.
# path = "$.json.path"
# path = [ "$.json.path" ]
# }
[schema]
album = { type = "text", path = "$.Album" }
isrc = { type = "text", path = "$.Isrc" }
artist = { type = "text", path = ["$.Artist", "$.TrackArtist"] }
lyrics = { type = "text", path = "$.Lyrics" }
genre = { type = "text", path = "$.Genre" }
title = { type = "text", path = ["$.Title", "$.TrackTitle"] }
album = { type = "text", path = ["$.flac.album"] }
isrc = { type = "text", path = ["$.flac.isrc"] }
artist = { type = "text", path = ["$.flac.artist", "$.flac.trackartist"] }
lyrics = { type = "text", path = ["$.flac.lyrics"] }
genre = { type = "text", path = ["$.flac.genre"] }
title = { type = "text", path = ["$.flac.tracktitle", "$.flac.title"] }
# Fts configuration.
# Determines which fields (defined in `schema`) are included in the fts index.

View File

@@ -1,15 +1,13 @@
use serde::Deserialize;
use serde::{Deserialize, Serialize};
use std::{collections::HashMap, fmt::Debug, path::PathBuf};
mod post;
pub use post::*;
use crate::objectpath::ObjectPath;
mod misc;
pub use misc::*;
use crate::objectpath::ObjectPath;
pub mod objectpath;
pub mod pattern;
pub static INIT_DB_TOML: &str = include_str!("./config.toml");
@@ -17,6 +15,15 @@ fn default_true() -> bool {
true
}
pub fn default_base() -> String {
"(.*)".to_owned()
}
#[expect(clippy::unwrap_used)]
pub fn default_files() -> HashMap<Label, String> {
[(Label::new("item").unwrap(), "{base}".to_owned())].into()
}
#[test]
#[expect(clippy::expect_used)]
fn init_db_toml_valid() {
@@ -35,21 +42,8 @@ pub struct DatasetConfig {
/// Must be unique
pub name: Label,
/// Root dir for indices
pub working_dir: Option<PathBuf>,
/// Where to find this field
pub source: HashMap<Label, Source>,
/// How to post-process this field
#[serde(default)]
pub post: Vec<FieldSpecPost>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct S3Credentials {
pub access_key_id: String,
pub secret_access_key: String,
}
#[derive(Debug, Clone, Deserialize)]
@@ -58,34 +52,25 @@ pub struct S3Credentials {
pub enum Source {
/// A directory of files
Filesystem {
/// If false, ignore this dataset
#[serde(default = "default_true")]
enabled: bool,
/// The directories to scan.
/// Must be relative.
path: PathBuf,
/// If true, all toml files are ignored.
/// Metadata can be added to any file using a {filename}.toml.
///
/// If false, toml files are treated as regular files
/// and sidecar metadata is disabled.
#[serde(default = "default_true")]
sidecars: bool,
},
/// Regex that extracts an item key from a file path.
/// - File paths are relative to `path`.
/// - The first group in this regex is the file's item key.
#[serde(default = "default_base")]
base_pattern: String,
/// An S3-compatible object store bucket
S3 {
bucket: String,
prefix: Option<String>,
/// Custom endpoint URL (for MinIO, etc.)
endpoint: Option<String>,
region: String,
credentials: S3Credentials,
/// If true, all .toml objects are treated as sidecar metadata files.
#[serde(default = "default_true")]
sidecars: bool,
/// Map of files included in each item.'
/// `{base}` is replaced with the string extracted by base_pattern.
/// Default is `{ item: "{base}" }`
#[serde(default = "default_files")]
files: HashMap<Label, String>,
},
}
@@ -93,23 +78,16 @@ pub enum Source {
// MARK: schema
//
#[derive(Debug, Clone, Deserialize)]
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct FieldSpec {
/// The type of this field
pub r#type: FieldType,
/// If true, do not display this field.
/// This attribute has no effect on pile, it
/// is intended for consumers of data.
#[serde(default)]
pub hidden: bool,
/// How to find this field in a data entry
pub path: Vec<ObjectPath>,
/// How to post-process this field
#[serde(default)]
pub post: Vec<FieldSpecPost>,
}
#[derive(Debug, Clone, Copy, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
pub enum FieldType {
Text,
}
//

View File

@@ -1,7 +1,7 @@
use std::{fmt, str::FromStr};
use serde::{
Deserialize, Deserializer,
Deserialize, Deserializer, Serialize, Serializer,
de::{self, Visitor},
};
use smartstring::{LazyCompact, SmartString};
@@ -41,11 +41,21 @@ pub enum PathSegment {
/// Go to root node (`$` identifier)
Root,
/// Go to a child of the current object
Field(Label),
/// Go to a child of the current object.
Field {
name: Label,
args: Option<SmartString<LazyCompact>>,
},
/// Go to an element of the current list
Index(i64),
/// Go to a slice of the current list
Range {
start: i64,
end: i64,
inclusive: bool,
},
}
/// A path to aPathSegment::Field inside a nested object,
@@ -55,11 +65,44 @@ pub enum PathSegment {
/// - `$` refers to the root object
/// - `.<name>` selects aPathSegment::Field of an object
/// - `[n]` selects an item of an array
#[derive(Debug, Clone)]
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ObjectPath {
pub segments: Vec<PathSegment>,
}
impl fmt::Display for ObjectPath {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
for seg in &self.segments {
match seg {
PathSegment::Root => write!(f, "$")?,
PathSegment::Field { name, args: None } => write!(f, ".{name}")?,
PathSegment::Field {
name,
args: Some(a),
} => write!(f, ".{name}({a})")?,
PathSegment::Index(i) => write!(f, "[{i}]")?,
PathSegment::Range {
start,
end,
inclusive: false,
} => write!(f, "[{start}..{end}]")?,
PathSegment::Range {
start,
end,
inclusive: true,
} => write!(f, "[{start}..={end}]")?,
}
}
Ok(())
}
}
impl Serialize for ObjectPath {
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
serializer.serialize_str(&self.to_string())
}
}
impl<'de> Deserialize<'de> for ObjectPath {
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
struct PathVisitor;

View File

@@ -1,10 +1,80 @@
use std::str::FromStr;
use smartstring::{LazyCompact, SmartString};
use crate::{
Label,
objectpath::{PathParseError, PathSegment, tokenizer::Token},
};
/// Parse an ident token into a `PathSegment::Field`, handling optional args of
/// the form `name(args)`. Parens inside args may be nested; `\(` and `\)` are
/// escaped and do not affect depth counting.
fn parse_field(ident: &str, position: usize) -> Result<PathSegment, PathParseError> {
let bytes = ident.as_bytes();
let mut i = 0;
// Find the first unescaped '(' — everything before it is the name.
let open_paren: Option<usize> = loop {
if i >= bytes.len() {
break None;
}
match bytes[i] {
b'\\' => i += 2, // skip escaped character
b'(' => break Some(i),
_ => i += 1,
}
};
let name_str = &ident[..open_paren.unwrap_or(bytes.len())];
let name = Label::new(name_str).ok_or_else(|| PathParseError::InvalidField {
position,
str: name_str.into(),
})?;
let Some(open_pos) = open_paren else {
return Ok(PathSegment::Field { name, args: None });
};
// Scan args, tracking paren depth.
let args_start = open_pos + 1;
let mut depth: usize = 1;
let mut j = args_start;
while j < bytes.len() {
match bytes[j] {
b'\\' => j += 2, // skip escaped character
b'(' => {
depth += 1;
j += 1;
}
b')' => {
depth -= 1;
if depth == 0 {
// Closing paren must be the last character.
if j + 1 != bytes.len() {
return Err(PathParseError::Syntax {
position: position + j + 1,
});
}
let args: SmartString<LazyCompact> = ident[args_start..j].into();
return Ok(PathSegment::Field {
name,
args: Some(args),
});
}
j += 1;
}
_ => j += 1,
}
}
// Reached end of ident without finding the matching ')'.
Err(PathParseError::Syntax {
position: position + ident.len(),
})
}
enum State {
Start,
@@ -17,6 +87,15 @@ enum State {
/// We are indexing an array, waiting for a number
Index,
/// We parsed the start index, waiting for `]` or the first `.` of `..`
IndexAfterStart(i64),
/// We saw one `.` after the start index, waiting for the second `.`
IndexRangeDot1(i64),
/// We saw `..`, waiting for the end index (optionally prefixed with `=`)
IndexRangeDot2(i64),
/// We are indexing an array, waiting for a close-bracket
IndexClose,
}
@@ -72,14 +151,7 @@ impl Parser {
// MARK: dot
//
(State::Dot, (p, Token::Ident(ident))) => {
self.segments
.push(PathSegment::Field(Label::new(*ident).ok_or_else(|| {
PathParseError::InvalidField {
position: *p,
str: (*ident).into(),
}
})?));
self.segments.push(parse_field(ident, *p)?);
self.state = State::Selected;
}
@@ -101,8 +173,7 @@ impl Parser {
}
})?;
self.segments.push(PathSegment::Index(idx));
self.state = State::IndexClose;
self.state = State::IndexAfterStart(idx);
}
(State::Index, (p, Token::Root))
@@ -112,6 +183,49 @@ impl Parser {
return Err(PathParseError::Syntax { position: *p });
}
(State::IndexAfterStart(idx), (_, Token::SqbClose)) => {
self.segments.push(PathSegment::Index(idx));
self.state = State::Selected;
}
(State::IndexAfterStart(idx), (_, Token::Dot)) => {
self.state = State::IndexRangeDot1(idx);
}
(State::IndexAfterStart(_), (p, _)) => {
return Err(PathParseError::Syntax { position: *p });
}
(State::IndexRangeDot1(idx), (_, Token::Dot)) => {
self.state = State::IndexRangeDot2(idx);
}
(State::IndexRangeDot1(_), (p, _)) => {
return Err(PathParseError::Syntax { position: *p });
}
(State::IndexRangeDot2(start), (p, Token::Ident(ident))) => {
let (end_str, inclusive) = if let Some(stripped) = ident.strip_prefix('=') {
(stripped, true)
} else {
(*ident, false)
};
let end: i64 = i64::from_str(end_str).map_err(|_err| {
PathParseError::InvalidIndexString {
position: *p,
str: (*ident).into(),
}
})?;
self.segments.push(PathSegment::Range {
start,
end,
inclusive,
});
self.state = State::IndexClose;
}
(State::IndexRangeDot2(_), (p, _)) => {
return Err(PathParseError::Syntax { position: *p });
}
(State::IndexClose, (_, Token::SqbClose)) => self.state = State::Selected,
(State::IndexClose, (p, _)) => {
return Err(PathParseError::Syntax { position: *p });
@@ -124,6 +238,9 @@ impl Parser {
State::Start => Err(PathParseError::Syntax { position: 0 }),
State::Dot => Err(PathParseError::Syntax { position }),
State::Index => Err(PathParseError::Syntax { position }),
State::IndexAfterStart(_) => Err(PathParseError::Syntax { position }),
State::IndexRangeDot1(_) => Err(PathParseError::Syntax { position }),
State::IndexRangeDot2(_) => Err(PathParseError::Syntax { position }),
State::IndexClose => Err(PathParseError::Syntax { position }),
State::Selected => Ok(()),
}?;
@@ -161,27 +278,30 @@ mod tests {
parse_test("$", Ok(&[PathSegment::Root]));
}
fn field(name: &str) -> PathSegment {
PathSegment::Field {
name: Label::new(name).unwrap(),
args: None,
}
}
fn field_args(name: &str, args: &str) -> PathSegment {
PathSegment::Field {
name: Label::new(name).unwrap(),
args: Some(args.into()),
}
}
#[test]
fn single_field() {
parse_test(
"$.foo",
Ok(&[
PathSegment::Root,
PathSegment::Field(Label::new("foo").unwrap()),
]),
);
parse_test("$.foo", Ok(&[PathSegment::Root, field("foo")]));
}
#[test]
fn nested_fields() {
parse_test(
"$.foo.bar.baz",
Ok(&[
PathSegment::Root,
PathSegment::Field(Label::new("foo").unwrap()),
PathSegment::Field(Label::new("bar").unwrap()),
PathSegment::Field(Label::new("baz").unwrap()),
]),
Ok(&[PathSegment::Root, field("foo"), field("bar"), field("baz")]),
);
}
@@ -189,11 +309,7 @@ mod tests {
fn array_index() {
parse_test(
"$.items[0]",
Ok(&[
PathSegment::Root,
PathSegment::Field(Label::new("items").unwrap()),
PathSegment::Index(0),
]),
Ok(&[PathSegment::Root, field("items"), PathSegment::Index(0)]),
);
}
@@ -203,7 +319,7 @@ mod tests {
"$.a[1][2]",
Ok(&[
PathSegment::Root,
PathSegment::Field(Label::new("a").unwrap()),
field("a"),
PathSegment::Index(1),
PathSegment::Index(2),
]),
@@ -216,9 +332,9 @@ mod tests {
"$.a[0].b",
Ok(&[
PathSegment::Root,
PathSegment::Field(Label::new("a").unwrap()),
field("a"),
PathSegment::Index(0),
PathSegment::Field(Label::new("b").unwrap()),
field("b"),
]),
);
}
@@ -227,14 +343,94 @@ mod tests {
fn negative_index() {
parse_test(
"$.a[-1]",
Ok(&[PathSegment::Root, field("a"), PathSegment::Index(-1)]),
);
}
// MARK: args
#[test]
fn field_with_simple_args() {
parse_test(
"$.foo(bar)",
Ok(&[PathSegment::Root, field_args("foo", "bar")]),
);
}
#[test]
fn field_with_empty_args() {
parse_test("$.foo()", Ok(&[PathSegment::Root, field_args("foo", "")]));
}
#[test]
fn field_with_nested_parens_in_args() {
parse_test(
"$.foo(a(b)c)",
Ok(&[PathSegment::Root, field_args("foo", "a(b)c")]),
);
}
#[test]
fn field_with_deeply_nested_parens_in_args() {
parse_test(
"$.foo(a(b(c))d)",
Ok(&[PathSegment::Root, field_args("foo", "a(b(c))d")]),
);
}
#[test]
fn field_with_escaped_open_paren_in_args() {
// "$.foo(a\(b)" — '\(' is escaped, so depth never rises above 1; ')' closes it
parse_test(
r"$.foo(a\(b)",
Ok(&[PathSegment::Root, field_args("foo", r"a\(b")]),
);
}
#[test]
fn field_with_escaped_close_paren_in_args() {
// "$.foo(a\)b)" — '\)' is escaped, the second ')' closes at depth 0
parse_test(
r"$.foo(a\)b)",
Ok(&[PathSegment::Root, field_args("foo", r"a\)b")]),
);
}
#[test]
fn field_with_both_escaped_parens_in_args() {
parse_test(
r"$.foo(a\(b\)c)",
Ok(&[PathSegment::Root, field_args("foo", r"a\(b\)c")]),
);
}
#[test]
fn field_args_with_multiple_segments() {
parse_test(
"$.foo(x).bar(y)",
Ok(&[
PathSegment::Root,
PathSegment::Field(Label::new("a").unwrap()),
PathSegment::Index(-1),
field_args("foo", "x"),
field_args("bar", "y"),
]),
);
}
#[test]
fn field_args_unclosed_paren_error() {
// Missing closing ')' → Syntax error at end of source
parse_test("$.foo(bar", Err(PathParseError::Syntax { position: 9 }));
}
#[test]
fn field_args_trailing_chars_after_close_error() {
// Closing ')' is not the last char → Syntax error at the trailing char
parse_test(
"$.foo(bar)baz",
Err(PathParseError::Syntax { position: 10 }),
);
}
#[test]
fn non_ascii_error() {
parse_test(
@@ -245,4 +441,46 @@ mod tests {
}),
);
}
// MARK: range
fn range(start: i64, end: i64, inclusive: bool) -> PathSegment {
PathSegment::Range {
start,
end,
inclusive,
}
}
#[test]
fn exclusive_range() {
parse_test(
"$.a[0..5]",
Ok(&[PathSegment::Root, field("a"), range(0, 5, false)]),
);
}
#[test]
fn inclusive_range() {
parse_test(
"$.a[1..=2]",
Ok(&[PathSegment::Root, field("a"), range(1, 2, true)]),
);
}
#[test]
fn range_with_negative_end() {
parse_test(
"$.a[0..-1]",
Ok(&[PathSegment::Root, field("a"), range(0, -1, false)]),
);
}
#[test]
fn range_with_negative_start() {
parse_test(
"$.a[-3..-1]",
Ok(&[PathSegment::Root, field("a"), range(-3, -1, false)]),
);
}
}

View File

@@ -21,7 +21,52 @@ impl Tokenizer {
let mut tokens = Vec::new();
let mut window_start = None;
// Paren depth: while > 0, `.` / `[` / `]` / `$` are part of the ident.
let mut paren_depth: usize = 0;
// When true, the current char is escaped by a preceding `\` and is
// treated as a plain ident character with no special meaning.
let mut skip_next = false;
for (i, c) in source.char_indices() {
if skip_next {
skip_next = false;
// Escaped char: just extend the ident window (already opened by `\`).
continue;
}
if c == '\\' {
if window_start.is_none() {
window_start = Some(i);
}
skip_next = true;
continue;
}
if paren_depth > 0 {
// Inside parens: only track depth changes, everything else is ident.
match c {
'(' => {
if window_start.is_none() {
window_start = Some(i);
}
paren_depth += 1;
}
')' => {
if window_start.is_none() {
window_start = Some(i);
}
paren_depth -= 1;
}
x if x.is_ascii() => {
if window_start.is_none() {
window_start = Some(i);
}
}
char => return Err(PathParseError::NonAsciiChar { position: i, char }),
}
continue;
}
match c {
'$' => {
if let Some(s) = window_start.take() {
@@ -51,10 +96,26 @@ impl Tokenizer {
tokens.push((i, Token::SqbClose));
}
x if x.is_ascii() => match window_start {
None => window_start = Some(i),
Some(_) => continue,
},
'(' => {
if window_start.is_none() {
window_start = Some(i);
}
paren_depth += 1;
}
')' => {
if window_start.is_none() {
window_start = Some(i);
}
// paren_depth is 0 here — stray `)` is an ident char and
// parse_field will surface the error later.
}
x if x.is_ascii() => {
if window_start.is_none() {
window_start = Some(i);
}
}
char => return Err(PathParseError::NonAsciiChar { position: i, char }),
}

View File

@@ -0,0 +1,49 @@
use std::collections::HashMap;
use serde::{Deserialize, Deserializer, de};
use smartstring::{LazyCompact, SmartString};
use thiserror::Error;
use crate::{Label, objectpath::PathParseError as ObjectPathError};
mod parser;
pub use parser::GroupSegment;
#[derive(Debug, Error, PartialEq)]
pub enum GroupPatternParseError {
/// A `{` or `}` appeared in an invalid position, or a `{` was never closed.
#[error("syntax error at index {position}")]
Syntax { position: usize },
/// The contents of a `{...}` block could not be parsed as an object path.
#[error("invalid object path {path:?}: {source}")]
InvalidObjectPath {
start: usize,
end: usize,
path: SmartString<LazyCompact>,
source: ObjectPathError,
},
}
#[derive(Debug, Clone, Default)]
pub struct GroupPattern {
pub pattern: HashMap<Label, Vec<GroupSegment>>,
}
impl<'de> Deserialize<'de> for GroupPattern {
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
let raw = HashMap::<String, String>::deserialize(deserializer)?;
let mut parts = HashMap::with_capacity(raw.len());
for (key, value) in raw {
let label = Label::try_from(key.as_str()).map_err(de::Error::custom)?;
let segments = parser::Parser::new()
.parse(&value)
.map_err(de::Error::custom)?
.into_iter()
.map(|(_, seg)| seg)
.collect();
parts.insert(label, segments);
}
Ok(GroupPattern { pattern: parts })
}
}

View File

@@ -0,0 +1,203 @@
use smartstring::{LazyCompact, SmartString};
use crate::{objectpath::ObjectPath, pattern::GroupPatternParseError};
#[cfg_attr(test, derive(PartialEq))]
#[derive(Debug, Clone)]
pub enum GroupSegment {
Path(ObjectPath),
Literal(SmartString<LazyCompact>),
}
pub struct Parser {}
impl Parser {
pub fn new() -> Self {
Self {}
}
/// Parse a pattern string of the form `{path}.literal{path}...`.
///
/// - `{...}` delimiters are parsed as [`ObjectPath`] expressions.
/// Nested `{}` inside a path are allowed; depth is tracked to find the
/// matching closing brace.
/// - Everything outside `{...}` is a `Literal` segment.
/// - A bare `}` in literal position (depth == 0) is a syntax error.
/// - An unclosed `{` is a syntax error.
pub fn parse(self, source: &str) -> Result<Vec<(usize, GroupSegment)>, GroupPatternParseError> {
let mut tokens = Vec::new();
// `depth` > 0 means we are currently inside a `{...}` path expression.
let mut depth: usize = 0;
// Start of the current segment (literal text or path content).
let mut window_start: usize = 0;
// Source position of the opening `{` for the current path (used for error reporting).
let mut open_brace: usize = 0;
for (i, c) in source.char_indices() {
match c {
'{' => {
if depth == 0 {
// Emit any accumulated literal.
if i > window_start {
tokens.push((
window_start,
GroupSegment::Literal(source[window_start..i].into()),
));
}
open_brace = i;
// Path content starts after the opening brace.
window_start = i + 1;
depth = 1;
} else {
// Nested brace inside a path — keep counting.
depth += 1;
}
}
'}' => {
if depth == 0 {
// Unmatched `}` outside any path.
return Err(GroupPatternParseError::Syntax { position: i });
}
depth -= 1;
if depth == 0 {
// Closing brace of the outermost path expression — parse as ObjectPath.
let path_str = &source[window_start..i];
let path = path_str.parse::<ObjectPath>().map_err(|e| {
GroupPatternParseError::InvalidObjectPath {
start: open_brace,
end: i + 1,
path: path_str.into(),
source: e,
}
})?;
tokens.push((open_brace, GroupSegment::Path(path)));
// Literal content (if any) starts after this `}`.
window_start = i + 1;
}
}
_ => {}
}
}
// Unclosed `{`.
if depth > 0 {
return Err(GroupPatternParseError::Syntax {
position: open_brace,
});
}
// Emit any trailing literal.
if window_start < source.len() {
tokens.push((
window_start,
GroupSegment::Literal(source[window_start..].into()),
));
}
Ok(tokens)
}
}
//
// MARK: tests
//
#[expect(clippy::unwrap_used)]
#[cfg(test)]
mod tests {
use super::*;
fn parse(source: &str) -> Result<Vec<(usize, GroupSegment)>, GroupPatternParseError> {
Parser::new().parse(source)
}
fn path(s: &str) -> GroupSegment {
GroupSegment::Path(s.parse().unwrap())
}
fn lit(s: &str) -> GroupSegment {
GroupSegment::Literal(s.into())
}
#[test]
fn regex() {
assert_eq!(
parse("{$.split(/)[-1].regex((.*).pub \\((.*)\\).pdf)[0]}").unwrap(),
vec![(0, path("$.split(/)[-1].regex((.*).pub \\((.*)\\).pdf)[0]"))]
);
}
#[test]
fn single_path() {
assert_eq!(parse("{$.foo}").unwrap(), vec![(0, path("$.foo"))]);
}
#[test]
fn single_literal() {
assert_eq!(parse("hello").unwrap(), vec![(0, lit("hello"))]);
}
#[test]
fn path_then_literal() {
assert_eq!(
parse("{$.foo}.txt").unwrap(),
vec![(0, path("$.foo")), (7, lit(".txt"))]
);
}
#[test]
fn literal_then_path() {
assert_eq!(
parse("prefix/{$.foo}").unwrap(),
vec![(0, lit("prefix/")), (7, path("$.foo"))]
);
}
#[test]
fn interleaved() {
assert_eq!(
parse("{$.a}.sep.{$.b}").unwrap(),
vec![(0, path("$.a")), (5, lit(".sep.")), (10, path("$.b")),]
);
}
#[test]
fn unmatched_open_brace_error() {
assert_eq!(
parse("{$.foo"),
Err(GroupPatternParseError::Syntax { position: 0 })
);
}
#[test]
fn unmatched_close_brace_in_literal_error() {
assert_eq!(
parse("foo}bar"),
Err(GroupPatternParseError::Syntax { position: 3 })
);
}
#[test]
fn invalid_path_error() {
assert_eq!(
parse("{not-a-path}"),
Err(GroupPatternParseError::InvalidObjectPath {
start: 0,
end: 12,
path: "not-a-path".into(),
source: crate::objectpath::PathParseError::MustStartWithRoot { position: 0 },
})
);
}
#[test]
fn literal_between_paths() {
assert_eq!(
parse("foo{$.x}bar").unwrap(),
vec![(0, lit("foo")), (3, path("$.x")), (8, lit("bar")),]
);
}
}

View File

@@ -1,18 +0,0 @@
use serde::Deserialize;
#[derive(Debug, Clone, Deserialize, PartialEq, Eq)]
#[serde(untagged)]
pub enum FieldSpecPost {
TrimSuffix { trim_suffix: String },
TrimPrefix { trim_prefix: String },
SetCase { case: Case },
Join { join: String },
NotEmpty { notempty: bool },
}
#[derive(Debug, Clone, Copy, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
pub enum Case {
Lower,
Upper,
}

View File

@@ -10,37 +10,17 @@ workspace = true
[dependencies]
pile-config = { workspace = true }
pile-toolbox = { workspace = true }
pile-flac = { workspace = true }
pile-value = { workspace = true }
regex = { workspace = true }
serde_json = { workspace = true }
itertools = { workspace = true }
walkdir = { workspace = true }
tantivy = { workspace = true }
tracing = { workspace = true }
chrono = { workspace = true }
toml = { workspace = true }
thiserror = { workspace = true }
smartstring = { workspace = true }
blake3 = { workspace = true }
epub = { workspace = true }
kamadak-exif = { workspace = true }
pdf = { workspace = true }
pdfium-render = { workspace = true, optional = true }
image = { workspace = true, optional = true }
id3 = { workspace = true }
tokio = { workspace = true }
tokio-stream = { workspace = true }
async-trait = { workspace = true }
aws-sdk-s3 = { workspace = true }
mime = { workspace = true }
mime_guess = { workspace = true }
serde = { workspace = true }
axum = { workspace = true, optional = true }
utoipa = { workspace = true, optional = true }
utoipa-swagger-ui = { workspace = true, optional = true }
[features]
default = []
pdfium = ["dep:pdfium-render", "dep:image"]
axum = ["dep:axum", "dep:utoipa", "dep:utoipa-swagger-ui"]
pdfium = ["pile-value/pdfium"]

View File

@@ -1,57 +0,0 @@
use std::env;
use std::path::PathBuf;
const PDFIUM_URL: &str = "https://github.com/bblanchon/pdfium-binaries/releases/download/chromium%2F7725/pdfium-linux-x64.tgz";
#[expect(clippy::expect_used)]
#[expect(clippy::unwrap_used)]
#[expect(clippy::print_stderr)]
fn main() {
println!("cargo:rerun-if-changed=build.rs");
if env::var("CARGO_FEATURE_PDFIUM").is_err() {
return;
}
let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
// OUT_DIR is target/<profile>/build/<pkg>-<hash>/out
// Go up 3 levels to reach target/<profile>/
let profile_dir = out_dir
.ancestors()
.nth(3)
.expect("unexpected OUT_DIR structure")
.to_path_buf();
let lib_path = profile_dir.join("libpdfium.so");
if !lib_path.exists() {
let tgz_path = out_dir.join("pdfium.tgz");
eprintln!("cargo:warning=Downloading PDFium from {PDFIUM_URL}");
let status = std::process::Command::new("curl")
.args(["-L", "--fail", "-o", tgz_path.to_str().unwrap(), PDFIUM_URL])
.status()
.expect("failed to run curl");
assert!(status.success(), "curl failed to download PDFium");
let status = std::process::Command::new("tar")
.args([
"-xzf",
tgz_path.to_str().unwrap(),
"-C",
out_dir.to_str().unwrap(),
])
.status()
.expect("failed to run tar");
assert!(status.success(), "tar failed to extract PDFium");
std::fs::copy(out_dir.join("lib").join("libpdfium.so"), &lib_path)
.expect("failed to copy libpdfium.so");
}
println!("cargo:rustc-link-search=native={}", profile_dir.display());
println!("cargo:rustc-link-lib=dylib=pdfium");
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
}

View File

@@ -1,21 +1,22 @@
use chrono::{DateTime, Utc};
use pile_config::{ConfigToml, Label, Source, objectpath::ObjectPath};
use pile_config::{
ConfigToml, DatasetConfig, Label, Source, default_base, default_files, objectpath::ObjectPath,
};
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
use pile_value::{
extract::traits::ExtractState,
source::{DataSource, DirDataSource, misc::path_ts_earliest},
value::{Item, PileValue},
};
use regex::Regex;
use serde_json::Value;
use std::{collections::HashMap, io::ErrorKind, path::PathBuf, sync::Arc, time::Instant};
use tantivy::{Executor, Index, IndexWriter, TantivyError, collector::TopDocs};
use thiserror::Error;
use tokio::task::JoinSet;
use tokio_stream::{StreamExt, wrappers::ReceiverStream};
use tracing::{debug, info, trace, warn};
use crate::{
DataSource, Item, PileValue,
extract::MetaExtractor,
index::{DbFtsIndex, FtsLookupResult},
path_ts_earliest,
source::{DirDataSource, S3DataSource},
};
use crate::index::{DbFtsIndex, FtsLookupResult};
#[derive(Debug, Error)]
pub enum DatasetError {
@@ -33,31 +34,43 @@ pub enum DatasetError {
// MARK: Dataset enum
//
/// An opened data source — either a local filesystem directory or an S3 bucket.
/// An opened data source
pub enum Dataset {
Dir(Arc<DirDataSource>),
S3(Arc<S3DataSource>),
}
impl Dataset {
pub async fn get(&self, key: &str) -> Option<Item> {
pub fn len(&self) -> usize {
match self {
Self::Dir(ds) => ds.get(key).await.ok().flatten(),
Self::S3(ds) => ds.get(key).await.ok().flatten(),
Self::Dir(ds) => ds.len(),
}
}
pub fn iter(&self) -> ReceiverStream<Result<Item, std::io::Error>> {
pub async fn get(&self, key: &str) -> Option<Item> {
match self {
Self::Dir(ds) => ds.iter(),
Self::S3(ds) => ds.iter(),
Self::Dir(ds) => ds.get(key).await.ok().flatten(),
}
}
pub fn iter(&self) -> Box<dyn Iterator<Item = &Item> + Send + '_> {
match self {
Self::Dir(ds) => Box::new(ds.iter()),
}
}
pub fn iter_page(
&self,
offset: usize,
limit: usize,
) -> Box<dyn Iterator<Item = &Item> + Send + '_> {
match self {
Self::Dir(ds) => Box::new(ds.iter_page(offset, limit)),
}
}
pub async fn latest_change(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
match self {
Self::Dir(ds) => ds.latest_change().await,
Self::S3(ds) => ds.latest_change().await,
}
}
}
@@ -68,16 +81,103 @@ impl Dataset {
/// An opened dataset: config, working directory, and all opened sources.
pub struct Datasets {
pub path_config: PathBuf,
pub path_config: Option<PathBuf>,
pub path_parent: PathBuf,
pub path_workdir: PathBuf,
pub path_workdir: Option<PathBuf>,
pub config: ConfigToml,
pub sources: HashMap<Label, Dataset>,
pub disabled_sources: HashMap<Label, Dataset>,
}
impl Datasets {
pub fn open(config: impl Into<PathBuf>) -> Result<Self, std::io::Error> {
#[expect(clippy::unwrap_used)]
pub fn virt_source() -> Label {
Label::new("virtual-source").unwrap()
}
#[expect(clippy::unwrap_used)]
pub async fn virt(parent: impl Into<PathBuf>) -> Result<Self, std::io::Error> {
let path_parent = parent.into();
let config = ConfigToml {
dataset: DatasetConfig {
name: Label::new("virtual-dataset").unwrap(),
source: [(
Self::virt_source(),
Source::Filesystem {
enabled: true,
path: path_parent.clone(),
base_pattern: default_base(),
files: default_files(),
},
)]
.into_iter()
.collect(),
},
schema: HashMap::new(),
fts: None,
};
let mut sources = HashMap::new();
let mut disabled_sources = HashMap::new();
for (label, source) in &config.dataset.source {
match source {
Source::Filesystem {
enabled,
path,
base_pattern,
files,
} => {
let target = match enabled {
true => &mut sources,
false => &mut disabled_sources,
};
let base_regex = Regex::new(base_pattern).map_err(|e| {
std::io::Error::new(
ErrorKind::InvalidInput,
format!("invalid base_pattern: {e}"),
)
})?;
if base_regex.captures_len() != 2 {
return Err(std::io::Error::new(
ErrorKind::InvalidInput,
"base_pattern must have exactly one capture group",
));
}
target.insert(
label.clone(),
Dataset::Dir(
DirDataSource::new(
label,
path_parent.join(path),
base_regex,
files.clone(),
)
.await?,
),
);
}
}
}
return Ok(Self {
path_config: None,
path_workdir: None,
path_parent,
config,
sources,
disabled_sources,
});
}
pub async fn open(
config: impl Into<PathBuf>,
working_dir_root: impl Into<PathBuf>,
) -> Result<Self, std::io::Error> {
let path_config = config.into();
let path_parent = path_config
.parent()
@@ -106,61 +206,59 @@ impl Datasets {
}
};
let path_workdir = config
.dataset
.working_dir
.clone()
.unwrap_or(path_parent.join(".pile"))
.join(config.dataset.name.as_str());
let path_workdir = working_dir_root.into().join(config.dataset.name.as_str());
let mut sources = HashMap::new();
let mut disabled_sources = HashMap::new();
for (label, source) in &config.dataset.source {
match source {
Source::Filesystem { path, sidecars } => {
sources.insert(
label.clone(),
Dataset::Dir(Arc::new(DirDataSource::new(
label,
path_parent.join(path),
*sidecars,
))),
);
}
Source::S3 {
bucket,
prefix,
endpoint,
region,
credentials,
sidecars,
Source::Filesystem {
enabled,
path,
base_pattern,
files,
} => {
match S3DataSource::new(
label,
bucket.clone(),
prefix.clone(),
endpoint.clone(),
region.clone(),
credentials,
*sidecars,
) {
Ok(ds) => {
sources.insert(label.clone(), Dataset::S3(Arc::new(ds)));
}
Err(err) => {
warn!("Could not open S3 source {label}: {err}");
}
let target = match enabled {
true => &mut sources,
false => &mut disabled_sources,
};
let base_regex = Regex::new(base_pattern).map_err(|e| {
std::io::Error::new(
ErrorKind::InvalidInput,
format!("invalid base_pattern: {e}"),
)
})?;
if base_regex.captures_len() != 2 {
return Err(std::io::Error::new(
ErrorKind::InvalidInput,
"base_pattern must have exactly one capture group",
));
}
target.insert(
label.clone(),
Dataset::Dir(
DirDataSource::new(
label,
path_parent.join(path),
base_regex,
files.clone(),
)
.await?,
),
);
}
}
}
return Ok(Self {
path_config,
path_config: Some(path_config),
path_workdir: Some(path_workdir),
path_parent,
path_workdir,
config,
sources,
disabled_sources,
});
}
@@ -176,6 +274,7 @@ impl Datasets {
/// Returns `None` if the item or field is not found.
pub async fn get_field(
&self,
state: &ExtractState,
source: &Label,
key: &str,
path: &ObjectPath,
@@ -183,12 +282,13 @@ impl Datasets {
let Some(item) = self.get(source, key).await else {
return Ok(None);
};
let extractor = MetaExtractor::new(&item);
let root = PileValue::ObjectExtractor(Arc::new(extractor));
let Some(value) = root.query(path).await? else {
let item = PileValue::Item(item);
let Some(value) = item.query(state, path).await? else {
return Ok(None);
};
Ok(Some(value.to_json().await?))
Ok(Some(value.to_json(state).await?))
}
//
@@ -198,11 +298,29 @@ impl Datasets {
/// Refresh this dataset's fts index.
pub async fn fts_refresh(
&self,
state: &ExtractState,
_threads: usize,
flag: Option<CancelFlag>,
) -> Result<(), CancelableTaskError<DatasetError>> {
let fts_tmp_dir = self.path_workdir.join(".tmp-fts");
let fts_dir = self.path_workdir.join("fts");
let start = Instant::now();
let workdir = match self.path_workdir.as_ref() {
Some(x) => x,
None => {
warn!("Skipping fts_refresh, no workdir");
return Ok(());
}
};
let fts_tmp_dir = workdir.join(".tmp-fts");
let fts_dir = workdir.join("fts");
debug!(
message = "Rebuilding fts index",
dataset = self.config.dataset.name.as_str(),
?fts_dir,
?fts_tmp_dir,
?workdir
);
if fts_tmp_dir.is_dir() {
warn!("Removing temporary index in {}", fts_dir.display());
@@ -230,7 +348,7 @@ impl Datasets {
index_writer.add_document(doc).map_err(DatasetError::from)?;
total += 1;
if logged_at.elapsed().as_secs() >= 5 {
debug!("Indexed {total} documents so far");
debug!("Indexed {total} documents");
logged_at = Instant::now();
}
}
@@ -243,19 +361,20 @@ impl Datasets {
for (name, dataset) in &self.sources {
info!("Loading source {name}");
let mut stream = dataset.iter();
while let Some(item_result) = stream.next().await {
let stream = dataset.iter();
for item in stream {
if let Some(flag) = &flag
&& flag.is_cancelled()
{
return Err(CancelableTaskError::Cancelled);
}
let item = item_result.map_err(DatasetError::from)?;
let db = Arc::clone(&db_index);
let state = state.clone();
let item = item.clone();
join_set.spawn(async move {
let key = item.key();
let result = db.entry_to_document(&item).await;
let result = db.entry_to_document(&state, &item).await;
(key, result)
});
@@ -282,9 +401,18 @@ impl Datasets {
return Err(CancelableTaskError::Cancelled);
}
info!("Committing {total} documents");
index_writer.commit().map_err(DatasetError::from)?;
debug!(
message = "Rebuilt fts index",
dataset = self.config.dataset.name.as_str(),
?fts_dir,
?fts_tmp_dir,
?workdir,
n_docs = total,
time_ms = start.elapsed().as_millis()
);
if fts_dir.is_dir() {
warn!("Removing existing index in {}", fts_dir.display());
std::fs::remove_dir_all(&fts_dir).map_err(DatasetError::from)?;
@@ -299,7 +427,15 @@ impl Datasets {
query: &str,
top_n: usize,
) -> Result<Vec<FtsLookupResult>, DatasetError> {
let fts_dir = self.path_workdir.join("fts");
let workdir = match self.path_workdir.as_ref() {
Some(x) => x,
None => {
warn!("Skipping fts_lookup, no workdir");
return Ok(Vec::new());
}
};
let fts_dir = workdir.join("fts");
if !fts_dir.exists() {
return Err(DatasetError::NoFtsIndex);
@@ -319,7 +455,12 @@ impl Datasets {
/// Time at which fts was created
pub fn ts_fts(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
let fts_dir = self.path_workdir.join("fts");
let workdir = match self.path_workdir.as_ref() {
Some(x) => x,
None => return Ok(None),
};
let fts_dir = workdir.join("fts");
if !fts_dir.exists() {
return Ok(None);

View File

@@ -1,43 +0,0 @@
use pile_config::Label;
use std::sync::Arc;
mod epub_meta;
pub use epub_meta::*;
mod epub_text;
pub use epub_text::*;
use crate::{Item, PileValue, extract::ObjectExtractor};
pub struct EpubExtractor {
text: Arc<EpubTextExtractor>,
meta: Arc<EpubMetaExtractor>,
}
impl EpubExtractor {
pub fn new(item: &Item) -> Self {
Self {
text: Arc::new(EpubTextExtractor::new(item)),
meta: Arc::new(EpubMetaExtractor::new(item)),
}
}
}
#[async_trait::async_trait]
impl ObjectExtractor for EpubExtractor {
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error> {
match name.as_str() {
"text" => self.text.field(name).await,
"meta" => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))),
_ => Ok(None),
}
}
#[expect(clippy::unwrap_used)]
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(vec![
Label::new("text").unwrap(),
Label::new("meta").unwrap(),
])
}
}

View File

@@ -1,162 +0,0 @@
use mime::Mime;
use pile_config::Label;
use pile_flac::{FlacBlock, FlacReader};
use std::{
collections::HashMap,
io::BufReader,
sync::{Arc, OnceLock},
};
use crate::{
Item, PileValue, SyncReadBridge,
extract::{ListExtractor, ObjectExtractor},
};
pub struct FlacImagesExtractor {
item: Item,
}
impl FlacImagesExtractor {
pub fn new(item: &Item) -> Self {
Self { item: item.clone() }
}
async fn get_images(&self) -> Result<Vec<PileValue>, std::io::Error> {
let reader = SyncReadBridge::new_current(self.item.read().await?);
let raw_images = tokio::task::spawn_blocking(move || {
let reader = FlacReader::new(BufReader::new(reader));
let mut images: Vec<(Mime, Vec<u8>)> = Vec::new();
for block in reader {
match block.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))? {
FlacBlock::Picture(picture) => {
images.push((picture.mime, picture.img_data));
}
FlacBlock::AudioFrame(_) => break,
_ => {}
}
}
Ok::<_, std::io::Error>(images)
})
.await
.map_err(std::io::Error::other)??;
Ok(raw_images
.into_iter()
.map(|(mime, data)| PileValue::Blob {
mime,
bytes: Arc::new(data),
})
.collect())
}
}
#[async_trait::async_trait]
impl ListExtractor for FlacImagesExtractor {
async fn get<'a>(&'a self, idx: usize) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.get_images().await?.into_iter().nth(idx))
}
async fn len(&self) -> Result<usize, std::io::Error> {
Ok(self.get_images().await?.len())
}
}
pub struct FlacExtractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
images: Option<PileValue>,
}
impl FlacExtractor {
pub fn new(item: &Item) -> Self {
let is_flac = match item {
Item::File { path, .. } => path.to_str().unwrap_or_default().ends_with(".flac"),
Item::S3 { key, .. } => key.ends_with(".flac"),
};
let images =
is_flac.then(|| PileValue::ListExtractor(Arc::new(FlacImagesExtractor::new(item))));
Self {
item: item.clone(),
output: OnceLock::new(),
images,
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let key = match &self.item {
Item::File { path, .. } => path.to_str().unwrap_or_default().to_owned(),
Item::S3 { key, .. } => key.to_string(),
};
if !key.ends_with(".flac") {
let _ = self.output.set(HashMap::new());
#[expect(clippy::unwrap_used)]
return Ok(self.output.get().unwrap());
}
let reader = SyncReadBridge::new_current(self.item.read().await?);
let raw_tags = tokio::task::spawn_blocking(move || {
let reader = FlacReader::new(BufReader::new(reader));
let mut tags: Vec<(String, String)> = Vec::new();
for block in reader {
match block.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))? {
FlacBlock::VorbisComment(comment) => {
for (k, v) in comment.comment.comments {
tags.push((k.to_string().to_lowercase(), v.into()));
}
}
FlacBlock::AudioFrame(_) => break,
_ => {}
}
}
Ok::<_, std::io::Error>(tags)
})
.await
.map_err(std::io::Error::other)??;
let mut output: HashMap<Label, Vec<PileValue>> = HashMap::new();
for (k, v) in raw_tags {
if let Some(label) = Label::new(k) {
output
.entry(label)
.or_default()
.push(PileValue::String(Arc::new(v.into())));
}
}
let output: HashMap<Label, PileValue> = output
.into_iter()
.map(|(k, v)| (k, PileValue::Array(Arc::new(v))))
.collect();
let _ = self.output.set(output);
#[expect(clippy::unwrap_used)]
return Ok(self.output.get().unwrap());
}
}
#[async_trait::async_trait]
impl ObjectExtractor for FlacExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
if name.as_str() == "images"
&& let Some(ref images) = self.images
{
return Ok(Some(images.clone()));
}
Ok(self.get_inner().await?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
let mut fields = self.get_inner().await?.keys().cloned().collect::<Vec<_>>();
if self.images.is_some() {
#[expect(clippy::unwrap_used)]
fields.push(Label::new("images").unwrap());
}
Ok(fields)
}
}

View File

@@ -1,77 +0,0 @@
use pile_config::Label;
use std::{
collections::HashMap,
path::Component,
sync::{Arc, OnceLock},
};
use crate::{Item, PileValue, extract::ObjectExtractor};
pub struct FsExtractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl FsExtractor {
pub fn new(item: &Item) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
}
}
fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let Item::File { path, .. } = &self.item else {
return Ok(self.output.get_or_init(HashMap::new));
};
#[expect(clippy::unwrap_used)]
let output = HashMap::from([
(
Label::new("extension").unwrap(),
path.extension()
.and_then(|x| x.to_str())
.map(|x| PileValue::String(Arc::new(x.into())))
.unwrap_or(PileValue::Null),
),
(
Label::new("path").unwrap(),
path.to_str()
.map(|x| PileValue::String(Arc::new(x.into())))
.unwrap_or(PileValue::Null),
),
(
Label::new("segments").unwrap(),
path.components()
.map(|x| match x {
Component::CurDir => Some(".".to_owned()),
Component::Normal(x) => x.to_str().map(|x| x.to_owned()),
Component::ParentDir => Some("..".to_owned()),
Component::RootDir => Some("/".to_owned()),
Component::Prefix(x) => x.as_os_str().to_str().map(|x| x.to_owned()),
})
.map(|x| x.map(|x| PileValue::String(Arc::new(x.into()))))
.collect::<Option<Vec<_>>>()
.map(|v| PileValue::Array(Arc::new(v)))
.unwrap_or(PileValue::Null),
),
]);
return Ok(self.output.get_or_init(|| output));
}
}
#[async_trait::async_trait]
impl ObjectExtractor for FsExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.get_inner()?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner()?.keys().cloned().collect())
}
}

View File

@@ -1,130 +0,0 @@
use id3::Tag;
use pile_config::Label;
use std::{
borrow::Cow,
collections::HashMap,
io::BufReader,
sync::{Arc, OnceLock},
};
use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor};
pub struct Id3Extractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl Id3Extractor {
pub fn new(item: &Item) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let key = self.item.key();
let ext = key.as_str().rsplit('.').next();
if !matches!(ext, Some("mp3") | Some("aiff") | Some("aif") | Some("wav")) {
return Ok(self.output.get_or_init(HashMap::new));
}
let reader = SyncReadBridge::new_current(self.item.read().await?);
let tag = match tokio::task::spawn_blocking(move || Tag::read_from2(BufReader::new(reader)))
.await
{
Ok(Ok(tag)) => tag,
Ok(Err(id3::Error {
kind: id3::ErrorKind::NoTag,
..
})) => {
return Ok(self.output.get_or_init(HashMap::new));
}
Ok(Err(id3::Error {
kind: id3::ErrorKind::Io(e),
..
})) => return Err(e),
Ok(Err(e)) => return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, e)),
Err(e) => return Err(e.into()),
};
let mut output: HashMap<Label, Vec<PileValue>> = HashMap::new();
for frame in tag.frames() {
if let Some(text) = frame.content().text() {
let name = frame_id_to_field(frame.id());
if let Some(key) = Label::new(name) {
output
.entry(key)
.or_default()
.push(PileValue::String(Arc::new(text.into())));
}
}
}
let output = output
.into_iter()
.map(|(k, v)| (k, PileValue::Array(Arc::new(v))))
.collect();
return Ok(self.output.get_or_init(|| output));
}
}
/// Map an ID3 frame ID to the equivalent Vorbis Comment field name.
/// Falls back to the lowercased frame ID if no mapping exists.
fn frame_id_to_field(id: &str) -> Cow<'static, str> {
match id {
// spell:off
"TIT2" => Cow::Borrowed("title"),
"TIT1" => Cow::Borrowed("grouping"),
"TIT3" => Cow::Borrowed("subtitle"),
"TPE1" => Cow::Borrowed("artist"),
"TPE2" => Cow::Borrowed("albumartist"),
"TPE3" => Cow::Borrowed("conductor"),
"TOPE" => Cow::Borrowed("originalartist"),
"TALB" => Cow::Borrowed("album"),
"TOAL" => Cow::Borrowed("originalalbum"),
"TRCK" => Cow::Borrowed("tracknumber"),
"TPOS" => Cow::Borrowed("discnumber"),
"TSST" => Cow::Borrowed("discsubtitle"),
"TDRC" | "TYER" => Cow::Borrowed("date"),
"TDOR" | "TORY" => Cow::Borrowed("originaldate"),
"TCON" => Cow::Borrowed("genre"),
"TCOM" => Cow::Borrowed("composer"),
"TEXT" => Cow::Borrowed("lyricist"),
"TPUB" => Cow::Borrowed("label"),
"TSRC" => Cow::Borrowed("isrc"),
"TBPM" => Cow::Borrowed("bpm"),
"TLAN" => Cow::Borrowed("language"),
"TMED" => Cow::Borrowed("media"),
"TMOO" => Cow::Borrowed("mood"),
"TCOP" => Cow::Borrowed("copyright"),
"TENC" => Cow::Borrowed("encodedby"),
"TSSE" => Cow::Borrowed("encodersettings"),
"TSOA" => Cow::Borrowed("albumsort"),
"TSOP" => Cow::Borrowed("artistsort"),
"TSOT" => Cow::Borrowed("titlesort"),
"MVNM" => Cow::Borrowed("movement"),
"MVIN" => Cow::Borrowed("movementnumber"),
_ => Cow::Owned(id.to_lowercase()),
// spell:on
}
}
#[async_trait::async_trait]
impl ObjectExtractor for Id3Extractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.get_inner().await?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect())
}
}

View File

@@ -1,19 +0,0 @@
use pile_config::Label;
use std::collections::HashMap;
use crate::{PileValue, extract::ObjectExtractor};
pub struct MapExtractor {
pub(crate) inner: HashMap<Label, PileValue>,
}
#[async_trait::async_trait]
impl ObjectExtractor for MapExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.inner.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.inner.keys().cloned().collect())
}
}

View File

@@ -1,165 +0,0 @@
use pile_config::Label;
use std::{collections::HashMap, sync::Arc};
mod flac;
pub use flac::*;
mod id3;
pub use id3::*;
mod fs;
pub use fs::*;
mod epub;
pub use epub::*;
mod exif;
pub use exif::*;
mod pdf;
pub use pdf::*;
mod toml;
pub use toml::*;
mod map;
pub use map::*;
mod sidecar;
pub use sidecar::*;
use crate::{Item, PileValue};
/// An attachment that extracts metadata from an [Item].
///
/// Metadata is exposed as an immutable map of {label: value},
/// much like a json object.
#[async_trait::async_trait]
pub trait ObjectExtractor: Send + Sync {
/// Get the field at `name` from `item`.
/// - returns `None` if `name` is not a valid field
/// - returns `Some(Null)` if `name` is not available
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error>;
/// Return all fields in this extractor.
/// `Self::field` must return [Some] for all these keys
/// and [None] for all others.
async fn fields(&self) -> Result<Vec<Label>, std::io::Error>;
/// Convert this to a JSON value.
async fn to_json(&self) -> Result<serde_json::Value, std::io::Error> {
let keys = self.fields().await?;
let mut map = serde_json::Map::new();
for k in &keys {
let v = match self.field(k).await? {
Some(x) => x,
None => continue,
};
map.insert(k.to_string(), Box::pin(v.to_json()).await?);
}
Ok(serde_json::Value::Object(map))
}
}
/// An attachment that extracts metadata from an [Item].
///
/// Metadata is exposed as an immutable list of values.
#[async_trait::async_trait]
pub trait ListExtractor: Send + Sync {
/// Get the item at index `idx`.
/// Indices start at zero, and must be consecutive.
/// - returns `None` if `idx` is out of range
/// - returns `Some(Null)` if `None` is at `idx`
async fn get(&self, idx: usize) -> Result<Option<PileValue>, std::io::Error>;
async fn len(&self) -> Result<usize, std::io::Error>;
async fn is_empty(&self) -> Result<bool, std::io::Error> {
Ok(self.len().await? == 0)
}
/// Convert this list to a JSON value.
async fn to_json(&self) -> Result<serde_json::Value, std::io::Error> {
let len = self.len().await?;
let mut list = Vec::with_capacity(len);
for i in 0..len {
#[expect(clippy::expect_used)]
let v = self
.get(i)
.await?
.expect("value must be present according to length");
list.push(Box::pin(v.to_json()).await?);
}
Ok(serde_json::Value::Array(list))
}
}
pub struct MetaExtractor {
inner: MapExtractor,
}
impl MetaExtractor {
#[expect(clippy::unwrap_used)]
pub fn new(item: &Item) -> Self {
let inner = MapExtractor {
inner: HashMap::from([
(
Label::new("flac").unwrap(),
crate::PileValue::ObjectExtractor(Arc::new(FlacExtractor::new(item))),
),
(
Label::new("id3").unwrap(),
crate::PileValue::ObjectExtractor(Arc::new(Id3Extractor::new(item))),
),
(
Label::new("fs").unwrap(),
crate::PileValue::ObjectExtractor(Arc::new(FsExtractor::new(item))),
),
(
Label::new("epub").unwrap(),
crate::PileValue::ObjectExtractor(Arc::new(EpubExtractor::new(item))),
),
(
Label::new("exif").unwrap(),
crate::PileValue::ObjectExtractor(Arc::new(ExifExtractor::new(item))),
),
(
Label::new("pdf").unwrap(),
crate::PileValue::ObjectExtractor(Arc::new(PdfExtractor::new(item))),
),
(
Label::new("toml").unwrap(),
crate::PileValue::ObjectExtractor(Arc::new(TomlExtractor::new(item))),
),
(
Label::new("sidecar").unwrap(),
crate::PileValue::ObjectExtractor(Arc::new(SidecarExtractor::new(item))),
),
]),
};
Self { inner }
}
}
#[async_trait::async_trait]
impl ObjectExtractor for MetaExtractor {
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error> {
self.inner.field(name).await
}
#[expect(clippy::unwrap_used)]
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
return Ok(vec![
Label::new("flac").unwrap(),
Label::new("id3").unwrap(),
Label::new("fs").unwrap(),
Label::new("epub").unwrap(),
Label::new("exif").unwrap(),
Label::new("pdf").unwrap(),
Label::new("sidecar").unwrap(),
]);
}
}

View File

@@ -1,69 +0,0 @@
use pile_config::Label;
use std::sync::Arc;
#[cfg(feature = "pdfium")]
mod pdf_cover;
#[cfg(feature = "pdfium")]
pub use pdf_cover::*;
#[cfg(feature = "pdfium")]
mod pdf_pages;
#[cfg(feature = "pdfium")]
pub use pdf_pages::*;
mod pdf_meta;
pub use pdf_meta::*;
mod pdf_text;
pub use pdf_text::*;
use crate::{Item, PileValue, extract::ObjectExtractor};
pub struct PdfExtractor {
text: Arc<PdfTextExtractor>,
meta: Arc<PdfMetaExtractor>,
#[cfg(feature = "pdfium")]
cover: Arc<PdfCoverExtractor>,
#[cfg(feature = "pdfium")]
pages: Arc<PdfPagesExtractor>,
}
impl PdfExtractor {
pub fn new(item: &Item) -> Self {
Self {
text: Arc::new(PdfTextExtractor::new(item)),
meta: Arc::new(PdfMetaExtractor::new(item)),
#[cfg(feature = "pdfium")]
cover: Arc::new(PdfCoverExtractor::new(item)),
#[cfg(feature = "pdfium")]
pages: Arc::new(PdfPagesExtractor::new(item)),
}
}
}
#[async_trait::async_trait]
impl ObjectExtractor for PdfExtractor {
async fn field(&self, name: &pile_config::Label) -> Result<Option<PileValue>, std::io::Error> {
match name.as_str() {
"text" => self.text.field(name).await,
"meta" => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))),
#[cfg(feature = "pdfium")]
"cover" => self.cover.field(name).await,
#[cfg(feature = "pdfium")]
"pages" => Ok(Some(PileValue::ListExtractor(self.pages.clone()))),
_ => Ok(None),
}
}
#[expect(clippy::unwrap_used)]
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(vec![
Label::new("text").unwrap(),
Label::new("meta").unwrap(),
#[cfg(feature = "pdfium")]
Label::new("cover").unwrap(),
#[cfg(feature = "pdfium")]
Label::new("pages").unwrap(),
])
}
}

View File

@@ -1,95 +0,0 @@
use image::ImageFormat;
use pdfium_render::prelude::*;
use pile_config::Label;
use std::{
collections::HashMap,
io::{BufReader, Cursor},
sync::{Arc, OnceLock},
};
use tracing::trace;
use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor};
pub struct PdfCoverExtractor {
item: Item,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl PdfCoverExtractor {
pub fn new(item: &Item) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let reader = SyncReadBridge::new_current(self.item.read().await?);
let cover = tokio::task::spawn_blocking(move || {
let mut bytes = Vec::new();
std::io::Read::read_to_end(&mut BufReader::new(reader), &mut bytes)?;
let pdfium = Pdfium::default();
let document = pdfium
.load_pdf_from_byte_slice(&bytes, None)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let render_config = PdfRenderConfig::new().set_target_width(1024);
let page = document
.pages()
.get(0)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let image = page
.render_with_config(&render_config)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?
.as_image();
let mut png_bytes = Vec::new();
image
.write_to(&mut Cursor::new(&mut png_bytes), ImageFormat::Png)
.map_err(|e| std::io::Error::other(e.to_string()))?;
Ok::<_, std::io::Error>(png_bytes)
})
.await
.map_err(std::io::Error::other)?;
let output = match cover {
Ok(data) => {
#[expect(clippy::unwrap_used)]
let label = Label::new("cover").unwrap();
HashMap::from([(
label,
PileValue::Blob {
mime: mime::IMAGE_PNG,
bytes: Arc::new(data),
},
)])
}
Err(error) => {
trace!(message = "Could not render pdf cover", ?error, key = ?self.item.key());
HashMap::new()
}
};
return Ok(self.output.get_or_init(|| output));
}
}
#[async_trait::async_trait]
impl ObjectExtractor for PdfCoverExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.get_inner().await?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect())
}
}

View File

@@ -1,44 +0,0 @@
use pile_config::Label;
use std::sync::OnceLock;
use crate::{
Item, PileValue,
extract::{ObjectExtractor, TomlExtractor},
};
pub struct SidecarExtractor {
item: Item,
output: OnceLock<Option<TomlExtractor>>,
}
impl SidecarExtractor {
pub fn new(item: &Item) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
}
}
}
#[async_trait::async_trait]
impl ObjectExtractor for SidecarExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
match self
.output
.get_or_init(|| self.item.sidecar().map(TomlExtractor::new))
{
Some(x) => Ok(x.field(name).await?),
None => Ok(Some(PileValue::Null)),
}
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
match self
.output
.get_or_init(|| self.item.sidecar().map(TomlExtractor::new))
{
Some(x) => Ok(x.fields().await?),
None => Ok(Vec::new()),
}
}
}

View File

@@ -1,18 +1,16 @@
use itertools::Itertools;
use pile_config::{Case, ConfigToml, DatasetFts, FieldSpecPost, Label};
use std::{
path::PathBuf,
sync::{Arc, LazyLock},
use pile_config::{ConfigToml, DatasetFts, Label, objectpath::ObjectPath};
use pile_value::{
extract::traits::ExtractState,
value::{Item, PileValue},
};
use std::{path::PathBuf, sync::LazyLock};
use tantivy::{
DocAddress, Index, ReloadPolicy, TantivyDocument, TantivyError,
collector::Collector,
query::QueryParser,
schema::{self, Schema, Value as TantivyValue},
};
use tracing::{debug, trace, warn};
use crate::{Item, PileValue, extract::MetaExtractor};
use tracing::warn;
#[derive(Debug, Clone)]
pub struct FtsLookupResult {
@@ -68,6 +66,7 @@ impl DbFtsIndex {
/// Turn an entry into a tantivy document
pub async fn entry_to_document(
&self,
state: &ExtractState,
item: &Item,
) -> Result<Option<TantivyDocument>, TantivyError> {
let mut doc = TantivyDocument::default();
@@ -76,22 +75,16 @@ impl DbFtsIndex {
doc.add_text(self.schema.get_field("_meta_source")?, item.source_name());
doc.add_text(self.schema.get_field("_meta_key")?, key);
let extractor = PileValue::ObjectExtractor(Arc::new(MetaExtractor::new(item)));
let item = PileValue::Item(item.clone());
let mut empty = true;
for name in self.fts_cfg().fields.keys() {
let x = self.get_field(&extractor, name).await?;
let vals = self.get_field(state, &item, name).await?;
let field = self.schema.get_field(name)?;
let val = match x {
Some(x) => x,
None => continue,
};
empty = false;
let field = self.schema.get_field(name);
if let Ok(field) = field {
doc.add_text(field, val);
for v in vals {
empty = false;
doc.add_text(field, v);
}
}
@@ -104,110 +97,32 @@ impl DbFtsIndex {
pub async fn get_field(
&self,
state: &ExtractState,
extractor: &PileValue,
field_name: &Label,
) -> Result<Option<String>, std::io::Error> {
) -> Result<Vec<String>, std::io::Error> {
let field = match self.cfg.schema.get(field_name) {
Some(x) => x,
None => {
warn!("Unknown field {field_name:?}");
return Ok(None);
return Ok(Vec::new());
}
};
// Try paths in order, using the first value we find
'outer: for path in field.path.as_slice() {
let val = match extractor.query(path).await? {
for path in field.path.as_slice() {
let val = match extractor.query(state, path).await? {
Some(PileValue::Null) | None => continue,
Some(x) => x,
None => return Ok(None),
};
let mut val = match val {
PileValue::Null => {
trace!(
message = "Skipping field, is null",
field = field_name.to_string(),
?path,
// value = ?val
);
continue;
}
x => x.clone(),
};
for post in &field.post {
val = match apply(post, &val) {
Some(x) => x,
None => return Ok(None),
};
}
loop {
val = match val {
PileValue::String(x) => return Ok(Some(x.to_string())),
PileValue::U64(x) => return Ok(Some(x.to_string())),
PileValue::I64(x) => return Ok(Some(x.to_string())),
PileValue::Array(x) => {
if x.len() == 1 {
x[0].clone()
} else if x.len() > 1 {
debug!(
message = "Skipping field, is array with more than one element",
field = field_name.to_string(),
?path,
);
continue 'outer;
} else {
debug!(
message = "Skipping field, is empty array",
field = field_name.to_string(),
?path,
);
continue 'outer;
}
}
PileValue::Null => {
trace!(
message = "Skipping field, is null",
field = field_name.to_string(),
?path,
);
continue 'outer;
}
PileValue::ObjectExtractor(_) => {
trace!(
message = "Skipping field, is object",
field = field_name.to_string(),
?path,
);
continue 'outer;
}
PileValue::ListExtractor(_) => {
trace!(
message = "Skipping field, is ListExtractor",
field = field_name.to_string(),
?path,
);
continue 'outer;
}
PileValue::Blob { .. } => {
trace!(
message = "Skipping field, is blob",
field = field_name.to_string(),
?path,
);
continue 'outer;
}
}
let val = val_to_string(state, &val, path, field_name).await?;
if !val.is_empty() {
return Ok(val);
}
}
return Ok(None);
return Ok(Vec::new());
}
/// Run the given query on this table's ftx index.
@@ -297,103 +212,41 @@ impl DbFtsIndex {
}
}
pub fn apply(post: &FieldSpecPost, val: &PileValue) -> Option<PileValue> {
Some(match post {
FieldSpecPost::NotEmpty { notempty: false } => val.clone(),
FieldSpecPost::NotEmpty { notempty: true } => match val {
PileValue::Null => return None,
PileValue::String(x) if x.is_empty() => return None,
PileValue::Array(x) if x.is_empty() => return None,
x => x.clone(),
},
async fn val_to_string(
state: &ExtractState,
val: &PileValue,
path: &ObjectPath,
field_name: &str,
) -> Result<Vec<String>, std::io::Error> {
match val {
PileValue::String(x) => return Ok(vec![x.to_string()]),
PileValue::U64(x) => return Ok(vec![x.to_string()]),
PileValue::I64(x) => return Ok(vec![x.to_string()]),
FieldSpecPost::SetCase { case: Case::Lower } => match val {
PileValue::Null => return None,
PileValue::U64(_) => return None,
PileValue::I64(_) => return None,
PileValue::Blob { .. } => return None,
PileValue::ObjectExtractor(_) => return None,
PileValue::ListExtractor(_) => return None,
PileValue::String(x) => PileValue::String(Arc::new(x.as_str().to_lowercase().into())),
PileValue::Array(x) => {
let mut out = Vec::new();
for x in x.iter() {
out.extend(Box::pin(val_to_string(state, x, path, field_name)).await?);
}
return Ok(out);
}
PileValue::Array(x) => PileValue::Array(Arc::new(
x.iter().map(|x| apply(post, x)).collect::<Option<_>>()?,
)),
},
#[expect(clippy::unwrap_used)]
PileValue::ListExtractor(x) => {
let mut out = Vec::new();
let len = x.len(state).await?;
for i in 0..len {
let v = x.get(state, i).await?;
out.extend(Box::pin(val_to_string(state, &v.unwrap(), path, field_name)).await?);
}
return Ok(out);
}
FieldSpecPost::SetCase { case: Case::Upper } => match val {
PileValue::Null => return None,
PileValue::U64(_) => return None,
PileValue::I64(_) => return None,
PileValue::Blob { .. } => return None,
PileValue::ObjectExtractor(_) => return None,
PileValue::ListExtractor(_) => return None,
PileValue::String(x) => PileValue::String(Arc::new(x.as_str().to_uppercase().into())),
PileValue::Null => {}
PileValue::ObjectExtractor(_) => {}
PileValue::Item(_) => {}
PileValue::Binary(_) => {}
}
PileValue::Array(x) => PileValue::Array(Arc::new(
x.iter()
.map(|x| apply(post, x))
.collect::<Option<Vec<_>>>()?,
)),
},
FieldSpecPost::TrimSuffix { trim_suffix } => match val {
PileValue::Null => return None,
PileValue::U64(_) => return None,
PileValue::I64(_) => return None,
PileValue::Blob { .. } => return None,
PileValue::ObjectExtractor(_) => return None,
PileValue::ListExtractor(_) => return None,
PileValue::String(x) => PileValue::String(Arc::new(
x.strip_suffix(trim_suffix).unwrap_or(x.as_str()).into(),
)),
PileValue::Array(x) => PileValue::Array(Arc::new(
x.iter()
.map(|x| apply(post, x))
.collect::<Option<Vec<_>>>()?,
)),
},
FieldSpecPost::TrimPrefix { trim_prefix } => match val {
PileValue::Null => return None,
PileValue::U64(_) => return None,
PileValue::I64(_) => return None,
PileValue::Blob { .. } => return None,
PileValue::ObjectExtractor(_) => return None,
PileValue::ListExtractor(_) => return None,
PileValue::String(x) => PileValue::String(Arc::new(
x.strip_prefix(trim_prefix).unwrap_or(x.as_str()).into(),
)),
PileValue::Array(x) => PileValue::Array(Arc::new(
x.iter()
.map(|x| apply(post, x))
.collect::<Option<Vec<_>>>()?,
)),
},
FieldSpecPost::Join { join } => match val {
PileValue::Null => return None,
PileValue::U64(_) => return None,
PileValue::I64(_) => return None,
PileValue::Blob { .. } => return None,
PileValue::ObjectExtractor(_) => return None,
PileValue::ListExtractor(_) => return None,
PileValue::String(x) => PileValue::String(x.clone()),
PileValue::Array(x) => PileValue::String(Arc::new(
x.iter()
.map(|x| apply(post, x))
.map(|x| x.and_then(|x| x.as_str().map(|x| x.to_owned())))
.collect::<Option<Vec<_>>>()?
.into_iter()
.join(join)
.into(),
)),
},
})
return Ok(Vec::new());
}

View File

@@ -1,294 +0,0 @@
use mime::Mime;
use smartstring::{LazyCompact, SmartString};
use std::{
fs::File,
io::{Read, Seek, SeekFrom},
path::PathBuf,
sync::Arc,
};
use tokio::runtime::Handle;
use crate::source::{DirDataSource, S3DataSource};
//
// MARK: item
//
/// A cheaply-clonable pointer to an item in a dataset
#[derive(Debug, Clone)]
pub enum Item {
File {
source: Arc<DirDataSource>,
mime: Mime,
path: PathBuf,
sidecar: Option<Box<Item>>,
},
S3 {
source: Arc<S3DataSource>,
mime: Mime,
key: SmartString<LazyCompact>,
sidecar: Option<Box<Item>>,
},
}
impl Item {
/// Open the item for reading. For S3, performs a HEAD request to determine
/// the object size.
pub async fn read(&self) -> Result<ItemReader, std::io::Error> {
Ok(match self {
Self::File { path, .. } => ItemReader::File(File::open(path)?),
Self::S3 { source, key, .. } => {
let head = source
.client
.head_object()
.bucket(source.bucket.as_str())
.key(key.as_str())
.send()
.await
.map_err(std::io::Error::other)?;
let size = head.content_length().unwrap_or(0) as u64;
ItemReader::S3(S3Reader {
client: source.client.clone(),
bucket: source.bucket.clone(),
key: key.to_owned(),
cursor: 0,
size,
})
}
})
}
pub fn source_name(&self) -> &pile_config::Label {
match self {
Self::File { source, .. } => &source.name,
Self::S3 { source, .. } => &source.name,
}
}
#[expect(clippy::expect_used)]
pub fn key(&self) -> SmartString<LazyCompact> {
match self {
Self::File { path, .. } => path.to_str().expect("path is not utf-8").into(),
Self::S3 { key, .. } => key.clone(),
}
}
pub fn hash(&self) -> Result<blake3::Hash, std::io::Error> {
match self {
Self::File { path, .. } => {
let mut hasher = blake3::Hasher::new();
let mut file = std::fs::File::open(path)?;
std::io::copy(&mut file, &mut hasher)?;
return Ok(hasher.finalize());
}
Self::S3 { .. } => todo!(),
}
}
pub fn mime(&self) -> &Mime {
match self {
Self::File { mime, .. } => mime,
Self::S3 { mime, .. } => mime,
}
}
pub fn sidecar(&self) -> Option<&Self> {
match self {
Self::File { sidecar, .. } => sidecar.as_ref().map(|x| &**x),
Self::S3 { sidecar, .. } => sidecar.as_ref().map(|x| &**x),
}
}
}
//
// MARK: reader
//
pub trait AsyncReader: Send {
/// Read a chunk of bytes.
fn read(
&mut self,
buf: &mut [u8],
) -> impl Future<Output = Result<usize, std::io::Error>> + Send;
/// Read all remaining bytes into a `Vec`.
fn read_to_end(&mut self) -> impl Future<Output = Result<Vec<u8>, std::io::Error>> + Send {
async {
let mut buf = Vec::new();
let mut chunk = vec![0u8; 65536];
loop {
let n = self.read(&mut chunk).await?;
if n == 0 {
break;
}
buf.extend_from_slice(&chunk[..n]);
}
Ok(buf)
}
}
}
pub trait AsyncSeekReader: AsyncReader {
fn seek(&mut self, pos: SeekFrom) -> impl Future<Output = Result<u64, std::io::Error>> + Send;
}
//
// MARK: sync bridge
//
/// Turn an async [Reader] into a sync [Read] + [Seek].
///
/// Never use this outside of [tokio::task::spawn_blocking],
/// the async runtime will deadlock if this struct blocks
/// the runtime.
pub struct SyncReadBridge<R: AsyncReader> {
inner: R,
handle: Handle,
}
impl<R: AsyncReader> SyncReadBridge<R> {
/// Creates a new adapter using a handle to the current runtime.
/// Panics if called outside of tokio
pub fn new_current(inner: R) -> Self {
Self::new(inner, Handle::current())
}
/// Creates a new adapter using a handle to an existing runtime.
pub fn new(inner: R, handle: Handle) -> Self {
Self { inner, handle }
}
}
impl<R: AsyncReader> Read for SyncReadBridge<R> {
fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
self.handle.block_on(self.inner.read(buf))
}
}
impl<R: AsyncReader + AsyncSeekReader> Seek for SyncReadBridge<R> {
fn seek(&mut self, pos: SeekFrom) -> Result<u64, std::io::Error> {
self.handle.block_on(self.inner.seek(pos))
}
}
//
// MARK: itemreader
//
pub enum ItemReader {
File(File),
S3(S3Reader),
}
impl AsyncReader for ItemReader {
async fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
match self {
Self::File(x) => std::io::Read::read(x, buf),
Self::S3(x) => x.read(buf).await,
}
}
}
impl AsyncSeekReader for ItemReader {
async fn seek(&mut self, pos: std::io::SeekFrom) -> Result<u64, std::io::Error> {
match self {
Self::File(x) => x.seek(pos),
Self::S3(x) => x.seek(pos).await,
}
}
}
//
// MARK: S3Reader
//
pub struct S3Reader {
client: Arc<aws_sdk_s3::Client>,
bucket: SmartString<LazyCompact>,
key: SmartString<LazyCompact>,
cursor: u64,
size: u64,
}
impl AsyncReader for S3Reader {
async fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
let len_left = self.size.saturating_sub(self.cursor);
if len_left == 0 || buf.is_empty() {
return Ok(0);
}
let start_byte = self.cursor;
let len_to_read = (buf.len() as u64).min(len_left);
let end_byte = start_byte + len_to_read - 1;
let resp = self
.client
.get_object()
.bucket(self.bucket.as_str())
.key(self.key.as_str())
.range(format!("bytes={start_byte}-{end_byte}"))
.send()
.await
.map_err(std::io::Error::other)?;
let bytes = resp
.body
.collect()
.await
.map(|x| x.into_bytes())
.map_err(std::io::Error::other)?;
let n = bytes.len().min(buf.len());
buf[..n].copy_from_slice(&bytes[..n]);
self.cursor += n as u64;
Ok(n)
}
}
impl AsyncSeekReader for S3Reader {
async fn seek(&mut self, pos: SeekFrom) -> Result<u64, std::io::Error> {
match pos {
SeekFrom::Start(x) => self.cursor = x.min(self.size),
SeekFrom::Current(x) => {
if x < 0 {
let abs = x.unsigned_abs();
if abs > self.cursor {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidInput,
"cannot seek past start",
));
}
self.cursor -= abs;
} else {
self.cursor += x as u64;
}
}
std::io::SeekFrom::End(x) => {
if x < 0 {
let abs = x.unsigned_abs();
if abs > self.size {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidInput,
"cannot seek past start",
));
}
self.cursor = self.size - abs;
} else {
self.cursor = self.size + x as u64;
}
}
}
self.cursor = self.cursor.min(self.size);
Ok(self.cursor)
}
}

View File

@@ -1,21 +1,4 @@
mod traits;
pub use traits::*;
mod misc;
pub use misc::*;
mod dataset;
pub use dataset::{Dataset, DatasetError, Datasets};
mod item;
pub use item::*;
mod value;
pub use value::*;
pub mod extract;
pub mod index;
pub mod source;
#[cfg(feature = "axum")]
pub mod serve;

View File

@@ -1,99 +0,0 @@
use axum::{
Json,
extract::{Query, State},
http::{StatusCode, header},
response::{IntoResponse, Response},
};
use pile_config::{Label, objectpath::ObjectPath};
use serde::Deserialize;
use std::{sync::Arc, time::Instant};
use tracing::debug;
use utoipa::ToSchema;
use crate::{Datasets, PileValue, extract::MetaExtractor};
#[derive(Deserialize, ToSchema)]
pub struct FieldQuery {
source: String,
key: String,
path: String,
}
/// Extract a specific field from an item's metadata
#[utoipa::path(
get,
path = "/field",
params(
("source" = String, Query, description = "Source label"),
("key" = String, Query, description = "Item key"),
("path" = String, Query, description = "Object path (e.g. $.flac.title)"),
),
responses(
(status = 200, description = "Field value as JSON"),
(status = 400, description = "Invalid source label or path"),
(status = 404, description = "Item or field not found"),
(status = 500, description = "Internal server error"),
)
)]
pub async fn get_field(
State(state): State<Arc<Datasets>>,
Query(params): Query<FieldQuery>,
) -> Response {
let start = Instant::now();
debug!(
message = "Serving /field",
source = params.source,
key = params.key,
path = params.path,
);
let label = match Label::try_from(params.source.clone()) {
Ok(l) => l,
Err(e) => return (StatusCode::BAD_REQUEST, format!("{e:?}")).into_response(),
};
let path: ObjectPath = match params.path.parse() {
Ok(p) => p,
Err(e) => return (StatusCode::BAD_REQUEST, format!("{e:?}")).into_response(),
};
let Some(item) = state.get(&label, &params.key).await else {
return StatusCode::NOT_FOUND.into_response();
};
let extractor = MetaExtractor::new(&item);
let root: PileValue = PileValue::ObjectExtractor(Arc::new(extractor));
let value = match root.query(&path).await {
Ok(Some(v)) => v,
Ok(None) => return StatusCode::NOT_FOUND.into_response(),
Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
};
debug!(
message = "Served /field",
source = params.source,
key = params.key,
path = params.path,
time_ms = start.elapsed().as_millis()
);
match value {
PileValue::String(s) => (
StatusCode::OK,
[(header::CONTENT_TYPE, "text/plain")],
s.to_string(),
)
.into_response(),
PileValue::Blob { mime, bytes } => (
StatusCode::OK,
[(header::CONTENT_TYPE, mime.to_string())],
bytes.as_ref().clone(),
)
.into_response(),
_ => match value.to_json().await {
Ok(json) => (StatusCode::OK, Json(json)).into_response(),
Err(e) => (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
},
}
}

View File

@@ -1,73 +0,0 @@
use axum::{
extract::{Query, State},
http::{StatusCode, header},
response::{IntoResponse, Response},
};
use pile_config::Label;
use serde::Deserialize;
use std::{sync::Arc, time::Instant};
use tracing::debug;
use utoipa::ToSchema;
use crate::{AsyncReader, Datasets};
#[derive(Deserialize, ToSchema)]
pub struct ItemQuery {
source: String,
key: String,
}
/// Fetch the raw bytes of an item by source and key
#[utoipa::path(
get,
path = "/item",
params(
("source" = String, Query, description = "Source label"),
("key" = String, Query, description = "Item key"),
),
responses(
(status = 200, description = "Raw item bytes"),
(status = 400, description = "Invalid source label"),
(status = 404, description = "Item not found"),
(status = 500, description = "Internal server error"),
)
)]
pub async fn item_get(
State(state): State<Arc<Datasets>>,
Query(params): Query<ItemQuery>,
) -> Response {
let start = Instant::now();
debug!(
message = "Serving /item",
source = params.source,
key = params.key
);
let label = match Label::try_from(params.source.clone()) {
Ok(l) => l,
Err(e) => return (StatusCode::BAD_REQUEST, format!("{e:?}")).into_response(),
};
let Some(item) = state.get(&label, &params.key).await else {
return StatusCode::NOT_FOUND.into_response();
};
let mime = item.mime().to_string();
let mut reader = match item.read().await {
Ok(r) => r,
Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
};
debug!(
message = "Served /item",
source = params.source,
key = params.key,
time_ms = start.elapsed().as_millis()
);
match reader.read_to_end().await {
Ok(bytes) => (StatusCode::OK, [(header::CONTENT_TYPE, mime)], bytes).into_response(),
Err(e) => (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
}
}

View File

@@ -1,47 +0,0 @@
use axum::{
Router,
routing::{get, post},
};
use std::sync::Arc;
use utoipa::OpenApi;
use utoipa_swagger_ui::SwaggerUi;
use crate::Datasets;
mod lookup;
pub use lookup::*;
mod item;
pub use item::*;
mod field;
pub use field::*;
#[derive(OpenApi)]
#[openapi(
tags(),
paths(lookup, item_get, get_field),
components(schemas(LookupRequest, LookupResponse, LookupResult, ItemQuery, FieldQuery))
)]
pub(crate) struct Api;
impl Datasets {
#[inline]
pub fn router(self: Arc<Self>, with_docs: bool) -> Router<()> {
let mut router = Router::new()
.route("/lookup", post(lookup))
.route("/item", get(item_get))
.route("/field", get(get_field))
.with_state(self.clone());
if with_docs {
let docs_path = "/docs";
let docs = SwaggerUi::new(docs_path)
.url(format!("{}/openapi.json", docs_path), Api::openapi());
router = router.merge(docs);
}
router
}
}

View File

@@ -1,128 +0,0 @@
use chrono::{DateTime, Utc};
use pile_config::Label;
use std::{path::PathBuf, sync::Arc};
use tokio_stream::wrappers::ReceiverStream;
use walkdir::WalkDir;
use crate::{DataSource, Item, path_ts_latest};
#[derive(Debug)]
pub struct DirDataSource {
pub name: Label,
pub dir: PathBuf,
pub sidecars: bool,
}
impl DirDataSource {
pub fn new(name: &Label, dir: PathBuf, sidecars: bool) -> Self {
Self {
name: name.clone(),
dir,
sidecars,
}
}
}
impl DataSource for Arc<DirDataSource> {
async fn get(&self, key: &str) -> Result<Option<Item>, std::io::Error> {
let key = match key.parse::<PathBuf>() {
Ok(x) => self.dir.join(x),
Err(_) => return Ok(None),
};
if !key.is_file() {
return Ok(None);
}
// Ignore toml files if sidecars are enabled
if self.sidecars && key.extension().and_then(|x| x.to_str()) == Some("toml") {
return Ok(None);
}
return Ok(Some(Item::File {
source: Arc::clone(self),
mime: mime_guess::from_path(&key).first_or_octet_stream(),
path: key.clone(),
sidecar: self.sidecars.then(|| {
Box::new(Item::File {
source: Arc::clone(self),
mime: mime_guess::from_path(key.with_extension("toml")).first_or_octet_stream(),
path: key.with_extension("toml"),
sidecar: None,
})
}),
}));
}
fn iter(&self) -> ReceiverStream<Result<Item, std::io::Error>> {
let (tx, rx) = tokio::sync::mpsc::channel(64);
let source = Arc::clone(self);
let dir = self.dir.clone();
tokio::task::spawn_blocking(move || {
for entry in WalkDir::new(dir) {
let entry = match entry {
Err(e) => {
let msg = format!("walkdir error: {e:?}");
let err = e.into_io_error().unwrap_or(std::io::Error::other(msg));
if tx.blocking_send(Err(err)).is_err() {
return;
}
continue;
}
Ok(e) => e,
};
if entry.file_type().is_dir() {
continue;
}
let path = entry.into_path();
let item = match path.extension().and_then(|x| x.to_str()) {
None => continue,
Some("toml") if source.sidecars => continue,
Some(_) => Item::File {
source: Arc::clone(&source),
mime: mime_guess::from_path(&path).first_or_octet_stream(),
path: path.clone(),
sidecar: source.sidecars.then(|| {
Box::new(Item::File {
source: Arc::clone(&source),
mime: mime_guess::from_path(path.with_extension("toml"))
.first_or_octet_stream(),
path: path.with_extension("toml"),
sidecar: None,
})
}),
},
};
if tx.blocking_send(Ok(item)).is_err() {
return;
}
}
});
ReceiverStream::new(rx)
}
async fn latest_change(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
let mut ts: Option<DateTime<Utc>> = None;
if !self.dir.exists() {
return Ok(None);
}
let new = path_ts_latest(&self.dir)?;
match (ts, new) {
(_, None) => {}
(None, Some(new)) => ts = Some(new),
(Some(old), Some(new)) => ts = Some(old.max(new)),
};
return Ok(ts);
}
}

View File

@@ -1,5 +0,0 @@
mod dir;
pub use dir::*;
mod s3;
pub use s3::*;

View File

@@ -1,255 +0,0 @@
use aws_sdk_s3::config::{BehaviorVersion, Credentials, Region};
use chrono::{DateTime, Utc};
use pile_config::{Label, S3Credentials};
use smartstring::{LazyCompact, SmartString};
use std::sync::Arc;
use tokio_stream::wrappers::ReceiverStream;
use crate::{DataSource, Item};
#[derive(Debug)]
pub struct S3DataSource {
pub name: Label,
pub bucket: SmartString<LazyCompact>,
pub prefix: Option<SmartString<LazyCompact>>,
pub sidecars: bool,
pub client: Arc<aws_sdk_s3::Client>,
}
impl S3DataSource {
pub fn new(
name: &Label,
bucket: String,
prefix: Option<String>,
endpoint: Option<String>,
region: String,
credentials: &S3Credentials,
sidecars: bool,
) -> Result<Self, std::io::Error> {
let client = {
let creds = Credentials::new(
&credentials.access_key_id,
&credentials.secret_access_key,
None,
None,
"pile",
);
let mut s3_config = aws_sdk_s3::config::Builder::new()
.behavior_version(BehaviorVersion::latest())
.region(Region::new(region))
.credentials_provider(creds);
if let Some(ep) = endpoint {
s3_config = s3_config.endpoint_url(ep).force_path_style(true);
}
aws_sdk_s3::Client::from_conf(s3_config.build())
};
Ok(Self {
name: name.clone(),
bucket: bucket.into(),
prefix: prefix.map(|x| x.into()),
sidecars,
client: Arc::new(client),
})
}
async fn find_sidecar_key(&self, key: &str) -> Option<SmartString<LazyCompact>> {
// First try {key}.toml
let full_toml = format!("{key}.toml");
if self
.client
.head_object()
.bucket(self.bucket.as_str())
.key(&full_toml)
.send()
.await
.is_ok()
{
return Some(full_toml.into());
}
// Then try {key-with-extension-stripped}.toml
let stripped = std::path::Path::new(key).with_extension("toml");
if let Some(stripped_str) = stripped.to_str()
&& stripped_str != full_toml.as_str()
&& self
.client
.head_object()
.bucket(self.bucket.as_str())
.key(stripped_str)
.send()
.await
.is_ok()
{
return Some(stripped_str.into());
}
None
}
async fn make_item(self: &Arc<Self>, key: impl Into<SmartString<LazyCompact>>) -> Item {
let key: SmartString<LazyCompact> = key.into();
let mime = mime_guess::from_path(key.as_str()).first_or_octet_stream();
let sidecar = if self.sidecars {
self.find_sidecar_key(key.as_str())
.await
.map(|sidecar_key| {
Box::new(Item::S3 {
source: Arc::clone(self),
mime: mime_guess::from_path(sidecar_key.as_str()).first_or_octet_stream(),
key: sidecar_key,
sidecar: None,
})
})
} else {
None
};
Item::S3 {
source: Arc::clone(self),
mime,
key,
sidecar,
}
}
}
impl DataSource for Arc<S3DataSource> {
async fn get(&self, key: &str) -> Result<Option<Item>, std::io::Error> {
if self.sidecars && key.ends_with(".toml") {
return Ok(None);
}
let result = self
.client
.head_object()
.bucket(self.bucket.as_str())
.key(key)
.send()
.await;
match result {
Err(sdk_err) => {
let not_found = sdk_err
.as_service_error()
.map(|e| e.is_not_found())
.unwrap_or(false);
if not_found {
return Ok(None);
}
Err(std::io::Error::other(sdk_err))
}
Ok(_) => Ok(Some(self.make_item(key).await)),
}
}
fn iter(&self) -> ReceiverStream<Result<Item, std::io::Error>> {
let (tx, rx) = tokio::sync::mpsc::channel(64);
let source = Arc::clone(self);
tokio::spawn(async move {
let mut continuation_token: Option<String> = None;
loop {
let mut req = source
.client
.list_objects_v2()
.bucket(source.bucket.as_str());
if let Some(prefix) = &source.prefix {
req = req.prefix(prefix.as_str());
}
if let Some(token) = continuation_token {
req = req.continuation_token(token);
}
let resp = match req.send().await {
Err(e) => {
let _ = tx.send(Err(std::io::Error::other(e))).await;
break;
}
Ok(resp) => resp,
};
let next_token = resp.next_continuation_token().map(ToOwned::to_owned);
let is_truncated = resp.is_truncated().unwrap_or(false);
for obj in resp.contents() {
let key = match obj.key() {
Some(k) => k.to_owned(),
None => continue,
};
if source.sidecars && key.ends_with(".toml") {
continue;
}
let item = source.make_item(key).await;
if tx.send(Ok(item)).await.is_err() {
return;
}
}
if !is_truncated {
break;
}
continuation_token = next_token;
}
});
ReceiverStream::new(rx)
}
async fn latest_change(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
let mut ts: Option<DateTime<Utc>> = None;
let mut continuation_token: Option<String> = None;
loop {
let mut req = self.client.list_objects_v2().bucket(self.bucket.as_str());
if let Some(prefix) = &self.prefix {
req = req.prefix(prefix.as_str());
}
if let Some(token) = continuation_token {
req = req.continuation_token(token);
}
let resp = match req.send().await {
Err(_) => return Ok(None),
Ok(resp) => resp,
};
let next_token = resp.next_continuation_token().map(ToOwned::to_owned);
let is_truncated = resp.is_truncated().unwrap_or(false);
for obj in resp.contents() {
if let Some(last_modified) = obj.last_modified() {
let dt = DateTime::from_timestamp(
last_modified.secs(),
last_modified.subsec_nanos(),
);
if let Some(dt) = dt {
ts = Some(match ts {
None => dt,
Some(prev) => prev.max(dt),
});
}
}
}
if !is_truncated {
break;
}
continuation_token = next_token;
}
Ok(ts)
}
}

View File

@@ -1,158 +0,0 @@
use aws_sdk_s3::{error::SdkError, operation::get_object::GetObjectError};
use mime::Mime;
use std::io::{Error as IoError, Seek, SeekFrom, Write};
use thiserror::Error;
use super::S3Client;
use crate::retry;
#[derive(Debug, Error)]
#[expect(clippy::large_enum_variant)]
pub enum S3ReaderError {
#[error("sdk error")]
SdkError(#[from] SdkError<GetObjectError>),
#[error("byte stream error")]
ByteStreamError(#[from] aws_sdk_s3::primitives::ByteStreamError),
#[error("i/o error")]
IoError(#[from] IoError),
}
/// Provides a [`std::io::Read`]-like interface to an S3 object. \
/// This doesn't actually implement [`std::io::Read`] because Read isn't async.
///
/// Also implements [`std::io::Seek`]
pub struct S3Reader {
pub(super) client: S3Client,
pub(super) bucket: String,
pub(super) key: String,
pub(super) cursor: u64,
pub(super) size: u64,
pub(super) mime: Mime,
}
impl S3Reader {
pub async fn read(&mut self, mut buf: &mut [u8]) -> Result<usize, S3ReaderError> {
let len_left = self.size - self.cursor;
if len_left == 0 || buf.is_empty() {
return Ok(0);
}
#[expect(clippy::unwrap_used)] // TODO: probably fits?
let start_byte = usize::try_from(self.cursor).unwrap();
#[expect(clippy::unwrap_used)] // usize fits in u64
let len_to_read = u64::try_from(buf.len()).unwrap().min(len_left);
#[expect(clippy::unwrap_used)] // must fit, we called min()
let len_to_read = usize::try_from(len_to_read).unwrap();
let end_byte = start_byte + len_to_read - 1;
let b = retry!(
self.client.retries,
self.client
.client
.get_object()
.bucket(self.bucket.as_str())
.key(self.key.as_str())
.range(format!("bytes={start_byte}-{end_byte}"))
.send()
.await
)?;
// Looks like `bytes 31000000-31999999/33921176``
// println!("{:?}", b.content_range);
let mut bytes = b.body.collect().await?.into_bytes();
bytes.truncate(len_to_read);
let l = bytes.len();
// Memory to memory writes are infallible
#[expect(clippy::unwrap_used)]
buf.write_all(&bytes).unwrap();
// Cannot fail, usize should always fit into u64
#[expect(clippy::unwrap_used)]
{
self.cursor += u64::try_from(l).unwrap();
}
return Ok(len_to_read);
}
pub fn is_done(&self) -> bool {
return self.cursor == self.size;
}
pub fn mime(&self) -> &Mime {
&self.mime
}
/// Write the entire contents of this reader to `r`.
///
/// This method always downloads the whole object,
/// and always preserves `self.cursor`.
pub async fn download<W: Write>(&mut self, r: &mut W) -> Result<(), S3ReaderError> {
let pos = self.stream_position()?;
const BUF_LEN: usize = 10_000_000;
#[expect(clippy::unwrap_used)] // Cannot fail
let mut buf: Box<[u8; BUF_LEN]> = vec![0u8; BUF_LEN].try_into().unwrap();
while !self.is_done() {
let b = self.read(&mut buf[..]).await?;
r.write_all(&buf[0..b])?;
}
self.seek(SeekFrom::Start(pos))?;
Ok(())
}
}
impl Seek for S3Reader {
fn seek(&mut self, pos: SeekFrom) -> std::io::Result<u64> {
match pos {
SeekFrom::Start(x) => self.cursor = x.min(self.size - 1),
// Cannot panic, we handle all cases
#[expect(clippy::unwrap_used)]
SeekFrom::Current(x) => {
if x < 0 {
if u64::try_from(x.abs()).unwrap() > self.cursor {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidInput,
"cannot seek past start",
));
}
self.cursor -= u64::try_from(x.abs()).unwrap();
} else {
self.cursor += u64::try_from(x).unwrap();
}
}
// Cannot panic, we handle all cases
#[expect(clippy::unwrap_used)]
SeekFrom::End(x) => {
if x < 0 {
if u64::try_from(x.abs()).unwrap() > self.size {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidInput,
"cannot seek past start",
));
}
// Cannot fail, is abs
self.cursor = self.size - u64::try_from(x.abs()).unwrap();
} else {
// Cannot fail, is positive
self.cursor = self.size + u64::try_from(x).unwrap();
}
}
}
self.cursor = self.cursor.min(self.size - 1);
return Ok(self.cursor);
}
}

View File

@@ -1,18 +0,0 @@
use chrono::{DateTime, Utc};
use tokio_stream::wrappers::ReceiverStream;
use crate::Item;
/// A read-only set of [Item]s.
pub trait DataSource {
/// Get an item from this datasource
fn get(&self, key: &str) -> impl Future<Output = Result<Option<Item>, std::io::Error>> + Send;
/// Iterate over all items in this source in an arbitrary order
fn iter(&self) -> ReceiverStream<Result<Item, std::io::Error>>;
/// Return the time of the latest change to the data in this source
fn latest_change(
&self,
) -> impl Future<Output = Result<Option<DateTime<Utc>>, std::io::Error>> + Send;
}

View File

@@ -1,194 +0,0 @@
use mime::Mime;
use pile_config::objectpath::{ObjectPath, PathSegment};
use serde_json::{Map, Value};
use smartstring::{LazyCompact, SmartString};
use std::sync::Arc;
use crate::extract::{ListExtractor, ObjectExtractor};
/// An immutable, cheaply-clonable, lazily-computed value.
/// Very similar to [serde_json::Value].
pub enum PileValue {
Null,
U64(u64),
I64(i64),
/// A string
String(Arc<SmartString<LazyCompact>>),
/// An array of values
Array(Arc<Vec<PileValue>>),
/// A binary blob
Blob {
mime: Mime,
bytes: Arc<Vec<u8>>,
},
/// A lazily-computed map of {label: value}
ObjectExtractor(Arc<dyn ObjectExtractor>),
/// A lazily-computed array
ListExtractor(Arc<dyn ListExtractor>),
}
impl Clone for PileValue {
fn clone(&self) -> Self {
match self {
Self::Null => Self::Null,
Self::U64(x) => Self::U64(*x),
Self::I64(x) => Self::I64(*x),
Self::String(x) => Self::String(x.clone()),
Self::Array(x) => Self::Array(x.clone()),
Self::ObjectExtractor(x) => Self::ObjectExtractor(x.clone()),
Self::ListExtractor(x) => Self::ListExtractor(x.clone()),
Self::Blob { mime, bytes } => Self::Blob {
mime: mime.clone(),
bytes: bytes.clone(),
},
}
}
}
impl PileValue {
pub async fn query(&self, query: &ObjectPath) -> Result<Option<Self>, std::io::Error> {
let mut out: Option<PileValue> = Some(self.clone());
for s in &query.segments {
match s {
PathSegment::Root => out = Some(self.clone()),
PathSegment::Field(field) => {
out = match out {
None => return Ok(None),
Some(Self::Null) => None,
Some(Self::U64(_)) => None,
Some(Self::I64(_)) => None,
Some(Self::Array(_)) => None,
Some(Self::String(_)) => None,
Some(Self::Blob { .. }) => None,
Some(Self::ListExtractor(_)) => None,
Some(Self::ObjectExtractor(e)) => e.field(field).await?,
}
}
PathSegment::Index(idx) => {
out = match &out {
None => return Ok(None),
Some(Self::Null) => None,
Some(Self::U64(_)) => None,
Some(Self::I64(_)) => None,
Some(Self::Blob { .. }) => None,
Some(Self::Array(v)) => {
let idx = if *idx >= 0 {
usize::try_from(*idx).ok()
} else {
usize::try_from(v.len() as i64 - idx).ok()
};
idx.and_then(|idx| v.get(idx)).cloned()
}
Some(Self::String(_)) => None,
Some(Self::ObjectExtractor(_)) => None,
Some(Self::ListExtractor(e)) => {
let idx = if *idx >= 0 {
usize::try_from(*idx).ok()
} else {
usize::try_from(e.len().await? as i64 - idx).ok()
};
match idx {
Some(idx) => e.get(idx).await?,
None => None,
}
}
}
}
}
}
return Ok(out.clone());
}
/// Like `to_json`, but counts populated fields instead of collecting values.
///
/// - Leaf values (non-null scalars, arrays, blobs) contribute `Some(1)`.
/// - `Null` contributes `None`.
/// - `ObjectExtractor` is recursed into; returns `Some(Object(map))` with
/// only the fields that had data, or `None` if all fields were absent.
/// - `Array` / `ListExtractor` are treated as opaque leaf values (not descended into).
pub async fn count_fields(&self) -> Result<Option<Value>, std::io::Error> {
Ok(match self {
Self::Null => None,
Self::U64(_) | Self::I64(_) | Self::String(_) | Self::Blob { .. } => {
Some(Value::Number(1u64.into()))
}
Self::Array(x) => (!x.is_empty()).then(|| Value::Number(1u64.into())),
Self::ListExtractor(x) => (!x.is_empty().await?).then(|| Value::Number(1u64.into())),
Self::ObjectExtractor(e) => {
let keys = e.fields().await?;
let mut map = Map::new();
for k in &keys {
let v = match e.field(k).await? {
Some(x) => x,
None => continue,
};
if let Some(counted) = Box::pin(v.count_fields()).await? {
map.insert(k.to_string(), counted);
}
}
if map.is_empty() {
None
} else {
Some(Value::Object(map))
}
}
})
}
pub fn as_str(&self) -> Option<&str> {
match self {
Self::String(x) => Some(x),
_ => None,
}
}
pub async fn to_json(&self) -> Result<Value, std::io::Error> {
Ok(match self {
Self::Null => Value::Null,
Self::U64(x) => Value::Number((*x).into()),
Self::I64(x) => Value::Number((*x).into()),
// TODO: replace with something meaningful
Self::Blob { mime, bytes } => {
Value::String(format!("<Blob ({mime}, {} bytes)>", bytes.len()))
}
Self::String(x) => Value::String(x.to_string()),
Self::Array(x) => {
let mut arr = Vec::new();
for item in &**x {
arr.push(Box::pin(item.to_json()).await?);
}
Value::Array(arr)
}
Self::ObjectExtractor(e) => {
let keys = e.fields().await?;
let mut map = Map::new();
for k in &keys {
let v = match e.field(k).await? {
Some(x) => x,
None => continue,
};
map.insert(k.to_string(), Box::pin(v.to_json()).await?);
}
Value::Object(map)
}
Self::ListExtractor(e) => e.to_json().await?,
})
}
}

View File

@@ -1,10 +1,20 @@
use std::io::{Read, Seek, SeekFrom};
use std::io::{ErrorKind, Read, Seek, SeekFrom};
use crate::{
FlacBlock, FlacDecodeError,
blocks::{FlacAudioFrame, FlacMetablockHeader, FlacMetablockType},
};
fn read_exact_flac<R: Read>(reader: &mut R, buf: &mut [u8]) -> Result<(), FlacDecodeError> {
reader.read_exact(buf).map_err(|e| {
if e.kind() == ErrorKind::UnexpectedEof {
FlacDecodeError::MalformedBlock
} else {
e.into()
}
})
}
// TODO: quickly skip blocks we do not need
/// The next block we expect to read
@@ -42,9 +52,9 @@ impl<R: Read + Seek> Iterator for FlacReader<R> {
ReaderState::MagicBits => {
let mut data = [0u8; 4];
if let Err(e) = self.inner.read_exact(&mut data[..4]) {
if let Err(e) = read_exact_flac(&mut self.inner, &mut data[..4]) {
self.state = ReaderState::Done;
return Some(Err(e.into()));
return Some(Err(e));
}
if data != [0x66, 0x4C, 0x61, 0x43] {
@@ -57,9 +67,9 @@ impl<R: Read + Seek> Iterator for FlacReader<R> {
ReaderState::MetablockHeader { is_first } => {
let mut data = [0u8; 4];
if let Err(e) = self.inner.read_exact(&mut data[..]) {
if let Err(e) = read_exact_flac(&mut self.inner, &mut data[..]) {
self.state = ReaderState::Done;
return Some(Err(e.into()));
return Some(Err(e));
}
let header = match FlacMetablockHeader::decode(&data) {
@@ -80,9 +90,9 @@ impl<R: Read + Seek> Iterator for FlacReader<R> {
ReaderState::MetaBlock { header } => {
let mut data = vec![0u8; header.length as usize];
if let Err(e) = self.inner.read_exact(&mut data) {
if let Err(e) = read_exact_flac(&mut self.inner, &mut data) {
self.state = ReaderState::Done;
return Some(Err(e.into()));
return Some(Err(e));
}
let block = match FlacBlock::decode(header.block_type, &data) {

11
crates/pile-io/Cargo.toml Normal file
View File

@@ -0,0 +1,11 @@
[package]
name = "pile-io"
version = { workspace = true }
rust-version = { workspace = true }
edition = { workspace = true }
[lints]
workspace = true
[dependencies]
tokio = { workspace = true }

View File

@@ -0,0 +1,75 @@
use std::io::{Read, Seek, SeekFrom};
use tokio::runtime::Handle;
//
// MARK: asyncreader
//
/// An `async` equivalent of [std::io::Read].
pub trait AsyncReader: Send {
/// Read a chunk of bytes.
fn read(
&mut self,
buf: &mut [u8],
) -> impl Future<Output = Result<usize, std::io::Error>> + Send;
/// Read all remaining bytes into a `Vec`.
fn read_to_end(&mut self) -> impl Future<Output = Result<Vec<u8>, std::io::Error>> + Send {
async {
let mut buf = Vec::new();
let mut chunk = vec![0u8; 65536];
loop {
let n = self.read(&mut chunk).await?;
if n == 0 {
break;
}
buf.extend_from_slice(&chunk[..n]);
}
Ok(buf)
}
}
}
/// An `async` equivalent of [std::io::Read] + [std::io::Seek].
pub trait AsyncSeekReader: AsyncReader {
fn seek(&mut self, pos: SeekFrom) -> impl Future<Output = Result<u64, std::io::Error>> + Send;
}
//
// MARK: sync bridge
//
/// Turn an async [Reader] into a sync [Read] + [Seek].
///
/// Never use this outside of [tokio::task::spawn_blocking],
/// the async runtime will deadlock if this struct blocks
/// the runtime.
pub struct SyncReadBridge<R: AsyncReader> {
inner: R,
handle: Handle,
}
impl<R: AsyncReader> SyncReadBridge<R> {
/// Creates a new adapter using a handle to the current runtime.
/// Panics if called outside of a tokio context.
pub fn new_current(inner: R) -> Self {
Self::new(inner, Handle::current())
}
/// Creates a new adapter using a handle to an existing runtime.
pub fn new(inner: R, handle: Handle) -> Self {
Self { inner, handle }
}
}
impl<R: AsyncReader> Read for SyncReadBridge<R> {
fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
self.handle.block_on(self.inner.read(buf))
}
}
impl<R: AsyncReader + AsyncSeekReader> Seek for SyncReadBridge<R> {
fn seek(&mut self, pos: SeekFrom) -> Result<u64, std::io::Error> {
self.handle.block_on(self.inner.seek(pos))
}
}

View File

@@ -0,0 +1,2 @@
mod asyncreader;
pub use asyncreader::*;

View File

@@ -0,0 +1,28 @@
[package]
name = "pile-serve"
version = { workspace = true }
rust-version = { workspace = true }
edition = { workspace = true }
[lints]
workspace = true
[dependencies]
pile-config = { workspace = true }
pile-value = { workspace = true }
pile-dataset = { workspace = true }
serde_json = { workspace = true }
tracing = { workspace = true }
tokio = { workspace = true }
tokio-util = { version = "0.7", features = ["io"] }
serde = { workspace = true }
axum = { workspace = true }
percent-encoding = { workspace = true }
utoipa = { workspace = true }
utoipa-swagger-ui = { workspace = true }
[features]
default = []
pdfium = ["pile-value/pdfium"]

View File

@@ -0,0 +1,30 @@
use axum::{
Json,
extract::State,
http::StatusCode,
response::{IntoResponse, Response},
};
use pile_dataset::Datasets;
use std::{collections::HashMap, sync::Arc};
pub use pile_config::FieldSpec;
pub type FieldsResponse = HashMap<String, FieldSpec>;
/// Retrieve this dataset's schema.
#[utoipa::path(
get,
path = "/config/schema",
responses(
(status = 200, description = "This dataset's schema"),
)
)]
pub async fn config_schema(State(state): State<Arc<Datasets>>) -> Response {
let fields: FieldsResponse = state
.config
.schema
.iter()
.map(|(k, v)| (k.as_str().to_owned(), v.clone()))
.collect();
(StatusCode::OK, Json(fields)).into_response()
}

View File

@@ -0,0 +1,190 @@
use axum::{
Json,
body::Body,
extract::{Query, RawQuery, State},
http::{StatusCode, header},
response::{IntoResponse, Response},
};
use percent_encoding::percent_decode_str;
use pile_config::{Label, objectpath::ObjectPath};
use pile_dataset::Datasets;
use pile_value::{
extract::traits::ExtractState,
value::{BinaryPileValue, PileValue},
};
use serde::Deserialize;
use std::{sync::Arc, time::Instant};
use tokio_util::io::ReaderStream;
use tracing::debug;
use utoipa::ToSchema;
#[derive(Deserialize, ToSchema)]
pub struct ExtractQuery {
source: String,
key: String,
#[serde(default)]
download: bool,
name: Option<String>,
}
/// Extract a specific field from an item's metadata.
/// Multiple `path` parameters may be provided; the first non-null result is returned.
#[utoipa::path(
get,
path = "/extract",
params(
("source" = String, Query, description = "Source label"),
("key" = String, Query, description = "Item key"),
("path" = String, Query, description = "Object path (e.g. $.flac.title); repeat for fallbacks"),
("name" = Option<String>, Query, description = "Downloaded filename; defaults to the last segment of the key"),
),
responses(
(status = 200, description = "Field value as JSON"),
(status = 400, description = "Invalid source label or path"),
(status = 404, description = "Item or field not found"),
(status = 500, description = "Internal server error"),
)
)]
pub async fn get_extract(
State(state): State<Arc<Datasets>>,
Query(params): Query<ExtractQuery>,
RawQuery(raw_query): RawQuery,
) -> Response {
let start = Instant::now();
let label = match Label::try_from(params.source.clone()) {
Ok(l) => l,
Err(e) => return (StatusCode::BAD_REQUEST, format!("{e:?}")).into_response(),
};
// Collect all `path` query params in order (supports repeated ?path=...&path=...)
let raw = raw_query.as_deref().unwrap_or("");
let paths: Vec<ObjectPath> = {
let mut result = Vec::new();
for part in raw.split('&') {
if let Some((k, v)) = part.split_once('=')
&& k == "path"
{
let v = percent_decode_str(v).decode_utf8_lossy();
match v.parse::<ObjectPath>() {
Ok(p) => result.push(p),
Err(e) => {
return (StatusCode::BAD_REQUEST, format!("{e:?}")).into_response();
}
}
}
}
result
};
if paths.is_empty() {
return (StatusCode::BAD_REQUEST, "Missing `path` query parameter").into_response();
}
debug!(
message = "Serving /extract",
source = params.source,
key = params.key,
paths = paths.len(),
);
let Some(item) = state.get(&label, &params.key).await else {
return StatusCode::NOT_FOUND.into_response();
};
let extract_state = ExtractState { ignore_mime: false };
let item = PileValue::Item(item);
// Try each path in order, returning the first non-null result
let mut value = None;
for path in &paths {
match item.query(&extract_state, path).await {
Ok(None) => continue,
Ok(Some(PileValue::Null)) => {
value = Some(PileValue::Null);
continue;
}
Ok(Some(v)) => {
value = Some(v);
break;
}
Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
}
}
let Some(value) = value else {
return (StatusCode::BAD_REQUEST, "no value").into_response();
};
debug!(
message = "Served /extract",
source = params.source,
key = params.key,
time_ms = start.elapsed().as_millis()
);
let disposition_type = if params.download {
"attachment"
} else {
"inline"
};
let file_name = params.name.unwrap_or_else(|| {
params
.key
.rsplit('/')
.next()
.unwrap_or(&params.key)
.to_owned()
});
let disposition = format!("{disposition_type}; filename=\"{file_name}\"");
match value {
PileValue::String(s) => (
StatusCode::OK,
[
(header::CONTENT_TYPE, "text/plain".to_owned()),
(header::CONTENT_DISPOSITION, disposition),
],
s.to_string(),
)
.into_response(),
PileValue::Binary(binary) => {
let mime = binary.mime().to_string();
let body = match binary {
BinaryPileValue::Blob { bytes, .. } => Body::from(bytes.0.to_vec()),
BinaryPileValue::File { path, .. } => match tokio::fs::File::open(&path).await {
Ok(file) => Body::from_stream(ReaderStream::new(file)),
Err(e) => {
return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}"))
.into_response();
}
},
};
(
StatusCode::OK,
[
(header::CONTENT_TYPE, mime),
(header::CONTENT_DISPOSITION, disposition),
],
body,
)
.into_response()
}
_ => match value.to_json(&extract_state).await {
Ok(json) => (
StatusCode::OK,
[(header::CONTENT_DISPOSITION, disposition)],
Json(json),
)
.into_response(),
Err(e) => (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
},
}
}

View File

@@ -0,0 +1,103 @@
use axum::{
Json,
extract::{Query, State},
http::StatusCode,
response::{IntoResponse, Response},
};
use pile_dataset::Datasets;
use serde::{Deserialize, Serialize};
use std::sync::Arc;
use tracing::debug;
use utoipa::ToSchema;
#[derive(Deserialize, ToSchema)]
pub struct ItemsQuery {
#[serde(default)]
offset: usize,
#[serde(default = "default_limit")]
limit: usize,
}
fn default_limit() -> usize {
100
}
#[derive(Debug, Serialize, Deserialize, ToSchema)]
pub struct ItemsResponse {
pub items: Vec<ItemRef>,
pub total: usize,
pub offset: usize,
pub limit: usize,
}
#[derive(Debug, Serialize, Deserialize, ToSchema)]
pub struct ItemRef {
pub source: String,
pub key: String,
}
/// List all items across all sources with consistent ordering, paginated by offset and limit
#[utoipa::path(
get,
path = "/items",
params(
("offset" = usize, Query, description = "Number of items to skip"),
("limit" = usize, Query, description = "Maximum number of items to return (max 1000)"),
),
responses(
(status = 200, description = "Paginated list of items", body = ItemsResponse),
)
)]
pub async fn items_list(
State(state): State<Arc<Datasets>>,
Query(params): Query<ItemsQuery>,
) -> Response {
let limit = params.limit.min(1000);
let offset = params.offset;
debug!(message = "Serving /items", offset, limit);
// Sort sources by label for a consistent global order: (source, key)
let mut source_labels: Vec<_> = state.sources.keys().collect();
source_labels.sort();
let mut items: Vec<ItemRef> = Vec::with_capacity(limit);
let mut total = 0usize;
let mut remaining_offset = offset;
for label in source_labels {
let dataset = &state.sources[label];
let source_len = dataset.len();
if remaining_offset >= source_len {
// This entire source is before our window; skip it efficiently
remaining_offset -= source_len;
total += source_len;
continue;
}
let want = (limit - items.len()).min(source_len - remaining_offset);
let source_str = label.as_str().to_owned();
for item in dataset.iter_page(remaining_offset, want) {
items.push(ItemRef {
source: source_str.clone(),
key: item.key().to_string(),
});
}
remaining_offset = 0;
total += source_len;
}
debug!(message = "Served /items", offset, limit, total);
(
StatusCode::OK,
Json(ItemsResponse {
items,
total,
offset,
limit,
}),
)
.into_response()
}

View File

@@ -0,0 +1,89 @@
use axum::{
Router,
routing::{get, post},
};
use pile_dataset::Datasets;
use std::sync::Arc;
use utoipa::OpenApi;
use utoipa_swagger_ui::SwaggerUi;
mod lookup;
pub use lookup::*;
mod extract;
pub use extract::*;
mod items;
pub use items::*;
mod config_schema;
pub use config_schema::*;
mod schema_field;
pub use schema_field::*;
mod schema;
pub use schema::*;
#[derive(OpenApi)]
#[openapi(
tags(),
paths(
lookup,
get_extract,
items_list,
config_schema,
schema_field,
schema_all
),
components(schemas(
LookupRequest,
LookupResponse,
LookupResult,
ExtractQuery,
ItemsQuery,
ItemsResponse,
ItemRef
))
)]
pub(crate) struct Api;
#[inline]
pub fn router(ds: Arc<Datasets>, with_docs: bool) -> Router<()> {
router_prefix(ds, with_docs, None)
}
#[inline]
pub fn router_prefix(ds: Arc<Datasets>, with_docs: bool, prefix: Option<&str>) -> Router<()> {
let mut router = Router::new()
.route("/lookup", post(lookup))
.route("/extract", get(get_extract))
.route("/items", get(items_list))
.route("/config/schema", get(config_schema))
.route("/schema", get(schema_all))
.route("/schema/{field}", get(schema_field))
.with_state(ds.clone());
if let Some(prefix) = prefix {
router = Router::new().nest(prefix, router);
}
if with_docs {
let docs_path = match prefix {
None => "/docs".into(),
Some(prefix) => format!("{prefix}/docs"),
};
let api = Api::openapi();
let api = match prefix {
None => api,
Some(prefix) => utoipa::openapi::OpenApi::default().nest(prefix, api),
};
let docs =
SwaggerUi::new(docs_path.clone()).url(format!("{}/openapi.json", docs_path), api);
router = router.merge(docs);
}
router
}

View File

@@ -4,13 +4,12 @@ use axum::{
http::StatusCode,
response::{IntoResponse, Response},
};
use pile_dataset::Datasets;
use serde::{Deserialize, Serialize};
use std::{sync::Arc, time::Instant};
use tracing::debug;
use utoipa::ToSchema;
use crate::Datasets;
#[derive(Serialize, Deserialize, ToSchema, Debug)]
pub struct LookupRequest {
pub query: String,
@@ -22,6 +21,7 @@ pub struct LookupRequest {
#[derive(Debug, Serialize, Deserialize, ToSchema)]
pub struct LookupResponse {
pub results: Vec<LookupResult>,
pub total: u64,
}
#[derive(Debug, Serialize, Deserialize, ToSchema)]
@@ -48,13 +48,10 @@ pub async fn lookup(
Json(body): Json<LookupRequest>,
) -> Response {
let start = Instant::now();
debug!(
message = "Serving /lookup",
query = body.query,
limit = body.limit.unwrap_or(10)
);
let limit = body.limit.unwrap_or(128).min(1024);
debug!(message = "Serving /lookup", query = body.query, limit);
let results: Vec<LookupResult> = match state.fts_lookup(&body.query, body.limit.unwrap_or(10)) {
let results: Vec<LookupResult> = match state.fts_lookup(&body.query, limit) {
Ok(x) => x
.into_iter()
.map(|x| LookupResult {
@@ -69,6 +66,8 @@ pub async fn lookup(
}
};
let total: u64 = state.sources.iter().map(|x| x.1.len() as u64).sum();
debug!(
message = "Served /lookup",
query = body.query,
@@ -76,5 +75,5 @@ pub async fn lookup(
time_ms = start.elapsed().as_millis()
);
return (StatusCode::OK, Json(LookupResponse { results })).into_response();
return (StatusCode::OK, Json(LookupResponse { results, total })).into_response();
}

View File

@@ -0,0 +1,129 @@
use axum::{
Json,
extract::{Query, State},
http::StatusCode,
response::{IntoResponse, Response},
};
use pile_config::Label;
use pile_dataset::Datasets;
use pile_value::{extract::traits::ExtractState, value::PileValue};
use serde::{Deserialize, Serialize};
use std::{collections::HashMap, sync::Arc};
use utoipa::IntoParams;
#[derive(Deserialize, IntoParams)]
pub struct SchemaQuery {
source: String,
key: String,
#[serde(default)]
hidden: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum ApiValue {
Binary { binary: bool, mime: String },
Object { object: bool },
Array(Vec<ApiValue>),
String(String),
Number(serde_json::Number),
Null,
}
pub type SchemaResponse = HashMap<String, ApiValue>;
async fn pile_value_to_api(
state: &ExtractState,
value: PileValue,
) -> Result<ApiValue, std::io::Error> {
match value {
PileValue::String(s) => Ok(ApiValue::String(s.to_string())),
PileValue::U64(n) => Ok(ApiValue::Number(n.into())),
PileValue::I64(n) => Ok(ApiValue::Number(n.into())),
PileValue::Null => Ok(ApiValue::Null),
PileValue::Binary(x) => Ok(ApiValue::Binary {
binary: true,
mime: x.mime().to_string(),
}),
PileValue::Array(arr) => {
let mut out = Vec::with_capacity(arr.len());
for item in arr.iter() {
out.push(Box::pin(pile_value_to_api(state, item.clone())).await?);
}
Ok(ApiValue::Array(out))
}
PileValue::ObjectExtractor(_) | PileValue::ListExtractor(_) | PileValue::Item(_) => {
Ok(ApiValue::Object { object: true })
}
}
}
/// Get all schema field values for a single item.
#[utoipa::path(
get,
path = "/schema",
params(
("source" = String, Query, description = "Source label"),
("key" = String, Query, description = "Item key"),
("hidden" = bool, Query, description = "Include hidden fields (default: false)"),
),
responses(
(status = 200, description = "Schema field values as a map of label to value"),
(status = 400, description = "Invalid source label"),
(status = 404, description = "Item not found"),
(status = 500, description = "Internal server error"),
)
)]
pub async fn schema_all(
State(state): State<Arc<Datasets>>,
Query(params): Query<SchemaQuery>,
) -> Response {
let label = match Label::try_from(params.source.clone()) {
Ok(l) => l,
Err(e) => return (StatusCode::BAD_REQUEST, format!("{e:?}")).into_response(),
};
let Some(item) = state.get(&label, &params.key).await else {
return StatusCode::NOT_FOUND.into_response();
};
let extract_state = ExtractState { ignore_mime: false };
let item = PileValue::Item(item);
let mut result: SchemaResponse = HashMap::new();
for (field_label, field_spec) in &state.config.schema {
if field_spec.hidden && !params.hidden {
continue;
}
let mut value = None;
for path in &field_spec.path {
match item.query(&extract_state, path).await {
Ok(Some(PileValue::Null)) | Ok(None) => continue,
Ok(Some(v)) => {
value = Some(v);
break;
}
Err(e) => {
return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response();
}
}
}
let Some(v) = value else { continue };
let api_value = match pile_value_to_api(&extract_state, v).await {
Ok(v) => v,
Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
};
result.insert(field_label.as_str().to_owned(), api_value);
}
(StatusCode::OK, Json(result)).into_response()
}

View File

@@ -0,0 +1,173 @@
use axum::{
Json,
body::Body,
extract::{Path, Query, State},
http::{StatusCode, header},
response::{IntoResponse, Response},
};
use pile_config::Label;
use pile_dataset::Datasets;
use pile_value::{
extract::traits::ExtractState,
value::{BinaryPileValue, PileValue},
};
use serde::Deserialize;
use std::{sync::Arc, time::Instant};
use tokio_util::io::ReaderStream;
use tracing::debug;
use utoipa::IntoParams;
#[derive(Deserialize, IntoParams)]
pub struct SchemaFieldQuery {
source: String,
key: String,
#[serde(default)]
download: bool,
name: Option<String>,
}
/// Extract a specific schema field from an item's metadata.
#[utoipa::path(
get,
path = "/schema/{field}",
params(
("field" = String, Path, description = "Schema field"),
("source" = String, Query, description = "Source label"),
("key" = String, Query, description = "Item key"),
("name" = Option<String>, Query, description = "Downloaded filename; defaults to the last segment of the key"),
),
responses(
(status = 200, description = "Field value as JSON"),
(status = 400, description = "Invalid source label or path"),
(status = 404, description = "Item or field not found"),
(status = 500, description = "Internal server error"),
)
)]
pub async fn schema_field(
State(state): State<Arc<Datasets>>,
Path(field): Path<String>,
Query(params): Query<SchemaFieldQuery>,
) -> Response {
let start = Instant::now();
let label = match Label::try_from(params.source.clone()) {
Ok(l) => l,
Err(e) => return (StatusCode::BAD_REQUEST, format!("{e:?}")).into_response(),
};
debug!(
message = "Serving /schema/{field}",
source = params.source,
key = params.key,
field = field,
);
let Some(item) = state.get(&label, &params.key).await else {
return StatusCode::NOT_FOUND.into_response();
};
let field_label = match Label::new(&field) {
Some(x) => x,
None => return StatusCode::NOT_FOUND.into_response(),
};
let paths = match state.config.schema.get(&field_label) {
Some(x) => &x.path,
None => return StatusCode::NOT_FOUND.into_response(),
};
let extract_state = ExtractState { ignore_mime: false };
let item = PileValue::Item(item);
let mut value = None;
for path in paths {
match item.query(&extract_state, path).await {
Ok(None) => continue,
Ok(Some(PileValue::Null)) => {
value = Some(PileValue::Null);
continue;
}
Ok(Some(v)) => {
value = Some(v);
break;
}
Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
}
}
let Some(value) = value else {
return (StatusCode::BAD_REQUEST, "no value").into_response();
};
debug!(
message = "Served /schema/{field}",
source = params.source,
key = params.key,
field = field,
time_ms = start.elapsed().as_millis()
);
let disposition_type = if params.download {
"attachment"
} else {
"inline"
};
let file_name = params.name.unwrap_or_else(|| {
params
.key
.rsplit('/')
.next()
.unwrap_or(&params.key)
.to_owned()
});
let disposition = format!("{disposition_type}; filename=\"{file_name}\"");
match value {
PileValue::String(s) => (
StatusCode::OK,
[
(header::CONTENT_TYPE, "text/plain".to_owned()),
(header::CONTENT_DISPOSITION, disposition),
],
s.to_string(),
)
.into_response(),
PileValue::Binary(binary) => {
let mime = binary.mime().to_string();
let body = match binary {
BinaryPileValue::Blob { bytes, .. } => Body::from(bytes.0.to_vec()),
BinaryPileValue::File { path, .. } => match tokio::fs::File::open(&path).await {
Ok(file) => Body::from_stream(ReaderStream::new(file)),
Err(e) => {
return (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}"))
.into_response();
}
},
};
(
StatusCode::OK,
[
(header::CONTENT_TYPE, mime),
(header::CONTENT_DISPOSITION, disposition),
],
body,
)
.into_response()
}
_ => match value.to_json(&extract_state).await {
Ok(json) => (
StatusCode::OK,
[(header::CONTENT_DISPOSITION, disposition)],
Json(json),
)
.into_response(),
Err(e) => (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:?}")).into_response(),
},
}
}

View File

@@ -80,10 +80,12 @@ impl CancelFlag {
#[inline]
pub async fn await_cancel(&self) {
if self.is_cancelled() {
return;
let notified = self.notify.notified();
tokio::pin!(notified);
notified.as_mut().enable();
if !self.is_cancelled() {
notified.await;
}
self.notify.notified().await;
}
#[inline]

View File

@@ -0,0 +1,45 @@
[package]
name = "pile-value"
version = { workspace = true }
rust-version = { workspace = true }
edition = { workspace = true }
[lints]
workspace = true
[dependencies]
pile-io = { workspace = true }
pile-config = { workspace = true }
pile-flac = { workspace = true }
anyhow = { workspace = true }
serde_json = { workspace = true }
walkdir = { workspace = true }
tracing = { workspace = true }
chrono = { workspace = true }
toml = { workspace = true }
smartstring = { workspace = true }
regex = { workspace = true }
blake3 = { workspace = true }
sha2 = { workspace = true }
sha1 = { workspace = true }
md5 = { workspace = true }
epub = { workspace = true }
kamadak-exif = { workspace = true }
pdf = { workspace = true }
pdfium-render = { workspace = true, optional = true }
image = { workspace = true }
id3 = { workspace = true }
tokio = { workspace = true }
async-trait = { workspace = true }
mime = { workspace = true }
mime_guess = { workspace = true }
serde = { workspace = true }
strum = { workspace = true }
[build-dependencies]
reqwest = { workspace = true }
[features]
default = []
pdfium = ["dep:pdfium-render"]

View File

@@ -0,0 +1,99 @@
use std::env;
use std::path::PathBuf;
const PDFIUM_VERSION: &str = "chromium%2F7725";
fn pdfium_url(os: &str, arch: &str) -> String {
let platform = match (os, arch) {
("linux", "x86_64") => "linux-x64",
("linux", "aarch64") => "linux-arm64",
("macos", "x86_64") => "mac-x64",
("macos", "aarch64") => "mac-arm64",
_ => panic!("unsupported platform: {os}-{arch}"),
};
format!(
"https://github.com/bblanchon/pdfium-binaries/releases/download/{PDFIUM_VERSION}/pdfium-{platform}.tgz"
)
}
fn lib_name(os: &str) -> &'static str {
match os {
"macos" => "libpdfium.dylib",
_ => "libpdfium.so",
}
}
fn rpath_flag(os: &str) -> &'static str {
match os {
"macos" => "-Wl,-rpath,@loader_path",
_ => "-Wl,-rpath,$ORIGIN",
}
}
#[expect(clippy::expect_used)]
#[expect(clippy::unwrap_used)]
#[expect(clippy::print_stderr)]
fn main() {
println!("cargo:rerun-if-changed=build.rs");
if env::var("CARGO_FEATURE_PDFIUM").is_err() {
return;
}
let os = env::var("CARGO_CFG_TARGET_OS").unwrap();
let arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap();
let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
// OUT_DIR is target/<profile>/build/<pkg>-<hash>/out
// Go up 3 levels to reach target/<profile>/
let profile_dir = out_dir
.ancestors()
.nth(3)
.expect("unexpected OUT_DIR structure")
.to_path_buf();
// If PDFIUM_LIB_DIR is set (e.g. by Nix), use the pre-installed library directly.
if let Ok(lib_dir) = env::var("PDFIUM_LIB_DIR") {
println!("cargo:rustc-link-search=native={lib_dir}");
println!("cargo:rustc-link-lib=dylib=pdfium");
return;
}
let lib_file = lib_name(&os);
let lib_path = profile_dir.join(lib_file);
if !lib_path.exists() {
let url = pdfium_url(&os, &arch);
let tgz_path = out_dir.join("pdfium.tgz");
eprintln!("cargo:warning=Downloading PDFium from {url}");
let response = reqwest::blocking::get(&url).expect("failed to download PDFium");
assert!(
response.status().is_success(),
"failed to download PDFium: {}",
response.status()
);
let bytes = response.bytes().expect("failed to read PDFium response");
std::fs::write(&tgz_path, &bytes).expect("failed to write pdfium.tgz");
let status = std::process::Command::new("tar")
.args([
"-xzf",
tgz_path.to_str().unwrap(),
"-C",
out_dir.to_str().unwrap(),
])
.status()
.expect("failed to run tar");
assert!(status.success(), "tar failed to extract PDFium");
std::fs::copy(out_dir.join("lib").join(lib_file), &lib_path)
.expect("failed to copy pdfium library");
}
println!("cargo:rustc-link-search=native={}", profile_dir.display());
println!("cargo:rustc-link-lib=dylib=pdfium");
println!("cargo:rustc-link-arg={}", rpath_flag(&os));
}

View File

@@ -0,0 +1,75 @@
use epub::doc::EpubDoc;
use mime::Mime;
use pile_io::SyncReadBridge;
use std::sync::{Arc, OnceLock};
use tracing::trace;
use crate::{
extract::traits::ExtractState,
value::{ArcBytes, BinaryPileValue, PileValue},
};
pub struct EpubCoverExtractor {
item: BinaryPileValue,
output: OnceLock<Option<(Mime, Vec<u8>)>>,
}
impl EpubCoverExtractor {
pub fn new(item: &BinaryPileValue) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<Option<&(Mime, Vec<u8>)>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x.as_ref());
}
let reader = SyncReadBridge::new_current(self.item.read().await?);
let result = tokio::task::spawn_blocking(move || {
let mut doc = EpubDoc::from_reader(reader)?;
let cover_id = match doc.get_cover_id() {
Ok(id) => id,
Err(_) => return Ok::<_, anyhow::Error>(None),
};
let mime: Mime = doc
.resources
.get(&cover_id)
.and_then(|(_, mime_str)| mime_str.parse().ok())
.unwrap_or(mime::IMAGE_JPEG);
let bytes = doc.get_cover()?;
Ok(Some((mime, bytes)))
})
.await?;
let result = match result {
Ok(x) => x,
Err(error) => match error.downcast::<std::io::Error>() {
Ok(x) => return Err(x),
Err(error) => {
trace!(message = "Could not extract epub cover", ?error, item = ?self.item);
None
}
},
};
Ok(self.output.get_or_init(|| result).as_ref())
}
pub async fn get(&self, state: &ExtractState) -> Result<Option<PileValue>, std::io::Error> {
if !state.ignore_mime && self.item.mime().essence_str() != "application/epub+zip" {
return Ok(None);
}
Ok(self.get_inner().await?.map(|(mime, bytes)| {
PileValue::Binary(BinaryPileValue::Blob {
mime: mime.clone(),
bytes: ArcBytes(Arc::new(bytes.clone())),
})
}))
}
}

View File

@@ -1,20 +1,24 @@
use epub::doc::EpubDoc;
use pile_config::Label;
use pile_io::SyncReadBridge;
use std::{
collections::HashMap,
sync::{Arc, OnceLock},
};
use tracing::trace;
use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor};
use crate::{
extract::traits::{ExtractState, ObjectExtractor},
value::{BinaryPileValue, PileValue},
};
pub struct EpubMetaExtractor {
item: Item,
item: BinaryPileValue,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl EpubMetaExtractor {
pub fn new(item: &Item) -> Self {
pub fn new(item: &BinaryPileValue) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
@@ -26,16 +30,9 @@ impl EpubMetaExtractor {
return Ok(x);
}
let key = self.item.key();
let ext = key.as_str().rsplit('.').next();
if !matches!(ext, Some("epub")) {
return Ok(self.output.get_or_init(HashMap::new));
}
let reader = SyncReadBridge::new_current(self.item.read().await?);
let raw_meta = tokio::task::spawn_blocking(move || {
let doc = EpubDoc::from_reader(reader)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let doc = EpubDoc::from_reader(reader)?;
let fields: &[&'static str] = &[
"title",
@@ -51,17 +48,19 @@ impl EpubMetaExtractor {
let meta: Vec<(&'static str, Option<String>)> =
fields.iter().map(|&key| (key, doc.mdata(key))).collect();
Ok::<_, std::io::Error>(meta)
Ok::<_, anyhow::Error>(meta)
})
.await
.map_err(std::io::Error::other)?;
.await?;
let raw_meta = match raw_meta {
Ok(x) => x,
Err(error) => {
trace!(message = "Could not process epub", ?error, key = ?self.item.key());
return Ok(self.output.get_or_init(HashMap::new));
}
Err(error) => match error.downcast::<std::io::Error>() {
Ok(x) => return Err(x),
Err(error) => {
trace!(message = "Could not process epub", ?error, item = ?self.item);
return Ok(self.output.get_or_init(HashMap::new));
}
},
};
let mut output: HashMap<Label, PileValue> = HashMap::new();
@@ -82,7 +81,20 @@ impl EpubMetaExtractor {
#[async_trait::async_trait]
impl ObjectExtractor for EpubMetaExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
async fn field(
&self,
state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
if !state.ignore_mime && self.item.mime().essence_str() != "application/epub+zip" {
return Ok(None);
}
Ok(self.get_inner().await?.get(name).cloned())
}

View File

@@ -1,20 +1,24 @@
use epub::doc::EpubDoc;
use pile_config::Label;
use pile_io::SyncReadBridge;
use std::{
collections::HashMap,
sync::{Arc, OnceLock},
};
use tracing::debug;
use tracing::trace;
use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor};
use crate::{
extract::traits::{ExtractState, ObjectExtractor},
value::{BinaryPileValue, PileValue},
};
pub struct EpubTextExtractor {
item: Item,
item: BinaryPileValue,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl EpubTextExtractor {
pub fn new(item: &Item) -> Self {
pub fn new(item: &BinaryPileValue) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
@@ -26,16 +30,9 @@ impl EpubTextExtractor {
return Ok(x);
}
let key = self.item.key();
let ext = key.as_str().rsplit('.').next();
if !matches!(ext, Some("epub")) {
return Ok(self.output.get_or_init(HashMap::new));
}
let reader = SyncReadBridge::new_current(self.item.read().await?);
let raw_text = tokio::task::spawn_blocking(move || {
let mut doc = EpubDoc::from_reader(reader)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let mut doc = EpubDoc::from_reader(reader)?;
let mut text_parts: Vec<String> = Vec::new();
@@ -48,17 +45,19 @@ impl EpubTextExtractor {
}
}
Ok::<_, std::io::Error>(text_parts.join(" "))
Ok::<_, anyhow::Error>(text_parts.join(" "))
})
.await
.map_err(std::io::Error::other)?;
.await?;
let raw_text = match raw_text {
Ok(x) => x,
Err(error) => {
debug!(message = "Could not process epub", ?error, key = ?self.item.key());
return Ok(self.output.get_or_init(HashMap::new));
}
Err(error) => match error.downcast::<std::io::Error>() {
Ok(x) => return Err(x),
Err(error) => {
trace!(message = "Could not process epub", ?error, item = ?self.item);
return Ok(self.output.get_or_init(HashMap::new));
}
},
};
#[expect(clippy::unwrap_used)]
@@ -92,7 +91,20 @@ fn strip_html(html: &str) -> String {
#[async_trait::async_trait]
impl ObjectExtractor for EpubTextExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
async fn field(
&self,
state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
if !state.ignore_mime && self.item.mime().essence_str() != "application/epub+zip" {
return Ok(None);
}
Ok(self.get_inner().await?.get(name).cloned())
}

View File

@@ -0,0 +1,109 @@
use pile_config::Label;
use std::sync::Arc;
mod epub_cover;
pub use epub_cover::*;
mod epub_meta;
pub use epub_meta::*;
mod epub_text;
pub use epub_text::*;
use crate::{
extract::traits::{ExtractState, ObjectExtractor},
value::{BinaryPileValue, PileValue},
};
pub struct EpubExtractor {
text: Arc<EpubTextExtractor>,
meta: Arc<EpubMetaExtractor>,
cover: Arc<EpubCoverExtractor>,
}
impl EpubExtractor {
pub fn new(item: &BinaryPileValue) -> Self {
Self {
text: Arc::new(EpubTextExtractor::new(item)),
meta: Arc::new(EpubMetaExtractor::new(item)),
cover: Arc::new(EpubCoverExtractor::new(item)),
}
}
}
#[async_trait::async_trait]
impl ObjectExtractor for EpubExtractor {
async fn field(
&self,
state: &ExtractState,
name: &pile_config::Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
match (name.as_str(), args) {
("text", args) => Ok(Some(
self.text
.field(state, name, args)
.await
.map(|x| x.unwrap_or(PileValue::Null))?,
)),
("meta", None) => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))),
("cover", None) => self.cover.get(state).await,
_ => Ok(None),
}
}
#[expect(clippy::unwrap_used)]
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(vec![
Label::new("text").unwrap(),
Label::new("meta").unwrap(),
Label::new("cover").unwrap(),
])
}
async fn to_json(&self, state: &ExtractState) -> Result<serde_json::Value, std::io::Error> {
let keys = self.fields().await?;
let mut map = serde_json::Map::new();
for k in &keys {
let v = match self.field(state, k, None).await? {
Some(x) => x,
None => continue,
};
if k.as_str() == "text" {
map.insert(
k.to_string(),
serde_json::Value::String(format!(
"<String ({} bytes)",
match v {
PileValue::String(x) => x.len(),
_ => 0,
}
)),
);
continue;
}
if k.as_str() == "cover" {
let summary = match &v {
PileValue::Binary(BinaryPileValue::Blob { mime, bytes }) => {
format!("<Blob ({mime}, {} bytes)>", bytes.0.len())
}
PileValue::Binary(BinaryPileValue::File { mime, .. }) => {
format!("<File ({mime})>")
}
PileValue::Null => "<null>".to_owned(),
_ => "<cover>".to_owned(),
};
map.insert(k.to_string(), serde_json::Value::String(summary));
continue;
}
map.insert(k.to_string(), Box::pin(v.to_json(state)).await?);
}
Ok(serde_json::Value::Object(map))
}
}

View File

@@ -1,4 +1,5 @@
use pile_config::Label;
use pile_io::SyncReadBridge;
use std::{
collections::HashMap,
io::BufReader,
@@ -6,15 +7,18 @@ use std::{
};
use tracing::trace;
use crate::{Item, PileValue, SyncReadBridge, extract::ObjectExtractor};
use crate::{
extract::traits::{ExtractState, ObjectExtractor},
value::{BinaryPileValue, PileValue},
};
pub struct ExifExtractor {
item: Item,
item: BinaryPileValue,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl ExifExtractor {
pub fn new(item: &Item) -> Self {
pub fn new(item: &BinaryPileValue) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
@@ -29,9 +33,7 @@ impl ExifExtractor {
let reader = SyncReadBridge::new_current(self.item.read().await?);
let raw_fields = tokio::task::spawn_blocking(move || {
let mut br = BufReader::new(reader);
let exif = exif::Reader::new()
.read_from_container(&mut br)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let exif = exif::Reader::new().read_from_container(&mut br)?;
let fields: Vec<(String, String)> = exif
.fields()
@@ -43,15 +45,15 @@ impl ExifExtractor {
})
.collect();
Ok::<_, std::io::Error>(fields)
Ok::<_, exif::Error>(fields)
})
.await
.map_err(std::io::Error::other)?;
.await?;
let raw_fields = match raw_fields {
Ok(x) => x,
Err(exif::Error::Io(x)) => return Err(x),
Err(error) => {
trace!(message = "Could not process exif", ?error, key = ?self.item.key());
trace!(message = "Could not process exif", ?error, item = ?self.item);
return Ok(self.output.get_or_init(HashMap::new));
}
};
@@ -62,6 +64,7 @@ impl ExifExtractor {
let Some(label) = tag_to_label(&tag_name) else {
continue;
};
// First occurrence wins (PRIMARY IFD comes before THUMBNAIL)
output
.entry(label)
@@ -83,7 +86,26 @@ fn tag_to_label(tag: &str) -> Option<Label> {
#[async_trait::async_trait]
impl ObjectExtractor for ExifExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
async fn field(
&self,
state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
trace!(
?args,
item = ?self.item,
"Getting field {name:?} from ExifExtractor",
);
if args.is_some() {
return Ok(None);
}
if !state.ignore_mime && self.item.mime().type_() != mime::IMAGE {
return Ok(None);
}
Ok(self.get_inner().await?.get(name).cloned())
}

View File

@@ -0,0 +1,236 @@
use mime::Mime;
use pile_config::Label;
use pile_flac::{FlacBlock, FlacDecodeError, FlacReader};
use pile_io::SyncReadBridge;
use std::{
collections::HashMap,
io::BufReader,
sync::{Arc, OnceLock},
};
use tracing::trace;
use crate::{
extract::traits::{ExtractState, ListExtractor, ObjectExtractor},
value::{ArcBytes, BinaryPileValue, PileValue},
};
pub struct FlacImagesExtractor {
item: BinaryPileValue,
cached_count: OnceLock<usize>,
}
impl FlacImagesExtractor {
pub fn new(item: &BinaryPileValue) -> Self {
Self {
item: item.clone(),
cached_count: OnceLock::new(),
}
}
async fn get_count(&self) -> Result<usize, std::io::Error> {
let reader = SyncReadBridge::new_current(self.item.read().await?);
let count = tokio::task::spawn_blocking(move || {
let reader = FlacReader::new(BufReader::new(reader));
let mut count = 0usize;
for block in reader {
match block {
Ok(FlacBlock::AudioFrame(_)) => break,
Ok(FlacBlock::Picture(_)) => count += 1,
Err(FlacDecodeError::IoError(err)) => return Err(err),
Err(_) => return Ok(0),
_ => {}
}
}
Ok::<_, std::io::Error>(count)
})
.await??;
return Ok(count);
}
fn mime_ok(&self, state: &ExtractState) -> bool {
if state.ignore_mime {
return true;
}
let essence = self.item.mime().essence_str();
essence == "audio/flac" || essence == "audio/x-flac"
}
}
#[async_trait::async_trait]
impl ListExtractor for FlacImagesExtractor {
async fn get(
&self,
state: &ExtractState,
mut idx: usize,
) -> Result<Option<PileValue>, std::io::Error> {
trace!(
item = ?self.item,
"Getting index {idx} from FlacImagesExtractor",
);
if !self.mime_ok(state) {
return Ok(None);
}
let item = self.item.clone();
let reader = SyncReadBridge::new_current(self.item.read().await?);
let image = tokio::task::spawn_blocking(move || {
let reader = FlacReader::new(BufReader::new(reader));
let mut out: Option<(Mime, Vec<u8>)> = None;
'blocks: for block in reader {
match block {
Ok(FlacBlock::AudioFrame(_)) => break,
Ok(FlacBlock::Picture(picture)) => {
if idx > 0 {
idx -= 1;
continue;
}
out = Some((picture.mime, picture.img_data));
break 'blocks;
}
Err(FlacDecodeError::IoError(err)) => return Err(err),
Err(error) => {
trace!(message = "Could not parse FLAC images", ?item, ?error);
return Ok(None);
}
_ => {}
}
}
Ok::<_, std::io::Error>(out)
})
.await
.map_err(std::io::Error::other)??;
Ok(image.map(|(mime, data)| {
PileValue::Binary(BinaryPileValue::Blob {
mime,
bytes: ArcBytes(Arc::new(data)),
})
}))
}
async fn len(&self, state: &ExtractState) -> Result<usize, std::io::Error> {
if !self.mime_ok(state) {
return Ok(0);
}
if let Some(x) = self.cached_count.get() {
return Ok(*x);
}
let count = self.get_count().await?;
return Ok(*self.cached_count.get_or_init(|| count));
}
}
pub struct FlacExtractor {
item: BinaryPileValue,
output: OnceLock<HashMap<Label, PileValue>>,
images: PileValue,
}
impl FlacExtractor {
pub fn new(item: &BinaryPileValue) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
images: PileValue::ListExtractor(Arc::new(FlacImagesExtractor::new(item))),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
trace!(message = "Reading FLAC tags", item = ?self.item);
let item = self.item.clone();
let reader = SyncReadBridge::new_current(self.item.read().await?);
let output = tokio::task::spawn_blocking(move || {
let reader = FlacReader::new(BufReader::new(reader));
let mut output: HashMap<Label, Vec<PileValue>> = HashMap::new();
for block in reader {
match block {
Ok(FlacBlock::AudioFrame(_)) => break,
Ok(FlacBlock::VorbisComment(comment)) => {
for (k, v) in comment.comment.comments {
if let Some(label) = Label::new(k.to_string().to_lowercase()) {
output
.entry(label)
.or_default()
.push(PileValue::String(Arc::new(v)));
}
}
}
Err(FlacDecodeError::IoError(err)) => return Err(err),
Err(error) => {
trace!(message = "Could not parse FLAC metadata", ?item, ?error);
return Ok(HashMap::new());
}
_ => {}
}
}
let output: HashMap<Label, PileValue> = output
.into_iter()
.map(|(k, v)| (k, PileValue::Array(Arc::new(v))))
.collect();
Ok::<HashMap<Label, PileValue>, std::io::Error>(output)
})
.await??;
return Ok(self.output.get_or_init(|| output));
}
fn mime_ok(&self, state: &ExtractState) -> bool {
if state.ignore_mime {
return true;
}
let essence = self.item.mime().essence_str();
essence == "audio/flac" || essence == "audio/x-flac"
}
}
#[async_trait::async_trait]
impl ObjectExtractor for FlacExtractor {
async fn field(
&self,
state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
if !self.mime_ok(state) {
return Ok(None);
}
if name.as_str() == "images" {
return Ok(Some(self.images.clone()));
}
Ok(self.get_inner().await?.get(name).cloned())
}
#[expect(clippy::unwrap_used)]
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self
.get_inner()
.await?
.keys()
.cloned()
.chain([Label::new("images").unwrap()])
.collect::<Vec<_>>())
}
}

View File

@@ -0,0 +1,113 @@
use crate::{
extract::traits::{ExtractState, ObjectExtractor},
value::{BinaryPileValue, PileValue},
};
use pile_config::Label;
use std::{
collections::HashMap,
path::Component,
sync::{Arc, OnceLock},
};
pub struct FsExtractor {
item: BinaryPileValue,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl FsExtractor {
pub fn new(item: &BinaryPileValue) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
}
}
fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let path = match &self.item {
BinaryPileValue::File { path, .. } => path,
_ => return Ok(self.output.get_or_init(HashMap::new)),
};
let mut root = false;
let components = path
.components()
.filter_map(|x| match x {
Component::CurDir => None,
Component::Normal(x) => Some(x.to_str().map(|x| x.to_owned())),
Component::ParentDir => Some(Some("..".to_owned())),
Component::RootDir => {
root = true;
Some(None)
}
Component::Prefix(x) => Some(x.as_os_str().to_str().map(|x| x.to_owned())),
})
.collect::<Option<Vec<_>>>();
let mut path_str = components.as_ref().map(|x| x.join("/"));
if root {
path_str = path_str.map(|x| format!("/{x}"));
}
#[expect(clippy::unwrap_used)]
let output = HashMap::from([
(
Label::new("extension").unwrap(),
path.extension()
.and_then(|x| x.to_str())
.map(|x| PileValue::String(Arc::new(x.into())))
.unwrap_or(PileValue::Null),
),
(
Label::new("path").unwrap(),
path_str
.map(|x| PileValue::String(Arc::new(x.into())))
.unwrap_or(PileValue::Null),
),
(
Label::new("segments").unwrap(),
components
.clone()
.map(|x| {
PileValue::Array(Arc::new(
x.iter()
.map(|x| PileValue::String(Arc::new(x.into())))
.collect(),
))
})
.unwrap_or(PileValue::Null),
),
(
Label::new("name").unwrap(),
components
.and_then(|x| x.last().map(|x| PileValue::String(Arc::new(x.into()))))
.unwrap_or(PileValue::Null),
),
]);
return Ok(self.output.get_or_init(|| output));
}
}
#[async_trait::async_trait]
impl ObjectExtractor for FsExtractor {
async fn field(
&self,
_state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
Ok(self.get_inner()?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner()?.keys().cloned().collect())
}
}

View File

@@ -0,0 +1,111 @@
use crate::{
extract::traits::{ExtractState, ObjectExtractor},
value::{BinaryPileValue, PileValue},
};
use pile_config::Label;
use pile_io::SyncReadBridge;
use std::{io::Read, sync::Arc};
use tokio::sync::OnceCell;
fn to_hex(bytes: &[u8]) -> String {
bytes.iter().map(|b| format!("{b:02x}")).collect()
}
macro_rules! hash_algos {
($($name:ident),* $(,)?) => {
pub struct HashExtractor {
item: BinaryPileValue,
$($name: OnceCell<String>,)*
}
impl HashExtractor {
pub fn new(item: &BinaryPileValue) -> Self {
Self {
item: item.clone(),
$($name: OnceCell::new(),)*
}
}
}
static LABELS: std::sync::LazyLock<Vec<Label>> = std::sync::LazyLock::new(|| {
vec![$(Label::new(stringify!($name)).unwrap()),*]
});
};
}
hash_algos!(blake3, md5, sha1, sha224, sha256, sha384, sha512);
impl HashExtractor {
async fn compute(&self, name: &Label) -> Result<Option<String>, std::io::Error> {
let name_str = name.as_ref();
macro_rules! algo {
($cell:ident, $compute:expr) => {
if name_str == stringify!($cell) {
return Ok(Some(
self.$cell
.get_or_try_init(|| async {
let read = self.item.read().await?;
let mut read = SyncReadBridge::new_current(read);
tokio::task::spawn_blocking(move || {
let mut bytes = Vec::new();
read.read_to_end(&mut bytes)?;
Ok::<String, std::io::Error>($compute(&bytes))
})
.await?
})
.await?
.clone(),
));
}
};
}
algo!(blake3, |b: &Vec<u8>| blake3::hash(b).to_hex().to_string());
algo!(md5, |b: &Vec<u8>| format!("{:x}", md5::compute(b)));
algo!(sha1, |b: &Vec<u8>| {
use sha1::Digest;
to_hex(sha1::Sha1::digest(b).as_ref())
});
algo!(sha224, |b: &Vec<u8>| {
use sha2::Digest;
to_hex(sha2::Sha224::digest(b).as_ref())
});
algo!(sha256, |b: &Vec<u8>| {
use sha2::Digest;
to_hex(sha2::Sha256::digest(b).as_ref())
});
algo!(sha384, |b: &Vec<u8>| {
use sha2::Digest;
to_hex(sha2::Sha384::digest(b).as_ref())
});
algo!(sha512, |b: &Vec<u8>| {
use sha2::Digest;
to_hex(sha2::Sha512::digest(b).as_ref())
});
Ok(None)
}
}
#[async_trait::async_trait]
impl ObjectExtractor for HashExtractor {
async fn field(
&self,
_state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
Ok(self
.compute(name)
.await?
.map(|s| PileValue::String(Arc::new(s.into()))))
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(LABELS.clone())
}
}

View File

@@ -0,0 +1,241 @@
use id3::Tag;
use mime::Mime;
use pile_config::Label;
use pile_io::SyncReadBridge;
use std::{
borrow::Cow,
collections::HashMap,
io::BufReader,
sync::{Arc, OnceLock},
};
use tracing::trace;
use crate::{
extract::traits::{ExtractState, ListExtractor, ObjectExtractor},
value::{ArcBytes, BinaryPileValue, PileValue},
};
pub struct Id3ImagesExtractor {
item: BinaryPileValue,
cached_count: OnceLock<usize>,
}
impl Id3ImagesExtractor {
pub fn new(item: &BinaryPileValue) -> Self {
Self {
item: item.clone(),
cached_count: OnceLock::new(),
}
}
async fn read_tag(&self) -> Result<Option<Tag>, std::io::Error> {
let item = self.item.clone();
let reader = SyncReadBridge::new_current(self.item.read().await?);
tokio::task::spawn_blocking(move || match Tag::read_from2(BufReader::new(reader)) {
Ok(tag) => Ok(Some(tag)),
Err(id3::Error {
kind: id3::ErrorKind::Io(e),
..
}) => Err(e),
Err(error) => {
trace!(message = "Could not parse id3 tags", ?item, ?error);
Ok(None)
}
})
.await
.map_err(std::io::Error::other)?
}
fn mime_ok(&self, state: &ExtractState) -> bool {
state.ignore_mime || self.item.mime().essence_str() == "audio/mpeg"
}
}
#[async_trait::async_trait]
impl ListExtractor for Id3ImagesExtractor {
async fn get(
&self,
state: &ExtractState,
idx: usize,
) -> Result<Option<PileValue>, std::io::Error> {
if !self.mime_ok(state) {
return Ok(None);
}
let Some(tag) = self.read_tag().await? else {
return Ok(None);
};
let Some(picture) = tag.pictures().nth(idx) else {
return Ok(None);
};
let mime: Mime = picture
.mime_type
.parse()
.unwrap_or(mime::APPLICATION_OCTET_STREAM);
let data = picture.data.clone();
Ok(Some(PileValue::Binary(BinaryPileValue::Blob {
mime,
bytes: ArcBytes(Arc::new(data)),
})))
}
async fn len(&self, state: &ExtractState) -> Result<usize, std::io::Error> {
if !self.mime_ok(state) {
return Ok(0);
}
if let Some(x) = self.cached_count.get() {
return Ok(*x);
}
let count = match self.read_tag().await? {
Some(tag) => tag.pictures().count(),
None => 0,
};
Ok(*self.cached_count.get_or_init(|| count))
}
}
pub struct Id3Extractor {
item: BinaryPileValue,
output: OnceLock<HashMap<Label, PileValue>>,
images: PileValue,
}
impl Id3Extractor {
pub fn new(item: &BinaryPileValue) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
images: PileValue::ListExtractor(Arc::new(Id3ImagesExtractor::new(item))),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
trace!(message = "Reading id3 tags", key = ?self.item);
let item = self.item.clone();
let reader = SyncReadBridge::new_current(self.item.read().await?);
let tag = match tokio::task::spawn_blocking(move || Tag::read_from2(BufReader::new(reader)))
.await
{
Ok(Ok(tag)) => tag,
Err(e) => return Err(e.into()),
Ok(Err(id3::Error {
kind: id3::ErrorKind::Io(e),
..
})) => return Err(e),
Ok(Err(error)) => {
trace!(message = "Could not parse id3 tags", ?item, ?error);
return Ok(self.output.get_or_init(HashMap::new));
}
};
let mut output: HashMap<Label, Vec<PileValue>> = HashMap::new();
for frame in tag.frames() {
if let Some(texts) = frame.content().text_values() {
let name = frame_id_to_field(frame.id());
if let Some(key) = Label::new(name) {
for text in texts {
output
.entry(key.clone())
.or_default()
.push(PileValue::String(Arc::new(text.into())));
}
}
}
}
let output = output
.into_iter()
.map(|(k, v)| (k, PileValue::Array(Arc::new(v))))
.collect();
return Ok(self.output.get_or_init(|| output));
}
}
/// Map an ID3 frame ID to the equivalent Vorbis Comment field name.
/// Falls back to the lowercased frame ID if no mapping exists.
fn frame_id_to_field(id: &str) -> Cow<'static, str> {
match id {
// spell:off
"TIT2" => Cow::Borrowed("title"),
"TIT1" => Cow::Borrowed("grouping"),
"TIT3" => Cow::Borrowed("subtitle"),
"TPE1" => Cow::Borrowed("artist"),
"TPE2" => Cow::Borrowed("albumartist"),
"TPE3" => Cow::Borrowed("conductor"),
"TOPE" => Cow::Borrowed("originalartist"),
"TALB" => Cow::Borrowed("album"),
"TOAL" => Cow::Borrowed("originalalbum"),
"TRCK" => Cow::Borrowed("tracknumber"),
"TPOS" => Cow::Borrowed("discnumber"),
"TSST" => Cow::Borrowed("discsubtitle"),
"TDRC" | "TYER" => Cow::Borrowed("date"),
"TDOR" | "TORY" => Cow::Borrowed("originaldate"),
"TCON" => Cow::Borrowed("genre"),
"TCOM" => Cow::Borrowed("composer"),
"TEXT" => Cow::Borrowed("lyricist"),
"TPUB" => Cow::Borrowed("label"),
"TSRC" => Cow::Borrowed("isrc"),
"TBPM" => Cow::Borrowed("bpm"),
"TLAN" => Cow::Borrowed("language"),
"TMED" => Cow::Borrowed("media"),
"TMOO" => Cow::Borrowed("mood"),
"TCOP" => Cow::Borrowed("copyright"),
"TENC" => Cow::Borrowed("encodedby"),
"TSSE" => Cow::Borrowed("encodersettings"),
"TSOA" => Cow::Borrowed("albumsort"),
"TSOP" => Cow::Borrowed("artistsort"),
"TSOT" => Cow::Borrowed("titlesort"),
"MVNM" => Cow::Borrowed("movement"),
"MVIN" => Cow::Borrowed("movementnumber"),
_ => Cow::Owned(id.to_lowercase()),
// spell:on
}
}
#[async_trait::async_trait]
impl ObjectExtractor for Id3Extractor {
async fn field(
&self,
state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
if !state.ignore_mime && self.item.mime().essence_str() != "audio/mpeg" {
return Ok(None);
}
if name.as_str() == "images" {
return Ok(Some(self.images.clone()));
}
Ok(self.get_inner().await?.get(name).cloned())
}
#[expect(clippy::unwrap_used)]
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self
.get_inner()
.await?
.keys()
.cloned()
.chain([Label::new("images").unwrap()])
.collect::<Vec<_>>())
}
}

View File

@@ -0,0 +1,100 @@
use image::ImageFormat;
use mime::Mime;
use pile_config::Label;
use pile_io::AsyncReader;
use std::{io::Cursor, str::FromStr, sync::Arc};
use tracing::trace;
use transform::{CropTransformer, ImageTransformer, MaxDimTransformer};
mod transform;
use crate::{
extract::traits::{ExtractState, ObjectExtractor},
value::{ArcBytes, BinaryPileValue, PileValue},
};
pub struct ImageExtractor {
item: BinaryPileValue,
}
impl ImageExtractor {
pub fn new(item: &BinaryPileValue) -> Self {
Self { item: item.clone() }
}
async fn apply<T: ImageTransformer + Send + 'static>(
&self,
args: &str,
) -> Result<Option<PileValue>, std::io::Error> {
let transformer = match T::parse_args(args) {
Ok(t) => t,
Err(_) => return Ok(None),
};
let mime = self.item.mime().clone();
let bytes = self.item.read().await?.read_to_end().await?;
let Some(format) = ImageFormat::from_mime_type(&mime) else {
return Ok(Some(PileValue::Binary(BinaryPileValue::Blob {
mime,
bytes: ArcBytes(Arc::new(bytes)),
})));
};
let bytes_for_closure = bytes.clone();
let result = tokio::task::spawn_blocking(move || {
let mut img = image::load_from_memory_with_format(&bytes_for_closure, format)?;
transformer.transform(&mut img);
let mut out = Cursor::new(Vec::new());
img.write_to(&mut out, format)?;
let out_mime =
Mime::from_str(format.to_mime_type()).unwrap_or(mime::APPLICATION_OCTET_STREAM);
Ok::<_, image::ImageError>((out_mime, out.into_inner()))
})
.await?;
match result {
Ok((out_mime, out_bytes)) => Ok(Some(PileValue::Binary(BinaryPileValue::Blob {
mime: out_mime,
bytes: ArcBytes(Arc::new(out_bytes)),
}))),
Err(_) => Ok(Some(PileValue::Binary(BinaryPileValue::Blob {
mime,
bytes: ArcBytes(Arc::new(bytes)),
}))),
}
}
}
#[async_trait::async_trait]
impl ObjectExtractor for ImageExtractor {
async fn field(
&self,
_state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
let Some(args) = args else {
return Ok(None);
};
trace!(?args, "Getting field {name:?} from ImageExtractor",);
match name.as_str() {
"maxdim" => self.apply::<MaxDimTransformer>(args).await,
"crop" => self.apply::<CropTransformer>(args).await,
_ => Ok(None),
}
}
#[expect(clippy::unwrap_used)]
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(vec![
Label::new("maxdim").unwrap(),
Label::new("crop").unwrap(),
])
}
}

View File

@@ -0,0 +1,4 @@
mod pixeldim;
pub mod transformers;
pub use transformers::{CropTransformer, ImageTransformer, MaxDimTransformer};

View File

@@ -0,0 +1,68 @@
use serde::{Deserialize, Deserializer};
use std::fmt;
use std::str::FromStr;
// TODO: parse -, + (100vw - 10px)
// TODO: parse 100vw [min] 10
// TODO: parse 100vw [max] 10
#[derive(Debug, Clone, PartialEq)]
pub enum PixelDim {
Pixels(u32),
WidthPercent(f32),
HeightPercent(f32),
}
impl FromStr for PixelDim {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let numeric_end = s.find(|c: char| !c.is_ascii_digit() && c != '.');
let (quantity, unit) = numeric_end.map(|x| s.split_at(x)).unwrap_or((s, "px"));
let quantity = quantity.trim();
let unit = unit.trim();
match unit {
"vw" => Ok(PixelDim::WidthPercent(
quantity
.parse()
.map_err(|_err| format!("invalid quantity {quantity}"))?,
)),
"vh" => Ok(PixelDim::HeightPercent(
quantity
.parse()
.map_err(|_err| format!("invalid quantity {quantity}"))?,
)),
"px" => Ok(PixelDim::Pixels(
quantity
.parse()
.map_err(|_err| format!("invalid quantity {quantity}"))?,
)),
_ => Err(format!("invalid unit {unit}")),
}
}
}
impl<'de> Deserialize<'de> for PixelDim {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
let s = String::deserialize(deserializer)?;
FromStr::from_str(&s).map_err(serde::de::Error::custom)
}
}
impl fmt::Display for PixelDim {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
PixelDim::Pixels(px) => write!(f, "{px}"),
PixelDim::WidthPercent(p) => write!(f, "{p:.2}vw"),
PixelDim::HeightPercent(p) => write!(f, "{p:.2}vh"),
}
}
}

View File

@@ -0,0 +1,188 @@
use image::DynamicImage;
use serde::{Deserialize, Serialize};
use std::{fmt::Display, str::FromStr};
use strum::{Display, EnumString};
use super::super::{pixeldim::PixelDim, transformers::ImageTransformer};
#[derive(Debug, Clone, Copy, PartialEq, Eq, EnumString, Serialize, Deserialize, Display)]
pub enum Direction {
#[serde(rename = "n")]
#[strum(to_string = "n")]
#[strum(serialize = "north")]
North,
#[serde(rename = "e")]
#[strum(serialize = "e")]
#[strum(serialize = "east")]
East,
#[serde(rename = "s")]
#[strum(serialize = "s")]
#[strum(serialize = "south")]
South,
#[serde(rename = "w")]
#[strum(to_string = "w")]
#[strum(serialize = "west")]
West,
#[serde(rename = "c")]
#[strum(serialize = "c")]
#[strum(serialize = "center")]
Center,
#[serde(rename = "ne")]
#[strum(serialize = "ne")]
#[strum(serialize = "northeast")]
NorthEast,
#[serde(rename = "se")]
#[strum(serialize = "se")]
#[strum(serialize = "southeast")]
SouthEast,
#[serde(rename = "nw")]
#[strum(serialize = "nw")]
#[strum(serialize = "northwest")]
NorthWest,
#[serde(rename = "sw")]
#[strum(serialize = "sw")]
#[strum(serialize = "southwest")]
SouthWest,
}
/// Crop an image to (at most) the given size.
/// See [Self::new] for details.
#[derive(Debug, Clone, PartialEq)]
pub struct CropTransformer {
w: PixelDim,
h: PixelDim,
float: Direction,
}
impl CropTransformer {
/// Create a new [CropTransformer] with the given parameters.
///
/// A [CropTransformer] creates an image of size `w x h`, but...
/// - does not reduce width if `w` is greater than image width
/// - does not reduce height if `h` is greater than image height
/// - does nothing if `w` or `h` is less than or equal to zero.
#[expect(dead_code)]
pub fn new(w: PixelDim, h: PixelDim, float: Direction) -> Self {
Self { w, h, float }
}
fn crop_dim(&self, img_width: u32, img_height: u32) -> (u32, u32) {
let crop_width = match self.w {
PixelDim::Pixels(w) => w,
PixelDim::WidthPercent(pct) => ((img_width as f32) * pct / 100.0) as u32,
PixelDim::HeightPercent(pct) => ((img_height as f32) * pct / 100.0) as u32,
};
let crop_height = match self.h {
PixelDim::Pixels(h) => h,
PixelDim::WidthPercent(pct) => ((img_width as f32) * pct / 100.0) as u32,
PixelDim::HeightPercent(pct) => ((img_height as f32) * pct / 100.0) as u32,
};
(crop_width, crop_height)
}
#[expect(clippy::integer_division)]
fn crop_pos(
&self,
img_width: u32,
img_height: u32,
crop_width: u32,
crop_height: u32,
) -> (u32, u32) {
match self.float {
Direction::North => {
let x = (img_width - crop_width) / 2;
let y = 0;
(x, y)
}
Direction::East => {
let x = img_width - crop_width;
let y = (img_height - crop_height) / 2;
(x, y)
}
Direction::South => {
let x = (img_width - crop_width) / 2;
let y = img_height - crop_height;
(x, y)
}
Direction::West => {
let x = 0;
let y = (img_height - crop_height) / 2;
(x, y)
}
Direction::Center => {
let x = (img_width - crop_width) / 2;
let y = (img_height - crop_height) / 2;
(x, y)
}
Direction::NorthEast => {
let x = img_width - crop_width;
let y = 0;
(x, y)
}
Direction::SouthEast => {
let x = img_width - crop_width;
let y = img_height - crop_height;
(x, y)
}
Direction::NorthWest => {
let x = 0;
let y = 0;
(x, y)
}
Direction::SouthWest => {
let x = 0;
let y = img_height - crop_height;
(x, y)
}
}
}
}
impl Display for CropTransformer {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "crop({},{},{})", self.w, self.h, self.float)
}
}
impl ImageTransformer for CropTransformer {
fn parse_args(args: &str) -> Result<Self, String> {
let args: Vec<&str> = args.split(",").collect();
if args.len() != 3 {
return Err(format!("expected 3 args, got {}", args.len()));
}
let w = args[0].trim().parse::<PixelDim>()?;
let h = args[1].trim().parse::<PixelDim>()?;
let direction = args[2].trim();
let direction = Direction::from_str(direction)
.map_err(|_err| format!("invalid direction {direction}"))?;
Ok(Self {
w,
h,
float: direction,
})
}
fn transform(&self, input: &mut DynamicImage) {
let (img_width, img_height) = (input.width(), input.height());
let (crop_width, crop_height) = self.crop_dim(img_width, img_height);
if (crop_width < img_width || crop_height < img_height) && crop_width > 0 && crop_height > 0
{
let (x, y) = self.crop_pos(img_width, img_height, crop_width, crop_height);
*input = input.crop(x, y, crop_width, crop_height);
}
}
}

View File

@@ -0,0 +1,87 @@
use image::{DynamicImage, imageops::FilterType};
use std::fmt::Display;
use super::super::{pixeldim::PixelDim, transformers::ImageTransformer};
/// Scale an image until it fits in a configured bounding box.
#[derive(Debug, Clone, PartialEq)]
pub struct MaxDimTransformer {
w: PixelDim,
h: PixelDim,
}
impl MaxDimTransformer {
/// Create a new [MaxDimTransformer] that scales an image down
/// until it fits in a box of dimension `w x h`.
///
/// Images are never scaled up.
#[expect(dead_code)]
pub fn new(w: PixelDim, h: PixelDim) -> Self {
Self { w, h }
}
fn target_dim(&self, img_width: u32, img_height: u32) -> (u32, u32) {
let max_width = match self.w {
PixelDim::Pixels(w) => Some(w),
PixelDim::WidthPercent(pct) => Some(((img_width as f32) * pct / 100.0) as u32),
PixelDim::HeightPercent(_) => None,
};
let max_height = match self.h {
PixelDim::Pixels(h) => Some(h),
PixelDim::HeightPercent(pct) => Some(((img_height as f32) * pct / 100.0) as u32),
PixelDim::WidthPercent(_) => None,
};
if max_width.map(|x| img_width <= x).unwrap_or(true)
&& max_height.map(|x| img_height <= x).unwrap_or(true)
{
return (img_width, img_height);
}
let width_ratio = max_width
.map(|x| x as f32 / img_width as f32)
.unwrap_or(1.0);
let height_ratio = max_height
.map(|x| x as f32 / img_height as f32)
.unwrap_or(1.0);
let ratio = width_ratio.min(height_ratio);
(
(img_width as f32 * ratio) as u32,
(img_height as f32 * ratio) as u32,
)
}
}
impl Display for MaxDimTransformer {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "maxdim({},{})", self.w, self.h)
}
}
impl ImageTransformer for MaxDimTransformer {
fn parse_args(args: &str) -> Result<Self, String> {
let args: Vec<&str> = args.split(",").collect();
if args.len() != 2 {
return Err(format!("expected 2 args, got {}", args.len()));
}
let w = args[0].parse::<PixelDim>()?;
let h = args[1].parse::<PixelDim>()?;
Ok(Self { w, h })
}
fn transform(&self, input: &mut DynamicImage) {
let (img_width, img_height) = (input.width(), input.height());
let (target_width, target_height) = self.target_dim(img_width, img_height);
// Only resize if needed
if target_width != img_width || target_height != img_height {
*input = input.resize(target_width, target_height, FilterType::Lanczos3);
}
}
}

View File

@@ -0,0 +1,26 @@
//! Defines all transformation steps we can apply to an image
use image::DynamicImage;
use std::fmt::{Debug, Display};
mod crop;
pub use crop::*;
mod maxdim;
pub use maxdim::*;
/// A single transformation that may be applied to an image.
pub trait ImageTransformer
where
Self: PartialEq,
Self: Sized + Clone,
Self: Display + Debug,
{
/// Transform the given image in place
fn transform(&self, input: &mut DynamicImage);
/// Parse an arg string.
///
/// `name({arg_string})`
fn parse_args(args: &str) -> Result<Self, String>;
}

View File

@@ -0,0 +1,88 @@
use pile_config::Label;
use pile_io::AsyncReader;
use std::{
collections::HashMap,
sync::{Arc, OnceLock},
};
use crate::{
extract::traits::{ExtractState, ObjectExtractor},
value::{BinaryPileValue, PileValue},
};
fn json_to_pile(value: serde_json::Value) -> PileValue {
match value {
serde_json::Value::Null => PileValue::Null,
serde_json::Value::Bool(b) => PileValue::String(Arc::new(b.to_string().into())),
serde_json::Value::Number(n) => PileValue::String(Arc::new(n.to_string().into())),
serde_json::Value::String(s) => PileValue::String(Arc::new(s.into())),
serde_json::Value::Array(a) => {
PileValue::Array(Arc::new(a.into_iter().map(json_to_pile).collect()))
}
serde_json::Value::Object(_) => PileValue::Null,
}
}
pub struct JsonExtractor {
item: BinaryPileValue,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl JsonExtractor {
pub fn new(item: &BinaryPileValue) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
}
}
async fn get_inner(&self) -> Result<&HashMap<Label, PileValue>, std::io::Error> {
if let Some(x) = self.output.get() {
return Ok(x);
}
let mut reader = self.item.read().await?;
let bytes = reader.read_to_end().await?;
let json: serde_json::Value = match serde_json::from_slice(&bytes) {
Ok(x) => x,
Err(_) => return Ok(self.output.get_or_init(HashMap::new)),
};
let output: HashMap<Label, PileValue> = match json {
serde_json::Value::Object(map) => map
.into_iter()
.filter_map(|(k, v)| Label::new(&k).map(|label| (label, json_to_pile(v))))
.collect(),
_ => HashMap::new(),
};
return Ok(self.output.get_or_init(|| output));
}
}
#[async_trait::async_trait]
impl ObjectExtractor for JsonExtractor {
async fn field(
&self,
state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
if !state.ignore_mime
&& (self.item.mime().type_() != mime::APPLICATION
&& self.item.mime().type_() != mime::TEXT)
{
return Ok(None);
}
Ok(self.get_inner().await?.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.get_inner().await?.keys().cloned().collect())
}
}

View File

@@ -0,0 +1,125 @@
mod flac;
use std::{collections::HashMap, sync::Arc};
pub use flac::*;
mod id3;
pub use id3::*;
mod fs;
pub use fs::*;
mod epub;
pub use epub::*;
mod exif;
pub use exif::*;
mod pdf;
pub use pdf::*;
mod json;
pub use json::*;
mod toml;
use pile_config::Label;
pub use toml::*;
mod text;
pub use text::*;
mod image;
pub use image::*;
mod hash;
pub use hash::*;
use crate::{
extract::{
misc::MapExtractor,
traits::{ExtractState, ObjectExtractor},
},
value::{BinaryPileValue, PileValue},
};
pub struct BinaryExtractor {
inner: MapExtractor,
image: Arc<ImageExtractor>,
}
impl BinaryExtractor {
#[expect(clippy::unwrap_used)]
pub fn new(item: &BinaryPileValue) -> Self {
let inner = MapExtractor {
inner: HashMap::from([
(
Label::new("flac").unwrap(),
PileValue::ObjectExtractor(Arc::new(FlacExtractor::new(item))),
),
(
Label::new("id3").unwrap(),
PileValue::ObjectExtractor(Arc::new(Id3Extractor::new(item))),
),
(
Label::new("fs").unwrap(),
PileValue::ObjectExtractor(Arc::new(FsExtractor::new(item))),
),
(
Label::new("epub").unwrap(),
PileValue::ObjectExtractor(Arc::new(EpubExtractor::new(item))),
),
(
Label::new("exif").unwrap(),
PileValue::ObjectExtractor(Arc::new(ExifExtractor::new(item))),
),
(
Label::new("pdf").unwrap(),
PileValue::ObjectExtractor(Arc::new(PdfExtractor::new(item))),
),
(
Label::new("json").unwrap(),
PileValue::ObjectExtractor(Arc::new(JsonExtractor::new(item))),
),
(
Label::new("toml").unwrap(),
PileValue::ObjectExtractor(Arc::new(TomlExtractor::new(item))),
),
(
Label::new("text").unwrap(),
PileValue::ObjectExtractor(Arc::new(TextExtractor::new(item))),
),
(
Label::new("hash").unwrap(),
PileValue::ObjectExtractor(Arc::new(HashExtractor::new(item))),
),
]),
};
Self {
inner,
image: Arc::new(ImageExtractor::new(item)),
}
}
}
#[async_trait::async_trait]
impl ObjectExtractor for BinaryExtractor {
async fn field(
&self,
state: &ExtractState,
name: &pile_config::Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if self.image.fields().await?.contains(name) {
self.image.field(state, name, args).await
} else {
self.inner.field(state, name, args).await
}
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
let mut fields = self.inner.fields().await?;
fields.extend(self.image.fields().await?);
Ok(fields)
}
}

View File

@@ -0,0 +1,100 @@
use pile_config::Label;
use std::sync::Arc;
use tracing::trace;
#[cfg(feature = "pdfium")]
mod pdf_pages;
#[cfg(feature = "pdfium")]
pub use pdf_pages::*;
mod pdf_meta;
pub use pdf_meta::*;
mod pdf_text;
pub use pdf_text::*;
use crate::{
extract::traits::{ExtractState, ObjectExtractor},
value::{BinaryPileValue, PileValue},
};
pub struct PdfExtractor {
text: Arc<PdfTextExtractor>,
meta: Arc<PdfMetaExtractor>,
#[cfg(feature = "pdfium")]
pages: Arc<PdfPagesExtractor>,
}
impl PdfExtractor {
pub fn new(item: &BinaryPileValue) -> Self {
Self {
text: Arc::new(PdfTextExtractor::new(item)),
meta: Arc::new(PdfMetaExtractor::new(item)),
#[cfg(feature = "pdfium")]
pages: Arc::new(PdfPagesExtractor::new(item)),
}
}
}
#[async_trait::async_trait]
impl ObjectExtractor for PdfExtractor {
async fn field(
&self,
state: &ExtractState,
name: &pile_config::Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
trace!(
?args,
item = ?self.text.item,
"Getting field {name:?} from PdfExtractor",
);
match (name.as_str(), args) {
("text", args) => self.text.field(state, name, args).await,
("meta", None) => Ok(Some(PileValue::ObjectExtractor(self.meta.clone()))),
#[cfg(feature = "pdfium")]
("pages", None) => Ok(Some(PileValue::ListExtractor(self.pages.clone()))),
_ => Ok(None),
}
}
#[expect(clippy::unwrap_used)]
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(vec![
Label::new("text").unwrap(),
Label::new("meta").unwrap(),
#[cfg(feature = "pdfium")]
Label::new("pages").unwrap(),
])
}
async fn to_json(&self, state: &ExtractState) -> Result<serde_json::Value, std::io::Error> {
let keys = self.fields().await?;
let mut map = serde_json::Map::new();
for k in &keys {
let v = match self.field(state, k, None).await? {
Some(x) => x,
None => continue,
};
if k.as_str() == "text" {
map.insert(
k.to_string(),
serde_json::Value::String(format!(
"<String ({} bytes)",
match v {
PileValue::String(x) => x.len(),
_ => 0,
}
)),
);
continue;
}
map.insert(k.to_string(), Box::pin(v.to_json(state)).await?);
}
Ok(serde_json::Value::Object(map))
}
}

View File

@@ -1,6 +1,7 @@
use pdf::file::FileOptions;
use pdf::primitive::{Date, TimeRel};
use pile_config::Label;
use pile_io::SyncReadBridge;
use std::{
collections::HashMap,
io::BufReader,
@@ -8,16 +9,19 @@ use std::{
};
use tracing::trace;
use crate::extract::ObjectExtractor;
use crate::{Item, PileValue, SyncReadBridge};
use crate::value::BinaryPileValue;
use crate::{
extract::traits::{ExtractState, ObjectExtractor},
value::PileValue,
};
pub struct PdfMetaExtractor {
item: Item,
item: BinaryPileValue,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl PdfMetaExtractor {
pub fn new(item: &Item) -> Self {
pub fn new(item: &BinaryPileValue) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
@@ -79,7 +83,7 @@ impl PdfMetaExtractor {
let (page_count, raw_meta) = match raw_meta {
Ok(x) => x,
Err(error) => {
trace!(message = "Could not process pdf", ?error, key = ?self.item.key());
trace!(message = "Could not process pdf", ?error, item = ?self.item);
return Ok(self.output.get_or_init(HashMap::new));
}
};
@@ -120,7 +124,20 @@ fn format_date(d: &Date) -> String {
#[async_trait::async_trait]
impl ObjectExtractor for PdfMetaExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
async fn field(
&self,
state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
if !state.ignore_mime && self.item.mime().essence_str() != "application/pdf" {
return Ok(None);
}
Ok(self.get_inner().await?.get(name).cloned())
}

View File

@@ -1,19 +1,23 @@
use image::ImageFormat;
use pdfium_render::prelude::*;
use pile_io::SyncReadBridge;
use std::{
io::{BufReader, Cursor},
sync::Arc,
};
use tracing::trace;
use crate::{Item, PileValue, SyncReadBridge, extract::ListExtractor};
use crate::{
extract::traits::{ExtractState, ListExtractor},
value::{ArcBytes, BinaryPileValue, PileValue},
};
pub struct PdfPagesExtractor {
item: Item,
item: BinaryPileValue,
}
impl PdfPagesExtractor {
pub fn new(item: &Item) -> Self {
pub fn new(item: &BinaryPileValue) -> Self {
Self { item: item.clone() }
}
@@ -31,7 +35,20 @@ impl PdfPagesExtractor {
#[async_trait::async_trait]
impl ListExtractor for PdfPagesExtractor {
async fn get(&self, idx: usize) -> Result<Option<PileValue>, std::io::Error> {
async fn get(
&self,
state: &ExtractState,
idx: usize,
) -> Result<Option<PileValue>, std::io::Error> {
trace!(
item = ?self.item,
"Getting index {idx} from PdfPagesExtractor",
);
if !state.ignore_mime && self.item.mime().essence_str() != "application/pdf" {
return Ok(None);
}
let bytes = self.get_bytes().await?;
let png = tokio::task::spawn_blocking(move || {
let pdfium = Pdfium::default();
@@ -61,19 +78,23 @@ impl ListExtractor for PdfPagesExtractor {
let value = match png {
Ok(None) => return Ok(None),
Ok(Some(bytes)) => PileValue::Blob {
Ok(Some(bytes)) => PileValue::Binary(BinaryPileValue::Blob {
mime: mime::IMAGE_PNG,
bytes: Arc::new(bytes),
},
bytes: ArcBytes(Arc::new(bytes)),
}),
Err(error) => {
trace!(message = "Could not render pdf page", ?error, idx, key = ?self.item.key());
trace!(message = "Could not render pdf page", ?error, idx, item = ?self.item);
PileValue::Null
}
};
Ok(Some(value))
}
async fn len(&self) -> Result<usize, std::io::Error> {
async fn len(&self, state: &ExtractState) -> Result<usize, std::io::Error> {
if !state.ignore_mime && self.item.mime().essence_str() != "application/pdf" {
return Ok(0);
}
let bytes = self.get_bytes().await?;
let count = tokio::task::spawn_blocking(move || {
let pdfium = Pdfium::default();
@@ -87,7 +108,7 @@ impl ListExtractor for PdfPagesExtractor {
match count {
Ok(n) => Ok(n),
Err(error) => {
trace!(message = "Could not read pdf page count", ?error, key = ?self.item.key());
trace!(message = "Could not read pdf page count", ?error, item = ?self.item);
Ok(0)
}
}
@@ -95,10 +116,10 @@ impl ListExtractor for PdfPagesExtractor {
// Override, extracting all pages is very slow,
// and we can't display binary in json anyway
async fn to_json(&self) -> Result<serde_json::Value, std::io::Error> {
async fn to_json(&self, state: &ExtractState) -> Result<serde_json::Value, std::io::Error> {
Ok(serde_json::Value::String(format!(
"<PdfPages ({} pages)>",
self.len().await?
self.len(state).await?
)))
}
}

View File

@@ -1,6 +1,7 @@
use pdf::content::{Op, TextDrawAdjusted};
use pdf::file::FileOptions;
use pile_config::Label;
use pile_io::SyncReadBridge;
use std::{
collections::HashMap,
io::BufReader,
@@ -8,16 +9,19 @@ use std::{
};
use tracing::trace;
use crate::extract::ObjectExtractor;
use crate::{Item, PileValue, SyncReadBridge};
use crate::value::BinaryPileValue;
use crate::{
extract::traits::{ExtractState, ObjectExtractor},
value::PileValue,
};
pub struct PdfTextExtractor {
item: Item,
pub(super) item: BinaryPileValue,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl PdfTextExtractor {
pub fn new(item: &Item) -> Self {
pub fn new(item: &BinaryPileValue) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
@@ -83,7 +87,7 @@ impl PdfTextExtractor {
let raw_text = match raw_text {
Ok(x) => x,
Err(error) => {
trace!(message = "Could not process pdf", ?error, key = ?self.item.key());
trace!(message = "Could not process pdf", ?error, item = ?self.item);
return Ok(self.output.get_or_init(HashMap::new));
}
};
@@ -100,7 +104,20 @@ impl PdfTextExtractor {
#[async_trait::async_trait]
impl ObjectExtractor for PdfTextExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
async fn field(
&self,
state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
if !state.ignore_mime && self.item.mime().essence_str() != "application/pdf" {
return Ok(None);
}
Ok(self.get_inner().await?.get(name).cloned())
}

View File

@@ -0,0 +1,68 @@
use pile_config::Label;
use pile_io::AsyncReader;
use std::sync::{Arc, OnceLock};
use crate::{
extract::traits::{ExtractState, ObjectExtractor},
value::{BinaryPileValue, PileValue},
};
pub struct TextExtractor {
item: BinaryPileValue,
output: OnceLock<PileValue>,
}
impl TextExtractor {
pub fn new(item: &BinaryPileValue) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
}
}
}
#[async_trait::async_trait]
impl ObjectExtractor for TextExtractor {
async fn field(
&self,
state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
if !state.ignore_mime
&& (self.item.mime().type_() != mime::TEXT
&& self.item.mime().type_() != mime::APPLICATION)
{
return Ok(None);
}
if name.as_str() != "text" {
return Ok(None);
}
{
if let Some(x) = self.output.get() {
return Ok(Some(x.clone()));
}
let mut reader = self.item.read().await?;
let bytes = reader.read_to_end().await?;
let string = String::from_utf8(bytes).ok();
let value = match string {
Some(x) => PileValue::String(Arc::new(x.into())),
None => PileValue::Null,
};
return Ok(Some(self.output.get_or_init(|| value).clone()));
}
}
#[expect(clippy::unwrap_used)]
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(vec![Label::new("text").unwrap()])
}
}

View File

@@ -1,10 +1,14 @@
use pile_config::Label;
use pile_io::AsyncReader;
use std::{
collections::HashMap,
sync::{Arc, OnceLock},
};
use crate::{AsyncReader, Item, PileValue, extract::ObjectExtractor};
use crate::{
extract::traits::{ExtractState, ObjectExtractor},
value::{BinaryPileValue, PileValue},
};
fn toml_to_pile(value: toml::Value) -> PileValue {
match value {
@@ -21,12 +25,12 @@ fn toml_to_pile(value: toml::Value) -> PileValue {
}
pub struct TomlExtractor {
item: Item,
item: BinaryPileValue,
output: OnceLock<HashMap<Label, PileValue>>,
}
impl TomlExtractor {
pub fn new(item: &Item) -> Self {
pub fn new(item: &BinaryPileValue) -> Self {
Self {
item: item.clone(),
output: OnceLock::new(),
@@ -38,13 +42,7 @@ impl TomlExtractor {
return Ok(x);
}
let mut reader = match self.item.read().await {
Ok(r) => r,
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
return Ok(self.output.get_or_init(HashMap::new));
}
Err(e) => return Err(e),
};
let mut reader = self.item.read().await?;
let bytes = reader.read_to_end().await?;
let toml: toml::Value = match toml::from_slice(&bytes) {
Ok(x) => x,
@@ -65,7 +63,20 @@ impl TomlExtractor {
#[async_trait::async_trait]
impl ObjectExtractor for TomlExtractor {
async fn field(&self, name: &Label) -> Result<Option<PileValue>, std::io::Error> {
async fn field(
&self,
state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
if !state.ignore_mime && self.item.mime().type_() != mime::TEXT {
return Ok(None);
}
Ok(self.get_inner().await?.get(name).cloned())
}

View File

@@ -0,0 +1,58 @@
use std::{collections::HashMap, sync::Arc};
use pile_config::Label;
use crate::{
extract::{
misc::MapExtractor,
traits::{ExtractState, ObjectExtractor},
},
value::{Item, PileValue},
};
pub struct ItemExtractor {
inner: MapExtractor,
}
impl ItemExtractor {
pub fn new(item: &Item) -> Self {
let files = {
let Item::File { files, .. } = &item;
let mut inner = HashMap::new();
for f in files {
inner.insert(f.0.clone(), f.1.clone());
}
PileValue::ObjectExtractor(Arc::new(MapExtractor { inner }))
};
#[expect(clippy::unwrap_used)]
let inner = MapExtractor {
inner: HashMap::from([
(Label::new("files").unwrap(), files),
(
Label::new("key").unwrap(),
PileValue::String(Arc::new(item.key())),
),
]),
};
Self { inner }
}
}
#[async_trait::async_trait]
impl ObjectExtractor for ItemExtractor {
async fn field(
&self,
state: &ExtractState,
name: &pile_config::Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
self.inner.field(state, name, args).await
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
let fields = self.inner.fields().await?;
Ok(fields)
}
}

View File

@@ -0,0 +1,31 @@
use std::sync::Arc;
use crate::{
extract::traits::{ExtractState, ListExtractor},
value::PileValue,
};
pub struct ArrayExtractor {
inner: Arc<Vec<PileValue>>,
}
impl ArrayExtractor {
pub fn new(inner: Arc<Vec<PileValue>>) -> Self {
Self { inner }
}
}
#[async_trait::async_trait]
impl ListExtractor for ArrayExtractor {
async fn get(
&self,
_state: &ExtractState,
idx: usize,
) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.inner.get(idx).cloned())
}
async fn len(&self, _state: &ExtractState) -> Result<usize, std::io::Error> {
Ok(self.inner.len())
}
}

View File

@@ -0,0 +1,32 @@
use pile_config::Label;
use std::collections::HashMap;
use crate::{
extract::traits::{ExtractState, ObjectExtractor},
value::PileValue,
};
#[derive(Default)]
pub struct MapExtractor {
pub inner: HashMap<Label, PileValue>,
}
#[async_trait::async_trait]
impl ObjectExtractor for MapExtractor {
async fn field(
&self,
_state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
Ok(self.inner.get(name).cloned())
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
Ok(self.inner.keys().cloned().collect())
}
}

View File

@@ -0,0 +1,8 @@
mod list;
pub use list::*;
mod vec;
pub use vec::*;
mod map;
pub use map::*;

View File

@@ -0,0 +1,24 @@
use crate::{
extract::traits::{ExtractState, ListExtractor},
value::PileValue,
};
#[derive(Default)]
pub struct VecExtractor {
pub inner: Vec<PileValue>,
}
#[async_trait::async_trait]
impl ListExtractor for VecExtractor {
async fn get(
&self,
_state: &ExtractState,
idx: usize,
) -> Result<Option<PileValue>, std::io::Error> {
Ok(self.inner.get(idx).cloned())
}
async fn len(&self, _state: &ExtractState) -> Result<usize, std::io::Error> {
Ok(self.inner.len())
}
}

View File

@@ -0,0 +1,6 @@
pub mod blob;
pub mod item;
pub mod misc;
pub mod regex;
pub mod string;
pub mod traits;

View File

@@ -0,0 +1,104 @@
use std::sync::Arc;
use pile_config::Label;
use regex::Regex;
use smartstring::{LazyCompact, SmartString};
use crate::{
extract::traits::{ExtractState, ListExtractor, ObjectExtractor},
value::PileValue,
};
struct RegexData {
regex: Arc<Regex>,
/// Captured substrings indexed by group index (0 = whole match).
captures: Vec<Option<Arc<SmartString<LazyCompact>>>>,
}
impl RegexData {
fn new(regex: Arc<Regex>, input: &str) -> Option<Self> {
let caps = regex.captures(input)?;
let captures = caps
.iter()
.map(|m| m.map(|m| Arc::new(m.as_str().into())))
.collect();
Some(Self { regex, captures })
}
}
/// Exposes named capture groups as object fields.
pub struct RegexExtractor(Arc<RegexData>);
impl RegexExtractor {
/// Run `regex` against `input`. Returns `None` if there is no match.
pub fn new(regex: Arc<Regex>, input: &str) -> Option<Self> {
Some(Self(Arc::new(RegexData::new(regex, input)?)))
}
}
#[async_trait::async_trait]
impl ObjectExtractor for RegexExtractor {
async fn field(
&self,
_state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
if args.is_some() {
return Ok(None);
}
let Some(idx) = self
.0
.regex
.capture_names()
.position(|n| n == Some(name.as_str()))
else {
return Ok(None);
};
Ok(Some(
match self.0.captures.get(idx).and_then(|v| v.as_ref()) {
Some(s) => PileValue::String(s.clone()),
None => PileValue::Null,
},
))
}
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
#[expect(clippy::unwrap_used)]
Ok(self
.0
.regex
.capture_names()
.flatten()
.map(|n| Label::new(n).unwrap())
.collect())
}
fn as_list(&self) -> Option<Arc<dyn ListExtractor>> {
Some(Arc::new(RegexExtractor(self.0.clone())))
}
}
#[async_trait::async_trait]
impl ListExtractor for RegexExtractor {
async fn get(
&self,
_state: &ExtractState,
idx: usize,
) -> Result<Option<PileValue>, std::io::Error> {
let raw_idx = idx + 1;
let Some(slot) = self.0.captures.get(raw_idx) else {
return Ok(None);
};
Ok(Some(match slot {
Some(s) => PileValue::String(s.clone()),
None => PileValue::Null,
}))
}
async fn len(&self, _state: &ExtractState) -> Result<usize, std::io::Error> {
Ok(self.0.captures.len().saturating_sub(1))
}
}

View File

@@ -0,0 +1,236 @@
use pile_config::Label;
use regex::Regex;
use smartstring::{LazyCompact, SmartString};
use std::sync::Arc;
use crate::{
extract::{
regex::RegexExtractor,
traits::{ExtractState, ObjectExtractor},
},
value::PileValue,
};
pub struct StringExtractor {
item: Arc<SmartString<LazyCompact>>,
}
impl StringExtractor {
pub fn new(item: &Arc<SmartString<LazyCompact>>) -> Self {
Self { item: item.clone() }
}
}
#[async_trait::async_trait]
impl ObjectExtractor for StringExtractor {
async fn field(
&self,
_state: &ExtractState,
name: &Label,
args: Option<&str>,
) -> Result<Option<PileValue>, std::io::Error> {
Ok(match (name.as_str(), args) {
("trim", None) => Some(PileValue::String(Arc::new(
self.item.as_str().trim().into(),
))),
("upper", None) => Some(PileValue::String(Arc::new(
self.item.as_str().to_lowercase().into(),
))),
("lower", None) => Some(PileValue::String(Arc::new(
self.item.as_str().to_uppercase().into(),
))),
("nonempty", None) => Some(match self.item.is_empty() {
true => PileValue::Null,
false => PileValue::String(self.item.clone()),
}),
("trimprefix", Some(prefix)) => Some(PileValue::String(Arc::new(
self.item
.as_str()
.strip_prefix(prefix)
.unwrap_or(self.item.as_str())
.into(),
))),
("trimsuffix", Some(suffix)) => Some(PileValue::String(Arc::new(
self.item
.as_str()
.strip_suffix(suffix)
.unwrap_or(self.item.as_str())
.into(),
))),
("split", Some(by)) => Some(PileValue::Array(Arc::new(
self.item
.as_str()
.split(by)
.map(|s| PileValue::String(Arc::new(s.into())))
.collect(),
))),
("regex", Some(pattern)) => {
let Ok(re) = Regex::new(pattern) else {
return Ok(None);
};
Some(
match RegexExtractor::new(Arc::new(re), self.item.as_str()) {
Some(ext) => PileValue::ObjectExtractor(Arc::new(ext)),
None => PileValue::Null,
},
)
}
_ => None,
})
}
#[expect(clippy::unwrap_used)]
async fn fields(&self) -> Result<Vec<Label>, std::io::Error> {
return Ok(vec![
Label::new("trim").unwrap(),
Label::new("upper").unwrap(),
Label::new("lower").unwrap(),
Label::new("nonempty").unwrap(),
Label::new("trimprefix").unwrap(),
Label::new("trimsuffix").unwrap(),
Label::new("split").unwrap(),
]);
}
}
#[cfg(test)]
#[expect(clippy::expect_used)]
mod tests {
use super::*;
fn extractor(s: &str) -> StringExtractor {
StringExtractor::new(&Arc::new(s.into()))
}
#[expect(clippy::unwrap_used)]
async fn field(ext: &StringExtractor, name: &str, args: Option<&str>) -> Option<PileValue> {
let state = ExtractState { ignore_mime: false };
ext.field(&state, &Label::new(name).unwrap(), args)
.await
.unwrap()
}
fn string(v: Option<PileValue>) -> Option<String> {
match v? {
PileValue::String(s) => Some(s.as_str().to_owned()),
_ => panic!("expected string"),
}
}
fn array(v: Option<PileValue>) -> Vec<String> {
match v.expect("expected Some") {
PileValue::Array(arr) => arr
.iter()
.map(|v| match v {
PileValue::String(s) => s.as_str().to_owned(),
_ => panic!("expected string element"),
})
.collect(),
_ => panic!("expected array"),
}
}
#[tokio::test]
async fn trim() {
assert_eq!(
string(field(&extractor(" hi "), "trim", None).await),
Some("hi".into())
);
}
#[tokio::test]
async fn trim_no_args() {
assert!(field(&extractor("x"), "trim", Some("foo")).await.is_none());
}
#[tokio::test]
async fn nonempty_with_content() {
assert!(matches!(
field(&extractor("hello"), "nonempty", None).await,
Some(PileValue::String(_))
));
}
#[tokio::test]
async fn nonempty_empty_string() {
assert!(matches!(
field(&extractor(""), "nonempty", None).await,
Some(PileValue::Null)
));
}
#[tokio::test]
async fn trimprefix_present() {
assert_eq!(
string(field(&extractor("foobar"), "trimprefix", Some("foo")).await),
Some("bar".into())
);
}
#[tokio::test]
async fn trimprefix_absent() {
assert_eq!(
string(field(&extractor("foobar"), "trimprefix", Some("baz")).await),
Some("foobar".into())
);
}
#[tokio::test]
async fn trimprefix_no_args() {
assert!(
field(&extractor("foobar"), "trimprefix", None)
.await
.is_none()
);
}
#[tokio::test]
async fn trimsuffix_present() {
assert_eq!(
string(field(&extractor("foobar"), "trimsuffix", Some("bar")).await),
Some("foo".into())
);
}
#[tokio::test]
async fn trimsuffix_absent() {
assert_eq!(
string(field(&extractor("foobar"), "trimsuffix", Some("baz")).await),
Some("foobar".into())
);
}
#[tokio::test]
async fn split_basic() {
assert_eq!(
array(field(&extractor("a,b,c"), "split", Some(",")).await),
vec!["a", "b", "c"]
);
}
#[tokio::test]
async fn split_no_match() {
assert_eq!(
array(field(&extractor("abc"), "split", Some(",")).await),
vec!["abc"]
);
}
#[tokio::test]
async fn split_no_args() {
assert!(field(&extractor("abc"), "split", None).await.is_none());
}
#[tokio::test]
async fn unknown_field() {
assert!(field(&extractor("abc"), "bogus", None).await.is_none());
}
}

View File

@@ -0,0 +1,91 @@
#[derive(Debug, Clone)]
pub struct ExtractState {
/// If true, extract all fields from all items.
/// Do not pre-filter using mime type.
///
/// This may detect additional fields, but
/// makes extraction take much longer
pub ignore_mime: bool,
}
/// An attachment that extracts metadata from an [Item].
///
/// Metadata is exposed as an immutable map of {label: value},
/// much like a json object.
#[async_trait::async_trait]
pub trait ObjectExtractor: Send + Sync {
/// Get the field at `name` from `item`.
/// - returns `None` if `name` is not a valid field
/// - returns `Some(Null)` if `name` is not available
///
/// For extractors that parse binary, this fn should return
/// an error only if we failed to obtain the data we need (permission denied, etc).
///
/// If the underlying data has an invalid format (e.g, running a pdf extractor on a non-pdf file),
/// this fn should return `Ok(Some(None))`.
async fn field(
&self,
state: &ExtractState,
name: &pile_config::Label,
args: Option<&str>,
) -> Result<Option<crate::value::PileValue>, std::io::Error>;
/// Return all fields in this extractor.
/// `Self::field` must return [Some] for all these keys
/// and [None] for all others.
async fn fields(&self) -> Result<Vec<pile_config::Label>, std::io::Error>;
/// Return a list view of this extractor, if supported.
fn as_list(&self) -> Option<std::sync::Arc<dyn ListExtractor>> {
None
}
/// Convert this to a JSON value.
async fn to_json(&self, state: &ExtractState) -> Result<serde_json::Value, std::io::Error> {
let keys = self.fields().await?;
let mut map = serde_json::Map::new();
for k in &keys {
let v = match self.field(state, k, None).await? {
Some(x) => x,
None => continue,
};
map.insert(k.to_string(), Box::pin(v.to_json(state)).await?);
}
Ok(serde_json::Value::Object(map))
}
}
/// An attachment that extracts metadata from an [Item].
///
/// Metadata is exposed as an immutable list of values.
#[async_trait::async_trait]
pub trait ListExtractor: Send + Sync {
/// Get the item at index `idx`.
/// Indices start at zero, and must be consecutive.
/// - returns `None` if `idx` is out of range
/// - returns `Some(Null)` if `None` is at `idx`
async fn get(
&self,
state: &ExtractState,
idx: usize,
) -> Result<Option<crate::value::PileValue>, std::io::Error>;
async fn len(&self, state: &ExtractState) -> Result<usize, std::io::Error>;
/// Convert this list to a JSON value.
async fn to_json(&self, state: &ExtractState) -> Result<serde_json::Value, std::io::Error> {
let len = self.len(state).await?;
let mut list = Vec::with_capacity(len);
for i in 0..len {
#[expect(clippy::expect_used)]
let v = self
.get(state, i)
.await?
.expect("value must be present according to length");
list.push(Box::pin(v.to_json(state)).await?);
}
Ok(serde_json::Value::Array(list))
}
}

View File

@@ -0,0 +1,3 @@
pub mod extract;
pub mod source;
pub mod value;

View File

@@ -0,0 +1,137 @@
use chrono::{DateTime, Utc};
use pile_config::Label;
use regex::Regex;
use smartstring::{LazyCompact, SmartString};
use std::{
collections::{BTreeMap, HashMap},
path::PathBuf,
sync::{Arc, OnceLock},
};
use walkdir::WalkDir;
use crate::{
source::{DataSource, misc::path_ts_latest},
value::{BinaryPileValue, Item, PileValue},
};
#[derive(Debug)]
pub struct DirDataSource {
pub name: Label,
pub dir: PathBuf,
pub base_pattern: Regex,
pub files: HashMap<Label, String>,
pub index: OnceLock<BTreeMap<SmartString<LazyCompact>, Item>>,
}
impl DirDataSource {
pub async fn new(
name: &Label,
dir: PathBuf,
base_pattern: Regex,
files: HashMap<Label, String>,
) -> Result<Arc<Self>, std::io::Error> {
let source = Arc::new(Self {
name: name.clone(),
dir,
base_pattern,
files,
index: OnceLock::new(),
});
let mut index = BTreeMap::new();
'entry: for entry in WalkDir::new(&source.dir) {
let entry = match entry {
Err(e) => {
let msg = format!("walkdir error: {e:?}");
let err = e.into_io_error().unwrap_or(std::io::Error::other(msg));
return Err(err);
}
Ok(e) => e,
};
if entry.file_type().is_dir() {
continue;
}
let path = entry.into_path();
let rel_path = match path.strip_prefix(&source.dir) {
Ok(p) => p,
Err(_) => continue 'entry,
};
let path_str = match rel_path.to_str() {
Some(x) => x,
None => continue 'entry,
};
let captures = match source.base_pattern.captures(path_str) {
Some(c) => c,
None => continue 'entry,
};
let base = match captures.get(1) {
Some(m) => m.as_str(),
None => continue 'entry,
};
let key: SmartString<LazyCompact> = base.into();
if index.contains_key(&key) {
continue 'entry;
}
let mut item_files = HashMap::new();
for (label, template) in &source.files {
let file_path = source.dir.join(template.replace("{base}", base));
if file_path.exists() {
let mime = mime_guess::from_path(&file_path).first_or_octet_stream();
item_files.insert(
label.clone(),
PileValue::Binary(BinaryPileValue::File {
mime,
path: file_path,
}),
);
}
}
index.insert(
key.clone(),
Item::File {
key,
source: Arc::clone(&source),
files: item_files,
},
);
}
source.index.get_or_init(|| index);
Ok(source)
}
}
impl DataSource for Arc<DirDataSource> {
#[expect(clippy::expect_used)]
fn len(&self) -> usize {
self.index.get().expect("index should be initialized").len()
}
#[expect(clippy::expect_used)]
async fn get(&self, key: &str) -> Result<Option<Item>, std::io::Error> {
return Ok(self
.index
.get()
.expect("index should be initialized")
.get(key)
.cloned());
}
#[expect(clippy::expect_used)]
fn iter(&self) -> impl Iterator<Item = &Item> {
self.index
.get()
.expect("index should be initialized")
.values()
}
async fn latest_change(&self) -> Result<Option<DateTime<Utc>>, std::io::Error> {
path_ts_latest(&self.dir)
}
}

View File

@@ -0,0 +1,29 @@
mod dir;
pub use dir::*;
pub mod misc;
/// A read-only set of [Item]s.
pub trait DataSource {
/// Get the number of items in this source
fn len(&self) -> usize;
/// Get an item from this datasource
fn get(
&self,
key: &str,
) -> impl Future<Output = Result<Option<crate::value::Item>, std::io::Error>> + Send;
/// Iterate over all items in this source in sorted key order
fn iter(&self) -> impl Iterator<Item = &crate::value::Item>;
/// Iterate over a page of items, sorted by key
fn iter_page(&self, offset: usize, limit: usize) -> impl Iterator<Item = &crate::value::Item> {
self.iter().skip(offset).take(limit)
}
/// Return the time of the latest change to the data in this source
fn latest_change(
&self,
) -> impl Future<Output = Result<Option<chrono::DateTime<chrono::Utc>>, std::io::Error>> + Send;
}

View File

@@ -0,0 +1,45 @@
use pile_config::Label;
use smartstring::{LazyCompact, SmartString};
use std::{collections::HashMap, sync::Arc};
use crate::{source::DirDataSource, value::PileValue};
//
// MARK: item
//
/// A cheaply-cloneable pointer to an item in a dataset
#[derive(Clone)]
pub enum Item {
File {
key: SmartString<LazyCompact>,
source: Arc<DirDataSource>,
files: HashMap<Label, PileValue>,
},
}
impl std::fmt::Debug for Item {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::File { key, files, .. } => f
.debug_struct("Item::File")
.field("key", key)
.field("files", &files.keys().collect::<Vec<_>>())
.finish(),
}
}
}
impl Item {
pub fn source_name(&self) -> &pile_config::Label {
match self {
Self::File { source, .. } => &source.name,
}
}
pub fn key(&self) -> SmartString<LazyCompact> {
match self {
Self::File { key, .. } => key.clone(),
}
}
}

View File

@@ -0,0 +1,9 @@
mod item;
pub use item::*;
mod readers;
pub use readers::*;
#[expect(clippy::module_inception)]
mod value;
pub use value::*;

View File

@@ -0,0 +1,34 @@
use pile_io::{AsyncReader, AsyncSeekReader};
use std::{
fs::File,
io::{Cursor, Seek},
};
use crate::value::ArcBytes;
//
// MARK: itemreader
//
pub enum ItemReader {
File(File),
Vec(Cursor<ArcBytes>),
}
impl AsyncReader for ItemReader {
async fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
match self {
Self::File(x) => std::io::Read::read(x, buf),
Self::Vec(x) => std::io::Read::read(x, buf),
}
}
}
impl AsyncSeekReader for ItemReader {
async fn seek(&mut self, pos: std::io::SeekFrom) -> Result<u64, std::io::Error> {
match self {
Self::File(x) => x.seek(pos),
Self::Vec(x) => x.seek(pos),
}
}
}

View File

@@ -0,0 +1,298 @@
use mime::Mime;
use pile_config::objectpath::{ObjectPath, PathSegment};
use serde_json::{Map, Value};
use smartstring::{LazyCompact, SmartString};
use std::{fmt::Debug, fs::File, io::Cursor, path::PathBuf, sync::Arc};
use crate::{
extract::{
blob::BinaryExtractor,
item::ItemExtractor,
misc::{ArrayExtractor, MapExtractor, VecExtractor},
string::StringExtractor,
traits::{ExtractState, ListExtractor, ObjectExtractor},
},
value::{Item, ItemReader},
};
#[derive(Clone)]
pub struct ArcBytes(pub Arc<Vec<u8>>);
impl Debug for ArcBytes {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("ArcBytes")
.field("len()", &self.0.len())
.finish()
}
}
impl AsRef<[u8]> for ArcBytes {
fn as_ref(&self) -> &[u8] {
&self.0
}
}
#[derive(Debug, Clone)]
pub enum BinaryPileValue {
/// A binary blob
Blob { mime: Mime, bytes: ArcBytes },
/// An pointer to a file
File { mime: Mime, path: PathBuf },
}
impl BinaryPileValue {
/// Open the item for reading.
pub async fn read(&self) -> Result<ItemReader, std::io::Error> {
match self {
Self::File { path, .. } => Ok(ItemReader::File(File::open(path)?)),
Self::Blob { bytes, .. } => Ok(ItemReader::Vec(Cursor::new(bytes.clone()))),
}
}
pub fn mime(&self) -> &Mime {
match self {
Self::Blob { mime, .. } => mime,
Self::File { mime, .. } => mime,
}
}
}
/// An immutable, cheaply-cloneable, lazily-computed value.
/// Very similar to [serde_json::Value].
pub enum PileValue {
Null,
U64(u64),
I64(i64),
/// A string
String(Arc<SmartString<LazyCompact>>),
/// An array of values
Array(Arc<Vec<PileValue>>),
/// A lazily-computed map of {label: value}
ObjectExtractor(Arc<dyn ObjectExtractor>),
/// A lazily-computed array
ListExtractor(Arc<dyn ListExtractor>),
/// An pointer to an item in this dataset
Item(Item),
/// Binary data
Binary(BinaryPileValue),
}
impl Clone for PileValue {
fn clone(&self) -> Self {
match self {
Self::Null => Self::Null,
Self::U64(x) => Self::U64(*x),
Self::I64(x) => Self::I64(*x),
Self::String(x) => Self::String(x.clone()),
Self::Array(x) => Self::Array(x.clone()),
Self::ObjectExtractor(x) => Self::ObjectExtractor(x.clone()),
Self::ListExtractor(x) => Self::ListExtractor(x.clone()),
Self::Item(i) => Self::Item(i.clone()),
Self::Binary(b) => Self::Binary(b.clone()),
}
}
}
impl PileValue {
pub fn object_extractor(&self) -> Arc<dyn ObjectExtractor> {
match self {
Self::Null => Arc::new(MapExtractor::default()),
Self::U64(_) => Arc::new(MapExtractor::default()),
Self::I64(_) => Arc::new(MapExtractor::default()),
Self::Array(_) => Arc::new(MapExtractor::default()),
Self::String(s) => Arc::new(StringExtractor::new(s)),
Self::ListExtractor(_) => Arc::new(MapExtractor::default()),
Self::ObjectExtractor(e) => e.clone(),
Self::Item(i) => Arc::new(ItemExtractor::new(i)),
Self::Binary(b) => Arc::new(BinaryExtractor::new(b)),
}
}
pub fn list_extractor(&self) -> Arc<dyn ListExtractor> {
match self {
Self::Null => Arc::new(VecExtractor::default()),
Self::U64(_) => Arc::new(VecExtractor::default()),
Self::I64(_) => Arc::new(VecExtractor::default()),
Self::Array(a) => Arc::new(ArrayExtractor::new(a.clone())),
Self::String(_) => Arc::new(VecExtractor::default()),
Self::ListExtractor(e) => e.clone(),
Self::ObjectExtractor(e) => e
.as_list()
.unwrap_or_else(|| Arc::new(VecExtractor::default())),
Self::Item(_) => Arc::new(VecExtractor::default()),
Self::Binary(_) => Arc::new(VecExtractor::default()),
}
}
pub async fn query(
&self,
state: &ExtractState,
query: &ObjectPath,
) -> Result<Option<Self>, std::io::Error> {
let mut out: Option<PileValue> = Some(self.clone());
for s in &query.segments {
match s {
PathSegment::Root => out = Some(self.clone()),
PathSegment::Field { name, args } => {
let e = match out.map(|x| x.object_extractor()) {
Some(e) => e,
None => {
out = None;
continue;
}
};
out = e.field(state, name, args.as_deref()).await?;
}
PathSegment::Index(idx) => {
let e = match out.map(|x| x.list_extractor()) {
Some(e) => e,
None => {
out = None;
continue;
}
};
let idx = if *idx >= 0 {
usize::try_from(*idx).ok()
} else {
usize::try_from(e.len(state).await? as i64 - idx).ok()
};
let idx = match idx {
Some(idx) => idx,
None => {
out = None;
continue;
}
};
out = e.get(state, idx).await?;
}
PathSegment::Range {
start,
end,
inclusive,
} => {
let e = match out.map(|x| x.list_extractor()) {
Some(e) => e,
None => {
out = None;
continue;
}
};
let len = e.len(state).await? as i64;
let start_idx = if *start >= 0 { *start } else { len + start };
let end_idx = if *end >= 0 { *end } else { len + end };
let end_idx = if *inclusive { end_idx + 1 } else { end_idx };
let start_idx = start_idx.max(0) as usize;
let end_idx = (end_idx.max(0) as usize).min(len as usize);
let mut items = Vec::new();
for i in start_idx..end_idx {
match e.get(state, i).await? {
Some(v) => items.push(v),
None => break,
}
}
// TODO: lazy view?
out = Some(PileValue::Array(Arc::new(items)));
}
}
}
return Ok(out.clone());
}
/// Like `to_json`, but counts populated fields instead of collecting values.
///
/// - Leaf values (non-null scalars, arrays, blobs) contribute `Some(1)`.
/// - `Null` contributes `None`.
/// - `ObjectExtractor` is recursed into; returns `Some(Object(map))` with
/// only the fields that had data, or `None` if all fields were absent.
/// - `Array` / `ListExtractor` are treated as opaque leaf values (not descended into).
pub async fn count_fields(
&self,
state: &ExtractState,
) -> Result<Option<Value>, std::io::Error> {
Ok(match self {
Self::Null => None,
Self::U64(_)
| Self::I64(_)
| Self::String(_)
| Self::Binary(BinaryPileValue::Blob { .. }) => Some(Value::Number(1u64.into())),
Self::Array(x) => (!x.is_empty()).then(|| Value::Number(1u64.into())),
Self::ListExtractor(x) => (x.len(state).await? > 0).then(|| Value::Number(1u64.into())),
Self::ObjectExtractor(_)
| Self::Item(_)
| Self::Binary(BinaryPileValue::File { .. }) => {
let e = self.object_extractor();
let keys = e.fields().await?;
let mut map = Map::new();
for k in &keys {
let v = match e.field(state, k, None).await? {
Some(x) => x,
None => continue,
};
if let Some(counted) = Box::pin(v.count_fields(state)).await? {
map.insert(k.to_string(), counted);
}
}
if map.is_empty() {
None
} else {
Some(Value::Object(map))
}
}
})
}
pub fn as_str(&self) -> Option<&str> {
match self {
Self::String(x) => Some(x),
_ => None,
}
}
pub async fn to_json(&self, state: &ExtractState) -> Result<Value, std::io::Error> {
Ok(match self {
Self::Null => Value::Null,
Self::U64(x) => Value::Number((*x).into()),
Self::I64(x) => Value::Number((*x).into()),
Self::String(x) => Value::String(x.to_string()),
// TODO: replace with something meaningful?
Self::Binary(BinaryPileValue::Blob { mime, bytes }) => {
Value::String(format!("<Blob ({mime}, {} bytes)>", bytes.0.len()))
}
Self::Array(_) | Self::ListExtractor(_) => {
let e = self.list_extractor();
return e.to_json(state).await;
}
Self::ObjectExtractor(_)
| Self::Item(_)
| Self::Binary(BinaryPileValue::File { .. }) => {
let e = self.object_extractor();
return e.to_json(state).await;
}
})
}
}

View File

@@ -9,13 +9,14 @@ workspace = true
[dependencies]
pile-toolbox = { workspace = true }
pile-dataset = { workspace = true, features = ["axum", "pdfium"] }
pile-dataset = { workspace = true }
pile-serve = { workspace = true }
pile-value = { workspace = true }
pile-config = { workspace = true }
tracing = { workspace = true }
tracing-subscriber = { workspace = true }
tokio = { workspace = true }
tokio-stream = { workspace = true }
clap = { workspace = true }
#clap_complete = { workspace = true }
serde = { workspace = true }
@@ -26,3 +27,15 @@ anstyle = { workspace = true }
toml = { workspace = true }
serde_json = { workspace = true }
axum = { workspace = true }
utoipa = { workspace = true }
utoipa-swagger-ui = { workspace = true }
url = { workspace = true }
tracing-loki = { workspace = true }
base64 = { workspace = true }
dotenvy = { workspace = true }
envy = { workspace = true }
thiserror = { workspace = true }
[features]
default = ["pdfium"]
pdfium = ["pile-dataset/pdfium", "pile-serve/pdfium", "pile-value/pdfium"]

View File

@@ -1,106 +0,0 @@
use anyhow::{Context, Result};
use clap::Args;
use pile_config::{Label, Source};
use pile_dataset::index::DbFtsIndex;
use pile_dataset::source::DirDataSource;
use pile_dataset::{DataSource, Datasets, Item, PileValue, extract::MetaExtractor};
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
use std::{path::PathBuf, sync::Arc};
use tokio_stream::StreamExt;
use tracing::{info, warn};
use crate::{CliCmd, GlobalContext};
#[derive(Debug, Args)]
pub struct AnnotateCommand {
/// The schema field to read (must be defined in pile.toml)
field: String,
/// Sidecar path to write to (e.g. meta.title)
dest: String,
/// Path to dataset config
#[arg(long, short = 'c', default_value = "./pile.toml")]
config: PathBuf,
}
impl AnnotateCommand {
fn parse_dest(dest: &str) -> Result<Vec<Label>> {
dest.split('.')
.map(|s| {
Label::new(s).ok_or_else(|| anyhow::anyhow!("invalid label {s:?} in dest path"))
})
.collect()
}
}
impl CliCmd for AnnotateCommand {
async fn run(
self,
_ctx: GlobalContext,
_flag: CancelFlag,
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
let field = Label::new(&self.field)
.ok_or_else(|| anyhow::anyhow!("invalid field name {:?}", self.field))?;
let dest_path = Self::parse_dest(&self.dest)?;
let ds = Datasets::open(&self.config)
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
if !ds.config.schema.contains_key(&field) {
return Err(anyhow::anyhow!("field {:?} is not defined in schema", self.field).into());
}
let index = DbFtsIndex::new(&ds.path_workdir, &ds.config);
let count = 0u64;
for (name, source) in &ds.config.dataset.source {
match source {
Source::Filesystem { path, sidecars } => {
if !sidecars {
warn!("Source {name} does not have sidecars enabled, skipping");
continue;
}
let source = Arc::new(DirDataSource::new(name, path.clone(), *sidecars));
let mut stream = source.iter();
while let Some(res) = stream.next().await {
let item = res.with_context(|| format!("while reading source {name}"))?;
let Item::File { path, .. } = &item else {
continue;
};
let meta = MetaExtractor::new(&item);
let extractor = PileValue::ObjectExtractor(Arc::new(meta));
let Some(value) =
index.get_field(&extractor, &field).await.with_context(|| {
format!("while extracting field from {}", path.display())
})?
else {
continue;
};
// TODO: implement sidecar writing
let _ = (&dest_path, &value);
todo!("write_sidecar not yet implemented");
#[expect(unreachable_code)]
{
count += 1;
}
}
}
Source::S3 { .. } => {
warn!("Source {name} is an S3 source; sidecar annotation is not yet supported");
}
}
}
info!("Annotated {count} items");
return Ok(0);
}
}

View File

@@ -13,6 +13,10 @@ pub struct CheckCommand {
/// Path to dataset config
#[arg(long, short = 'c', default_value = "./pile.toml")]
config: PathBuf,
/// Working directory root
#[arg(long, default_value = "./.pile")]
workdir: PathBuf,
}
impl CliCmd for CheckCommand {
@@ -43,7 +47,8 @@ impl CliCmd for CheckCommand {
}
}
let ds = Datasets::open(&self.config)
let ds = Datasets::open(&self.config, self.workdir)
.await
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
let ts_fts = ds.ts_fts().context("while determining fts age")?;

View File

@@ -1,11 +1,12 @@
use anyhow::{Context, Result};
use clap::Args;
use pile_dataset::{Datasets, PileValue, extract::MetaExtractor};
use pile_config::objectpath::ObjectPath;
use pile_dataset::Datasets;
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
use pile_value::{extract::traits::ExtractState, value::PileValue};
use serde_json::{Map, Value};
use std::{path::PathBuf, sync::Arc, time::Instant};
use std::{path::PathBuf, time::Instant};
use tokio::task::JoinSet;
use tokio_stream::StreamExt;
use tracing::info;
use crate::{CliCmd, GlobalContext};
@@ -40,9 +41,17 @@ pub struct FieldsCommand {
#[arg(long)]
max_percent: Option<f64>,
/// Print counts of non-null schema fields instead of raw fields
#[arg(long)]
schema: bool,
/// Restrict to these sources (all sources if empty)
#[arg(long, short = 's')]
source: Vec<String>,
/// Working directory root
#[arg(long, default_value = "./.pile")]
workdir: PathBuf,
}
impl CliCmd for FieldsCommand {
@@ -53,13 +62,26 @@ impl CliCmd for FieldsCommand {
_ctx: GlobalContext,
flag: CancelFlag,
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
let ds = Datasets::open(&self.config)
let ds = Datasets::open(&self.config, &self.workdir)
.await
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
let start = Instant::now();
let mut total_counts: Map<String, Value> = Map::new();
let mut total_items = 0u64;
let jobs = self.jobs.max(1);
let state = ExtractState { ignore_mime: false };
// Pre-collect schema fields for the --schema mode
let schema_fields: Vec<(String, Vec<ObjectPath>)> = if self.schema {
ds.config
.schema
.iter()
.map(|(name, spec)| (name.to_string(), spec.path.clone()))
.collect()
} else {
Vec::new()
};
for (name, dataset) in ds.sources.iter().filter(|(name, _)| {
self.source.is_empty() || self.source.iter().any(|s| s == name.as_str())
@@ -86,26 +108,56 @@ impl CliCmd for FieldsCommand {
return Err(CancelableTaskError::Cancelled);
}
match stream.next().await {
match stream.next() {
None => break,
Some(item_result) => {
let item =
item_result.with_context(|| format!("while reading source {name}"))?;
Some(item) => {
let item = item.clone();
let name = name.clone();
join_set.spawn(async move {
let meta = MetaExtractor::new(&item);
let value = PileValue::ObjectExtractor(Arc::new(meta));
let result = value.count_fields().await.with_context(|| {
format!("while counting fields in source {name}")
})?;
Ok(result.and_then(|v| {
if let Value::Object(m) = v {
Some(m)
} else {
None
let state = state.clone();
if self.schema {
let schema_fields = schema_fields.clone();
join_set.spawn(async move {
let pv = PileValue::Item(item);
let mut counts = Map::new();
for (field_name, paths) in &schema_fields {
let mut present = false;
for path in paths {
let v =
pv.query(&state, path).await.with_context(|| {
format!(
"while extracting field {field_name} in source {name}"
)
})?;
if let Some(v) = v
&& !matches!(v, PileValue::Null)
{
present = true;
break;
}
}
counts.insert(
field_name.clone(),
Value::Number((present as u64).into()),
);
}
}))
});
Ok(Some(counts))
});
} else {
join_set.spawn(async move {
let item = PileValue::Item(item);
let result =
item.count_fields(&state).await.with_context(|| {
format!("while counting fields in source {name}")
})?;
Ok(result.and_then(|v| {
if let Value::Object(m) = v {
Some(m)
} else {
None
}
}))
});
}
}
}
}

View File

@@ -2,6 +2,7 @@ use anyhow::{Context, Result};
use clap::Args;
use pile_dataset::Datasets;
use pile_toolbox::cancelabletask::{CancelFlag, CancelableTaskError};
use pile_value::extract::traits::ExtractState;
use std::{fmt::Debug, path::PathBuf};
use crate::{CliCmd, GlobalContext};
@@ -15,6 +16,10 @@ pub struct IndexCommand {
/// Number of threads to use for indexing
#[arg(long, short = 'j', default_value = "3")]
jobs: usize,
/// Working directory root
#[arg(long, default_value = "./.pile")]
workdir: PathBuf,
}
impl CliCmd for IndexCommand {
@@ -23,17 +28,21 @@ impl CliCmd for IndexCommand {
_ctx: GlobalContext,
flag: CancelFlag,
) -> Result<i32, CancelableTaskError<anyhow::Error>> {
let ds = Datasets::open(&self.config)
let ds = Datasets::open(&self.config, &self.workdir)
.await
.with_context(|| format!("while opening dataset for {}", self.config.display()))?;
ds.fts_refresh(self.jobs, Some(flag)).await.map_err(|x| {
x.map_err(|x| {
anyhow::Error::from(x).context(format!(
"while refreshing fts for {}",
self.config.display()
))
})
})?;
let state = ExtractState { ignore_mime: false };
ds.fts_refresh(&state, self.jobs, Some(flag))
.await
.map_err(|x| {
x.map_err(|x| {
anyhow::Error::from(x).context(format!(
"while refreshing fts for {}",
self.config.display()
))
})
})?;
return Ok(0);
}

Some files were not shown because too many files have changed in this diff Show More