Extractor rewrite
This commit is contained in:
158
crates/pile-value/src/source/s3reader.rs
Normal file
158
crates/pile-value/src/source/s3reader.rs
Normal file
@@ -0,0 +1,158 @@
|
||||
use aws_sdk_s3::{error::SdkError, operation::get_object::GetObjectError};
|
||||
use mime::Mime;
|
||||
use std::io::{Error as IoError, Seek, SeekFrom, Write};
|
||||
use thiserror::Error;
|
||||
|
||||
use super::S3Client;
|
||||
use crate::retry;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
#[expect(clippy::large_enum_variant)]
|
||||
pub enum S3ReaderError {
|
||||
#[error("sdk error")]
|
||||
SdkError(#[from] SdkError<GetObjectError>),
|
||||
|
||||
#[error("byte stream error")]
|
||||
ByteStreamError(#[from] aws_sdk_s3::primitives::ByteStreamError),
|
||||
|
||||
#[error("i/o error")]
|
||||
IoError(#[from] IoError),
|
||||
}
|
||||
|
||||
/// Provides a [`std::io::Read`]-like interface to an S3 object. \
|
||||
/// This doesn't actually implement [`std::io::Read`] because Read isn't async.
|
||||
///
|
||||
/// Also implements [`std::io::Seek`]
|
||||
pub struct S3Reader {
|
||||
pub(super) client: S3Client,
|
||||
pub(super) bucket: String,
|
||||
pub(super) key: String,
|
||||
|
||||
pub(super) cursor: u64,
|
||||
pub(super) size: u64,
|
||||
pub(super) mime: Mime,
|
||||
}
|
||||
|
||||
impl S3Reader {
|
||||
pub async fn read(&mut self, mut buf: &mut [u8]) -> Result<usize, S3ReaderError> {
|
||||
let len_left = self.size - self.cursor;
|
||||
if len_left == 0 || buf.is_empty() {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
#[expect(clippy::unwrap_used)] // TODO: probably fits?
|
||||
let start_byte = usize::try_from(self.cursor).unwrap();
|
||||
|
||||
#[expect(clippy::unwrap_used)] // usize fits in u64
|
||||
let len_to_read = u64::try_from(buf.len()).unwrap().min(len_left);
|
||||
|
||||
#[expect(clippy::unwrap_used)] // must fit, we called min()
|
||||
let len_to_read = usize::try_from(len_to_read).unwrap();
|
||||
|
||||
let end_byte = start_byte + len_to_read - 1;
|
||||
|
||||
let b = retry!(
|
||||
self.client.retries,
|
||||
self.client
|
||||
.client
|
||||
.get_object()
|
||||
.bucket(self.bucket.as_str())
|
||||
.key(self.key.as_str())
|
||||
.range(format!("bytes={start_byte}-{end_byte}"))
|
||||
.send()
|
||||
.await
|
||||
)?;
|
||||
|
||||
// Looks like `bytes 31000000-31999999/33921176``
|
||||
// println!("{:?}", b.content_range);
|
||||
|
||||
let mut bytes = b.body.collect().await?.into_bytes();
|
||||
bytes.truncate(len_to_read);
|
||||
let l = bytes.len();
|
||||
|
||||
// Memory to memory writes are infallible
|
||||
#[expect(clippy::unwrap_used)]
|
||||
buf.write_all(&bytes).unwrap();
|
||||
|
||||
// Cannot fail, usize should always fit into u64
|
||||
#[expect(clippy::unwrap_used)]
|
||||
{
|
||||
self.cursor += u64::try_from(l).unwrap();
|
||||
}
|
||||
|
||||
return Ok(len_to_read);
|
||||
}
|
||||
|
||||
pub fn is_done(&self) -> bool {
|
||||
return self.cursor == self.size;
|
||||
}
|
||||
|
||||
pub fn mime(&self) -> &Mime {
|
||||
&self.mime
|
||||
}
|
||||
|
||||
/// Write the entire contents of this reader to `r`.
|
||||
///
|
||||
/// This method always downloads the whole object,
|
||||
/// and always preserves `self.cursor`.
|
||||
pub async fn download<W: Write>(&mut self, r: &mut W) -> Result<(), S3ReaderError> {
|
||||
let pos = self.stream_position()?;
|
||||
|
||||
const BUF_LEN: usize = 10_000_000;
|
||||
#[expect(clippy::unwrap_used)] // Cannot fail
|
||||
let mut buf: Box<[u8; BUF_LEN]> = vec![0u8; BUF_LEN].try_into().unwrap();
|
||||
|
||||
while !self.is_done() {
|
||||
let b = self.read(&mut buf[..]).await?;
|
||||
r.write_all(&buf[0..b])?;
|
||||
}
|
||||
|
||||
self.seek(SeekFrom::Start(pos))?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Seek for S3Reader {
|
||||
fn seek(&mut self, pos: SeekFrom) -> std::io::Result<u64> {
|
||||
match pos {
|
||||
SeekFrom::Start(x) => self.cursor = x.min(self.size - 1),
|
||||
|
||||
// Cannot panic, we handle all cases
|
||||
#[expect(clippy::unwrap_used)]
|
||||
SeekFrom::Current(x) => {
|
||||
if x < 0 {
|
||||
if u64::try_from(x.abs()).unwrap() > self.cursor {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidInput,
|
||||
"cannot seek past start",
|
||||
));
|
||||
}
|
||||
self.cursor -= u64::try_from(x.abs()).unwrap();
|
||||
} else {
|
||||
self.cursor += u64::try_from(x).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
// Cannot panic, we handle all cases
|
||||
#[expect(clippy::unwrap_used)]
|
||||
SeekFrom::End(x) => {
|
||||
if x < 0 {
|
||||
if u64::try_from(x.abs()).unwrap() > self.size {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidInput,
|
||||
"cannot seek past start",
|
||||
));
|
||||
}
|
||||
// Cannot fail, is abs
|
||||
self.cursor = self.size - u64::try_from(x.abs()).unwrap();
|
||||
} else {
|
||||
// Cannot fail, is positive
|
||||
self.cursor = self.size + u64::try_from(x).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.cursor = self.cursor.min(self.size - 1);
|
||||
return Ok(self.cursor);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user