cleanup
This commit is contained in:
parent
b6235368bd
commit
17ef2a1abd
140
src/main.rs
140
src/main.rs
@ -1,28 +1,19 @@
|
|||||||
|
#![feature(portable_simd)]
|
||||||
|
#![feature(stdarch_arm_neon_intrinsics)]
|
||||||
|
|
||||||
use anyhow::{anyhow, Context, Result};
|
use anyhow::{anyhow, Context, Result};
|
||||||
use clap::Parser;
|
use clap::{Parser, ValueEnum};
|
||||||
use std::{
|
use std::{
|
||||||
fs::File,
|
fs::File,
|
||||||
io::{BufRead, BufReader, Write},
|
io::{BufRead, BufReader, Read, Write},
|
||||||
net::{TcpListener, TcpStream},
|
net::{TcpListener, TcpStream},
|
||||||
process::Command,
|
process::Command,
|
||||||
sync::mpsc::channel,
|
time::{Duration, Instant},
|
||||||
thread,
|
|
||||||
time::Duration,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
mod reduce;
|
||||||
mod rmframes;
|
mod rmframes;
|
||||||
mod streamlsb;
|
|
||||||
use rmframes::RMFrames;
|
use rmframes::RMFrames;
|
||||||
use streamlsb::StreamLSB;
|
|
||||||
|
|
||||||
#[derive(Parser, Debug)]
|
|
||||||
#[command(author, version)]
|
|
||||||
pub struct Opts {
|
|
||||||
/// Listen for an (unsecure) TCP connection to send the data to.
|
|
||||||
/// TODO: implement basic encryption
|
|
||||||
#[arg(long, name = "port", short = 'l')]
|
|
||||||
listen: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
// Parameters only for RM2, firmware version >= 3.7.0.1930
|
// Parameters only for RM2, firmware version >= 3.7.0.1930
|
||||||
|
|
||||||
@ -33,8 +24,99 @@ const WIDTH: usize = 1872;
|
|||||||
/// How many bytes represent one pixel in the framebuffer.
|
/// How many bytes represent one pixel in the framebuffer.
|
||||||
const BYTES_PER_PIXEL: usize = 2;
|
const BYTES_PER_PIXEL: usize = 2;
|
||||||
|
|
||||||
|
const IN_SIZE: usize = WIDTH * HEIGHT * BYTES_PER_PIXEL;
|
||||||
|
|
||||||
|
#[derive(Parser, Debug)]
|
||||||
|
#[command(author, version)]
|
||||||
|
pub struct Opts {
|
||||||
|
#[arg(long, name = "port", short = 'p')]
|
||||||
|
port: usize,
|
||||||
|
|
||||||
|
#[arg(long, name = "format", short = 'f')]
|
||||||
|
format: Format,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(ValueEnum, Debug, Clone, Copy)]
|
||||||
|
pub enum Format {
|
||||||
|
Mono,
|
||||||
|
MonoSimd,
|
||||||
|
Gray8,
|
||||||
|
Gray8Simd,
|
||||||
|
Gray16be,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Format {
|
||||||
|
fn run<W: Write>(&self, mut src: RMFrames, mut tgt: W) -> Result<()> {
|
||||||
|
const IN_SIZE: usize = WIDTH * HEIGHT * BYTES_PER_PIXEL;
|
||||||
|
|
||||||
|
match self {
|
||||||
|
Self::Gray16be => {
|
||||||
|
let mut buf = [0u8; IN_SIZE];
|
||||||
|
loop {
|
||||||
|
let now = Instant::now();
|
||||||
|
src.read_exact(&mut buf)?;
|
||||||
|
tgt.write_all(&buf[0..IN_SIZE])?;
|
||||||
|
println!("{:?}", now.elapsed().as_millis());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Self::Gray8 => {
|
||||||
|
const OUT: usize = reduce::gray8::out_size(IN_SIZE);
|
||||||
|
let mut buf = [0u8; IN_SIZE];
|
||||||
|
loop {
|
||||||
|
let now = Instant::now();
|
||||||
|
src.read_exact(&mut buf)?;
|
||||||
|
|
||||||
|
reduce::gray8::run(&mut buf);
|
||||||
|
tgt.write_all(&buf[0..OUT])?;
|
||||||
|
println!("{:?}", now.elapsed().as_millis());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Self::Gray8Simd => {
|
||||||
|
const OUT: usize = reduce::gray8_simd::out_size(IN_SIZE);
|
||||||
|
let mut buf = [0u8; IN_SIZE];
|
||||||
|
loop {
|
||||||
|
let now = Instant::now();
|
||||||
|
src.read_exact(&mut buf)?;
|
||||||
|
|
||||||
|
reduce::gray8_simd::run(&mut buf);
|
||||||
|
tgt.write_all(&buf[0..OUT])?;
|
||||||
|
println!("{:?}", now.elapsed().as_millis());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Self::Mono => {
|
||||||
|
const OUT: usize = reduce::mono::out_size(IN_SIZE);
|
||||||
|
let mut buf = [0u8; IN_SIZE];
|
||||||
|
loop {
|
||||||
|
let now = Instant::now();
|
||||||
|
src.read_exact(&mut buf)?;
|
||||||
|
|
||||||
|
reduce::mono::run(&mut buf);
|
||||||
|
tgt.write_all(&buf[0..OUT])?;
|
||||||
|
println!("{:?}", now.elapsed().as_millis());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Self::MonoSimd => {
|
||||||
|
const OUT: usize = reduce::mono_simd::out_size(IN_SIZE);
|
||||||
|
let mut buf = [0u8; IN_SIZE];
|
||||||
|
loop {
|
||||||
|
let now = Instant::now();
|
||||||
|
src.read_exact(&mut buf)?;
|
||||||
|
|
||||||
|
reduce::mono_simd::run(&mut buf);
|
||||||
|
tgt.write_all(&buf[0..OUT])?;
|
||||||
|
println!("{:?}", now.elapsed().as_millis());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn main() -> Result<()> {
|
fn main() -> Result<()> {
|
||||||
let ref opts: Opts = Opts::parse();
|
let opts: Opts = Opts::parse();
|
||||||
|
|
||||||
let (file, offset) = {
|
let (file, offset) = {
|
||||||
let pid = xochitl_pid()?;
|
let pid = xochitl_pid()?;
|
||||||
@ -43,15 +125,12 @@ fn main() -> Result<()> {
|
|||||||
(mem, offset)
|
(mem, offset)
|
||||||
};
|
};
|
||||||
|
|
||||||
let src = RMFrames::init(&file, offset, WIDTH * HEIGHT * BYTES_PER_PIXEL)?;
|
let src = RMFrames::init(&file, offset, IN_SIZE)?;
|
||||||
let src = StreamLSB::new(src);
|
let tgt = listen_timeout(opts.port, Duration::from_secs(60))?;
|
||||||
let mut src = src;
|
|
||||||
|
|
||||||
let mut tgt: Box<dyn Write> = Box::new(listen_timeout(opts.listen, Duration::from_secs(60))?);
|
opts.format.run(src, tgt)?;
|
||||||
|
|
||||||
std::io::copy(&mut src, &mut tgt).unwrap();
|
Ok(())
|
||||||
|
|
||||||
return Ok(());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn listen_timeout(port: usize, timeout: Duration) -> Result<TcpStream> {
|
fn listen_timeout(port: usize, timeout: Duration) -> Result<TcpStream> {
|
||||||
@ -59,14 +138,8 @@ fn listen_timeout(port: usize, timeout: Duration) -> Result<TcpStream> {
|
|||||||
let listen = TcpListener::bind(&listen_addr)?;
|
let listen = TcpListener::bind(&listen_addr)?;
|
||||||
eprintln!("[rM] listening for a TCP connection on {}", listen_addr);
|
eprintln!("[rM] listening for a TCP connection on {}", listen_addr);
|
||||||
|
|
||||||
let (tx, rx) = channel();
|
let (conn, conn_addr) = listen.accept().unwrap();
|
||||||
thread::spawn(move || {
|
|
||||||
tx.send(listen.accept()).unwrap();
|
|
||||||
});
|
|
||||||
|
|
||||||
let (conn, conn_addr) = rx
|
|
||||||
.recv_timeout(timeout)
|
|
||||||
.context("Timeout while waiting for host to connect to reMarkable")??;
|
|
||||||
eprintln!("[rM] connection received from {}", conn_addr);
|
eprintln!("[rM] connection received from {}", conn_addr);
|
||||||
conn.set_write_timeout(Some(timeout))?;
|
conn.set_write_timeout(Some(timeout))?;
|
||||||
Ok(conn)
|
Ok(conn)
|
||||||
@ -74,7 +147,7 @@ fn listen_timeout(port: usize, timeout: Duration) -> Result<TcpStream> {
|
|||||||
|
|
||||||
fn xochitl_pid() -> Result<usize> {
|
fn xochitl_pid() -> Result<usize> {
|
||||||
let output = Command::new("/bin/pidof")
|
let output = Command::new("/bin/pidof")
|
||||||
.args(&["xochitl"])
|
.args(["xochitl"])
|
||||||
.output()
|
.output()
|
||||||
.context("Failed to run `/bin/pidof xochitl`")?;
|
.context("Failed to run `/bin/pidof xochitl`")?;
|
||||||
if output.status.success() {
|
if output.status.success() {
|
||||||
@ -95,8 +168,7 @@ fn rm2_fb_offset(pid: usize) -> Result<usize> {
|
|||||||
let line = BufReader::new(file)
|
let line = BufReader::new(file)
|
||||||
.lines()
|
.lines()
|
||||||
.skip_while(|line| matches!(line, Ok(l) if !l.ends_with("/dev/fb0")))
|
.skip_while(|line| matches!(line, Ok(l) if !l.ends_with("/dev/fb0")))
|
||||||
.skip(1)
|
.nth(1)
|
||||||
.next()
|
|
||||||
.with_context(|| format!("No line containing /dev/fb0 in /proc/{}/maps file", pid))?
|
.with_context(|| format!("No line containing /dev/fb0 in /proc/{}/maps file", pid))?
|
||||||
.with_context(|| format!("Error reading file /proc/{}/maps", pid))?;
|
.with_context(|| format!("Error reading file /proc/{}/maps", pid))?;
|
||||||
|
|
||||||
|
16
src/reduce/gray8.rs
Normal file
16
src/reduce/gray8.rs
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
const GROUP_BY: usize = 2;
|
||||||
|
pub const fn out_size(in_size: usize) -> usize {
|
||||||
|
in_size / 2
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn run(buf: &mut [u8]) {
|
||||||
|
let n_raw = buf.len();
|
||||||
|
let mut in_cursor = 0;
|
||||||
|
let mut out_cursor = 0;
|
||||||
|
|
||||||
|
while in_cursor + GROUP_BY <= n_raw {
|
||||||
|
buf[out_cursor] = buf[in_cursor];
|
||||||
|
out_cursor += 1;
|
||||||
|
in_cursor += GROUP_BY;
|
||||||
|
}
|
||||||
|
}
|
33
src/reduce/gray8_simd.rs
Normal file
33
src/reduce/gray8_simd.rs
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
use std::{
|
||||||
|
arch::arm::{vld1q_u8, vst1q_u8, vuzpq_u8},
|
||||||
|
convert::TryInto,
|
||||||
|
};
|
||||||
|
|
||||||
|
const GROUP_BY: usize = 32;
|
||||||
|
pub const fn out_size(in_size: usize) -> usize {
|
||||||
|
in_size / 2
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn run(buf: &mut [u8]) {
|
||||||
|
let n_raw = buf.len();
|
||||||
|
let mut in_cursor = 0;
|
||||||
|
let mut out_cursor = 0;
|
||||||
|
|
||||||
|
let mut res = [0u8; 16];
|
||||||
|
|
||||||
|
while in_cursor + GROUP_BY <= n_raw {
|
||||||
|
let a: &[u8; 16] = buf[in_cursor..in_cursor + 16].try_into().unwrap();
|
||||||
|
let b: &[u8; 16] = buf[in_cursor + 16..in_cursor + 32].try_into().unwrap();
|
||||||
|
|
||||||
|
unsafe {
|
||||||
|
let a = vld1q_u8(a as *const u8);
|
||||||
|
let b = vld1q_u8(b as *const u8);
|
||||||
|
let z = vuzpq_u8(a, b);
|
||||||
|
vst1q_u8(&mut res as *mut u8, z.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
buf[out_cursor..out_cursor + 16].copy_from_slice(&res);
|
||||||
|
out_cursor += 16;
|
||||||
|
in_cursor += GROUP_BY;
|
||||||
|
}
|
||||||
|
}
|
4
src/reduce/mod.rs
Normal file
4
src/reduce/mod.rs
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
pub mod gray8;
|
||||||
|
pub mod gray8_simd;
|
||||||
|
pub mod mono;
|
||||||
|
pub mod mono_simd;
|
47
src/reduce/mono.rs
Normal file
47
src/reduce/mono.rs
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
use std::convert::TryInto;
|
||||||
|
|
||||||
|
const GROUP_BY: usize = 16;
|
||||||
|
pub const fn out_size(in_size: usize) -> usize {
|
||||||
|
in_size / 16
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn run(buf: &mut [u8]) {
|
||||||
|
let n_raw = buf.len();
|
||||||
|
let mut in_cursor = 0;
|
||||||
|
let mut out_cursor = 0;
|
||||||
|
|
||||||
|
while in_cursor + GROUP_BY <= n_raw {
|
||||||
|
let a: &[u8; 16] = buf[in_cursor..in_cursor + 16].try_into().unwrap();
|
||||||
|
|
||||||
|
let mut out = 0u8;
|
||||||
|
|
||||||
|
if a[0] == 0x1E {
|
||||||
|
out |= 0b10000000;
|
||||||
|
}
|
||||||
|
if a[2] == 0x1E {
|
||||||
|
out |= 0b10000000 >> 1;
|
||||||
|
}
|
||||||
|
if a[4] == 0x1E {
|
||||||
|
out |= 0b10000000 >> 2;
|
||||||
|
}
|
||||||
|
if a[6] == 0x1E {
|
||||||
|
out |= 0b10000000 >> 3;
|
||||||
|
}
|
||||||
|
if a[8] == 0x1E {
|
||||||
|
out |= 0b10000000 >> 4;
|
||||||
|
}
|
||||||
|
if a[10] == 0x1E {
|
||||||
|
out |= 0b10000000 >> 5;
|
||||||
|
}
|
||||||
|
if a[12] == 0x1E {
|
||||||
|
out |= 0b10000000 >> 6;
|
||||||
|
}
|
||||||
|
if a[14] == 0x1E {
|
||||||
|
out |= 0b10000000 >> 7;
|
||||||
|
}
|
||||||
|
|
||||||
|
buf[out_cursor] = out;
|
||||||
|
out_cursor += 1;
|
||||||
|
in_cursor += GROUP_BY;
|
||||||
|
}
|
||||||
|
}
|
69
src/reduce/mono_simd.rs
Normal file
69
src/reduce/mono_simd.rs
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
use std::{
|
||||||
|
arch::arm::{
|
||||||
|
vandq_u8, vgetq_lane_u64, vld1q_s8, vld1q_u8, vpaddlq_u16, vpaddlq_u32, vpaddlq_u8,
|
||||||
|
vshlq_u8, vshrq_n_u8, vuzpq_u8,
|
||||||
|
},
|
||||||
|
convert::TryInto,
|
||||||
|
};
|
||||||
|
|
||||||
|
const GROUP_BY: usize = 32;
|
||||||
|
pub const fn out_size(in_size: usize) -> usize {
|
||||||
|
in_size / 16
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn run(buf: &mut [u8]) {
|
||||||
|
let n_raw = buf.len();
|
||||||
|
let mut in_cursor = 0;
|
||||||
|
let mut out_cursor = 0;
|
||||||
|
|
||||||
|
let m = unsafe {
|
||||||
|
let mask = &[0x01u8; 16];
|
||||||
|
vld1q_u8(mask as *const u8)
|
||||||
|
};
|
||||||
|
|
||||||
|
let h = unsafe {
|
||||||
|
let mask = &[
|
||||||
|
0x07i8, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02,
|
||||||
|
0x01, 0x00,
|
||||||
|
];
|
||||||
|
vld1q_s8(mask as *const i8)
|
||||||
|
};
|
||||||
|
|
||||||
|
while in_cursor + GROUP_BY <= n_raw {
|
||||||
|
let a: &[u8; 16] = buf[in_cursor..in_cursor + 16].try_into().unwrap();
|
||||||
|
let b: &[u8; 16] = buf[in_cursor + 16..in_cursor + 32].try_into().unwrap();
|
||||||
|
|
||||||
|
let res: (u8, u8) = unsafe {
|
||||||
|
// Load 32 bytes
|
||||||
|
let a = vld1q_u8(a as *const u8);
|
||||||
|
let b = vld1q_u8(b as *const u8);
|
||||||
|
|
||||||
|
// Unzip and get first byte of each pair
|
||||||
|
let a = vuzpq_u8(a, b).0;
|
||||||
|
|
||||||
|
// White = 0b1110, so >> 4.
|
||||||
|
let a = vshrq_n_u8::<4>(a);
|
||||||
|
|
||||||
|
// and with 0x01 mask
|
||||||
|
let a = vandq_u8(a, m);
|
||||||
|
|
||||||
|
// shift each bit left by an appropriate amount
|
||||||
|
// (h is [0x07, 0x06, .., 0x00, 0x07, .., 0x00])
|
||||||
|
let a = vshlq_u8(a, h);
|
||||||
|
|
||||||
|
// Sum everything
|
||||||
|
let s = vpaddlq_u8(a);
|
||||||
|
let s = vpaddlq_u16(s);
|
||||||
|
let s = vpaddlq_u32(s);
|
||||||
|
(
|
||||||
|
vgetq_lane_u64(s, 0).try_into().unwrap(),
|
||||||
|
vgetq_lane_u64(s, 1).try_into().unwrap(),
|
||||||
|
)
|
||||||
|
};
|
||||||
|
|
||||||
|
buf[out_cursor] = res.0;
|
||||||
|
buf[out_cursor + 1] = res.1;
|
||||||
|
out_cursor += 2;
|
||||||
|
in_cursor += GROUP_BY;
|
||||||
|
}
|
||||||
|
}
|
@ -68,62 +68,3 @@ impl<R: Read> Read for StreamLSB<R> {
|
|||||||
return Ok(out_cursor);
|
return Ok(out_cursor);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: make this faster with SIMD
|
|
||||||
|
|
||||||
/*
|
|
||||||
SORKIN DEEPSEEK
|
|
||||||
|
|
||||||
On an ARMv7 processor with NEON SIMD support, you can significantly speed up the process of extracting and packing the most significant bits (MSBs) of every other byte from 16 consecutive bytes. NEON allows you to process multiple bytes in parallel using wide registers and specialized instructions.Here’s how you can achieve this using NEON intrinsics:### Steps:
|
|
||||||
|
|
||||||
Load the 16 bytes into a NEON register.
|
|
||||||
Extract the MSB of every byte using a right shift (`vshrq_u8`).
|
|
||||||
Mask out every other byte using a bitwise AND (`vandq_u8`).
|
|
||||||
Pack the MSBs into a single byte using NEON's horizontal reduction.
|
|
||||||
|
|
||||||
### Code Example:
|
|
||||||
```c
|
|
||||||
#include <arm_neon.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <stdio.h>uint8_t pack_msbs_neon(const uint8_t *data) {
|
|
||||||
// Load 16 bytes into a NEON register
|
|
||||||
uint8x16_t vec = vld1q_u8(data); // Shift each byte right by 7 to isolate the MSB
|
|
||||||
uint8x16_t msbs = vshrq_n_u8(vec, 7); // Create a mask to select every other byte (0x80, 0x00, 0x80, 0x00, ...)
|
|
||||||
uint8x16_t mask = {0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
|
|
||||||
0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00}; // Apply the mask to keep only every other MSB
|
|
||||||
uint8x16_t masked_msbs = vandq_u8(msbs, mask); // Horizontally add the bytes to pack the MSBs into a single byte
|
|
||||||
uint8x8_t sum = vpadd_u8(vget_low_u8(masked_msbs), vget_high_u8(masked_msbs));
|
|
||||||
sum = vpadd_u8(sum, sum);
|
|
||||||
sum = vpadd_u8(sum, sum); // Extract the final packed byte
|
|
||||||
return vget_lane_u8(sum, 0);
|
|
||||||
}int main() {
|
|
||||||
uint8_t data[16] = {
|
|
||||||
0b10000001, 0b00000000, 0b11000011, 0b00000000,
|
|
||||||
0b10101010, 0b00000000, 0b11110000, 0b00000000,
|
|
||||||
0b10000001, 0b00000000, 0b11000011, 0b00000000,
|
|
||||||
0b10101010, 0b00000000, 0b11110000, 0b00000000
|
|
||||||
}; uint8_t packed = pack_msbs_neon(data); printf("Packed MSBs: 0x%02X\n", packed); return 0;
|
|
||||||
}
|
|
||||||
```### Explanation:
|
|
||||||
|
|
||||||
**`vld1q_u8(data)`**: Loads 16 bytes from `data` into a NEON register (`uint8x16_t`).
|
|
||||||
**`vshrq_n_u8(vec, 7)`**: Shifts each byte right by 7, isolating the MSB.
|
|
||||||
**Masking**: A mask (`0x80` for every other byte) is applied using `vandq_u8` to keep only the MSBs of every other byte.
|
|
||||||
**Horizontal Addition**: The `vpadd_u8` instruction is used to horizontally add pairs of bytes, effectively packing the MSBs into a single byte.
|
|
||||||
**Final Extraction**: The result is extracted using `vget_lane_u8`.
|
|
||||||
|
|
||||||
### Performance:
|
|
||||||
|
|
||||||
This approach processes 16 bytes in parallel using NEON, making it much faster than a scalar implementation.
|
|
||||||
The use of horizontal addition (`vpadd_u8`) efficiently reduces the NEON register to a single byte.
|
|
||||||
|
|
||||||
### Example Output:
|
|
||||||
For the given `data` array, the output will be:
|
|
||||||
```
|
|
||||||
Packed MSBs: 0xAA
|
|
||||||
```This matches the expected result, where the MSBs of every other byte are packed into a single byte.### Compilation:
|
|
||||||
Make sure to enable NEON support when compiling:
|
|
||||||
```bash
|
|
||||||
gcc -o neon_example neon_example.c -mfpu=neon -march=armv7-a
|
|
||||||
```This code is optimized for ARMv7 with NEON and should provide a significant performance boost over scalar implementations.
|
|
||||||
*/
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user