This commit is contained in:
Mark 2025-01-26 10:02:28 -08:00
parent b6235368bd
commit 17ef2a1abd
Signed by: Mark
GPG Key ID: C6D63995FE72FD80
7 changed files with 275 additions and 93 deletions

View File

@ -1,28 +1,19 @@
#![feature(portable_simd)]
#![feature(stdarch_arm_neon_intrinsics)]
use anyhow::{anyhow, Context, Result};
use clap::Parser;
use clap::{Parser, ValueEnum};
use std::{
fs::File,
io::{BufRead, BufReader, Write},
io::{BufRead, BufReader, Read, Write},
net::{TcpListener, TcpStream},
process::Command,
sync::mpsc::channel,
thread,
time::Duration,
time::{Duration, Instant},
};
mod reduce;
mod rmframes;
mod streamlsb;
use rmframes::RMFrames;
use streamlsb::StreamLSB;
#[derive(Parser, Debug)]
#[command(author, version)]
pub struct Opts {
/// Listen for an (unsecure) TCP connection to send the data to.
/// TODO: implement basic encryption
#[arg(long, name = "port", short = 'l')]
listen: usize,
}
// Parameters only for RM2, firmware version >= 3.7.0.1930
@ -33,8 +24,99 @@ const WIDTH: usize = 1872;
/// How many bytes represent one pixel in the framebuffer.
const BYTES_PER_PIXEL: usize = 2;
const IN_SIZE: usize = WIDTH * HEIGHT * BYTES_PER_PIXEL;
#[derive(Parser, Debug)]
#[command(author, version)]
pub struct Opts {
#[arg(long, name = "port", short = 'p')]
port: usize,
#[arg(long, name = "format", short = 'f')]
format: Format,
}
#[derive(ValueEnum, Debug, Clone, Copy)]
pub enum Format {
Mono,
MonoSimd,
Gray8,
Gray8Simd,
Gray16be,
}
impl Format {
fn run<W: Write>(&self, mut src: RMFrames, mut tgt: W) -> Result<()> {
const IN_SIZE: usize = WIDTH * HEIGHT * BYTES_PER_PIXEL;
match self {
Self::Gray16be => {
let mut buf = [0u8; IN_SIZE];
loop {
let now = Instant::now();
src.read_exact(&mut buf)?;
tgt.write_all(&buf[0..IN_SIZE])?;
println!("{:?}", now.elapsed().as_millis());
}
}
Self::Gray8 => {
const OUT: usize = reduce::gray8::out_size(IN_SIZE);
let mut buf = [0u8; IN_SIZE];
loop {
let now = Instant::now();
src.read_exact(&mut buf)?;
reduce::gray8::run(&mut buf);
tgt.write_all(&buf[0..OUT])?;
println!("{:?}", now.elapsed().as_millis());
}
}
Self::Gray8Simd => {
const OUT: usize = reduce::gray8_simd::out_size(IN_SIZE);
let mut buf = [0u8; IN_SIZE];
loop {
let now = Instant::now();
src.read_exact(&mut buf)?;
reduce::gray8_simd::run(&mut buf);
tgt.write_all(&buf[0..OUT])?;
println!("{:?}", now.elapsed().as_millis());
}
}
Self::Mono => {
const OUT: usize = reduce::mono::out_size(IN_SIZE);
let mut buf = [0u8; IN_SIZE];
loop {
let now = Instant::now();
src.read_exact(&mut buf)?;
reduce::mono::run(&mut buf);
tgt.write_all(&buf[0..OUT])?;
println!("{:?}", now.elapsed().as_millis());
}
}
Self::MonoSimd => {
const OUT: usize = reduce::mono_simd::out_size(IN_SIZE);
let mut buf = [0u8; IN_SIZE];
loop {
let now = Instant::now();
src.read_exact(&mut buf)?;
reduce::mono_simd::run(&mut buf);
tgt.write_all(&buf[0..OUT])?;
println!("{:?}", now.elapsed().as_millis());
}
}
}
}
}
fn main() -> Result<()> {
let ref opts: Opts = Opts::parse();
let opts: Opts = Opts::parse();
let (file, offset) = {
let pid = xochitl_pid()?;
@ -43,15 +125,12 @@ fn main() -> Result<()> {
(mem, offset)
};
let src = RMFrames::init(&file, offset, WIDTH * HEIGHT * BYTES_PER_PIXEL)?;
let src = StreamLSB::new(src);
let mut src = src;
let src = RMFrames::init(&file, offset, IN_SIZE)?;
let tgt = listen_timeout(opts.port, Duration::from_secs(60))?;
let mut tgt: Box<dyn Write> = Box::new(listen_timeout(opts.listen, Duration::from_secs(60))?);
opts.format.run(src, tgt)?;
std::io::copy(&mut src, &mut tgt).unwrap();
return Ok(());
Ok(())
}
fn listen_timeout(port: usize, timeout: Duration) -> Result<TcpStream> {
@ -59,14 +138,8 @@ fn listen_timeout(port: usize, timeout: Duration) -> Result<TcpStream> {
let listen = TcpListener::bind(&listen_addr)?;
eprintln!("[rM] listening for a TCP connection on {}", listen_addr);
let (tx, rx) = channel();
thread::spawn(move || {
tx.send(listen.accept()).unwrap();
});
let (conn, conn_addr) = listen.accept().unwrap();
let (conn, conn_addr) = rx
.recv_timeout(timeout)
.context("Timeout while waiting for host to connect to reMarkable")??;
eprintln!("[rM] connection received from {}", conn_addr);
conn.set_write_timeout(Some(timeout))?;
Ok(conn)
@ -74,7 +147,7 @@ fn listen_timeout(port: usize, timeout: Duration) -> Result<TcpStream> {
fn xochitl_pid() -> Result<usize> {
let output = Command::new("/bin/pidof")
.args(&["xochitl"])
.args(["xochitl"])
.output()
.context("Failed to run `/bin/pidof xochitl`")?;
if output.status.success() {
@ -95,8 +168,7 @@ fn rm2_fb_offset(pid: usize) -> Result<usize> {
let line = BufReader::new(file)
.lines()
.skip_while(|line| matches!(line, Ok(l) if !l.ends_with("/dev/fb0")))
.skip(1)
.next()
.nth(1)
.with_context(|| format!("No line containing /dev/fb0 in /proc/{}/maps file", pid))?
.with_context(|| format!("Error reading file /proc/{}/maps", pid))?;

16
src/reduce/gray8.rs Normal file
View File

@ -0,0 +1,16 @@
const GROUP_BY: usize = 2;
pub const fn out_size(in_size: usize) -> usize {
in_size / 2
}
pub fn run(buf: &mut [u8]) {
let n_raw = buf.len();
let mut in_cursor = 0;
let mut out_cursor = 0;
while in_cursor + GROUP_BY <= n_raw {
buf[out_cursor] = buf[in_cursor];
out_cursor += 1;
in_cursor += GROUP_BY;
}
}

33
src/reduce/gray8_simd.rs Normal file
View File

@ -0,0 +1,33 @@
use std::{
arch::arm::{vld1q_u8, vst1q_u8, vuzpq_u8},
convert::TryInto,
};
const GROUP_BY: usize = 32;
pub const fn out_size(in_size: usize) -> usize {
in_size / 2
}
pub fn run(buf: &mut [u8]) {
let n_raw = buf.len();
let mut in_cursor = 0;
let mut out_cursor = 0;
let mut res = [0u8; 16];
while in_cursor + GROUP_BY <= n_raw {
let a: &[u8; 16] = buf[in_cursor..in_cursor + 16].try_into().unwrap();
let b: &[u8; 16] = buf[in_cursor + 16..in_cursor + 32].try_into().unwrap();
unsafe {
let a = vld1q_u8(a as *const u8);
let b = vld1q_u8(b as *const u8);
let z = vuzpq_u8(a, b);
vst1q_u8(&mut res as *mut u8, z.0);
}
buf[out_cursor..out_cursor + 16].copy_from_slice(&res);
out_cursor += 16;
in_cursor += GROUP_BY;
}
}

4
src/reduce/mod.rs Normal file
View File

@ -0,0 +1,4 @@
pub mod gray8;
pub mod gray8_simd;
pub mod mono;
pub mod mono_simd;

47
src/reduce/mono.rs Normal file
View File

@ -0,0 +1,47 @@
use std::convert::TryInto;
const GROUP_BY: usize = 16;
pub const fn out_size(in_size: usize) -> usize {
in_size / 16
}
pub fn run(buf: &mut [u8]) {
let n_raw = buf.len();
let mut in_cursor = 0;
let mut out_cursor = 0;
while in_cursor + GROUP_BY <= n_raw {
let a: &[u8; 16] = buf[in_cursor..in_cursor + 16].try_into().unwrap();
let mut out = 0u8;
if a[0] == 0x1E {
out |= 0b10000000;
}
if a[2] == 0x1E {
out |= 0b10000000 >> 1;
}
if a[4] == 0x1E {
out |= 0b10000000 >> 2;
}
if a[6] == 0x1E {
out |= 0b10000000 >> 3;
}
if a[8] == 0x1E {
out |= 0b10000000 >> 4;
}
if a[10] == 0x1E {
out |= 0b10000000 >> 5;
}
if a[12] == 0x1E {
out |= 0b10000000 >> 6;
}
if a[14] == 0x1E {
out |= 0b10000000 >> 7;
}
buf[out_cursor] = out;
out_cursor += 1;
in_cursor += GROUP_BY;
}
}

69
src/reduce/mono_simd.rs Normal file
View File

@ -0,0 +1,69 @@
use std::{
arch::arm::{
vandq_u8, vgetq_lane_u64, vld1q_s8, vld1q_u8, vpaddlq_u16, vpaddlq_u32, vpaddlq_u8,
vshlq_u8, vshrq_n_u8, vuzpq_u8,
},
convert::TryInto,
};
const GROUP_BY: usize = 32;
pub const fn out_size(in_size: usize) -> usize {
in_size / 16
}
pub fn run(buf: &mut [u8]) {
let n_raw = buf.len();
let mut in_cursor = 0;
let mut out_cursor = 0;
let m = unsafe {
let mask = &[0x01u8; 16];
vld1q_u8(mask as *const u8)
};
let h = unsafe {
let mask = &[
0x07i8, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02,
0x01, 0x00,
];
vld1q_s8(mask as *const i8)
};
while in_cursor + GROUP_BY <= n_raw {
let a: &[u8; 16] = buf[in_cursor..in_cursor + 16].try_into().unwrap();
let b: &[u8; 16] = buf[in_cursor + 16..in_cursor + 32].try_into().unwrap();
let res: (u8, u8) = unsafe {
// Load 32 bytes
let a = vld1q_u8(a as *const u8);
let b = vld1q_u8(b as *const u8);
// Unzip and get first byte of each pair
let a = vuzpq_u8(a, b).0;
// White = 0b1110, so >> 4.
let a = vshrq_n_u8::<4>(a);
// and with 0x01 mask
let a = vandq_u8(a, m);
// shift each bit left by an appropriate amount
// (h is [0x07, 0x06, .., 0x00, 0x07, .., 0x00])
let a = vshlq_u8(a, h);
// Sum everything
let s = vpaddlq_u8(a);
let s = vpaddlq_u16(s);
let s = vpaddlq_u32(s);
(
vgetq_lane_u64(s, 0).try_into().unwrap(),
vgetq_lane_u64(s, 1).try_into().unwrap(),
)
};
buf[out_cursor] = res.0;
buf[out_cursor + 1] = res.1;
out_cursor += 2;
in_cursor += GROUP_BY;
}
}

View File

@ -68,62 +68,3 @@ impl<R: Read> Read for StreamLSB<R> {
return Ok(out_cursor);
}
}
// TODO: make this faster with SIMD
/*
SORKIN DEEPSEEK
On an ARMv7 processor with NEON SIMD support, you can significantly speed up the process of extracting and packing the most significant bits (MSBs) of every other byte from 16 consecutive bytes. NEON allows you to process multiple bytes in parallel using wide registers and specialized instructions.Heres how you can achieve this using NEON intrinsics:### Steps:
Load the 16 bytes into a NEON register.
Extract the MSB of every byte using a right shift (`vshrq_u8`).
Mask out every other byte using a bitwise AND (`vandq_u8`).
Pack the MSBs into a single byte using NEON's horizontal reduction.
### Code Example:
```c
#include <arm_neon.h>
#include <stdint.h>
#include <stdio.h>uint8_t pack_msbs_neon(const uint8_t *data) {
// Load 16 bytes into a NEON register
uint8x16_t vec = vld1q_u8(data); // Shift each byte right by 7 to isolate the MSB
uint8x16_t msbs = vshrq_n_u8(vec, 7); // Create a mask to select every other byte (0x80, 0x00, 0x80, 0x00, ...)
uint8x16_t mask = {0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00}; // Apply the mask to keep only every other MSB
uint8x16_t masked_msbs = vandq_u8(msbs, mask); // Horizontally add the bytes to pack the MSBs into a single byte
uint8x8_t sum = vpadd_u8(vget_low_u8(masked_msbs), vget_high_u8(masked_msbs));
sum = vpadd_u8(sum, sum);
sum = vpadd_u8(sum, sum); // Extract the final packed byte
return vget_lane_u8(sum, 0);
}int main() {
uint8_t data[16] = {
0b10000001, 0b00000000, 0b11000011, 0b00000000,
0b10101010, 0b00000000, 0b11110000, 0b00000000,
0b10000001, 0b00000000, 0b11000011, 0b00000000,
0b10101010, 0b00000000, 0b11110000, 0b00000000
}; uint8_t packed = pack_msbs_neon(data); printf("Packed MSBs: 0x%02X\n", packed); return 0;
}
```### Explanation:
**`vld1q_u8(data)`**: Loads 16 bytes from `data` into a NEON register (`uint8x16_t`).
**`vshrq_n_u8(vec, 7)`**: Shifts each byte right by 7, isolating the MSB.
**Masking**: A mask (`0x80` for every other byte) is applied using `vandq_u8` to keep only the MSBs of every other byte.
**Horizontal Addition**: The `vpadd_u8` instruction is used to horizontally add pairs of bytes, effectively packing the MSBs into a single byte.
**Final Extraction**: The result is extracted using `vget_lane_u8`.
### Performance:
This approach processes 16 bytes in parallel using NEON, making it much faster than a scalar implementation.
The use of horizontal addition (`vpadd_u8`) efficiently reduces the NEON register to a single byte.
### Example Output:
For the given `data` array, the output will be:
```
Packed MSBs: 0xAA
```This matches the expected result, where the MSBs of every other byte are packed into a single byte.### Compilation:
Make sure to enable NEON support when compiling:
```bash
gcc -o neon_example neon_example.c -mfpu=neon -march=armv7-a
```This code is optimized for ARMv7 with NEON and should provide a significant performance boost over scalar implementations.
*/