From c5c55de2d5678567ee0b8701c81d0584f30c80ee Mon Sep 17 00:00:00 2001
From: Mark <mark@betalupi.com>
Date: Sat, 25 Jan 2025 21:26:57 -0800
Subject: [PATCH] Refactor restream

---
 src/main.rs      | 124 ++++++++++++---------------------------------
 src/rmframes.rs  |  54 ++++++++++++++++++++
 src/streamlsb.rs | 129 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 214 insertions(+), 93 deletions(-)
 create mode 100644 src/rmframes.rs
 create mode 100644 src/streamlsb.rs
diff --git a/src/main.rs b/src/main.rs
index 6a31f9c..f219060 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,61 +1,55 @@
 use anyhow::{anyhow, Context, Result};
 use clap::Parser;
+use std::{
+    fs::File,
+    io::{BufRead, BufReader, Write},
+    net::{TcpListener, TcpStream},
+    process::Command,
+    sync::mpsc::channel,
+    thread,
+    time::Duration,
+};
 
-use std::fs::File;
-use std::io::{BufRead, BufReader, Read, Seek, SeekFrom, Write};
-use std::net::{TcpListener, TcpStream};
-use std::process::Command;
-use std::sync::mpsc::channel;
-use std::thread;
-use std::time::Duration;
+mod rmframes;
+mod streamlsb;
+use rmframes::RMFrames;
+use streamlsb::StreamLSB;
 
 #[derive(Parser, Debug)]
 #[command(author, version)]
 pub struct Opts {
-    /// Listen for an (unsecure) TCP connection to send the data to which reduces some load on the reMarkable and improves fps.
+    /// Listen for an (unsecure) TCP connection to send the data to.
+    /// TODO: implement basic encryption
     #[arg(long, name = "port", short = 'l')]
-    listen: Option<usize>,
-
-    /// Height (in pixels) of the framebuffer.
-    #[arg(long, name = "height", short = 'h')]
-    height: usize,
-
-    /// Width (in pixels) of the framebuffer.
-    #[arg(long, name = "width", short = 'w')]
-    width: usize,
-
-    /// How many bytes represent one pixel in the framebuffer.
-    #[arg(long, name = "bytes", short = 'b')]
-    bytes_per_pixel: usize,
-
-    /// File containing the framebuffer data. If this equals the string ':mem:' it will try to read the framebuffer from xochitl's process memory (rM2 only).
-    #[arg(long, name = "path", short = 'f')]
-    file: String,
+    listen: usize,
 }
 
+// Parameters only for RM2, firmware version >= 3.7.0.1930
+
+/// Height (in pixels) of the framebuffer.
+const HEIGHT: usize = 1404;
+/// Width (in pixels) of the framebuffer.
+const WIDTH: usize = 1872;
+/// How many bytes represent one pixel in the framebuffer.
+const BYTES_PER_PIXEL: usize = 2;
+
 fn main() -> Result<()> {
     let ref opts: Opts = Opts::parse();
 
-    let (file, offset) = if opts.file == ":mem:" {
+    let (file, offset) = {
         let pid = xochitl_pid()?;
         let offset = rm2_fb_offset(pid)?;
         let mem = format!("/proc/{}/mem", pid);
         (mem, offset)
-    } else {
-        (opts.file.to_owned(), 0)
     };
 
-    let mut streamer =
-        ReStreamer::init(&file, offset, opts.width, opts.height, opts.bytes_per_pixel)?;
+    let src = RMFrames::init(&file, offset, WIDTH * HEIGHT * BYTES_PER_PIXEL)?;
+    let src = StreamLSB::new(src);
+    let mut src = src;
 
-    let mut data_target: Box<dyn Write> = if let Some(port) = opts.listen {
-        Box::new(listen_timeout(port, Duration::from_secs(60))?)
-    } else {
-        let stdout = std::io::stdout();
-        Box::new(stdout.lock())
-    };
+    let mut tgt: Box<dyn Write> = Box::new(listen_timeout(opts.listen, Duration::from_secs(60))?);
 
-    std::io::copy(&mut streamer, &mut data_target).unwrap();
+    std::io::copy(&mut src, &mut tgt).unwrap();
 
     return Ok(());
 }
@@ -114,59 +108,3 @@ fn rm2_fb_offset(pid: usize) -> Result<usize> {
     let address = usize::from_str_radix(addr, 16).context("Error parsing framebuffer address")?;
     Ok(address + 8)
 }
-
-pub struct ReStreamer {
-    file: File,
-    start: u64,
-    cursor: usize,
-    size: usize,
-}
-
-impl ReStreamer {
-    pub fn init(
-        path: &str,
-        offset: usize,
-        width: usize,
-        height: usize,
-        bytes_per_pixel: usize,
-    ) -> Result<Self> {
-        let start = offset as u64;
-        let size = width * height * bytes_per_pixel;
-        let cursor = 0;
-        let file = File::open(path)?;
-
-        let mut streamer = ReStreamer {
-            file,
-            start,
-            cursor,
-            size,
-        };
-        streamer.next_frame()?;
-        Ok(streamer)
-    }
-
-    pub fn next_frame(&mut self) -> std::io::Result<()> {
-        self.file.seek(SeekFrom::Start(self.start))?;
-        self.cursor = 0;
-        Ok(())
-    }
-}
-
-impl Read for ReStreamer {
-    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
-        let requested = buf.len();
-        let bytes_read = if self.cursor + requested < self.size {
-            self.file.read(buf)?
-        } else {
-            let rest = self.size - self.cursor;
-            self.file.read(&mut buf[0..rest])?
-        };
-
-        self.cursor += bytes_read;
-        if self.cursor == self.size {
-            self.next_frame()?;
-        }
-
-        Ok(bytes_read)
-    }
-}
diff --git a/src/rmframes.rs b/src/rmframes.rs
new file mode 100644
index 0000000..5c03dfb
--- /dev/null
+++ b/src/rmframes.rs
@@ -0,0 +1,54 @@
+use anyhow::Result;
+use std::{
+    fs::File,
+    io::{Read, Seek, SeekFrom},
+};
+
+pub struct RMFrames {
+    file: File,
+    start: u64,
+    cursor: usize,
+    framebuffer_size: usize,
+}
+
+impl RMFrames {
+    pub fn init(path: &str, offset: usize, framebuffer_size: usize) -> Result<Self> {
+        let start = offset as u64;
+        let cursor = 0;
+        let file = File::open(path)?;
+
+        let mut streamer = RMFrames {
+            file,
+            start,
+            cursor,
+            framebuffer_size,
+        };
+        streamer.next_frame()?;
+        Ok(streamer)
+    }
+
+    pub fn next_frame(&mut self) -> std::io::Result<()> {
+        self.file.seek(SeekFrom::Start(self.start))?;
+        self.cursor = 0;
+        Ok(())
+    }
+}
+
+impl Read for RMFrames {
+    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
+        let requested = buf.len();
+        let bytes_read = if self.cursor + requested < self.framebuffer_size {
+            self.file.read(buf)?
+        } else {
+            let rest = self.framebuffer_size - self.cursor;
+            self.file.read(&mut buf[0..rest])?
+        };
+
+        self.cursor += bytes_read;
+        if self.cursor == self.framebuffer_size {
+            self.next_frame()?;
+        }
+
+        Ok(bytes_read)
+    }
+}
diff --git a/src/streamlsb.rs b/src/streamlsb.rs
new file mode 100644
index 0000000..8e4a1f6
--- /dev/null
+++ b/src/streamlsb.rs
@@ -0,0 +1,129 @@
+use std::io::Read;
+
+const BYTE_FACTOR: usize = 2;
+
+/// Read every other byte from `R`.
+///
+/// If `R` is a stream of u16be, this should return the least-significant byte of each u16.
+pub struct StreamLSB<R: Read> {
+    /// The source to read from
+    src: R,
+
+    /// Number of filled bits in fragbuf
+    arity: usize,
+
+    /// Hold a partial u16, in case we read half of one
+    fragbuf: [u8; BYTE_FACTOR],
+}
+
+impl<R: Read> StreamLSB<R> {
+    pub fn new(src: R) -> Self {
+        Self {
+            src,
+            arity: 0,
+            fragbuf: [0; BYTE_FACTOR],
+        }
+    }
+}
+
+impl<R: Read> Read for StreamLSB<R> {
+    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
+        let n_raw = self.src.read(buf)?;
+
+        let mut in_cursor = 0;
+        let mut out_cursor = 0;
+
+        if self.arity != 0 {
+            // Number of bytes we need to read
+            let need_read = BYTE_FACTOR - self.arity;
+            let can_read = need_read.min(buf.len());
+            self.fragbuf[self.arity..self.arity + can_read].copy_from_slice(&buf[0..can_read]);
+
+            in_cursor += can_read;
+            self.arity += can_read;
+            self.arity %= BYTE_FACTOR;
+
+            if self.arity != 0 {
+                // We did not read a full set of bytes,
+                // wait for more data.
+                return Ok(0);
+            } else {
+                // Extract least-significant byte
+                buf[0] = self.fragbuf[0];
+                out_cursor += 1;
+            }
+        }
+
+        for _ in in_cursor..(n_raw / BYTE_FACTOR) {
+            // Extract least-significant byte
+            buf[out_cursor] = buf[in_cursor];
+
+            in_cursor += BYTE_FACTOR;
+            out_cursor += 1;
+        }
+
+        self.arity = n_raw - in_cursor;
+        self.fragbuf[0..self.arity].copy_from_slice(&buf[in_cursor..n_raw]);
+
+        return Ok(out_cursor);
+    }
+}
+
+// TODO: make this faster with SIMD
+
+/*
+SORKIN DEEPSEEK
+
+On an ARMv7 processor with NEON SIMD support, you can significantly speed up the process of extracting and packing the most significant bits (MSBs) of every other byte from 16 consecutive bytes. NEON allows you to process multiple bytes in parallel using wide registers and specialized instructions.Here’s how you can achieve this using NEON intrinsics:### Steps:
+
+    Load the 16 bytes into a NEON register.
+    Extract the MSB of every byte using a right shift (`vshrq_u8`).
+    Mask out every other byte using a bitwise AND (`vandq_u8`).
+    Pack the MSBs into a single byte using NEON's horizontal reduction.
+
+### Code Example:
+```c
+#include <arm_neon.h>
+#include <stdint.h>
+#include <stdio.h>uint8_t pack_msbs_neon(const uint8_t *data) {
+    // Load 16 bytes into a NEON register
+    uint8x16_t vec = vld1q_u8(data);    // Shift each byte right by 7 to isolate the MSB
+    uint8x16_t msbs = vshrq_n_u8(vec, 7);    // Create a mask to select every other byte (0x80, 0x00, 0x80, 0x00, ...)
+    uint8x16_t mask = {0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+                       0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00};    // Apply the mask to keep only every other MSB
+    uint8x16_t masked_msbs = vandq_u8(msbs, mask);    // Horizontally add the bytes to pack the MSBs into a single byte
+    uint8x8_t sum = vpadd_u8(vget_low_u8(masked_msbs), vget_high_u8(masked_msbs));
+    sum = vpadd_u8(sum, sum);
+    sum = vpadd_u8(sum, sum);    // Extract the final packed byte
+    return vget_lane_u8(sum, 0);
+}int main() {
+    uint8_t data[16] = {
+        0b10000001, 0b00000000, 0b11000011, 0b00000000,
+        0b10101010, 0b00000000, 0b11110000, 0b00000000,
+        0b10000001, 0b00000000, 0b11000011, 0b00000000,
+        0b10101010, 0b00000000, 0b11110000, 0b00000000
+    };    uint8_t packed = pack_msbs_neon(data);    printf("Packed MSBs: 0x%02X\n", packed);    return 0;
+}
+```### Explanation:
+
+    **`vld1q_u8(data)`**: Loads 16 bytes from `data` into a NEON register (`uint8x16_t`).
+    **`vshrq_n_u8(vec, 7)`**: Shifts each byte right by 7, isolating the MSB.
+    **Masking**: A mask (`0x80` for every other byte) is applied using `vandq_u8` to keep only the MSBs of every other byte.
+    **Horizontal Addition**: The `vpadd_u8` instruction is used to horizontally add pairs of bytes, effectively packing the MSBs into a single byte.
+    **Final Extraction**: The result is extracted using `vget_lane_u8`.
+
+### Performance:
+
+    This approach processes 16 bytes in parallel using NEON, making it much faster than a scalar implementation.
+    The use of horizontal addition (`vpadd_u8`) efficiently reduces the NEON register to a single byte.
+
+### Example Output:
+For the given `data` array, the output will be:
+```
+Packed MSBs: 0xAA
+```This matches the expected result, where the MSBs of every other byte are packed into a single byte.### Compilation:
+Make sure to enable NEON support when compiling:
+```bash
+gcc -o neon_example neon_example.c -mfpu=neon -march=armv7-a
+```This code is optimized for ARMv7 with NEON and should provide a significant performance boost over scalar implementations.
+*/