diff --git a/src/main.rs b/src/main.rs
index f219060..9ac9baf 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,28 +1,19 @@
+#![feature(portable_simd)]
+#![feature(stdarch_arm_neon_intrinsics)]
+
 use anyhow::{anyhow, Context, Result};
-use clap::Parser;
+use clap::{Parser, ValueEnum};
 use std::{
     fs::File,
-    io::{BufRead, BufReader, Write},
+    io::{BufRead, BufReader, Read, Write},
     net::{TcpListener, TcpStream},
     process::Command,
-    sync::mpsc::channel,
-    thread,
-    time::Duration,
+    time::{Duration, Instant},
 };
 
+mod reduce;
 mod rmframes;
-mod streamlsb;
 use rmframes::RMFrames;
-use streamlsb::StreamLSB;
-
-#[derive(Parser, Debug)]
-#[command(author, version)]
-pub struct Opts {
-    /// Listen for an (unsecure) TCP connection to send the data to.
-    /// TODO: implement basic encryption
-    #[arg(long, name = "port", short = 'l')]
-    listen: usize,
-}
 
 // Parameters only for RM2, firmware version >= 3.7.0.1930
 
@@ -33,8 +24,99 @@ const WIDTH: usize = 1872;
 /// How many bytes represent one pixel in the framebuffer.
 const BYTES_PER_PIXEL: usize = 2;
 
+const IN_SIZE: usize = WIDTH * HEIGHT * BYTES_PER_PIXEL;
+
+#[derive(Parser, Debug)]
+#[command(author, version)]
+pub struct Opts {
+    #[arg(long, name = "port", short = 'p')]
+    port: usize,
+
+    #[arg(long, name = "format", short = 'f')]
+    format: Format,
+}
+
+#[derive(ValueEnum, Debug, Clone, Copy)]
+pub enum Format {
+    Mono,
+    MonoSimd,
+    Gray8,
+    Gray8Simd,
+    Gray16be,
+}
+
+impl Format {
+    fn run<W: Write>(&self, mut src: RMFrames, mut tgt: W) -> Result<()> {
+        const IN_SIZE: usize = WIDTH * HEIGHT * BYTES_PER_PIXEL;
+
+        match self {
+            Self::Gray16be => {
+                let mut buf = [0u8; IN_SIZE];
+                loop {
+                    let now = Instant::now();
+                    src.read_exact(&mut buf)?;
+                    tgt.write_all(&buf[0..IN_SIZE])?;
+                    println!("{:?}", now.elapsed().as_millis());
+                }
+            }
+
+            Self::Gray8 => {
+                const OUT: usize = reduce::gray8::out_size(IN_SIZE);
+                let mut buf = [0u8; IN_SIZE];
+                loop {
+                    let now = Instant::now();
+                    src.read_exact(&mut buf)?;
+
+                    reduce::gray8::run(&mut buf);
+                    tgt.write_all(&buf[0..OUT])?;
+                    println!("{:?}", now.elapsed().as_millis());
+                }
+            }
+
+            Self::Gray8Simd => {
+                const OUT: usize = reduce::gray8_simd::out_size(IN_SIZE);
+                let mut buf = [0u8; IN_SIZE];
+                loop {
+                    let now = Instant::now();
+                    src.read_exact(&mut buf)?;
+
+                    reduce::gray8_simd::run(&mut buf);
+                    tgt.write_all(&buf[0..OUT])?;
+                    println!("{:?}", now.elapsed().as_millis());
+                }
+            }
+
+            Self::Mono => {
+                const OUT: usize = reduce::mono::out_size(IN_SIZE);
+                let mut buf = [0u8; IN_SIZE];
+                loop {
+                    let now = Instant::now();
+                    src.read_exact(&mut buf)?;
+
+                    reduce::mono::run(&mut buf);
+                    tgt.write_all(&buf[0..OUT])?;
+                    println!("{:?}", now.elapsed().as_millis());
+                }
+            }
+
+            Self::MonoSimd => {
+                const OUT: usize = reduce::mono_simd::out_size(IN_SIZE);
+                let mut buf = [0u8; IN_SIZE];
+                loop {
+                    let now = Instant::now();
+                    src.read_exact(&mut buf)?;
+
+                    reduce::mono_simd::run(&mut buf);
+                    tgt.write_all(&buf[0..OUT])?;
+                    println!("{:?}", now.elapsed().as_millis());
+                }
+            }
+        }
+    }
+}
+
 fn main() -> Result<()> {
-    let ref opts: Opts = Opts::parse();
+    let opts: Opts = Opts::parse();
 
     let (file, offset) = {
         let pid = xochitl_pid()?;
@@ -43,15 +125,12 @@ fn main() -> Result<()> {
         (mem, offset)
     };
 
-    let src = RMFrames::init(&file, offset, WIDTH * HEIGHT * BYTES_PER_PIXEL)?;
-    let src = StreamLSB::new(src);
-    let mut src = src;
+    let src = RMFrames::init(&file, offset, IN_SIZE)?;
+    let tgt = listen_timeout(opts.port, Duration::from_secs(60))?;
 
-    let mut tgt: Box<dyn Write> = Box::new(listen_timeout(opts.listen, Duration::from_secs(60))?);
+    opts.format.run(src, tgt)?;
 
-    std::io::copy(&mut src, &mut tgt).unwrap();
-
-    return Ok(());
+    Ok(())
 }
 
 fn listen_timeout(port: usize, timeout: Duration) -> Result<TcpStream> {
@@ -59,14 +138,8 @@ fn listen_timeout(port: usize, timeout: Duration) -> Result<TcpStream> {
     let listen = TcpListener::bind(&listen_addr)?;
     eprintln!("[rM] listening for a TCP connection on {}", listen_addr);
 
-    let (tx, rx) = channel();
-    thread::spawn(move || {
-        tx.send(listen.accept()).unwrap();
-    });
+    let (conn, conn_addr) = listen.accept().unwrap();
 
-    let (conn, conn_addr) = rx
-        .recv_timeout(timeout)
-        .context("Timeout while waiting for host to connect to reMarkable")??;
     eprintln!("[rM] connection received from {}", conn_addr);
     conn.set_write_timeout(Some(timeout))?;
     Ok(conn)
@@ -74,7 +147,7 @@ fn listen_timeout(port: usize, timeout: Duration) -> Result<TcpStream> {
 
 fn xochitl_pid() -> Result<usize> {
     let output = Command::new("/bin/pidof")
-        .args(&["xochitl"])
+        .args(["xochitl"])
         .output()
         .context("Failed to run `/bin/pidof xochitl`")?;
     if output.status.success() {
@@ -95,8 +168,7 @@ fn rm2_fb_offset(pid: usize) -> Result<usize> {
     let line = BufReader::new(file)
         .lines()
         .skip_while(|line| matches!(line, Ok(l) if !l.ends_with("/dev/fb0")))
-        .skip(1)
-        .next()
+        .nth(1)
         .with_context(|| format!("No line containing /dev/fb0 in /proc/{}/maps file", pid))?
         .with_context(|| format!("Error reading file /proc/{}/maps", pid))?;
 
diff --git a/src/reduce/gray8.rs b/src/reduce/gray8.rs
new file mode 100644
index 0000000..eaac2db
--- /dev/null
+++ b/src/reduce/gray8.rs
@@ -0,0 +1,16 @@
+const GROUP_BY: usize = 2;
+pub const fn out_size(in_size: usize) -> usize {
+    in_size / 2
+}
+
+pub fn run(buf: &mut [u8]) {
+    let n_raw = buf.len();
+    let mut in_cursor = 0;
+    let mut out_cursor = 0;
+
+    while in_cursor + GROUP_BY <= n_raw {
+        buf[out_cursor] = buf[in_cursor];
+        out_cursor += 1;
+        in_cursor += GROUP_BY;
+    }
+}
diff --git a/src/reduce/gray8_simd.rs b/src/reduce/gray8_simd.rs
new file mode 100644
index 0000000..5000057
--- /dev/null
+++ b/src/reduce/gray8_simd.rs
@@ -0,0 +1,33 @@
+use std::{
+    arch::arm::{vld1q_u8, vst1q_u8, vuzpq_u8},
+    convert::TryInto,
+};
+
+const GROUP_BY: usize = 32;
+pub const fn out_size(in_size: usize) -> usize {
+    in_size / 2
+}
+
+pub fn run(buf: &mut [u8]) {
+    let n_raw = buf.len();
+    let mut in_cursor = 0;
+    let mut out_cursor = 0;
+
+    let mut res = [0u8; 16];
+
+    while in_cursor + GROUP_BY <= n_raw {
+        let a: &[u8; 16] = buf[in_cursor..in_cursor + 16].try_into().unwrap();
+        let b: &[u8; 16] = buf[in_cursor + 16..in_cursor + 32].try_into().unwrap();
+
+        unsafe {
+            let a = vld1q_u8(a as *const u8);
+            let b = vld1q_u8(b as *const u8);
+            let z = vuzpq_u8(a, b);
+            vst1q_u8(&mut res as *mut u8, z.0);
+        }
+
+        buf[out_cursor..out_cursor + 16].copy_from_slice(&res);
+        out_cursor += 16;
+        in_cursor += GROUP_BY;
+    }
+}
diff --git a/src/reduce/mod.rs b/src/reduce/mod.rs
new file mode 100644
index 0000000..29930a4
--- /dev/null
+++ b/src/reduce/mod.rs
@@ -0,0 +1,4 @@
+pub mod gray8;
+pub mod gray8_simd;
+pub mod mono;
+pub mod mono_simd;
diff --git a/src/reduce/mono.rs b/src/reduce/mono.rs
new file mode 100644
index 0000000..87260d3
--- /dev/null
+++ b/src/reduce/mono.rs
@@ -0,0 +1,47 @@
+use std::convert::TryInto;
+
+const GROUP_BY: usize = 16;
+pub const fn out_size(in_size: usize) -> usize {
+    in_size / 16
+}
+
+pub fn run(buf: &mut [u8]) {
+    let n_raw = buf.len();
+    let mut in_cursor = 0;
+    let mut out_cursor = 0;
+
+    while in_cursor + GROUP_BY <= n_raw {
+        let a: &[u8; 16] = buf[in_cursor..in_cursor + 16].try_into().unwrap();
+
+        let mut out = 0u8;
+
+        if a[0] == 0x1E {
+            out |= 0b10000000;
+        }
+        if a[2] == 0x1E {
+            out |= 0b10000000 >> 1;
+        }
+        if a[4] == 0x1E {
+            out |= 0b10000000 >> 2;
+        }
+        if a[6] == 0x1E {
+            out |= 0b10000000 >> 3;
+        }
+        if a[8] == 0x1E {
+            out |= 0b10000000 >> 4;
+        }
+        if a[10] == 0x1E {
+            out |= 0b10000000 >> 5;
+        }
+        if a[12] == 0x1E {
+            out |= 0b10000000 >> 6;
+        }
+        if a[14] == 0x1E {
+            out |= 0b10000000 >> 7;
+        }
+
+        buf[out_cursor] = out;
+        out_cursor += 1;
+        in_cursor += GROUP_BY;
+    }
+}
diff --git a/src/reduce/mono_simd.rs b/src/reduce/mono_simd.rs
new file mode 100644
index 0000000..36de67c
--- /dev/null
+++ b/src/reduce/mono_simd.rs
@@ -0,0 +1,69 @@
+use std::{
+    arch::arm::{
+        vandq_u8, vgetq_lane_u64, vld1q_s8, vld1q_u8, vpaddlq_u16, vpaddlq_u32, vpaddlq_u8,
+        vshlq_u8, vshrq_n_u8, vuzpq_u8,
+    },
+    convert::TryInto,
+};
+
+const GROUP_BY: usize = 32;
+pub const fn out_size(in_size: usize) -> usize {
+    in_size / 16
+}
+
+pub fn run(buf: &mut [u8]) {
+    let n_raw = buf.len();
+    let mut in_cursor = 0;
+    let mut out_cursor = 0;
+
+    let m = unsafe {
+        let mask = &[0x01u8; 16];
+        vld1q_u8(mask as *const u8)
+    };
+
+    let h = unsafe {
+        let mask = &[
+            0x07i8, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02,
+            0x01, 0x00,
+        ];
+        vld1q_s8(mask as *const i8)
+    };
+
+    while in_cursor + GROUP_BY <= n_raw {
+        let a: &[u8; 16] = buf[in_cursor..in_cursor + 16].try_into().unwrap();
+        let b: &[u8; 16] = buf[in_cursor + 16..in_cursor + 32].try_into().unwrap();
+
+        let res: (u8, u8) = unsafe {
+            // Load 32 bytes
+            let a = vld1q_u8(a as *const u8);
+            let b = vld1q_u8(b as *const u8);
+
+            // Unzip and get first byte of each pair
+            let a = vuzpq_u8(a, b).0;
+
+            // White = 0b1110, so >> 4.
+            let a = vshrq_n_u8::<4>(a);
+
+            // and with 0x01 mask
+            let a = vandq_u8(a, m);
+
+            // shift each bit left by an appropriate amount
+            // (h is [0x07, 0x06, .., 0x00, 0x07, .., 0x00])
+            let a = vshlq_u8(a, h);
+
+            // Sum everything
+            let s = vpaddlq_u8(a);
+            let s = vpaddlq_u16(s);
+            let s = vpaddlq_u32(s);
+            (
+                vgetq_lane_u64(s, 0).try_into().unwrap(),
+                vgetq_lane_u64(s, 1).try_into().unwrap(),
+            )
+        };
+
+        buf[out_cursor] = res.0;
+        buf[out_cursor + 1] = res.1;
+        out_cursor += 2;
+        in_cursor += GROUP_BY;
+    }
+}
diff --git a/src/streamlsb.rs b/src/streamlsb.rs
index 8e4a1f6..e066daa 100644
--- a/src/streamlsb.rs
+++ b/src/streamlsb.rs
@@ -68,62 +68,3 @@ impl<R: Read> Read for StreamLSB<R> {
         return Ok(out_cursor);
     }
 }
-
-// TODO: make this faster with SIMD
-
-/*
-SORKIN DEEPSEEK
-
-On an ARMv7 processor with NEON SIMD support, you can significantly speed up the process of extracting and packing the most significant bits (MSBs) of every other byte from 16 consecutive bytes. NEON allows you to process multiple bytes in parallel using wide registers and specialized instructions.Here’s how you can achieve this using NEON intrinsics:### Steps:
-
-    Load the 16 bytes into a NEON register.
-    Extract the MSB of every byte using a right shift (`vshrq_u8`).
-    Mask out every other byte using a bitwise AND (`vandq_u8`).
-    Pack the MSBs into a single byte using NEON's horizontal reduction.
-
-### Code Example:
-```c
-#include <arm_neon.h>
-#include <stdint.h>
-#include <stdio.h>uint8_t pack_msbs_neon(const uint8_t *data) {
-    // Load 16 bytes into a NEON register
-    uint8x16_t vec = vld1q_u8(data);    // Shift each byte right by 7 to isolate the MSB
-    uint8x16_t msbs = vshrq_n_u8(vec, 7);    // Create a mask to select every other byte (0x80, 0x00, 0x80, 0x00, ...)
-    uint8x16_t mask = {0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
-                       0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00};    // Apply the mask to keep only every other MSB
-    uint8x16_t masked_msbs = vandq_u8(msbs, mask);    // Horizontally add the bytes to pack the MSBs into a single byte
-    uint8x8_t sum = vpadd_u8(vget_low_u8(masked_msbs), vget_high_u8(masked_msbs));
-    sum = vpadd_u8(sum, sum);
-    sum = vpadd_u8(sum, sum);    // Extract the final packed byte
-    return vget_lane_u8(sum, 0);
-}int main() {
-    uint8_t data[16] = {
-        0b10000001, 0b00000000, 0b11000011, 0b00000000,
-        0b10101010, 0b00000000, 0b11110000, 0b00000000,
-        0b10000001, 0b00000000, 0b11000011, 0b00000000,
-        0b10101010, 0b00000000, 0b11110000, 0b00000000
-    };    uint8_t packed = pack_msbs_neon(data);    printf("Packed MSBs: 0x%02X\n", packed);    return 0;
-}
-```### Explanation:
-
-    **`vld1q_u8(data)`**: Loads 16 bytes from `data` into a NEON register (`uint8x16_t`).
-    **`vshrq_n_u8(vec, 7)`**: Shifts each byte right by 7, isolating the MSB.
-    **Masking**: A mask (`0x80` for every other byte) is applied using `vandq_u8` to keep only the MSBs of every other byte.
-    **Horizontal Addition**: The `vpadd_u8` instruction is used to horizontally add pairs of bytes, effectively packing the MSBs into a single byte.
-    **Final Extraction**: The result is extracted using `vget_lane_u8`.
-
-### Performance:
-
-    This approach processes 16 bytes in parallel using NEON, making it much faster than a scalar implementation.
-    The use of horizontal addition (`vpadd_u8`) efficiently reduces the NEON register to a single byte.
-
-### Example Output:
-For the given `data` array, the output will be:
-```
-Packed MSBs: 0xAA
-```This matches the expected result, where the MSBs of every other byte are packed into a single byte.### Compilation:
-Make sure to enable NEON support when compiling:
-```bash
-gcc -o neon_example neon_example.c -mfpu=neon -march=armv7-a
-```This code is optimized for ARMv7 with NEON and should provide a significant performance boost over scalar implementations.
-*/