use std::{ arch::arm::{vld1q_u8, vst1q_u8, vuzpq_u8}, convert::TryInto, }; const GROUP_BY: usize = 32; pub const fn out_size(in_size: usize) -> usize { in_size / 2 } pub fn run(buf: &mut [u8]) { let n_raw = buf.len(); let mut in_cursor = 0; let mut out_cursor = 0; let mut res = [0u8; 16]; while in_cursor + GROUP_BY <= n_raw { let a: &[u8; 16] = buf[in_cursor..in_cursor + 16].try_into().unwrap(); let b: &[u8; 16] = buf[in_cursor + 16..in_cursor + 32].try_into().unwrap(); unsafe { let a = vld1q_u8(a as *const u8); let b = vld1q_u8(b as *const u8); let z = vuzpq_u8(a, b); vst1q_u8(&mut res as *mut u8, z.0); } buf[out_cursor..out_cursor + 16].copy_from_slice(&res); out_cursor += 16; in_cursor += GROUP_BY; } }