moi 0.1.1

Encoder and decoder implementations for the MOI image compression format.
use crate::consts::*;
use std::simd::{u32x8,ToBitMask,SimdPartialEq};

const SIMD: bool = true;
const BUFFER_LEN: usize = HISTORY_SIZE + 256;

#[repr(align(32))]
#[derive(Debug,Copy,Clone,Eq,PartialEq)]
pub struct MoiHistory {
	buffer: [u32; BUFFER_LEN],
	offset: usize,
}

impl MoiHistory {
	pub fn new() -> Self {
		Self {
			offset: BUFFER_LEN - HISTORY_SIZE,
			buffer: [0; BUFFER_LEN],
		}
	}
	
	#[inline(always)]
	pub fn first(&self) -> [u8; 4] {
		unsafe {
			self.buffer.get_unchecked(self.offset).to_ne_bytes()
		}
	}
	
	#[inline]
	pub fn move_to_front(&mut self, i: u8) {
		let i = i as usize;
		if i == 0 { return; }
		let o = self.offset;
		if SIMD { //TODO simd instead of memcpy?
			unsafe {
				let tmp = *self.buffer.get_unchecked(o + i);
				let from = self.buffer.get_unchecked(o) as *const u32;
				let to = self.buffer.get_unchecked_mut(o + 1) as *mut u32;
				std::ptr::copy(from, to, i);
				*self.buffer.get_unchecked_mut(o) = tmp;
			}
		} else {
			self.buffer[o..=(o + i)].rotate_right(1);
		}
	}
	
	#[inline]
	pub fn find_index(&self, pixel: [u8; 4], start: usize, end: usize) -> Option<usize> {
		let pixel = u32::from_ne_bytes(pixel);
		if SIMD {
			let px: u32x8 = u32x8::splat(pixel);
			for i in (start / 8)..(end / 8) {
				let i = i * 8;
				let arr = unsafe {
					*(self.buffer.as_ptr().add(self.offset + i) as *const _)
				};
				let chunk = u32x8::from_array(arr);
				let mask = px.simd_eq(chunk);
				if mask.any() {
					return Some(i + mask.to_bitmask().trailing_zeros() as usize);
				}
			}
			if HISTORY_SIZE % 16 != 0 {
				let start = self.offset + HISTORY_SIZE  / px.lanes() * px.lanes();
				self.buffer.iter().skip(start).position(|&p| p == pixel).map(|i| i + start)
			} else {
				None
			}
		} else {
			self.buffer[(self.offset + start)..(self.offset + end)].iter().position(|&p| p == pixel).map(|i| i + start)
		}
	}
	
	#[inline(always)]
	pub fn push(&mut self, pixel: [u8; 4]) {
		self.offset -= 1;
		if self.offset == 0 {
			self.offset = BUFFER_LEN - HISTORY_SIZE;
			let o = self.offset;
			for i in 0..HISTORY_SIZE {
				unsafe {
					*self.buffer.get_unchecked_mut(o + i) = *self.buffer.get_unchecked(i);
				}
			}
		}
		unsafe {
			*self.buffer.get_unchecked_mut(self.offset) = u32::from_ne_bytes(pixel);
		}
	}
}

pub struct MoiWriter {
	pub unwritten_rgb: usize,
	pub rgb_cache: [[u8; 4]; RGB_MAX],
	pub output: Vec<u8>,
}

impl MoiWriter {
	pub fn new(header: crate::MoiHeader) -> Self {
		let mut r = Self {
			unwritten_rgb: 0,
			rgb_cache: [[0; 4]; RGB_MAX],
			output: Vec::new(),
		};
		header.encode(|byte| r.output.push(byte));
		r
	}
	
	#[inline]
	pub fn write(&mut self, byte: u8) {
		self.hot_write(byte)
	}
	
	#[inline(always)]
	pub fn hot_write(&mut self, byte: u8) {
		if self.unwritten_rgb > 0 {
			if self.unwritten_rgb == usize::max_value() {
				self.output.push(RGBA);
				self.output.extend_from_slice(&self.rgb_cache[0]);
			} else {
				self.output.push(RGB + self.unwritten_rgb as u8 - 1);
				for i in 0..self.unwritten_rgb {
					unsafe {
						self.output.extend_from_slice(&self.rgb_cache.get_unchecked(i)[0..3]);
					}
				}
			}
			self.unwritten_rgb = 0;
		}
		self.output.push(byte);
	}
	
	#[inline]
	pub fn add_rgb(&mut self, pixel: [u8; 4]) {
		if self.unwritten_rgb < RGB_MAX {
			self.rgb_cache[self.unwritten_rgb] = pixel;
			self.unwritten_rgb += 1;
		} else {
			self.hot_write(0); //flush
			self.output.pop();
			self.add_rgb(pixel);
		}
	}
	
	#[inline]
	pub fn add_rgba(&mut self, pixel: [u8; 4]) {
		if self.unwritten_rgb == usize::max_value() {
			self.unwritten_rgb = 0;
			self.output.push(RGBA2);
			self.output.extend_from_slice(&self.rgb_cache[0]);
			self.output.extend_from_slice(&pixel);
		} else {
			self.hot_write(0);
			self.output.pop();
			self.unwritten_rgb = usize::max_value();
			self.rgb_cache[0] = pixel;
		}
	}
	
	#[inline]
	pub fn encode_run(&mut self, len: usize) {
		match len {
			0 => {},
			1 => self.write(INDEX),
			2 => self.write(DOUBLE),
			3 => self.write(TRIPLE),
			4 => self.write(QUAD),
			5..RUN_MAX => self.write(RUN + len as u8 - 5),
			RUN_MAX..LONG_RUN_MIN => { //encode two short runs (or short run then double or whatever)
				self.encode_run(RUN_MAX - 1);
				self.encode_run(len - RUN_MAX + 1);
			},
			_ => { //too large for two short runs
				self.write(UTIL);
				if len < LONG_RUN_MAX {
					self.write((len - LONG_RUN_MIN) as u8);
				} else if len < LONG_RUN_MAX + 256 {
					self.write(INDEX_MAX as u8 - 2);
					self.write((len - LONG_RUN_MAX) as u8);
				} else {
					self.write(INDEX_MAX as u8 - 1);
					let l = (len - LONG_RUN_MAX - 256).min(65535) as u16;
					self.write(l.to_le_bytes()[0]);
					self.write(l.to_le_bytes()[1]);
					self.encode_run(len - LONG_RUN_MAX - 256 - l as usize);
				}
			},
		}
	}
	
	#[inline]
	pub fn try_luma(&mut self, [r,g,b]: [u8; 3], ty: usize) -> bool {
		let ty = ty - 1;
		let gbias = [LUMA_BIAS,LUMA2_BIAS,0];
		let bias = [1,7,31];
		let threshold = [[4,LUMA_RANGE-1],[16,LUMA2_RANGE-1],[64,255]];
		let bias = if g > 127 { bias[ty] } else { bias[ty] + 1 };
		let dg = g.wrapping_add(gbias[ty]);
		let dr = r.wrapping_sub(g).wrapping_add(bias);
		let db = b.wrapping_sub(g).wrapping_add(bias);
		
		if dr.max(db) < threshold[ty][0] && dg <= threshold[ty][1] {
			match ty {
				0 => self.write(LUMA + dg * 16 + dr * 4 + db),
				1 => {
					self.write(LUMA2 + dg);
					self.write(dr << 4 | db);
				},
				_ => {
					self.write(LUMA3 + (dr >> 2)); //don't use bitwise or here because LUMA3 might not be a multiple of 16
					self.write(dr << 6 | db);
					self.write(dg);
				},
			}
			true
		} else {
			false
		}
	}
	
	#[inline(never)]
	pub fn finish(mut self) -> Vec<u8> {
		self.hot_write(0); //flush rgb/rgba
		self.output.pop(); //remove the byte we just wrote
		self.output
	}
}

pub struct MoiHeader {
	pub magic: [u8; 3],
	pub version: u8,
	pub width: u32,
	pub height: u32,
}

impl MoiHeader {
	#[inline]
	pub fn encode(self, mut output: impl FnMut(u8)) {
		for i in 0..3 { output(self.magic[i]); }
		output(self.version);
		for i in 0..4 { output(self.width.to_le_bytes()[i]); }
		for i in 0..4 { output(self.height.to_le_bytes()[i]); }
	}
	
	#[inline]
	pub fn decode(input: [u8; 12]) -> Self {
		Self {
			magic: input[0..3].try_into().unwrap(),
			version: input[3],
			width: u32::from_le_bytes(input[4..8].try_into().unwrap()),
			height: u32::from_le_bytes(input[8..12].try_into().unwrap()),
		}
	}
}