#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
#[inline(always)]
pub unsafe fn vmul_p8x16(mut a : __m128i, b : __m128i, poly : u8) -> __m128i {
let zero = _mm_setzero_si128();
let mut mask = _mm_set1_epi8(1); let high = _mm_slli_epi32(mask,7);
let poly = _mm_set1_epi8(poly as i8);
let mut res : __m128i;
let mut temp : __m128i;
let mut cond : __m128i;
cond = _mm_cmpeq_epi8 ( _mm_and_si128(b, mask), mask);
res = _mm_blendv_epi8( zero, a , cond);
mask = _mm_slli_epi32(mask, 1);
for _ in 1..8 {
cond = _mm_cmpeq_epi8(_mm_and_si128(a, high), high);
a = _mm_add_epi8(a,a);
temp = _mm_xor_si128(a , poly);
a = _mm_blendv_epi8(a, temp, cond);
cond = _mm_cmpeq_epi8 ( _mm_and_si128(b, mask), mask);
res = _mm_blendv_epi8( res, _mm_xor_si128(a, res), cond);
mask = _mm_slli_epi32(mask, 1);
}
res
}
pub unsafe fn vmul_p8_buffer(dest : &mut [u8], av : &[u8], bv : &[u8], poly : u8)
{
debug_assert_eq!(av.len(), bv.len());
debug_assert_eq!(bv.len(), dest.len());
let bytes = av.len();
if bytes & 15 != 0 {
panic!("Buffer length not a multiple of 16");
}
let mut times = bytes >> 4;
let mut dest = dest.as_mut_ptr() as *mut std::arch::x86_64::__m128i;
let mut av = av.as_ptr() as *const std::arch::x86_64::__m128i;
let mut bv = bv.as_ptr() as *const std::arch::x86_64::__m128i;
while times > 0 {
times -= 1;
let a : __m128i;
let b : __m128i;
let res : __m128i;
a = _mm_lddqu_si128(av); b = _mm_lddqu_si128(bv); av = av.offset(1); bv = bv.offset(1);
res = vmul_p8x16(a, b, poly);
_mm_storeu_si128(dest,res);
dest = dest.offset(1);
}
}
pub unsafe fn vector_cube_p8x16(a : __m128i, poly : u8) -> __m128i {
let squared = vmul_p8x16(a, a, poly);
vmul_p8x16(squared, a, poly)
}
use super::{Simd,SimdMatrix};
#[derive(Clone,Copy,Debug)]
pub struct X86u8x16Long0x11b {
vec : __m128i,
}
impl X86u8x16Long0x11b {
unsafe fn left_shift(reg : Self, bytes : usize) -> Self {
debug_assert!(bytes > 0);
debug_assert!(bytes < 16);
let no_shuffle_addr : *const u8 = SHUFFLE_MASK.as_ptr().offset(16);
let lsh_addr = no_shuffle_addr.offset(bytes as isize * -1);
let mask = _mm_lddqu_si128(lsh_addr as *const std::arch::x86_64::__m128i);
Self { vec :_mm_shuffle_epi8(reg.vec, mask) }
}
unsafe fn right_shift(reg : Self, bytes : usize) -> Self {
debug_assert!(bytes > 0);
debug_assert!(bytes < 16);
let no_shuffle_addr : *const u8 = SHUFFLE_MASK.as_ptr().offset(16);
let rsh_addr = no_shuffle_addr.offset(bytes as isize);
let mask = _mm_lddqu_si128(rsh_addr as *const std::arch::x86_64::__m128i);
Self { vec :_mm_shuffle_epi8(reg.vec, mask) }
}
unsafe fn combine_bytes(r0 : Self, r1: Self, bytes : usize) -> Self {
debug_assert!(bytes != 0);
debug_assert!(bytes != 16);
Self { vec : _mm_or_si128 (r0.vec, Self::left_shift(r1, bytes).vec) }
}
unsafe fn future_bytes(r0 : Self, bytes : usize) -> Self {
debug_assert!(bytes != 0);
debug_assert!(bytes != 16);
Self::right_shift(r0, 16 - bytes)
}
}
#[allow(dead_code)]
unsafe fn test_alignr() {
let av = [ 0u8, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15 ];
let bv = [ 16u8, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31];
let lo = _mm_lddqu_si128(av.as_ptr() as *const std::arch::x86_64::__m128i);
let hi = _mm_lddqu_si128(bv.as_ptr() as *const std::arch::x86_64::__m128i);
let c = _mm_alignr_epi8 (hi, lo, 31);
eprintln!("got c = {:?}", c);
}
impl Simd for X86u8x16Long0x11b {
type E = u8;
type V = __m128i;
const SIMD_BYTES : usize = 16;
#[inline(always)]
fn zero_element() -> Self::E { 0u8.into() }
#[inline(always)]
fn add_elements(a : Self::E, b : Self::E) -> Self::E { (a ^ b).into() }
#[inline(always)]
fn zero_vector() -> Self {
unsafe {
X86u8x16Long0x11b { vec :_mm_setzero_si128() }
}
}
unsafe fn from_ptr(ptr: *const Self::E) -> Self {
X86u8x16Long0x11b {
vec : _mm_lddqu_si128(ptr as *const std::arch::x86_64::__m128i)
}
}
fn cross_product_slices(dest: &mut [u8],
av : &[u8], bv : &[u8]) {
assert_eq!(dest.len(), av.len());
assert_eq!(dest.len(), bv.len());
unsafe {
vmul_p8_buffer(&mut dest[..], &av[..], &bv[..], 0x1b);
}
}
fn cross_product(a : Self, b : Self) -> Self {
unsafe {
Self { vec : vmul_p8x16(a.vec, b.vec, 0x1b) }
}
}
unsafe fn sum_across_n(lo : Self, hi : Self, n : usize, off : usize)
-> (Self::E, Self) {
debug_assert!((off < 16) && (n > 0) && (n <= 16));
let m = if off + n >= 16 { hi } else { lo };
let mut temp = lo;
if off != 0 {
temp = Self::right_shift(temp, off);
}
if off + n < 16 {
let shift_amount = 16 - n;
temp = Self::left_shift(temp, shift_amount);
}
if off + n > 16 {
let shift_amount = 32 - (off + n);
let temp_hi = Self::left_shift(hi, shift_amount);
temp = Self { vec : _mm_xor_si128(
temp.vec,
temp_hi.vec
) };
}
let mut temp = temp.vec;
temp = _mm_xor_si128(temp, _mm_srli_si128(temp, 8));
temp = _mm_xor_si128(temp, _mm_srli_si128(temp, 4));
temp = _mm_xor_si128(temp, _mm_srli_si128(temp, 2));
temp = _mm_xor_si128(temp, _mm_srli_si128(temp, 1));
let extracted : u8 = (_mm_extract_epi8(temp, 0) & 255) as u8;
return (extracted, m);
}
unsafe fn read_next(mod_index : &mut usize,
array_index : &mut usize,
array : &[Self::E],
size : usize,
ra_size : &mut usize,
ra : &mut Self)
-> Self {
let reg0 = *ra;
let mut reg1 : X86u8x16Long0x11b;
let ret : X86u8x16Long0x11b;
let array_size = size;
let mods = *mod_index;
let mut new_mods = mods + 16;
debug_assert!(mods < array_size);
let addr_ptr = array.as_ptr()
.offset(*array_index as isize)
as *const std::arch::x86_64::__m128i;
reg1 = X86u8x16Long0x11b { vec :_mm_lddqu_si128(addr_ptr) };
*array_index += 16;
let mut deficit = 0;
if *array_index >= array_size && new_mods < array_size {
deficit = *array_index - array_size;
}
let old_offset = *ra_size;
let will_wrap_around : bool = new_mods >= array_size;
let had_readahead : bool = old_offset != 0;
if will_wrap_around {
new_mods -= array_size;
let want_bytes = 16 - old_offset;
let from_new = if want_bytes < new_mods {
want_bytes
} else {
new_mods
};
let from_end = want_bytes - from_new;
if old_offset == 0 {
} else {
reg1 = X86u8x16Long0x11b
::combine_bytes(reg0, reg1, old_offset);
}
let have_bytes = old_offset + from_end;
*array_index = 0;
if have_bytes != 16 {
let missing = 16 - old_offset - from_end;
let addr_ptr = array
.as_ptr()
.offset(*array_index as isize)
as *const std::arch::x86_64::__m128i;
let new = X86u8x16Long0x11b {
vec : _mm_lddqu_si128(addr_ptr) };
*array_index += 16;
if have_bytes == 0 {
reg1 = new
} else {
reg1 = X86u8x16Long0x11b
::combine_bytes(reg1, new, have_bytes);
}
let future_bytes = 16 - missing;
if future_bytes != 0 {
*ra = X86u8x16Long0x11b::future_bytes(new, future_bytes);
}
*ra_size = future_bytes;
} else {
*ra_size = 0
}
*mod_index = new_mods;
ret = reg1
} else {
if had_readahead {
ret = X86u8x16Long0x11b::combine_bytes(reg0, reg1, old_offset);
let future_bytes;
if deficit != 0 {
future_bytes = old_offset - deficit;
*array_index = 0;
} else {
future_bytes = old_offset;
}
*ra_size = future_bytes;
reg1 = X86u8x16Long0x11b::future_bytes(reg1, old_offset);
} else {
ret = reg1;
}
*mod_index += 16;
debug_assert!(*mod_index < array_size);
*ra = reg1;
}
ret
}
}
pub struct X86Matrix<S : Simd> {
_zero: S,
rows : usize,
cols : usize,
pub array : Vec<u8>,
is_rowwise : bool,
}
impl X86Matrix<X86u8x16Long0x11b> {
pub fn new(rows : usize, cols : usize, is_rowwise : bool) -> Self {
let size = rows * cols;
if size < 16 {
panic!("This matrix can't handle rows * cols < 16 bytes");
}
let array = vec![0u8; size + 15];
let _zero = X86u8x16Long0x11b::zero_vector();
X86Matrix::<X86u8x16Long0x11b> {
rows, cols, is_rowwise, array, _zero
}
}
pub fn fill(&mut self, data : &[u8]) {
let size = self.size();
if data.len() != size {
panic!("Supplied {} data bytes != matrix size {}",
data.len(), size);
}
self.array[0..size].copy_from_slice(data);
}
pub fn new_with_data(rows : usize, cols : usize, is_rowwise : bool,
data : &[u8]) -> Self {
let mut this = Self::new(rows, cols, is_rowwise);
this.fill(data);
this
}
}
const SHUFFLE_MASK : [u8; 48] = [
255u8, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255,
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255,
];
impl SimdMatrix<X86u8x16Long0x11b> for X86Matrix<X86u8x16Long0x11b> {
#[inline(always)]
fn rows(&self) -> usize { self.rows }
#[inline(always)]
fn cols(&self) -> usize { self.cols }
#[inline(always)]
fn is_rowwise(&self) -> bool { self.is_rowwise }
fn as_slice(&self) -> &[u8] {
let size = self.size();
&self.array[0..size]
}
#[inline(always)]
fn indexed_write(&mut self, index : usize, elem : u8) {
self.array[index] = elem;
}
fn as_mut_slice(&mut self) -> &mut [u8] {
let size = self.size();
&mut self.array[0..size]
}
}
#[cfg(test)]
mod tests {
use super::*;
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
#[allow(unused_imports)]
use std::arch::x86_64::*;
#[test]
fn test_vmul_p8x16() {
unsafe {
#[allow(overflowing_literals)]
let b : __m128i = _mm_set_epi32( 0xca000202, 0x000053ca, 0x00000000, 0x00000102 );
#[allow(overflowing_literals)]
let a : __m128i = _mm_set_epi32( 0x53004080, 0x0000ca53, 0x00000000, 0x00000201 );
let c : __m128i = _mm_set_epi32( 0x0100801b, 0x00000101, 0x00000000, 0x00000202 );
let result = vmul_p8x16(a, b, 0x1b);
assert_eq!(format!("{:?}", c), format!("{:?}", result))
}
}
#[test]
fn test_vmul_p8_buffer() {
unsafe {
let a = [0x53u8; 160];
let b = [0xcau8; 160];
let mut d = [0x00u8; 160];
let i = [0x01u8; 160];
vmul_p8_buffer(&mut d[..], &a, &b, 0x1b);
assert_eq!(format!("{:?}", d), format!("{:?}", i))
}
}
#[test]
fn test_vmul_p8_buffer_unaligned_read() {
unsafe {
let a = [0x53u8; 161];
let b = [0xcau8; 161];
let mut d = [0x00u8; 160];
let i = [0x01u8; 160];
vmul_p8_buffer(&mut d[..], &a[1..], &b[1..], 0x1b);
assert_eq!(format!("{:?}", d), format!("{:?}", i))
}
}
#[test]
fn test_vmul_p8_buffer_unaligned_write() {
unsafe {
let a = [0x53u8; 160];
let b = [0xcau8; 160];
let mut d = [0u8; 161];
let i = [0x01u8; 160];
vmul_p8_buffer(&mut d[1..], &a[..], &b[..], 0x1b);
assert_eq!(format!("{:?}", &d[1..161]), format!("{:?}", &i[0..160]))
}
}
#[test]
fn test_alignr_shr() {
unsafe { test_alignr() };
}
#[test]
fn test_sum_across_n() {
let av = [ 0u8, 1, 2, 4, 8, 16, 32, 64,
128, 0, 1, 2, 4, 8, 16, 32, ];
let bv = [ 1u8, 2, 4, 8, 16, 32, 64, 128,
0, 1, 2, 4, 8, 16, 32, 64,];
unsafe {
let lo = _mm_lddqu_si128(av.as_ptr() as *const std::arch::x86_64::__m128i);
let hi = _mm_lddqu_si128(bv.as_ptr() as *const std::arch::x86_64::__m128i);
let lo = X86u8x16Long0x11b { vec : lo };
let hi = X86u8x16Long0x11b { vec : hi };
let (sum,_new_m) = X86u8x16Long0x11b::sum_across_n(lo, hi, 16, 0);
let expect : u8 = 0b0111_1111 ^ 0b1011_1111;
eprintln!("expect {:x}", expect);
assert_eq!(sum, expect);
let (sum,_new_m) = X86u8x16Long0x11b::sum_across_n(lo, hi, 8, 0);
assert_eq!(sum, 0b0111_1111);
let (sum,_new_m) = X86u8x16Long0x11b::sum_across_n(lo, hi, 16, 1);
let expect : u8 = 0b1111_1111 ^ 0b0011_1110;
eprintln!("expect {:x}", expect);
assert_eq!(sum, expect);
let (sum,_new_m) = X86u8x16Long0x11b::sum_across_n(lo, hi, 1, 0);
assert_eq!(sum, 0b0000_0000);
let (sum,_new_m)
= X86u8x16Long0x11b::sum_across_n(lo, hi, 2, 0);
assert_eq!(sum, 0b0000_0001);
let (sum,_new_m)
= X86u8x16Long0x11b::sum_across_n(lo, hi, 3, 0);
assert_eq!(sum, 0b0000_0011);
let (sum,_new_m)
= X86u8x16Long0x11b::sum_across_n(lo, hi, 4, 0);
assert_eq!(sum, 0b0000_0111);
let (sum,_new_m)
= X86u8x16Long0x11b::sum_across_n(lo, hi, 5, 0);
assert_eq!(sum, 0b0000_1111);
let (sum,_new_m)
= X86u8x16Long0x11b::sum_across_n(lo, hi, 6, 0);
assert_eq!(sum, 0b0001_1111);
let (sum,_new_m)
= X86u8x16Long0x11b::sum_across_n(lo, hi, 7, 0);
assert_eq!(sum, 0b0011_1111);
let (sum,_new_m)
= X86u8x16Long0x11b::sum_across_n(lo, hi, 15, 0);
let expect : u8 = 0b0111_1111 ^ 0b1001_1111;
eprintln!("expect {:x}", expect);
assert_eq!(sum, expect);
}
}
#[test]
fn test_new_sum_across_n() {
let av = [ 0u8, 1, 2, 4, 8, 16, 32, 64,
128, 0, 1, 2, 4, 8, 16, 32, ];
let bv = [ 1u8, 2, 4, 8, 16, 32, 64, 128,
0, 1, 2, 4, 8, 16, 32, 64,];
unsafe {
let lo = _mm_lddqu_si128(
av.as_ptr() as *const std::arch::x86_64::__m128i);
let hi = _mm_lddqu_si128(
bv.as_ptr() as *const std::arch::x86_64::__m128i);
let lo = X86u8x16Long0x11b { vec : lo };
let hi = X86u8x16Long0x11b { vec : hi };
let (sum,_new_m)
= X86u8x16Long0x11b::sum_across_n(lo, hi, 16, 3);
let expect : u8 = 0b1111_1101 ^ 0b0011_1001;
eprintln!("expect {:x}", expect);
assert_eq!(sum, expect);
let (sum,_new_m)
= X86u8x16Long0x11b::sum_across_n(lo, hi, 1, 3);
let expect : u8 = 4;
eprintln!("expect {:x}", expect);
assert_eq!(sum, expect);
let (sum,_new_m)
= X86u8x16Long0x11b::sum_across_n(lo, hi, 2, 3);
let expect : u8 = 4 + 8;
eprintln!("expect {:x}", expect);
assert_eq!(sum, expect);
}
}
#[test]
#[should_panic]
fn test_matrix_too_small() {
let _ = X86Matrix::<X86u8x16Long0x11b>::new(3, 5, true);
}
#[test]
fn test_matrix_goldilocks() {
let _ = X86Matrix::<X86u8x16Long0x11b>::new(2, 8, true);
let _ = X86Matrix::<X86u8x16Long0x11b>::new(8, 2, true);
let _ = X86Matrix::<X86u8x16Long0x11b>::new(16, 1, true);
let _ = X86Matrix::<X86u8x16Long0x11b>::new(4, 4, true);
}
#[test]
fn test_matrix_read_pre_fill() {
let mat = X86Matrix::<X86u8x16Long0x11b>::new(4, 4, true);
let mut mat_mod_index = 0;
let mut mat_array_index = 0;
let mat_array = mat.as_slice();
let mat_size = mat.size();
let mut mat_ra_size = 0;
let mut mat_ra = X86u8x16Long0x11b::zero_vector();
unsafe {
let zero : __m128i = _mm_set_epi32( 0, 0, 0, 0 );
let first_read = X86u8x16Long0x11b::read_next(
&mut mat_mod_index,
&mut mat_array_index,
mat_array,
mat_size,
&mut mat_ra_size,
&mut mat_ra);
assert_eq!(format!("{:?}",zero), format!("{:?}",first_read.vec))
}
}
#[test]
fn test_matrix_read_post_fill() {
let mut mat = X86Matrix::<X86u8x16Long0x11b>::new(4, 4, true);
let identity = [ 1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1 ];
mat.fill(&identity[..]);
let mut mat_mod_index = 0;
let mut mat_array_index = 0;
let mat_array = mat.as_slice();
let mat_size = mat.size();
let mut mat_ra_size = 0;
let mut mat_ra = X86u8x16Long0x11b::zero_vector();
unsafe {
let one : __m128i = _mm_set_epi32(
0x01000000, 0x00010000,
0x00000100,
0x00000001
);
let array_ptr = identity
.as_ptr() as *const std::arch::x86_64::__m128i;
let id_reg = _mm_lddqu_si128(array_ptr);
assert_eq!(format!("{:?}",one), format!("{:?}",id_reg));
let first_read = X86u8x16Long0x11b::read_next(
&mut mat_mod_index,
&mut mat_array_index,
mat_array,
mat_size,
&mut mat_ra_size,
&mut mat_ra);
assert_eq!(format!("{:?}",one),
format!("{:?}",first_read.vec));
let mut scratch = [0u8; 16];
let scratch_ptr = scratch
.as_mut_ptr() as *mut std::arch::x86_64::__m128i;
_mm_storeu_si128(scratch_ptr,first_read.vec);
assert_eq!(scratch, identity);
}
}
#[test]
fn test_matrix_easy_wraparound() {
let mut mat = X86Matrix::<X86u8x16Long0x11b>::new(4, 4, true);
let identity = [ 1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1 ];
mat.fill(&identity[..]);
let mut mat_mod_index = 0;
let mut mat_array_index = 0;
let mat_array = mat.as_slice();
let mat_size = mat.size();
let mut mat_ra_size = 0;
let mut mat_ra = X86u8x16Long0x11b::zero_vector();
unsafe {
let one : __m128i = _mm_set_epi32(
0x01000000, 0x00010000,
0x00000100,
0x00000001
);
let array_ptr = identity.as_ptr() as *const std::arch::x86_64::__m128i;
let id_reg = _mm_lddqu_si128(array_ptr);
assert_eq!(format!("{:x?}",one), format!("{:x?}",id_reg));
let read = X86u8x16Long0x11b::read_next(
&mut mat_mod_index,
&mut mat_array_index,
mat_array,
mat_size,
&mut mat_ra_size,
&mut mat_ra);
assert_eq!(format!("{:x?}",one), format!("{:x?}",read.vec));
let read = X86u8x16Long0x11b::read_next(
&mut mat_mod_index,
&mut mat_array_index,
mat_array,
mat_size,
&mut mat_ra_size,
&mut mat_ra);
assert_eq!(format!("{:x?}",one), format!("{:x?}",read.vec));
let read = X86u8x16Long0x11b::read_next(
&mut mat_mod_index,
&mut mat_array_index,
mat_array,
mat_size,
&mut mat_ra_size,
&mut mat_ra);
assert_eq!(format!("{:x?}",one), format!("{:x?}",read.vec));
}
}
#[test]
fn test_matrix_internal_read() {
let mut mat = X86Matrix::<X86u8x16Long0x11b>::new(16, 3, true);
mat.fill(&SHUFFLE_MASK[..]);
let mut mat_mod_index = 0;
let mut mat_array_index = 0;
let mat_array = mat.as_slice();
let mat_size = mat.size();
let mut mat_ra_size = 0;
let mut mat_ra = X86u8x16Long0x11b::zero_vector();
unsafe {
let array_ptr = SHUFFLE_MASK.as_ptr();
let ff1_addr = array_ptr.offset( 0) as *const std::arch::x86_64::__m128i;
let inc_addr = array_ptr.offset(16) as *const std::arch::x86_64::__m128i;
let ff2_addr = array_ptr.offset(32) as *const std::arch::x86_64::__m128i;
let ff1 = _mm_lddqu_si128(ff1_addr);
let inc = _mm_lddqu_si128(inc_addr);
let ff2 = _mm_lddqu_si128(ff2_addr);
assert_eq!(format!("{:x?}",ff1), format!("{:x?}",ff2));
let read = X86u8x16Long0x11b::read_next(
&mut mat_mod_index,
&mut mat_array_index,
mat_array,
mat_size,
&mut mat_ra_size,
&mut mat_ra);
assert_eq!(format!("{:x?}",ff1), format!("{:x?}",read.vec));
let read = X86u8x16Long0x11b::read_next(
&mut mat_mod_index,
&mut mat_array_index,
mat_array,
mat_size,
&mut mat_ra_size,
&mut mat_ra);
assert_eq!(format!("{:x?}",inc), format!("{:x?}",read.vec));
let read = X86u8x16Long0x11b::read_next(
&mut mat_mod_index,
&mut mat_array_index,
mat_array,
mat_size,
&mut mat_ra_size,
&mut mat_ra);
assert_eq!(format!("{:x?}",ff1), format!("{:x?}",read.vec));
let read = X86u8x16Long0x11b::read_next(
&mut mat_mod_index,
&mut mat_array_index,
mat_array,
mat_size,
&mut mat_ra_size,
&mut mat_ra);
assert_eq!(format!("{:x?}",ff1), format!("{:x?}",read.vec));
let read = X86u8x16Long0x11b::read_next(
&mut mat_mod_index,
&mut mat_array_index,
mat_array,
mat_size,
&mut mat_ra_size,
&mut mat_ra);
assert_eq!(format!("{:x?}",inc), format!("{:x?}",read.vec));
}
}
#[test]
fn test_matrix_changing_read_offset() {
let stream = [0u8,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,
0u8,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20];
let mut mat = X86Matrix::<X86u8x16Long0x11b>::new(7, 3, true);
mat.fill(&stream[0..21]);
let mut mat_mod_index = 0;
let mut mat_array_index = 0;
let mat_array = mat.as_slice();
let mat_size = mat.size();
let mut mat_ra_size = 0;
let mut mat_ra = X86u8x16Long0x11b::zero_vector();
let array_ptr = stream.as_ptr();
let mut index = 0;
unsafe {
for _ in 0..21*16 * 2 + 1 {
let addr = array_ptr.offset(index)
as *const std::arch::x86_64::__m128i;
let expect = _mm_lddqu_si128(addr);
index += 16;
if index >= 21 { index -= 21 }
let mat_read = X86u8x16Long0x11b::read_next(
&mut mat_mod_index,
&mut mat_array_index,
mat_array,
mat_size,
&mut mat_ra_size,
&mut mat_ra);
assert_eq!(format!("{:x?}",mat_read.vec),
format!("{:x?}",expect));
}
}
}
#[test]
fn test_read_next_cycles() {
let mut errors = 0;
for rows in 4..18 {
for cols in 4..23 {
let size = rows * cols;
let mut mat = X86Matrix::<X86u8x16Long0x11b>
::new(rows, cols, true);
let fill_list = (1u8..=255).cycle().take(size);
let fill_vec : Vec<u8> = fill_list.collect();
eprintln!("filling matrix with {} bytes", fill_vec.len());
mat.fill(&fill_vec[..]);
let mut mat_mod_index = 0;
let mut mat_array_index = 0;
let mat_array = mat.as_slice();
let mat_size = mat.size();
let mut mat_ra_size = 0;
let mut mat_ra = X86u8x16Long0x11b::zero_vector();
let mut ref_list = (1u8..=255).cycle().take(size).cycle();
let mut ref_vec = [0u8; 16];
for i in 0 .. size {
unsafe {
let from_mat = X86u8x16Long0x11b::read_next(
&mut mat_mod_index,
&mut mat_array_index,
mat_array,
mat_size,
&mut mat_ra_size,
&mut mat_ra);
for i in 0..16 {
ref_vec[i] = ref_list.next().unwrap();
}
let addr = ref_vec.as_ptr()
as *const std::arch::x86_64::__m128i;
let expect = _mm_lddqu_si128(addr);
let fmt_ref = format!("{:x?}",expect);
let fmt_mat = format!("{:x?}",from_mat.vec);
if fmt_mat != fmt_ref {
eprintln!("read_next() failed");
eprintln!("Matrix {} rows x {} columns ",
rows, cols);
eprintln!("Got {} != ref {} at position {}",
fmt_mat, fmt_ref, i);
errors += 1;
}
}
}
}
}
if errors > 0 {
panic!("Failing test: {} errors", errors);
}
}
}