#[cfg(target_arch = "arm")]
use core::arch::arm::*;
#[cfg(target_arch = "aarch64")]
use core::arch::aarch64::*;
use std::mem::transmute;
use crate::*;
#[derive(Debug,Copy,Clone)]
pub struct VmullEngine8x8 {
vec : uint8x8_t,
}
impl VmullEngine8x8 {
#[inline(always)]
unsafe fn read_simd(ptr: *const u8) -> Self {
vld1_p8(ptr).into()
}
unsafe fn xor_across(v : Self) -> u8 {
let mut v : uint64x1_t = vreinterpret_u64_u8(v.vec);
v = veor_u64(v, vshr_n_u64::<32>(v)); v = veor_u64(v, vshr_n_u64::<16>(v)); v = veor_u64(v, vshr_n_u64::<8>(v)); let ret = vget_lane_u8::<0>(vreinterpret_u8_u64(v));
ret
}
unsafe fn rotate_right(v : Self, amount : usize) -> Self {
let mut mask = transmute( [0u8,1,2,3,4,5,6,7] ); let add_amount = vmov_n_u8(amount as u8);
let range_mask = vmov_n_u8(0x07);
mask = vadd_u8(mask, add_amount);
mask = vand_u8(mask, range_mask);
vtbl1_u8(v.vec, mask).into()
}
unsafe fn rotate_left(v : Self, amount : usize) -> Self {
Self::rotate_right(v, 8 - amount)
}
unsafe fn shift(v : Self, amount : isize) -> Self {
let mut mask = transmute( [0u8,1,2,3,4,5,6,7] ); let add_amount = vmov_n_s8(amount as i8);
mask = vadd_s8(mask, add_amount);
vreinterpret_u8_s8(vtbl1_s8(vreinterpret_s8_u8(v.vec), mask))
.into()
}
unsafe fn shift_left(v : Self, amount : usize) -> Self {
Self::shift(v, -(amount as isize))
}
unsafe fn shift_right(v : Self, amount : usize) -> Self {
Self::shift(v, amount as isize)
}
unsafe fn extract_from_offset(lo: &Self, hi : &Self, offset : usize)
-> Self {
debug_assert!(offset < 8);
let tbl2 = uint8x8x2_t ( lo.vec, hi.vec );
let mut mask = transmute( [0u8,1,2,3,4,5,6,7] ); let add_amount = vmov_n_u8(offset as u8);
mask = vadd_u8(mask, add_amount);
vtbl2_u8(tbl2, mask).into()
}
unsafe fn splat(elem : u8) -> Self {
vmov_n_u8(elem).into()
}
unsafe fn mask_start_elements(v : Self, count : usize) -> Self {
debug_assert!(count > 0);
let mask = Self::shift_right(Self::splat(0xff),
(8usize - count).into());
vand_u8(v.vec, mask.vec).into()
}
unsafe fn mask_end_elements(v : Self, count : usize) -> Self {
debug_assert!(count > 0);
let mask = Self::shift_left(Self::splat(0xff),
(8usize - count).into());
vand_u8(v.vec, mask.vec).into()
}
unsafe fn non_wrapping_read(read_ptr : *const u8,
beyond : *const u8
) -> Option<Self> {
if read_ptr.offset(Self::SIMD_BYTES as isize) > beyond {
None
} else {
Some(Self::read_simd(read_ptr).into())
}
}
unsafe fn wrapping_read(read_ptr : *const u8,
beyond : *const u8,
restart : *const u8
) -> (Self, Option<Self>) {
let missing : isize
= (read_ptr.offset(Self::SIMD_BYTES as isize)).offset_from(beyond);
debug_assert!(missing >= 0);
let mut r0 = Self::read_simd(read_ptr);
if missing == 0 {
return (r0.into(), None);
}
let r1 = Self::read_simd(restart);
r0 = Self::shift_left(r0.into(), missing as usize);
r0 = Self::extract_from_offset(&r0, &r1, 8-missing as usize);
(r0, Some(Self::mask_end_elements(r1, 8 - missing as usize)))
}
}
impl From<uint8x8_t> for VmullEngine8x8 {
fn from(other : uint8x8_t) -> Self {
Self { vec : other }
}
}
impl From<poly8x8_t> for VmullEngine8x8 {
fn from(other : poly8x8_t) -> Self {
unsafe {
Self { vec : vreinterpret_u8_p8(other) }
}
}
}
impl Simd for VmullEngine8x8 {
type V = uint8x8_t;
type E = u8;
const SIMD_BYTES : usize = 8;
#[inline(always)]
fn zero_element() -> Self::E { 0 }
#[inline(always)]
fn add_elements(a : Self::E, b : Self::E) -> Self::E { (a ^ b).into() }
#[inline(always)]
fn zero_vector() -> Self {
unsafe { vmov_n_u8(0).into() }
}
fn cross_product(a : Self, b : Self) -> Self {
unsafe {
simd_mull_reduce_poly8x8(&vreinterpret_p8_u8(a.vec),
&vreinterpret_p8_u8(b.vec)).into()
}
}
unsafe fn from_ptr(ptr: *const Self::E) -> Self {
Self::read_simd(ptr)
}
fn cross_product_slices(dest: &mut [Self::E],
av : &[Self::E], bv : &[Self::E]) {
debug_assert_eq!(av.len(), bv.len());
debug_assert_eq!(bv.len(), dest.len());
let bytes = av.len();
if bytes & 7 != 0 {
panic!("Buffer length not a multiple of 8");
}
let mut times = bytes >> 3;
let mut dest = dest.as_mut_ptr();
let mut av = av.as_ptr();
let mut bv = bv.as_ptr();
while times > 0 {
times -= 1;
let a : Self;
let b : Self;
let res : Self;
unsafe {
a = Self::read_simd(av); b = Self::read_simd(bv); av = av.offset(1); bv = bv.offset(1);
}
res = Self::cross_product(a, b);
unsafe {
vst1_u8(dest, res.vec);
dest = dest.offset(1);
}
}
}
#[inline(always)]
unsafe fn read_next(mod_index : &mut usize,
array_index : &mut usize,
array : &[Self::E],
size : usize,
ra_size : &mut usize,
ra : &mut Self)
-> Self {
let mut new_ra : Self; let mut new_mod_index = *mod_index;
let mut new_ra_size = *ra_size;
let available_at_end = size - *array_index;
let available = *ra_size + available_at_end;
debug_assert!(available_at_end > 0);
let read_ptr = array.as_ptr().offset((*array_index) as isize);
let mut r0 : Self = Self::read_simd(read_ptr as *const u8).into();
*array_index += 8;
let result;
let mut have_r1 = false;
let mut r1 = r0;
let array_bool = *array_index >= size;
let avail_bool = available_at_end <= 8;
if *array_index >= size {
if available < 8 {
let read_ptr = array.as_ptr().offset(0);
r1 = Self::read_simd(read_ptr as *const u8);
*array_index = 8;
new_ra_size = available;
if *ra_size > 0 {
r0 = Self::extract_from_offset(&ra, &r0, 8 - *ra_size);
r0 = Self::shift_left(r0, 8 - available);
result = Self::extract_from_offset(&r0, &r1, 8 - available);
new_ra = r1;
} else {
r0 = Self::shift_left(r0, 8 - available);
result = Self::extract_from_offset(&r0, &r1, 8 - available);
new_ra = r1;
}
} else {
new_ra_size = available - 8;
if new_ra_size > 0 {
new_ra = Self::shift_left(r0,8 - available_at_end);
} else {
new_ra = r0; }
if *ra_size > 0 {
result = Self::extract_from_offset(&ra, &r0, 8 - *ra_size);
} else {
result = r0;
}
}
} else {
if *ra_size > 0 {
result = Self::extract_from_offset(&ra, &r0, 8 - *ra_size);
} else {
result = r0;
}
new_ra = r0;
}
*ra_size = new_ra_size;
*ra = new_ra;
new_mod_index += 8;
if new_mod_index >= size { new_mod_index -= size }
*mod_index = new_mod_index;
if *array_index >= size {
*array_index = 0;
}
return result;
}
unsafe fn sum_across_n(lo : Self, hi : Self, n : usize, off : usize)
-> (Self::E, Self) {
let m = if off + n >= 8 { hi } else { lo };
let extracted = Self::extract_from_offset(&lo, &hi, off);
let masked = Self::mask_start_elements(extracted, n).into();
let result = Self::xor_across(masked);
( result, m )
}
}
pub fn simd_mull_reduce_poly8x8(a : &poly8x8_t, b: &poly8x8_t)
-> poly8x8_t {
unsafe {
let mut working : poly16x8_t = vmull_p8(*a, *b);
let mut top_nibble : uint16x8_t = vshrq_n_u16 (vreinterpretq_u16_p16(working), 12);
let tbl_1 : uint8x8_t = transmute([0x00u8, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, ]);
let tbl_2 : uint8x8_t = transmute([0xd8u8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99, ]);
let u4_0x11b_mod_table = uint8x8x2_t ( tbl_1, tbl_2 );
let mut reduced : uint8x8_t = vmovn_u16(top_nibble);
let mut lut : uint8x8_t = vtbl2_u8(u4_0x11b_mod_table, reduced);
let mut widened : poly16x8_t = vreinterpretq_p16_u16(vmovl_u8(lut));
widened = vreinterpretq_p16_u16(vshlq_n_u16(vreinterpretq_u16_p16(widened), 4));
working = vreinterpretq_p16_u16(veorq_u16(
vreinterpretq_u16_p16(working),
vreinterpretq_u16_p16(widened)));
top_nibble = vshlq_n_u16 (vreinterpretq_u16_p16(working), 4);
top_nibble = vshrq_n_u16 (top_nibble, 12);
reduced = vmovn_u16(top_nibble);
lut = vtbl2_u8(u4_0x11b_mod_table, reduced);
widened = vreinterpretq_p16_u16(vmovl_u8(lut));
working = vreinterpretq_p16_u16(veorq_u16(
vreinterpretq_u16_p16(working),
vreinterpretq_u16_p16(widened)));
working = vreinterpretq_p16_u16(vshlq_n_u16 (vreinterpretq_u16_p16(working), 8));
working = vreinterpretq_p16_u16(vshrq_n_u16 (vreinterpretq_u16_p16(working), 8));
let narrowed : uint8x8_t = vmovn_u16(vreinterpretq_u16_p16(working));
vreinterpret_p8_u8(narrowed)
}
}
pub struct ArmMatrix<S : Simd> {
_zero: S,
rows : usize,
cols : usize,
pub array : Vec<u8>,
is_rowwise : bool,
}
impl ArmMatrix<VmullEngine8x8> {
pub fn new(rows : usize, cols : usize, is_rowwise : bool) -> Self {
let size = rows * cols;
if size < 8 {
panic!("This matrix can't handle rows * cols < 8 bytes");
}
let array = vec![0u8; size + 7];
let _zero = VmullEngine8x8::zero_vector();
ArmMatrix::<VmullEngine8x8> {
rows, cols, is_rowwise, array, _zero
}
}
pub fn fill(&mut self, data : &[u8]) {
let size = self.size();
if data.len() != size {
panic!("Supplied {} data bytes != matrix size {}",
data.len(), size);
}
self.array[0..size].copy_from_slice(data);
}
pub fn new_with_data(rows : usize, cols : usize, is_rowwise : bool,
data : &[u8]) -> Self {
let mut this = Self::new(rows, cols, is_rowwise);
this.fill(data);
this
}
}
impl SimdMatrix<VmullEngine8x8> for ArmMatrix<VmullEngine8x8> {
#[inline(always)]
fn rows(&self) -> usize { self.rows }
#[inline(always)]
fn cols(&self) -> usize { self.cols }
#[inline(always)]
fn is_rowwise(&self) -> bool { self.is_rowwise }
fn as_slice(&self) -> &[u8] {
let size = self.size();
&self.array[0..size]
}
#[inline(always)]
fn indexed_write(&mut self, index : usize, elem : u8) {
self.array[index] = elem;
}
fn as_mut_slice(&mut self) -> &mut [u8] {
let size = self.size();
&mut self.array[0..size]
}
}
#[cfg(test)]
mod tests {
use super::*;
use guff::{GaloisField,new_gf8};
#[test]
fn test_mull_reduce_poly8x8() {
let a_array = [0u8,10,20,30,40,50,60,70];
let b_array = [8u8,9,10,11,12,13,14,15];
let a : poly8x8_t;
let b : poly8x8_t;
unsafe {
a = transmute ( a_array );
b = transmute ( b_array );
}
let mut r : poly8x8_t;
let mut result : Vec<u8> = vec![0;8];
let f = new_gf8(0x11b, 0x1b);
let got_poly = simd_mull_reduce_poly8x8(&a, &b);
unsafe {
vst1_p8(result.as_mut_ptr(), got_poly);
}
for i in 0 .. 8 {
let got = result[i];
let expect = f.mul(a_array[i], b_array[i]);
assert_eq!(got, expect);
}
}
#[test]
fn test_rotate_right_1() {
unsafe {
let data : uint8x8_t = transmute([1u8,10,20,30,40,50,60,70]);
let expect : uint8x8_t = transmute([10u8,20,30,40,50,60,70,1]);
let got = VmullEngine8x8::rotate_right(data.into(), 1);
assert_eq!(format!("{:x?}", expect),
format!("{:x?}", got.vec));
}
}
#[test]
fn test_rotate_left_1() {
unsafe {
let data : uint8x8_t = transmute([1u8,10,20,30,40,50,60,70]);
let expect : uint8x8_t = transmute([70u8,1,10,20,30,40,50,60]);
let got = VmullEngine8x8::rotate_left(data.into(), 1);
assert_eq!(format!("{:x?}", expect),
format!("{:x?}", got.vec));
}
}
#[test]
fn test_shift_right_1() {
unsafe {
let data : uint8x8_t = transmute([1u8,10,20,30,40,50,60,70]);
let expect : uint8x8_t = transmute([10u8,20,30,40,50,60,70,0]);
let got = VmullEngine8x8::shift_right(data.into(), 1);
assert_eq!(format!("{:x?}", expect),
format!("{:x?}", got.vec));
}
}
#[test]
fn test_shift_left_1() {
unsafe {
let data : uint8x8_t = transmute([1u8, 10,20,30,40,50,60,70]);
let expect : uint8x8_t = transmute([0u8, 1,10,20,30,40,50,60]);
let got = VmullEngine8x8::shift_left(data.into(), 1);
assert_eq!(format!("{:x?}", expect),
format!("{:x?}", got.vec));
}
}
#[test]
fn test_xor_across() {
unsafe {
let data : uint8x8_t = transmute([1u8, 2,4,8,16,32,64,128]);
let got = VmullEngine8x8::xor_across(data.into());
assert_eq!(255, got);
let data : uint8x8_t = transmute([0u8,1, 2,4,8,16,32,64]);
let got = VmullEngine8x8::xor_across(data.into());
assert_eq!(0x7f, got);
}
}
#[test]
fn test_extract_from_offset() {
unsafe {
let r0 : uint8x8_t = transmute([1u8, 2,4,8,16,32,64,128]);
let r1 : uint8x8_t = transmute([1u8, 2,3,4,5,6,7,8]);
let off_1 : uint8x8_t = transmute([2u8,4,8,16,32,64,128,1]);
let res = VmullEngine8x8::extract_from_offset(&r0.into(), &r1.into(), 0);
assert_eq!(format!("{:x?}", r0),
format!("{:x?}", res.vec));
let res = VmullEngine8x8::extract_from_offset(&r0.into(), &r1.into(), 1);
assert_eq!(format!("{:x?}", off_1),
format!("{:x?}", res.vec));
}
}
#[test]
fn test_splat() {
unsafe {
let expect : uint8x8_t = transmute([42u8,42,42,42, 42,42,42,42]);
let got = VmullEngine8x8::splat(42);
assert_eq!(format!("{:x?}", expect),
format!("{:x?}", got.vec));
}
}
#[test]
fn test_mask_start_elements() {
unsafe {
let input : uint8x8_t = transmute([42u8,42,42,42, 42,42,42,42]);
let expect_1 : uint8x8_t = transmute([42u8,0 ,0 ,0 , 0 ,0 ,0 ,0 ]);
let expect_2 : uint8x8_t = transmute([42u8,42,0 ,0 , 0 ,0 ,0 ,0 ]);
let expect_3 : uint8x8_t = transmute([42u8,42,42,0 , 0 ,0 ,0 ,0 ]);
let expect_7 : uint8x8_t = transmute([42u8,42,42,42, 42,42,42,0 ]);
let expect_8 : uint8x8_t = transmute([42u8,42,42,42, 42,42,42,42]);
let got = VmullEngine8x8::mask_start_elements(input.into(),1);
assert_eq!(format!("{:x?}", expect_1),
format!("{:x?}", got.vec));
let got = VmullEngine8x8::mask_start_elements(input.into(),2);
assert_eq!(format!("{:x?}", expect_2),
format!("{:x?}", got.vec));
let got = VmullEngine8x8::mask_start_elements(input.into(),3);
assert_eq!(format!("{:x?}", expect_3),
format!("{:x?}", got.vec));
let got = VmullEngine8x8::mask_start_elements(input.into(),7);
assert_eq!(format!("{:x?}", expect_7),
format!("{:x?}", got.vec));
let got = VmullEngine8x8::mask_start_elements(input.into(),8);
assert_eq!(format!("{:x?}", expect_8),
format!("{:x?}", got.vec));
}
}
#[test]
fn test_mask_end_elements() {
unsafe {
let input : uint8x8_t = transmute([42u8,42,42,42, 42,42,42,42]);
let expect_1 : uint8x8_t = transmute([0u8 ,0 ,0 ,0 , 0 ,0 ,0 ,42 ]);
let expect_2 : uint8x8_t = transmute([0u8 ,0 ,0 ,0 , 0 ,0 ,42,42 ]);
let expect_3 : uint8x8_t = transmute([0u8 ,0 ,0 ,0 , 0 ,42,42,42 ]);
let expect_7 : uint8x8_t = transmute([0u8,42,42,42, 42,42,42,42]);
let expect_8 : uint8x8_t = transmute([42u8,42,42,42, 42,42,42,42]);
let got = VmullEngine8x8::mask_end_elements(input.into(),1);
assert_eq!(format!("{:x?}", expect_1),
format!("{:x?}", got.vec));
let got = VmullEngine8x8::mask_end_elements(input.into(),2);
assert_eq!(format!("{:x?}", expect_2),
format!("{:x?}", got.vec));
let got = VmullEngine8x8::mask_end_elements(input.into(),3);
assert_eq!(format!("{:x?}", expect_3),
format!("{:x?}", got.vec));
let got = VmullEngine8x8::mask_end_elements(input.into(),7);
assert_eq!(format!("{:x?}", expect_7),
format!("{:x?}", got.vec));
let got = VmullEngine8x8::mask_end_elements(input.into(),8);
assert_eq!(format!("{:x?}", expect_8),
format!("{:x?}", got.vec));
}
}
fn test_non_wrapping_read() {
unsafe {
let vector = vec![42u8,42,42,42, 42,42,42,42,
42u8,42,42,42, 42,42,42,42,
42u8,42,42,42, 0,0,0,0,
0,0,0,0 , 0,0,0,0 ];
let mut pointer = vector.as_ptr();
let beyond = pointer.offset(20);
let _ = VmullEngine8x8::non_wrapping_read(
pointer, beyond).unwrap();
let _ = VmullEngine8x8::non_wrapping_read(
pointer.offset(8), beyond).unwrap();
match VmullEngine8x8::non_wrapping_read(pointer.offset(16), beyond) {
None => { },
_ => { panic!("Should have got back None"); }
}
}
}
fn test_wrapping_read() {
unsafe {
let vector = vec![1u8, 2, 3, 4, 5, 6, 7, 8,
42u8,42,42,42, 42,42,42,42,
41u8,40,39,38, 0,0,0,0,
0,0,0,0 , 0,0,0,0 ];
let mut pointer = vector.as_ptr();
let beyond = pointer.offset(20);
let _ = VmullEngine8x8::non_wrapping_read(
pointer, beyond).unwrap();
let _ = VmullEngine8x8::non_wrapping_read(
pointer.offset(8), beyond).unwrap();
let try_non_wrapping = VmullEngine8x8
::non_wrapping_read(pointer.offset(16), beyond);
match VmullEngine8x8::non_wrapping_read(pointer.offset(16), beyond) {
None => { },
_ => { panic!("Should have got back None"); }
}
let (first, next) = VmullEngine8x8
::wrapping_read(pointer.offset(16), beyond, pointer);
let expect_first : uint8x8_t = transmute([41u8,40,39,38, 1,2,3,4 ]);
let expect_next : uint8x8_t = transmute([0u8,0,0,0, 5,6,7,8 ]);
assert_eq!(format!("{:x?}", expect_first),
format!("{:x?}", first.vec));
assert_eq!(format!("{:x?}", expect_next),
format!("{:x?}", next.unwrap().vec));
}
}
#[test]
fn test_read_next_simple() {
let mut ra;
unsafe { ra = VmullEngine8x8::zero_vector() }
let mut ra_size = 0;
let mut mod_index = 0;
let mut array_index = 0;
let size = 24;
let array = [ 0u8,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,
0,0,0,0,0,0,0,0];
let mut index = 0;
let mut check = (0u8..24).cycle();
let mut check_vec = [0u8; 8];
let addr = check_vec.as_ptr();
let old_mod_index = 0;
for i in 0..42 {
unsafe {
for i in 0..8 {
check_vec[i] = check.next().unwrap();
}
index += 8;
let got = VmullEngine8x8
::read_next(&mut mod_index,
&mut array_index,
&array[..],
size,
&mut ra_size,
&mut ra);
assert_eq!(mod_index, index % size);
let v = VmullEngine8x8::read_simd(addr);
assert_eq!(format!("{:x?}", got.vec),
format!("{:x?}", v.vec));
}
}
}
#[test]
fn test_read_next() {
let mut ra;
unsafe { ra = VmullEngine8x8::zero_vector() }
let mut ra_size = 0;
let mut mod_index = 0;
let mut array_index = 0;
let size = 21;
let array = [ 0u8,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,
0,0,0,0,0,0,0,0];
let mut index = 0;
let mut check = (0u8..21).cycle();
let mut check_vec = [0u8; 8];
let addr = check_vec.as_ptr();
let old_mod_index = 0;
for i in 0..42 {
unsafe {
for i in 0..8 {
check_vec[i] = check.next().unwrap();
}
eprintln!("\nAbsolute index {}", index);
index += 8;
let got = VmullEngine8x8
::read_next(&mut mod_index,
&mut array_index,
&array[..],
size,
&mut ra_size,
&mut ra);
assert_eq!(mod_index, index % size);
let v = VmullEngine8x8::read_simd(addr);
assert_eq!(format!("{:x?}", got.vec),
format!("{:x?}", v.vec));
}
}
}
#[test]
fn test_sum_across_n() {
let a0 = [ 0u8, 1, 2, 4, 8, 16, 32, 64, ]; let a1 = [ 128u8, 0, 1, 2, 4, 8, 16, 32, ]; let a2 = [ 1u8, 2, 4, 8, 16, 32, 64, 128, ];
let a3 = [ 0u8, 1, 2, 4, 8, 16, 32, 64, ];
unsafe {
let a0 = VmullEngine8x8::read_simd(a0.as_ptr());
let a1 = VmullEngine8x8::read_simd(a1.as_ptr());
let a2 = VmullEngine8x8::read_simd(a2.as_ptr());
let a3 = VmullEngine8x8::read_simd(a3.as_ptr());
let (sum,_new_m) = VmullEngine8x8::sum_across_n(a0, a1, 8, 0);
let expect : u8 = 0b0111_1111;
eprintln!("expect {:x}", expect);
assert_eq!(sum, expect);
}
}
#[test]
fn test_new_sum_across_n() {
let a0 = [ 0u8, 1, 2, 4, 8, 16, 32, 64, ];
let a1 = [ 128u8, 0, 1, 2, 4, 8, 16, 32, ];
let a2 = [ 1u8, 2, 4, 8, 16, 32, 64, 128, ];
let a3 = [ 0u8, 1, 2, 4, 8, 16, 32, 64, ];
unsafe {
let lo = VmullEngine8x8::read_simd(a0.as_ptr());
let hi = VmullEngine8x8::read_simd(a1.as_ptr());
}
}
}