Skip to main content

tequel/
avx2_inline.rs

1use std::arch::x86_64::*;
2
3
4#[inline(always)]
5pub unsafe fn loadu(src: *const __m256i) -> __m256i {
6    unsafe { _mm256_loadu_si256(src as *const __m256i) }
7}
8
9
10#[inline(always)]
11pub unsafe fn storeu(dest: *mut __m256i, src: __m256i) {
12    unsafe { _mm256_storeu_si256(dest, src) }
13}
14
15
16#[inline(always)]
17pub unsafe fn add(a: __m256i, b: __m256i) -> __m256i {
18    unsafe { _mm256_add_epi32(a, b) }
19}
20
21
22#[inline(always)]
23pub unsafe fn add_i8(a: __m256i, b: __m256i) -> __m256i {
24    unsafe { _mm256_add_epi8(a, b) }
25}
26
27
28#[inline(always)]
29pub unsafe fn sub(a: __m256i, b: __m256i) -> __m256i {
30    unsafe { _mm256_sub_epi32(a, b) }
31}
32
33
34#[inline(always)]
35pub unsafe fn sub_i8(a: __m256i, b: __m256i) -> __m256i {
36    unsafe { _mm256_sub_epi8(a, b) }
37}
38
39
40#[inline(always)]
41pub unsafe fn xor(a: __m256i, b: __m256i) -> __m256i {
42    unsafe { _mm256_xor_si256(a, b) }
43}
44
45
46#[inline(always)]
47pub unsafe fn or(a: __m256i, b: __m256i) -> __m256i {
48    unsafe { _mm256_or_si256(a, b) }
49}
50
51
52#[inline(always)]
53pub unsafe fn setzero() -> __m256i {
54    unsafe { _mm256_setzero_si256() }
55}
56
57
58#[inline(always)]
59pub unsafe fn setone_i8(v: i8) -> __m256i {
60    unsafe { _mm256_set1_epi8(v) }
61}
62
63#[inline(always)]
64pub unsafe fn setone_i32(v: i32) -> __m256i {
65    unsafe { _mm256_set1_epi32(v) }
66}
67
68
69#[inline(always)]
70pub unsafe fn rota_lf<const IMM8: i32>(c: __m256i) -> __m256i {
71    unsafe { _mm256_slli_epi32(c, IMM8) }
72}
73
74
75#[inline(always)]
76pub unsafe fn rota_rg<const IMM8: i32>(c: __m256i) -> __m256i {
77    unsafe { _mm256_srli_epi32(c, IMM8) }
78}
79
80
81
82#[inline(always)]
83pub unsafe fn horiz_add_avx2(v: __m256i) -> u32 {
84    let mut arr = [0u32; 8];
85    
86    unsafe { storeu(arr.as_mut_ptr() as *mut __m256i, v) };          
87
88    arr.iter().fold(0, |acc, &x| acc.wrapping_add(x))
89}