1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219
use super::*; #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; use std::mem; pub struct Avx2; impl Simd for Avx2 { type Vi32 = __m256i; type Vf32 = __m256; const WIDTH_BYTES: usize = 8 * 4; #[inline(always)] unsafe fn set_lane_ps(a: &mut Self::Vf32, value: f32, i: usize) { debug_assert!(i < Self::WIDTH_BYTES / 4); let arr = mem::transmute::<&mut __m256, &mut [f32; 8]>(a); arr[i] = value; } #[inline(always)] unsafe fn set_lane_epi32(a: &mut Self::Vi32, value: i32, i: usize) { debug_assert!(i < Self::WIDTH_BYTES / 4); let arr = mem::transmute::<&mut __m256i, &mut [i32; 8]>(a); arr[i] = value; } #[inline(always)] unsafe fn get_lane_ps(a: Self::Vf32, i: usize) -> f32 { debug_assert!(i < Self::WIDTH_BYTES / 4); let arr = mem::transmute::<__m256, [f32; 8]>(a); arr[i] } #[inline(always)] unsafe fn get_lane_epi32(a: Self::Vi32, i: usize) -> i32 { debug_assert!(i < Self::WIDTH_BYTES / 4); let arr = mem::transmute::<__m256i, [i32; 8]>(a); arr[i] } #[inline(always)] unsafe fn abs_ps(a: Self::Vf32) -> Self::Vf32 { let b = _mm256_set1_epi32(0x7fffffff); _mm256_and_ps(a, _mm256_castsi256_ps(b)) } #[inline(always)] unsafe fn add_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32 { _mm256_add_epi32(a, b) } #[inline(always)] unsafe fn add_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32 { _mm256_add_ps(a, b) } #[inline(always)] unsafe fn and_si(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32 { _mm256_and_si256(a, b) } #[inline(always)] unsafe fn andnot_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32 { _mm256_andnot_ps(a, b) } #[inline(always)] unsafe fn andnot_si(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32 { _mm256_andnot_si256(a, b) } #[inline(always)] unsafe fn blendv_epi32(a: Self::Vi32, b: Self::Vi32, mask: Self::Vi32) -> Self::Vi32 { _mm256_castps_si256(_mm256_blendv_ps( _mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _mm256_castsi256_ps(mask), )) } #[inline(always)] unsafe fn blendv_ps(a: Self::Vf32, b: Self::Vf32, mask: Self::Vf32) -> Self::Vf32 { _mm256_blendv_ps(a, b, mask) } #[inline(always)] unsafe fn castps_si(a: Self::Vf32) -> Self::Vi32 { _mm256_castps_si256(a) } #[inline(always)] unsafe fn castsi_ps(a: Self::Vi32) -> Self::Vf32 { _mm256_castsi256_ps(a) } #[inline(always)] unsafe fn cmpeq_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32 { _mm256_cmpeq_epi32(a, b) } #[inline(always)] unsafe fn cmpge_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32 { _mm256_cmp_ps(a, b, _CMP_GE_OQ) } #[inline(always)] unsafe fn cmpgt_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32 { _mm256_cmpgt_epi32(a, b) } #[inline(always)] unsafe fn cmpgt_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32 { _mm256_cmp_ps(a, b, _CMP_GT_OQ) } #[inline(always)] unsafe fn cmplt_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32 { _mm256_cmp_ps(a, b, _CMP_LT_OQ) } #[inline(always)] unsafe fn cvtepi32_ps(a: Self::Vi32) -> Self::Vf32 { _mm256_cvtepi32_ps(a) } #[inline(always)] unsafe fn cvtps_epi32(a: Self::Vf32) -> Self::Vi32 { _mm256_cvtps_epi32(a) } #[inline(always)] unsafe fn ceil_ps(a: Self::Vf32) -> Self::Vf32 { _mm256_ceil_ps(a) } #[inline(always)] unsafe fn floor_ps(a: Self::Vf32) -> Self::Vf32 { _mm256_floor_ps(a) } #[inline(always)] unsafe fn fastfloor_ps(a: Self::Vf32) -> Self::Vf32 { _mm256_floor_ps(a) } #[inline(always)] unsafe fn fmadd_ps(a: Self::Vf32, b: Self::Vf32, c: Self::Vf32) -> Self::Vf32 { _mm256_fmadd_ps(a, b, c) } #[inline(always)] unsafe fn fnmadd_ps(a: Self::Vf32, b: Self::Vf32, c: Self::Vf32) -> Self::Vf32 { _mm256_fnmadd_ps(a, b, c) } #[inline(always)] unsafe fn i32gather_epi32(arr: &[i32], index: Self::Vi32) -> Self::Vi32 { _mm256_i32gather_epi32(&arr[0] as *const i32, index, 4) } #[inline(always)] unsafe fn i32gather_ps(arr: &[f32], index: Self::Vi32) -> Self::Vf32 { _mm256_i32gather_ps(&arr[0] as *const f32, index, 4) } #[inline(always)] unsafe fn loadu_ps(a: &f32) -> Self::Vf32 { _mm256_loadu_ps(a as *const f32) } #[inline(always)] unsafe fn loadu_si(a: &i32) -> Self::Vi32 { let m = mem::transmute::<&i32, &__m256i>(a); _mm256_loadu_si256(m) } #[inline(always)] unsafe fn storeu_ps(a: &mut f32, b: Self::Vf32) { _mm256_storeu_ps(a as *mut f32, b) } #[inline(always)] unsafe fn max_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32 { _mm256_max_ps(a, b) } #[inline(always)] unsafe fn min_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32 { _mm256_min_ps(a, b) } #[inline(always)] unsafe fn mul_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32 { _mm256_mul_ps(a, b) } #[inline(always)] unsafe fn div_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32 { _mm256_div_ps(a, b) } #[inline(always)] unsafe fn mullo_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32 { _mm256_mullo_epi32(a, b) } #[inline(always)] unsafe fn or_si(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32 { _mm256_or_si256(a, b) } #[inline(always)] unsafe fn round_ps(a: Self::Vf32) -> Self::Vf32 { _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) } #[inline(always)] unsafe fn set1_epi32(a: i32) -> Self::Vi32 { _mm256_set1_epi32(a) } #[inline(always)] unsafe fn set1_ps(a: f32) -> Self::Vf32 { _mm256_set1_ps(a) } #[inline(always)] unsafe fn setzero_ps() -> Self::Vf32 { _mm256_setzero_ps() } #[inline(always)] unsafe fn setzero_si() -> Self::Vi32 { _mm256_setzero_si256() } #[inline(always)] unsafe fn srai_epi32(a: Self::Vi32, b: i32) -> Self::Vi32 { _mm256_srai_epi32(a, b) } #[inline(always)] unsafe fn sub_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32 { _mm256_sub_epi32(a, b) } #[inline(always)] unsafe fn sub_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32 { _mm256_sub_ps(a, b) } #[inline(always)] unsafe fn sqrt_ps(a: Self::Vf32) -> Self::Vf32 { _mm256_sqrt_ps(a) } #[inline(always)] unsafe fn rsqrt_ps(a: Self::Vf32) -> Self::Vf32 { _mm256_rsqrt_ps(a) } #[inline(always)] unsafe fn xor_si(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32 { _mm256_xor_si256(a, b) } }