1use super::*;
2
3impl_op! {
4 fn add<i64> {
5 for Avx2(a: __m256i, b: __m256i) -> __m256i {
6 _mm256_add_epi64(a, b)
7 }
8 for Sse41(a: __m128i, b: __m128i) -> __m128i {
9 _mm_add_epi64(a, b)
10 }
11 for Sse2(a: __m128i, b: __m128i) -> __m128i {
12 _mm_add_epi64(a, b)
13 }
14 for Scalar(a: i64, b: i64) -> i64 {
15 a.wrapping_add(b)
16 }
17 for Neon(a: int64x2_t, b: int64x2_t) -> int64x2_t {
18 vaddq_s64(a, b)
19 }
20 for Wasm(a: v128, b: v128) -> v128 {
21 i64x2_add(a, b)
22 }
23 }
24}
25
26impl_op! {
27 fn sub<i64> {
28 for Avx2(a: __m256i, b: __m256i) -> __m256i {
29 _mm256_sub_epi64(a, b)
30 }
31 for Sse41(a: __m128i, b: __m128i) -> __m128i {
32 _mm_sub_epi64(a, b)
33 }
34 for Sse2(a: __m128i, b: __m128i) -> __m128i {
35 _mm_sub_epi64(a, b)
36 }
37 for Scalar(a: i64, b: i64) -> i64 {
38 a.wrapping_sub(b)
39 }
40 for Neon(a: int64x2_t, b: int64x2_t) -> int64x2_t {
41 vsubq_s64(a, b)
42 }
43 for Wasm(a: v128, b: v128) -> v128 {
44 i64x2_sub(a, b)
45 }
46 }
47}
48
49impl_op! {
50 fn mul<i64> {
51 for Avx2(a: __m256i, b: __m256i) -> __m256i {
52 let a_arr = core::mem::transmute::<__m256i, [i64; 4]>(a);
53 let b_arr = core::mem::transmute::<__m256i, [i64; 4]>(b);
54 let c_arr = [
55 a_arr[0].wrapping_mul(b_arr[0]),
56 a_arr[1].wrapping_mul(b_arr[1]),
57 a_arr[2].wrapping_mul(b_arr[2]),
58 a_arr[3].wrapping_mul(b_arr[3]),
59 ];
60 core::mem::transmute::<_, __m256i>(c_arr)
61 }
62 for Sse41(a: __m128i, b: __m128i) -> __m128i {
63 let a_arr = core::mem::transmute::<__m128i, [i64; 2]>(a);
64 let b_arr = core::mem::transmute::<__m128i, [i64; 2]>(b);
65 let c_arr = [
66 a_arr[0].wrapping_mul(b_arr[0]),
67 a_arr[1].wrapping_mul(b_arr[1]),
68 ];
69 core::mem::transmute::<_, __m128i>(c_arr)
70 }
71 for Sse2(a: __m128i, b: __m128i) -> __m128i {
72 let a_arr = core::mem::transmute::<__m128i, [i64; 2]>(a);
73 let b_arr = core::mem::transmute::<__m128i, [i64; 2]>(b);
74 let c_arr = [
75 a_arr[0].wrapping_mul(b_arr[0]),
76 a_arr[1].wrapping_mul(b_arr[1]),
77 ];
78 core::mem::transmute::<_, __m128i>(c_arr)
79 }
80 for Scalar(a: i64, b: i64) -> i64 {
81 a.wrapping_mul(b)
82 }
83 for Neon(a: int64x2_t, b: int64x2_t) -> int64x2_t {
84 let a_arr = core::mem::transmute::<int64x2_t, [i64; 2]>(a);
85 let b_arr = core::mem::transmute::<int64x2_t, [i64; 2]>(b);
86 let c_arr = [
87 a_arr[0].wrapping_mul(b_arr[0]),
88 a_arr[1].wrapping_mul(b_arr[1]),
89 ];
90 core::mem::transmute::<_, int64x2_t>(c_arr)
91 }
92 for Wasm(a: v128, b: v128) -> v128 {
93 i64x2_mul(a, b)
94 }
95 }
96}
97
98impl_op! {
99 fn min<i64> {
100 for Avx2(a: __m256i, b: __m256i) -> __m256i {
101 let mask = _mm256_cmpgt_epi64(a, b);
102 _mm256_or_si256(_mm256_and_si256(mask, b), _mm256_andnot_si256(mask, a))
103 }
104 for Sse41(a: __m128i, b: __m128i) -> __m128i {
105 let mask = _mm_cmpgt_epi64(a, b);
106 _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a))
107 }
108 for Sse2(a: __m128i, b: __m128i) -> __m128i {
109 let mask = _mm_cmpgt_epi64(a, b);
110 _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a))
111 }
112 for Scalar(a: i64, b: i64) -> i64 {
113 a.min(b)
114 }
115 for Neon(a: int64x2_t, b: int64x2_t) -> int64x2_t {
116 let mask = vreinterpretq_s64_u64(vcgtq_s64(a, b));
117 let not_mask = vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(mask)));
118 vorrq_s64(vandq_s64(mask, b), vandq_s64(not_mask, a))
119 }
120 for Wasm(a: v128, b: v128) -> v128 {
121 let mask = i64x2_gt(a, b);
122 v128_or(v128_and(mask, b), v128_andnot(a, mask))
123 }
124 }
125}
126
127impl_op! {
128 fn max<i64> {
129 for Avx2(a: __m256i, b: __m256i) -> __m256i {
130 let mask = _mm256_cmpgt_epi64(a, b);
131 _mm256_or_si256(_mm256_and_si256(mask, a), _mm256_andnot_si256(mask, b))
132 }
133 for Sse41(a: __m128i, b: __m128i) -> __m128i {
134 let mask = _mm_cmpgt_epi64(a, b);
135 _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b))
136 }
137 for Sse2(a: __m128i, b: __m128i) -> __m128i {
138 let mask = _mm_cmpgt_epi64(a, b);
139 _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b))
140 }
141 for Scalar(a: i64, b: i64) -> i64 {
142 a.max(b)
143 }
144 for Neon(a: int64x2_t, b: int64x2_t) -> int64x2_t {
145 let mask = vreinterpretq_s64_u64(vcgtq_s64(a, b));
146 let not_mask = vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(mask)));
147 vorrq_s64(vandq_s64(mask, a), vandq_s64(not_mask, b))
148 }
149 for Wasm(a: v128, b: v128) -> v128 {
150 let mask = i64x2_gt(a, b);
151 v128_or(v128_and(mask, a), v128_andnot(b, mask))
152 }
153 }
154}
155
156impl_op! {
157 fn abs<i64> {
158 for Avx2(a: __m256i) -> __m256i {
159 let mask = _mm256_cmpgt_epi64(_mm256_setzero_si256(), a);
160 _mm256_sub_epi64(_mm256_xor_si256(a, mask), mask)
161 }
162 for Sse41(a: __m128i) -> __m128i {
163 let mask = _mm_cmpgt_epi64(_mm_setzero_si128(), a);
164 _mm_sub_epi64(_mm_xor_si128(a, mask), mask)
165 }
166 for Sse2(a: __m128i) -> __m128i {
167 let mask = _mm_cmpgt_epi64(_mm_setzero_si128(), a);
168 _mm_sub_epi64(_mm_xor_si128(a, mask), mask)
169 }
170 for Scalar(a: i64) -> i64 {
171 a.abs()
172 }
173 for Neon(a: int64x2_t) -> int64x2_t {
174 vabsq_s64(a)
175 }
176 for Wasm(a: v128) -> v128 {
177 i64x2_abs(a)
178 }
179 }
180}
181
182impl_op! {
183 fn eq<i64> {
184 for Avx2(a: __m256i, b: __m256i) -> __m256i {
185 _mm256_cmpeq_epi64(a, b)
186 }
187 for Sse41(a: __m128i, b: __m128i) -> __m128i {
188 _mm_cmpeq_epi64(a, b)
189 }
190 for Sse2(a: __m128i, b: __m128i) -> __m128i {
191 _mm_cmpeq_epi64(a, b)
192 }
193 for Scalar(a: i64, b: i64) -> i64 {
194 if a == b {
195 u64::MAX as i64
196 } else {
197 0
198 }
199 }
200 for Neon(a: int64x2_t, b: int64x2_t) -> int64x2_t {
201 vreinterpretq_s64_u64(vceqq_s64(a, b))
202 }
203 for Wasm(a: v128, b: v128) -> v128 {
204 i64x2_eq(a, b)
205 }
206 }
207}
208
209impl_op! {
210 fn neq<i64> {
211 for Avx2(a: __m256i, b: __m256i) -> __m256i {
212 let eq = _mm256_cmpeq_epi64(a, b);
213 _mm256_xor_si256(eq, _mm256_set1_epi64x(u64::MAX as i64))
214 }
215 for Sse41(a: __m128i, b: __m128i) -> __m128i {
216 let eq = _mm_cmpeq_epi64(a, b);
217 _mm_xor_si128(eq, _mm_set1_epi64x(u64::MAX as i64))
218 }
219 for Sse2(a: __m128i, b: __m128i) -> __m128i {
220 let eq = _mm_cmpeq_epi64(a, b);
221 _mm_xor_si128(eq, _mm_set1_epi64x(u64::MAX as i64))
222 }
223 for Scalar(a: i64, b: i64) -> i64 {
224 if a != b {
225 u64::MAX as i64
226 } else {
227 0
228 }
229 }
230 for Neon(a: int64x2_t, b: int64x2_t) -> int64x2_t {
231 vreinterpretq_s64_u32(vmvnq_u32(vreinterpretq_u32_u64(vceqq_s64(a, b))))
232 }
233 for Wasm(a: v128, b: v128) -> v128 {
234 i64x2_ne(a, b)
235 }
236 }
237}
238
239impl_op! {
240 fn lt<i64> {
241 for Avx2(a: __m256i, b: __m256i) -> __m256i {
242 let gt = _mm256_cmpgt_epi64(a, b);
243 let eq = _mm256_cmpeq_epi64(a, b);
244 _mm256_andnot_si256(_mm256_or_si256(gt, eq), _mm256_set1_epi64x(u64::MAX as i64))
245 }
246 for Sse41(a: __m128i, b: __m128i) -> __m128i {
247 let gt = _mm_cmpgt_epi64(a, b);
248 let eq = _mm_cmpeq_epi64(a, b);
249 _mm_andnot_si128(_mm_or_si128(gt, eq), _mm_set1_epi64x(u64::MAX as i64))
250 }
251 for Sse2(a: __m128i, b: __m128i) -> __m128i {
252 let gt = _mm_cmpgt_epi64(a, b);
253 let eq = _mm_cmpeq_epi64(a, b);
254 _mm_andnot_si128(_mm_or_si128(gt, eq), _mm_set1_epi64x(u64::MAX as i64))
255 }
256 for Scalar(a: i64, b: i64) -> i64 {
257 if a < b {
258 u64::MAX as i64
259 } else {
260 0
261 }
262 }
263 for Neon(a: int64x2_t, b: int64x2_t) -> int64x2_t {
264 vreinterpretq_s64_u64(vcltq_s64(a, b))
265 }
266 for Wasm(a: v128, b: v128) -> v128 {
267 i64x2_lt(a, b)
268 }
269 }
270}
271
272impl_op! {
273 fn lte<i64> {
274 for Avx2(a: __m256i, b: __m256i) -> __m256i {
275 let gt = _mm256_cmpgt_epi64(a, b);
276 _mm256_xor_si256(gt, _mm256_set1_epi64x(u64::MAX as i64))
277 }
278 for Sse41(a: __m128i, b: __m128i) -> __m128i {
279 let gt = _mm_cmpgt_epi64(a, b);
280 _mm_xor_si128(gt, _mm_set1_epi64x(u64::MAX as i64))
281 }
282 for Sse2(a: __m128i, b: __m128i) -> __m128i {
283 let gt = _mm_cmpgt_epi64(a, b);
284 _mm_xor_si128(gt, _mm_set1_epi64x(u64::MAX as i64))
285 }
286 for Scalar(a: i64, b: i64) -> i64 {
287 if a <= b {
288 u64::MAX as i64
289 } else {
290 0
291 }
292 }
293 for Neon(a: int64x2_t, b: int64x2_t) -> int64x2_t {
294 vreinterpretq_s64_u64(vcleq_s64(a, b))
295 }
296 for Wasm(a: v128, b: v128) -> v128 {
297 i64x2_le(a, b)
298 }
299 }
300}
301
302impl_op! {
303 fn gt<i64> {
304 for Avx2(a: __m256i, b: __m256i) -> __m256i {
305 _mm256_cmpgt_epi64(a, b)
306 }
307 for Sse41(a: __m128i, b: __m128i) -> __m128i {
308 _mm_cmpgt_epi64(a, b)
309 }
310 for Sse2(a: __m128i, b: __m128i) -> __m128i {
311 _mm_cmpgt_epi64(a, b)
312 }
313 for Scalar(a: i64, b: i64) -> i64 {
314 if a > b {
315 u64::MAX as i64
316 } else {
317 0
318 }
319 }
320 for Neon(a: int64x2_t, b: int64x2_t) -> int64x2_t {
321 vreinterpretq_s64_u64(vcgtq_s64(a, b))
322 }
323 for Wasm(a: v128, b: v128) -> v128 {
324 i64x2_gt(a, b)
325 }
326 }
327}
328
329impl_op! {
330 fn gte<i64> {
331 for Avx2(a: __m256i, b: __m256i) -> __m256i {
332 let gt = _mm256_cmpgt_epi64(a, b);
333 let eq = _mm256_cmpeq_epi64(a, b);
334 _mm256_or_si256(gt, eq)
335 }
336 for Sse41(a: __m128i, b: __m128i) -> __m128i {
337 let gt = _mm_cmpgt_epi64(a, b);
338 let eq = _mm_cmpeq_epi64(a, b);
339 _mm_or_si128(gt, eq)
340 }
341 for Sse2(a: __m128i, b: __m128i) -> __m128i {
342 let gt = _mm_cmpgt_epi64(a, b);
343 let eq = _mm_cmpeq_epi64(a, b);
344 _mm_or_si128(gt, eq)
345 }
346 for Scalar(a: i64, b: i64) -> i64 {
347 if a >= b {
348 u64::MAX as i64
349 } else {
350 0
351 }
352 }
353 for Neon(a: int64x2_t, b: int64x2_t) -> int64x2_t {
354 vreinterpretq_s64_u64(vcgeq_s64(a, b))
355 }
356 for Wasm(a: v128, b: v128) -> v128 {
357 i64x2_ge(a, b)
358 }
359 }
360}
361
362impl_op! {
363 fn blendv<i64> {
364 for Avx2(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
365 _mm256_blendv_epi8(a, b, mask)
366 }
367 for Sse41(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
368 _mm_blendv_epi8(a, b, mask)
369 }
370 for Sse2(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
371 _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a))
372 }
373 for Scalar(a: i64, b: i64, mask: i64) -> i64 {
374 if mask == 0 {
375 a
376 } else {
377 b
378 }
379 }
380 for Neon(a: int64x2_t, b: int64x2_t, mask: int64x2_t) -> int64x2_t {
381 vbslq_s64(vreinterpretq_u64_s64(mask), b, a)
382 }
383 for Wasm(a: v128, b: v128, mask: v128) -> v128 {
384 v128_or(v128_and(mask, b), v128_andnot(a, mask))
385 }
386 }
387}
388
389impl_op! {
390 fn shl<i64> {
391 for Avx2(a: __m256i, rhs: i32) -> __m256i {
392 _mm256_sll_epi64(a, _mm_cvtsi32_si128(rhs))
393 }
394 for Sse41(a: __m128i, b: i32) -> __m128i {
395 _mm_sll_epi64(a, _mm_cvtsi32_si128(b))
396 }
397 for Sse2(a: __m128i, b: i32) -> __m128i {
398 _mm_sll_epi64(a, _mm_cvtsi32_si128(b))
399 }
400 for Scalar(a: i64, b: i32) -> i64 {
401 a << b
402 }
403 for Neon(a: int64x2_t, rhs: i32) -> int64x2_t {
404 let rhs = Self::set1(rhs as i64);
405 vshlq_s64(a, rhs)
406 }
407 for Wasm(a: v128, rhs: i32) -> v128 {
408 i64x2_shl(a, rhs as u32)
409 }
410 }
411}
412
413impl_op! {
414 fn shr<i64> {
415 for Avx2(a: __m256i, rhs: i32) -> __m256i {
416 _mm256_srl_epi64(a, _mm_cvtsi32_si128(rhs))
417 }
418 for Sse41(a: __m128i, rhs: i32) -> __m128i {
419 _mm_srl_epi64(a, _mm_cvtsi32_si128(rhs))
420 }
421 for Sse2(a: __m128i, rhs: i32) -> __m128i {
422 _mm_srl_epi64(a, _mm_cvtsi32_si128(rhs))
423 }
424 for Scalar(a: i64, rhs: i32) -> i64 {
425 ((a as u64) >> rhs) as i64
426 }
427 for Neon(a: int64x2_t, rhs: i32) -> int64x2_t {
428 let rhs = Self::set1(-rhs as i64);
429 vreinterpretq_s64_u64(vshlq_u64(vreinterpretq_u64_s64(a), rhs))
430 }
431 for Wasm(a: v128, rhs: i32) -> v128 {
432 u64x2_shr(a, rhs as u32)
433 }
434 }
435}
436
437impl_imm8_op! {
438 fn shl_const<i64, const BY: i32> {
439 for Avx2(a: __m256i) -> __m256i {
440 _mm256_slli_epi64(a, BY)
441 }
442 for Sse41(a: __m128i) -> __m128i {
443 _mm_slli_epi64(a, BY)
444 }
445 for Sse2(a: __m128i) -> __m128i {
446 _mm_slli_epi64(a, BY)
447 }
448 for Scalar(a: i64) -> i64 {
449 a << BY
450 }
451 for Neon(a: int64x2_t) -> int64x2_t {
452 vshlq_n_s64(a, BY)
453 }
454 for Wasm(a: v128) -> v128 {
455 i64x2_shl(a, BY as u32)
456 }
457 }
458}
459
460impl_imm8_op! {
461 fn shr_const<i64, const BY: i32> {
462 for Avx2(a: __m256i) -> __m256i {
463 _mm256_srli_epi64(a, BY)
464 }
465 for Sse41(a: __m128i) -> __m128i {
466 _mm_srli_epi64(a, BY)
467 }
468 for Sse2(a: __m128i) -> __m128i {
469 _mm_srli_epi64(a, BY)
470 }
471 for Scalar(a: i64) -> i64 {
472 ((a as u64) >> BY) as i64
473 }
474 for Neon(a: int64x2_t) -> int64x2_t {
475 vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), BY))
476 }
477 for Wasm(a: v128) -> v128 {
478 u64x2_shr(a, BY as u32)
479 }
480 }
481}
482
483impl_op! {
484 fn cast_f64<i64> {
485 for Avx2(a: __m256i) -> __m256d {
486 let arr = core::mem::transmute::<__m256i, [i64; 4]>(a);
487 let result = [
488 arr[0] as f64,
489 arr[1] as f64,
490 arr[2] as f64,
491 arr[3] as f64,
492 ];
493 core::mem::transmute::<_, __m256d>(result)
494 }
495 for Sse41(a: __m128i) -> __m128d {
496 let arr = core::mem::transmute::<__m128i, [i64; 2]>(a);
497 let result = [
498 arr[0] as f64,
499 arr[1] as f64,
500 ];
501 core::mem::transmute::<_, __m128d>(result)
502 }
503 for Sse2(a: __m128i) -> __m128d {
504 let arr = core::mem::transmute::<__m128i, [i64; 2]>(a);
505 let result = [
506 arr[0] as f64,
507 arr[1] as f64,
508 ];
509 core::mem::transmute::<_, __m128d>(result)
510 }
511 for Scalar(a: i64) -> f64 {
512 a as f64
513 }
514 for Neon(a: int64x2_t) -> float64x2_t {
515 vcvtq_f64_s64(a)
516 }
517 for Wasm(a: v128) -> v128 {
518 let arr = core::mem::transmute::<_, [i64; 2]>(a);
519 let result = [
520 arr[0] as f64,
521 arr[1] as f64,
522 ];
523 core::mem::transmute::<_, v128>(result)
524 }
525 }
526}
527
528impl_op! {
529 fn bitcast_f64<i64> {
530 for Avx2(a: __m256i) -> __m256d {
531 _mm256_castsi256_pd(a)
532 }
533 for Sse41(a: __m128i) -> __m128d {
534 _mm_castsi128_pd(a)
535 }
536 for Sse2(a: __m128i) -> __m128d {
537 _mm_castsi128_pd(a)
538 }
539 for Scalar(a: i64) -> f64 {
540 f64::from_bits(a as u64)
541 }
542 for Neon(a: int64x2_t) -> float64x2_t {
543 core::mem::transmute::<_, float64x2_t>(a)
544 }
545 for Wasm(a: v128) -> v128 {
546 a
547 }
548 }
549}
550
551impl_op! {
552 fn horizontal_add<i64> {
553 for Avx2(val: __m256i) -> i64 {
554 let a = val;
555 let b = _mm256_permute4x64_epi64(a, 0b00_01_10_11); let c = _mm256_add_epi64(a, b);
557 let val1 = _mm256_extract_epi64(c, 0);
558 let val2 = _mm256_extract_epi64(c, 1);
559 val1.wrapping_add(val2)
560 }
561 for Sse41(val: __m128i) -> i64 {
562 let first = _mm_cvtsi128_si64(val);
563 let second = _mm_cvtsi128_si64(_mm_shuffle_epi32(val, 0b_01_00_11_10));
564 first.wrapping_add(second)
565 }
566 for Sse2(val: __m128i) -> i64 {
567 let first = _mm_cvtsi128_si64(val);
568 let second = _mm_cvtsi128_si64(_mm_shuffle_epi32(val, 0b_01_00_11_10));
569 first.wrapping_add(second)
570 }
571 for Scalar(val: i64) -> i64 {
572 val
573 }
574 for Neon(val: int64x2_t) -> i64 {
575 let a = val;
576 let b = vcombine_s64(vget_high_s64(a), vget_low_s64(a));
577 let c = vaddq_s64(a, b);
578 vgetq_lane_s64(c, 0)
579 }
580 for Wasm(val: v128) -> i64 {
581 let a = i64x2_extract_lane::<0>(val);
582 let b = i64x2_extract_lane::<1>(val);
583 a.wrapping_add(b)
584 }
585 }
586}
587
588impl_op! {
589 fn zeroes<i64> {
590 for Avx2() -> __m256i {
591 _mm256_setzero_si256()
592 }
593 for Sse41() -> __m128i {
594 _mm_setzero_si128()
595 }
596 for Sse2() -> __m128i {
597 _mm_setzero_si128()
598 }
599 for Scalar() -> i64 {
600 0
601 }
602 for Neon() -> int64x2_t {
603 vdupq_n_s64(0)
604 }
605 for Wasm() -> v128 {
606 i64x2_splat(0)
607 }
608 }
609}
610
611impl_op! {
612 fn set1<i64> {
613 for Avx2(val: i64) -> __m256i {
614 _mm256_set1_epi64x(val)
615 }
616 for Sse41(val: i64) -> __m128i {
617 _mm_set1_epi64x(val)
618 }
619 for Sse2(val: i64) -> __m128i {
620 _mm_set1_epi64x(val)
621 }
622 for Scalar(val: i64) -> i64 {
623 val
624 }
625 for Neon(val: i64) -> int64x2_t {
626 vdupq_n_s64(val)
627 }
628 for Wasm(val: i64) -> v128 {
629 i64x2_splat(val)
630 }
631 }
632}
633
634impl_op! {
635 fn load_unaligned<i64> {
636 for Avx2(ptr: *const i64) -> __m256i {
637 _mm256_loadu_si256(ptr as *const __m256i)
638 }
639 for Sse41(ptr: *const i64) -> __m128i {
640 _mm_loadu_si128(ptr as *const __m128i)
641 }
642 for Sse2(ptr: *const i64) -> __m128i {
643 _mm_loadu_si128(ptr as *const __m128i)
644 }
645 for Scalar(ptr: *const i64) -> i64 {
646 unsafe { *ptr }
647 }
648 for Neon(ptr: *const i64) -> int64x2_t {
649 vld1q_s64(ptr)
650 }
651 for Wasm(ptr: *const i64) -> v128 {
652 *(ptr as *const v128)
653 }
654 }
655}
656
657impl_op! {
658 fn load_aligned<i64> {
659 for Avx2(ptr: *const i64) -> __m256i {
660 _mm256_load_si256(ptr as *const __m256i)
661 }
662 for Sse41(ptr: *const i64) -> __m128i {
663 _mm_load_si128(ptr as *const __m128i)
664 }
665 for Sse2(ptr: *const i64) -> __m128i {
666 _mm_load_si128(ptr as *const __m128i)
667 }
668 for Scalar(ptr: *const i64) -> i64 {
669 unsafe { *ptr }
670 }
671 for Neon(ptr: *const i64) -> int64x2_t {
672 vld1q_s64(ptr)
673 }
674 for Wasm(ptr: *const i64) -> v128 {
675 *(ptr as *const v128)
676 }
677 }
678}
679
680impl_op! {
681 fn store_unaligned<i64> {
682 for Avx2(ptr: *mut i64, a: __m256i) {
683 _mm256_storeu_si256(ptr as *mut __m256i, a)
684 }
685 for Sse41(ptr: *mut i64, a: __m128i) {
686 _mm_storeu_si128(ptr as *mut __m128i, a)
687 }
688 for Sse2(ptr: *mut i64, a: __m128i) {
689 _mm_storeu_si128(ptr as *mut __m128i, a)
690 }
691 for Scalar(ptr: *mut i64, a: i64) {
692 unsafe { *ptr = a }
693 }
694 for Neon(ptr: *mut i64, a: int64x2_t) {
695 vst1q_s64(ptr, a)
696 }
697 for Wasm(ptr: *mut i64, a: v128) {
698 *(ptr as *mut v128) = a;
699 }
700 }
701}
702
703impl_op! {
704 fn store_aligned<i64> {
705 for Avx2(ptr: *mut i64, a: __m256i) {
706 _mm256_store_si256(ptr as *mut __m256i, a)
707 }
708 for Sse41(ptr: *mut i64, a: __m128i) {
709 _mm_store_si128(ptr as *mut __m128i, a)
710 }
711 for Sse2(ptr: *mut i64, a: __m128i) {
712 _mm_store_si128(ptr as *mut __m128i, a)
713 }
714 for Scalar(ptr: *mut i64, a: i64) {
715 unsafe { *ptr = a }
716 }
717 for Neon(ptr: *mut i64, a: int64x2_t) {
718 vst1q_s64(ptr, a)
719 }
720 for Wasm(ptr: *mut i64, a: v128) {
721 *(ptr as *mut v128) = a;
722 }
723 }
724}