simdeez/ops/
i64.rs

1use super::*;
2
3impl_op! {
4    fn add<i64> {
5        for Avx2(a: __m256i, b: __m256i) -> __m256i {
6            _mm256_add_epi64(a, b)
7        }
8        for Sse41(a: __m128i, b: __m128i) -> __m128i {
9            _mm_add_epi64(a, b)
10        }
11        for Sse2(a: __m128i, b: __m128i) -> __m128i {
12            _mm_add_epi64(a, b)
13        }
14        for Scalar(a: i64, b: i64) -> i64 {
15            a.wrapping_add(b)
16        }
17        for Neon(a: int64x2_t, b: int64x2_t) -> int64x2_t {
18            vaddq_s64(a, b)
19        }
20        for Wasm(a: v128, b: v128) -> v128 {
21            i64x2_add(a, b)
22        }
23    }
24}
25
26impl_op! {
27    fn sub<i64> {
28        for Avx2(a: __m256i, b: __m256i) -> __m256i {
29            _mm256_sub_epi64(a, b)
30        }
31        for Sse41(a: __m128i, b: __m128i) -> __m128i {
32            _mm_sub_epi64(a, b)
33        }
34        for Sse2(a: __m128i, b: __m128i) -> __m128i {
35            _mm_sub_epi64(a, b)
36        }
37        for Scalar(a: i64, b: i64) -> i64 {
38            a.wrapping_sub(b)
39        }
40        for Neon(a: int64x2_t, b: int64x2_t) -> int64x2_t {
41            vsubq_s64(a, b)
42        }
43        for Wasm(a: v128, b: v128) -> v128 {
44            i64x2_sub(a, b)
45        }
46    }
47}
48
49impl_op! {
50    fn mul<i64> {
51        for Avx2(a: __m256i, b: __m256i) -> __m256i {
52            let a_arr = core::mem::transmute::<__m256i, [i64; 4]>(a);
53            let b_arr = core::mem::transmute::<__m256i, [i64; 4]>(b);
54            let c_arr = [
55                a_arr[0].wrapping_mul(b_arr[0]),
56                a_arr[1].wrapping_mul(b_arr[1]),
57                a_arr[2].wrapping_mul(b_arr[2]),
58                a_arr[3].wrapping_mul(b_arr[3]),
59            ];
60            core::mem::transmute::<_, __m256i>(c_arr)
61        }
62        for Sse41(a: __m128i, b: __m128i) -> __m128i {
63            let a_arr = core::mem::transmute::<__m128i, [i64; 2]>(a);
64            let b_arr = core::mem::transmute::<__m128i, [i64; 2]>(b);
65            let c_arr = [
66                a_arr[0].wrapping_mul(b_arr[0]),
67                a_arr[1].wrapping_mul(b_arr[1]),
68            ];
69            core::mem::transmute::<_, __m128i>(c_arr)
70        }
71        for Sse2(a: __m128i, b: __m128i) -> __m128i {
72            let a_arr = core::mem::transmute::<__m128i, [i64; 2]>(a);
73            let b_arr = core::mem::transmute::<__m128i, [i64; 2]>(b);
74            let c_arr = [
75                a_arr[0].wrapping_mul(b_arr[0]),
76                a_arr[1].wrapping_mul(b_arr[1]),
77            ];
78            core::mem::transmute::<_, __m128i>(c_arr)
79        }
80        for Scalar(a: i64, b: i64) -> i64 {
81            a.wrapping_mul(b)
82        }
83        for Neon(a: int64x2_t, b: int64x2_t) -> int64x2_t {
84            let a_arr = core::mem::transmute::<int64x2_t, [i64; 2]>(a);
85            let b_arr = core::mem::transmute::<int64x2_t, [i64; 2]>(b);
86            let c_arr = [
87                a_arr[0].wrapping_mul(b_arr[0]),
88                a_arr[1].wrapping_mul(b_arr[1]),
89            ];
90            core::mem::transmute::<_, int64x2_t>(c_arr)
91        }
92        for Wasm(a: v128, b: v128) -> v128 {
93            i64x2_mul(a, b)
94        }
95    }
96}
97
98impl_op! {
99    fn min<i64> {
100        for Avx2(a: __m256i, b: __m256i) -> __m256i {
101            let mask = _mm256_cmpgt_epi64(a, b);
102            _mm256_or_si256(_mm256_and_si256(mask, b), _mm256_andnot_si256(mask, a))
103        }
104        for Sse41(a: __m128i, b: __m128i) -> __m128i {
105            let mask = _mm_cmpgt_epi64(a, b);
106            _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a))
107        }
108        for Sse2(a: __m128i, b: __m128i) -> __m128i {
109            let mask = _mm_cmpgt_epi64(a, b);
110            _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a))
111        }
112        for Scalar(a: i64, b: i64) -> i64 {
113            a.min(b)
114        }
115        for Neon(a: int64x2_t, b: int64x2_t) -> int64x2_t {
116            let mask = vreinterpretq_s64_u64(vcgtq_s64(a, b));
117            let not_mask = vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(mask)));
118            vorrq_s64(vandq_s64(mask, b), vandq_s64(not_mask, a))
119        }
120        for Wasm(a: v128, b: v128) -> v128 {
121            let mask = i64x2_gt(a, b);
122            v128_or(v128_and(mask, b), v128_andnot(a, mask))
123        }
124    }
125}
126
127impl_op! {
128    fn max<i64> {
129        for Avx2(a: __m256i, b: __m256i) -> __m256i {
130            let mask = _mm256_cmpgt_epi64(a, b);
131            _mm256_or_si256(_mm256_and_si256(mask, a), _mm256_andnot_si256(mask, b))
132        }
133        for Sse41(a: __m128i, b: __m128i) -> __m128i {
134            let mask = _mm_cmpgt_epi64(a, b);
135            _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b))
136        }
137        for Sse2(a: __m128i, b: __m128i) -> __m128i {
138            let mask = _mm_cmpgt_epi64(a, b);
139            _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b))
140        }
141        for Scalar(a: i64, b: i64) -> i64 {
142            a.max(b)
143        }
144        for Neon(a: int64x2_t, b: int64x2_t) -> int64x2_t {
145            let mask = vreinterpretq_s64_u64(vcgtq_s64(a, b));
146            let not_mask = vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(mask)));
147            vorrq_s64(vandq_s64(mask, a), vandq_s64(not_mask, b))
148        }
149        for Wasm(a: v128, b: v128) -> v128 {
150            let mask = i64x2_gt(a, b);
151            v128_or(v128_and(mask, a), v128_andnot(b, mask))
152        }
153    }
154}
155
156impl_op! {
157    fn abs<i64> {
158        for Avx2(a: __m256i) -> __m256i {
159            let mask = _mm256_cmpgt_epi64(_mm256_setzero_si256(), a);
160            _mm256_sub_epi64(_mm256_xor_si256(a, mask), mask)
161        }
162        for Sse41(a: __m128i) -> __m128i {
163            let mask = _mm_cmpgt_epi64(_mm_setzero_si128(), a);
164            _mm_sub_epi64(_mm_xor_si128(a, mask), mask)
165        }
166        for Sse2(a: __m128i) -> __m128i {
167            let mask = _mm_cmpgt_epi64(_mm_setzero_si128(), a);
168            _mm_sub_epi64(_mm_xor_si128(a, mask), mask)
169        }
170        for Scalar(a: i64) -> i64 {
171            a.abs()
172        }
173        for Neon(a: int64x2_t) -> int64x2_t {
174            vabsq_s64(a)
175        }
176        for Wasm(a: v128) -> v128 {
177            i64x2_abs(a)
178        }
179    }
180}
181
182impl_op! {
183    fn eq<i64> {
184        for Avx2(a: __m256i, b: __m256i) -> __m256i {
185            _mm256_cmpeq_epi64(a, b)
186        }
187        for Sse41(a: __m128i, b: __m128i) -> __m128i {
188            _mm_cmpeq_epi64(a, b)
189        }
190        for Sse2(a: __m128i, b: __m128i) -> __m128i {
191            _mm_cmpeq_epi64(a, b)
192        }
193        for Scalar(a: i64, b: i64) -> i64 {
194            if a == b {
195                u64::MAX as i64
196            } else {
197                0
198            }
199        }
200        for Neon(a: int64x2_t, b: int64x2_t) -> int64x2_t {
201            vreinterpretq_s64_u64(vceqq_s64(a, b))
202        }
203        for Wasm(a: v128, b: v128) -> v128 {
204            i64x2_eq(a, b)
205        }
206    }
207}
208
209impl_op! {
210    fn neq<i64> {
211        for Avx2(a: __m256i, b: __m256i) -> __m256i {
212            let eq = _mm256_cmpeq_epi64(a, b);
213            _mm256_xor_si256(eq, _mm256_set1_epi64x(u64::MAX as i64))
214        }
215        for Sse41(a: __m128i, b: __m128i) -> __m128i {
216            let eq = _mm_cmpeq_epi64(a, b);
217            _mm_xor_si128(eq, _mm_set1_epi64x(u64::MAX as i64))
218        }
219        for Sse2(a: __m128i, b: __m128i) -> __m128i {
220            let eq = _mm_cmpeq_epi64(a, b);
221            _mm_xor_si128(eq, _mm_set1_epi64x(u64::MAX as i64))
222        }
223        for Scalar(a: i64, b: i64) -> i64 {
224            if a != b {
225                u64::MAX as i64
226            } else {
227                0
228            }
229        }
230        for Neon(a: int64x2_t, b: int64x2_t) -> int64x2_t {
231            vreinterpretq_s64_u32(vmvnq_u32(vreinterpretq_u32_u64(vceqq_s64(a, b))))
232        }
233        for Wasm(a: v128, b: v128) -> v128 {
234            i64x2_ne(a, b)
235        }
236    }
237}
238
239impl_op! {
240    fn lt<i64> {
241        for Avx2(a: __m256i, b: __m256i) -> __m256i {
242            let gt = _mm256_cmpgt_epi64(a, b);
243            let eq = _mm256_cmpeq_epi64(a, b);
244            _mm256_andnot_si256(_mm256_or_si256(gt, eq), _mm256_set1_epi64x(u64::MAX as i64))
245        }
246        for Sse41(a: __m128i, b: __m128i) -> __m128i {
247            let gt = _mm_cmpgt_epi64(a, b);
248            let eq = _mm_cmpeq_epi64(a, b);
249            _mm_andnot_si128(_mm_or_si128(gt, eq), _mm_set1_epi64x(u64::MAX as i64))
250        }
251        for Sse2(a: __m128i, b: __m128i) -> __m128i {
252            let gt = _mm_cmpgt_epi64(a, b);
253            let eq = _mm_cmpeq_epi64(a, b);
254            _mm_andnot_si128(_mm_or_si128(gt, eq), _mm_set1_epi64x(u64::MAX as i64))
255        }
256        for Scalar(a: i64, b: i64) -> i64 {
257            if a < b {
258                u64::MAX as i64
259            } else {
260                0
261            }
262        }
263        for Neon(a: int64x2_t, b: int64x2_t) -> int64x2_t {
264            vreinterpretq_s64_u64(vcltq_s64(a, b))
265        }
266        for Wasm(a: v128, b: v128) -> v128 {
267            i64x2_lt(a, b)
268        }
269    }
270}
271
272impl_op! {
273    fn lte<i64> {
274        for Avx2(a: __m256i, b: __m256i) -> __m256i {
275            let gt = _mm256_cmpgt_epi64(a, b);
276            _mm256_xor_si256(gt, _mm256_set1_epi64x(u64::MAX as i64))
277        }
278        for Sse41(a: __m128i, b: __m128i) -> __m128i {
279            let gt = _mm_cmpgt_epi64(a, b);
280            _mm_xor_si128(gt, _mm_set1_epi64x(u64::MAX as i64))
281        }
282        for Sse2(a: __m128i, b: __m128i) -> __m128i {
283            let gt = _mm_cmpgt_epi64(a, b);
284            _mm_xor_si128(gt, _mm_set1_epi64x(u64::MAX as i64))
285        }
286        for Scalar(a: i64, b: i64) -> i64 {
287            if a <= b {
288                u64::MAX as i64
289            } else {
290                0
291            }
292        }
293        for Neon(a: int64x2_t, b: int64x2_t) -> int64x2_t {
294            vreinterpretq_s64_u64(vcleq_s64(a, b))
295        }
296        for Wasm(a: v128, b: v128) -> v128 {
297            i64x2_le(a, b)
298        }
299    }
300}
301
302impl_op! {
303    fn gt<i64> {
304        for Avx2(a: __m256i, b: __m256i) -> __m256i {
305            _mm256_cmpgt_epi64(a, b)
306        }
307        for Sse41(a: __m128i, b: __m128i) -> __m128i {
308            _mm_cmpgt_epi64(a, b)
309        }
310        for Sse2(a: __m128i, b: __m128i) -> __m128i {
311            _mm_cmpgt_epi64(a, b)
312        }
313        for Scalar(a: i64, b: i64) -> i64 {
314            if a > b {
315                u64::MAX as i64
316            } else {
317                0
318            }
319        }
320        for Neon(a: int64x2_t, b: int64x2_t) -> int64x2_t {
321            vreinterpretq_s64_u64(vcgtq_s64(a, b))
322        }
323        for Wasm(a: v128, b: v128) -> v128 {
324            i64x2_gt(a, b)
325        }
326    }
327}
328
329impl_op! {
330    fn gte<i64> {
331        for Avx2(a: __m256i, b: __m256i) -> __m256i {
332            let gt = _mm256_cmpgt_epi64(a, b);
333            let eq = _mm256_cmpeq_epi64(a, b);
334            _mm256_or_si256(gt, eq)
335        }
336        for Sse41(a: __m128i, b: __m128i) -> __m128i {
337            let gt = _mm_cmpgt_epi64(a, b);
338            let eq = _mm_cmpeq_epi64(a, b);
339            _mm_or_si128(gt, eq)
340        }
341        for Sse2(a: __m128i, b: __m128i) -> __m128i {
342            let gt = _mm_cmpgt_epi64(a, b);
343            let eq = _mm_cmpeq_epi64(a, b);
344            _mm_or_si128(gt, eq)
345        }
346        for Scalar(a: i64, b: i64) -> i64 {
347            if a >= b {
348                u64::MAX as i64
349            } else {
350                0
351            }
352        }
353        for Neon(a: int64x2_t, b: int64x2_t) -> int64x2_t {
354            vreinterpretq_s64_u64(vcgeq_s64(a, b))
355        }
356        for Wasm(a: v128, b: v128) -> v128 {
357            i64x2_ge(a, b)
358        }
359    }
360}
361
362impl_op! {
363    fn blendv<i64> {
364        for Avx2(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
365            _mm256_blendv_epi8(a, b, mask)
366        }
367        for Sse41(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
368            _mm_blendv_epi8(a, b, mask)
369        }
370        for Sse2(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
371            _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a))
372        }
373        for Scalar(a: i64, b: i64, mask: i64) -> i64 {
374            if mask == 0 {
375                a
376            } else {
377                b
378            }
379        }
380        for Neon(a: int64x2_t, b: int64x2_t, mask: int64x2_t) -> int64x2_t {
381            vbslq_s64(vreinterpretq_u64_s64(mask), b, a)
382        }
383        for Wasm(a: v128, b: v128, mask: v128) -> v128 {
384            v128_or(v128_and(mask, b), v128_andnot(a, mask))
385        }
386    }
387}
388
389impl_op! {
390    fn shl<i64> {
391        for Avx2(a: __m256i, rhs: i32) -> __m256i {
392            _mm256_sll_epi64(a, _mm_cvtsi32_si128(rhs))
393        }
394        for Sse41(a: __m128i, b: i32) -> __m128i {
395            _mm_sll_epi64(a, _mm_cvtsi32_si128(b))
396        }
397        for Sse2(a: __m128i, b: i32) -> __m128i {
398            _mm_sll_epi64(a, _mm_cvtsi32_si128(b))
399        }
400        for Scalar(a: i64, b: i32) -> i64 {
401            a << b
402        }
403        for Neon(a: int64x2_t, rhs: i32) -> int64x2_t {
404            let rhs = Self::set1(rhs as i64);
405            vshlq_s64(a, rhs)
406        }
407        for Wasm(a: v128, rhs: i32) -> v128 {
408            i64x2_shl(a, rhs as u32)
409        }
410    }
411}
412
413impl_op! {
414    fn shr<i64> {
415        for Avx2(a: __m256i, rhs: i32) -> __m256i {
416            _mm256_srl_epi64(a, _mm_cvtsi32_si128(rhs))
417        }
418        for Sse41(a: __m128i, rhs: i32) -> __m128i {
419            _mm_srl_epi64(a, _mm_cvtsi32_si128(rhs))
420        }
421        for Sse2(a: __m128i, rhs: i32) -> __m128i {
422            _mm_srl_epi64(a, _mm_cvtsi32_si128(rhs))
423        }
424        for Scalar(a: i64, rhs: i32) -> i64 {
425            ((a as u64) >> rhs) as i64
426        }
427        for Neon(a: int64x2_t, rhs: i32) -> int64x2_t {
428            let rhs = Self::set1(-rhs as i64);
429            vreinterpretq_s64_u64(vshlq_u64(vreinterpretq_u64_s64(a), rhs))
430        }
431        for Wasm(a: v128, rhs: i32) -> v128 {
432            u64x2_shr(a, rhs as u32)
433        }
434    }
435}
436
437impl_imm8_op! {
438    fn shl_const<i64, const BY: i32> {
439        for Avx2(a: __m256i) -> __m256i {
440            _mm256_slli_epi64(a, BY)
441        }
442        for Sse41(a: __m128i) -> __m128i {
443            _mm_slli_epi64(a, BY)
444        }
445        for Sse2(a: __m128i) -> __m128i {
446            _mm_slli_epi64(a, BY)
447        }
448        for Scalar(a: i64) -> i64 {
449            a << BY
450        }
451        for Neon(a: int64x2_t) -> int64x2_t {
452            vshlq_n_s64(a, BY)
453        }
454        for Wasm(a: v128) -> v128 {
455            i64x2_shl(a, BY as u32)
456        }
457    }
458}
459
460impl_imm8_op! {
461    fn shr_const<i64, const BY: i32> {
462        for Avx2(a: __m256i) -> __m256i {
463            _mm256_srli_epi64(a, BY)
464        }
465        for Sse41(a: __m128i) -> __m128i {
466            _mm_srli_epi64(a, BY)
467        }
468        for Sse2(a: __m128i) -> __m128i {
469            _mm_srli_epi64(a, BY)
470        }
471        for Scalar(a: i64) -> i64 {
472            ((a as u64) >> BY) as i64
473        }
474        for Neon(a: int64x2_t) -> int64x2_t {
475            vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), BY))
476        }
477        for Wasm(a: v128) -> v128 {
478            u64x2_shr(a, BY as u32)
479        }
480    }
481}
482
483impl_op! {
484    fn cast_f64<i64> {
485        for Avx2(a: __m256i) -> __m256d {
486            let arr = core::mem::transmute::<__m256i, [i64; 4]>(a);
487            let result = [
488                arr[0] as f64,
489                arr[1] as f64,
490                arr[2] as f64,
491                arr[3] as f64,
492            ];
493            core::mem::transmute::<_, __m256d>(result)
494        }
495        for Sse41(a: __m128i) -> __m128d {
496            let arr = core::mem::transmute::<__m128i, [i64; 2]>(a);
497            let result = [
498                arr[0] as f64,
499                arr[1] as f64,
500            ];
501            core::mem::transmute::<_, __m128d>(result)
502        }
503        for Sse2(a: __m128i) -> __m128d {
504            let arr = core::mem::transmute::<__m128i, [i64; 2]>(a);
505            let result = [
506                arr[0] as f64,
507                arr[1] as f64,
508            ];
509            core::mem::transmute::<_, __m128d>(result)
510        }
511        for Scalar(a: i64) -> f64 {
512            a as f64
513        }
514        for Neon(a: int64x2_t) -> float64x2_t {
515            vcvtq_f64_s64(a)
516        }
517        for Wasm(a: v128) -> v128 {
518            let arr = core::mem::transmute::<_, [i64; 2]>(a);
519            let result = [
520                arr[0] as f64,
521                arr[1] as f64,
522            ];
523            core::mem::transmute::<_, v128>(result)
524        }
525    }
526}
527
528impl_op! {
529    fn bitcast_f64<i64> {
530        for Avx2(a: __m256i) -> __m256d {
531            _mm256_castsi256_pd(a)
532        }
533        for Sse41(a: __m128i) -> __m128d {
534            _mm_castsi128_pd(a)
535        }
536        for Sse2(a: __m128i) -> __m128d {
537            _mm_castsi128_pd(a)
538        }
539        for Scalar(a: i64) -> f64 {
540            f64::from_bits(a as u64)
541        }
542        for Neon(a: int64x2_t) -> float64x2_t {
543            core::mem::transmute::<_, float64x2_t>(a)
544        }
545        for Wasm(a: v128) -> v128 {
546            a
547        }
548    }
549}
550
551impl_op! {
552    fn horizontal_add<i64> {
553        for Avx2(val: __m256i) -> i64 {
554            let a = val;
555            let b = _mm256_permute4x64_epi64(a, 0b00_01_10_11); // Shuffle [0, 1, 2, 3]
556            let c = _mm256_add_epi64(a, b);
557            let val1 = _mm256_extract_epi64(c, 0);
558            let val2 = _mm256_extract_epi64(c, 1);
559            val1.wrapping_add(val2)
560        }
561        for Sse41(val: __m128i) -> i64 {
562            let first = _mm_cvtsi128_si64(val);
563            let second = _mm_cvtsi128_si64(_mm_shuffle_epi32(val, 0b_01_00_11_10));
564            first.wrapping_add(second)
565        }
566        for Sse2(val: __m128i) -> i64 {
567            let first = _mm_cvtsi128_si64(val);
568            let second = _mm_cvtsi128_si64(_mm_shuffle_epi32(val, 0b_01_00_11_10));
569            first.wrapping_add(second)
570        }
571        for Scalar(val: i64) -> i64 {
572            val
573        }
574        for Neon(val: int64x2_t) -> i64 {
575            let a = val;
576            let b = vcombine_s64(vget_high_s64(a), vget_low_s64(a));
577            let c = vaddq_s64(a, b);
578            vgetq_lane_s64(c, 0)
579        }
580        for Wasm(val: v128) -> i64 {
581            let a = i64x2_extract_lane::<0>(val);
582            let b = i64x2_extract_lane::<1>(val);
583            a.wrapping_add(b)
584        }
585    }
586}
587
588impl_op! {
589    fn zeroes<i64> {
590        for Avx2() -> __m256i {
591            _mm256_setzero_si256()
592        }
593        for Sse41() -> __m128i {
594            _mm_setzero_si128()
595        }
596        for Sse2() -> __m128i {
597            _mm_setzero_si128()
598        }
599        for Scalar() -> i64 {
600            0
601        }
602        for Neon() -> int64x2_t {
603            vdupq_n_s64(0)
604        }
605        for Wasm() -> v128 {
606            i64x2_splat(0)
607        }
608    }
609}
610
611impl_op! {
612    fn set1<i64> {
613        for Avx2(val: i64) -> __m256i {
614            _mm256_set1_epi64x(val)
615        }
616        for Sse41(val: i64) -> __m128i {
617            _mm_set1_epi64x(val)
618        }
619        for Sse2(val: i64) -> __m128i {
620            _mm_set1_epi64x(val)
621        }
622        for Scalar(val: i64) -> i64 {
623            val
624        }
625        for Neon(val: i64) -> int64x2_t {
626            vdupq_n_s64(val)
627        }
628        for Wasm(val: i64) -> v128 {
629            i64x2_splat(val)
630        }
631    }
632}
633
634impl_op! {
635    fn load_unaligned<i64> {
636        for Avx2(ptr: *const i64) -> __m256i {
637            _mm256_loadu_si256(ptr as *const __m256i)
638        }
639        for Sse41(ptr: *const i64) -> __m128i {
640            _mm_loadu_si128(ptr as *const __m128i)
641        }
642        for Sse2(ptr: *const i64) -> __m128i {
643            _mm_loadu_si128(ptr as *const __m128i)
644        }
645        for Scalar(ptr: *const i64) -> i64 {
646            unsafe { *ptr }
647        }
648        for Neon(ptr: *const i64) -> int64x2_t {
649            vld1q_s64(ptr)
650        }
651        for Wasm(ptr: *const i64) -> v128 {
652            *(ptr as *const v128)
653        }
654    }
655}
656
657impl_op! {
658    fn load_aligned<i64> {
659        for Avx2(ptr: *const i64) -> __m256i {
660            _mm256_load_si256(ptr as *const __m256i)
661        }
662        for Sse41(ptr: *const i64) -> __m128i {
663            _mm_load_si128(ptr as *const __m128i)
664        }
665        for Sse2(ptr: *const i64) -> __m128i {
666            _mm_load_si128(ptr as *const __m128i)
667        }
668        for Scalar(ptr: *const i64) -> i64 {
669            unsafe { *ptr }
670        }
671        for Neon(ptr: *const i64) -> int64x2_t {
672            vld1q_s64(ptr)
673        }
674        for Wasm(ptr: *const i64) -> v128 {
675            *(ptr as *const v128)
676        }
677    }
678}
679
680impl_op! {
681    fn store_unaligned<i64> {
682        for Avx2(ptr: *mut i64, a: __m256i) {
683            _mm256_storeu_si256(ptr as *mut __m256i, a)
684        }
685        for Sse41(ptr: *mut i64, a: __m128i) {
686            _mm_storeu_si128(ptr as *mut __m128i, a)
687        }
688        for Sse2(ptr: *mut i64, a: __m128i) {
689            _mm_storeu_si128(ptr as *mut __m128i, a)
690        }
691        for Scalar(ptr: *mut i64, a: i64) {
692            unsafe { *ptr = a }
693        }
694        for Neon(ptr: *mut i64, a: int64x2_t) {
695            vst1q_s64(ptr, a)
696        }
697        for Wasm(ptr: *mut i64, a: v128) {
698            *(ptr as *mut v128) = a;
699        }
700    }
701}
702
703impl_op! {
704    fn store_aligned<i64> {
705        for Avx2(ptr: *mut i64, a: __m256i) {
706            _mm256_store_si256(ptr as *mut __m256i, a)
707        }
708        for Sse41(ptr: *mut i64, a: __m128i) {
709            _mm_store_si128(ptr as *mut __m128i, a)
710        }
711        for Sse2(ptr: *mut i64, a: __m128i) {
712            _mm_store_si128(ptr as *mut __m128i, a)
713        }
714        for Scalar(ptr: *mut i64, a: i64) {
715            unsafe { *ptr = a }
716        }
717        for Neon(ptr: *mut i64, a: int64x2_t) {
718            vst1q_s64(ptr, a)
719        }
720        for Wasm(ptr: *mut i64, a: v128) {
721            *(ptr as *mut v128) = a;
722        }
723    }
724}