1#![allow(unsafe_code)]
8
9use crate::simd::traits::{SimdOps, SimdOpsExt};
10use crate::simd::types::{I16x16, I16x8, I32x4, I32x8, U8x16, U8x32};
11
12#[cfg(target_arch = "x86_64")]
13use std::arch::x86_64::*;
14
15#[derive(Clone, Copy, Debug)]
21pub struct Avx512Simd;
22
23impl Avx512Simd {
24 #[inline]
31 #[must_use]
32 pub const fn new() -> Self {
33 Self
34 }
35
36 #[inline]
38 #[must_use]
39 pub fn is_available() -> bool {
40 #[cfg(target_arch = "x86_64")]
41 {
42 is_x86_feature_detected!("avx512f")
44 && is_x86_feature_detected!("avx512bw")
45 && is_x86_feature_detected!("avx512dq")
46 }
47 #[cfg(not(target_arch = "x86_64"))]
48 {
49 false
50 }
51 }
52
53 #[inline]
55 #[cfg(target_arch = "x86_64")]
56 #[allow(dead_code)]
57 fn sad_u8x32_avx512(&self, a: &U8x32, b: &U8x32) -> u32 {
58 unsafe {
60 let a_vec = _mm256_loadu_si256(a.as_ptr().cast());
61 let b_vec = _mm256_loadu_si256(b.as_ptr().cast());
62
63 let sad = _mm256_sad_epu8(a_vec, b_vec);
65
66 let arr: [u64; 4] = std::mem::transmute(sad);
68 (arr[0] + arr[1] + arr[2] + arr[3]) as u32
69 }
70 }
71}
72
73impl SimdOps for Avx512Simd {
74 #[inline]
75 fn name(&self) -> &'static str {
76 "avx512"
77 }
78
79 #[inline]
80 fn is_available(&self) -> bool {
81 Self::is_available()
82 }
83
84 #[inline]
88 #[cfg(target_arch = "x86_64")]
89 fn add_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
90 unsafe {
92 let a_vec = _mm_loadu_si128(a.as_ptr().cast());
93 let b_vec = _mm_loadu_si128(b.as_ptr().cast());
94 let result = _mm_add_epi16(a_vec, b_vec);
95 let mut out = I16x8::zero();
96 _mm_storeu_si128(out.as_mut_ptr().cast(), result);
97 out
98 }
99 }
100
101 #[inline]
102 #[cfg(not(target_arch = "x86_64"))]
103 fn add_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
104 let mut result = I16x8::zero();
105 for i in 0..8 {
106 result[i] = a[i].wrapping_add(b[i]);
107 }
108 result
109 }
110
111 #[inline]
112 #[cfg(target_arch = "x86_64")]
113 fn sub_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
114 unsafe {
115 let a_vec = _mm_loadu_si128(a.as_ptr().cast());
116 let b_vec = _mm_loadu_si128(b.as_ptr().cast());
117 let result = _mm_sub_epi16(a_vec, b_vec);
118 let mut out = I16x8::zero();
119 _mm_storeu_si128(out.as_mut_ptr().cast(), result);
120 out
121 }
122 }
123
124 #[inline]
125 #[cfg(not(target_arch = "x86_64"))]
126 fn sub_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
127 let mut result = I16x8::zero();
128 for i in 0..8 {
129 result[i] = a[i].wrapping_sub(b[i]);
130 }
131 result
132 }
133
134 #[inline]
135 #[cfg(target_arch = "x86_64")]
136 fn mul_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
137 unsafe {
138 let a_vec = _mm_loadu_si128(a.as_ptr().cast());
139 let b_vec = _mm_loadu_si128(b.as_ptr().cast());
140 let result = _mm_mullo_epi16(a_vec, b_vec);
141 let mut out = I16x8::zero();
142 _mm_storeu_si128(out.as_mut_ptr().cast(), result);
143 out
144 }
145 }
146
147 #[inline]
148 #[cfg(not(target_arch = "x86_64"))]
149 fn mul_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
150 let mut result = I16x8::zero();
151 for i in 0..8 {
152 result[i] = a[i].wrapping_mul(b[i]);
153 }
154 result
155 }
156
157 #[inline]
158 #[cfg(target_arch = "x86_64")]
159 fn add_i32x4(&self, a: I32x4, b: I32x4) -> I32x4 {
160 unsafe {
161 let a_vec = _mm_loadu_si128(a.as_ptr().cast());
162 let b_vec = _mm_loadu_si128(b.as_ptr().cast());
163 let result = _mm_add_epi32(a_vec, b_vec);
164 let mut out = I32x4::zero();
165 _mm_storeu_si128(out.as_mut_ptr().cast(), result);
166 out
167 }
168 }
169
170 #[inline]
171 #[cfg(not(target_arch = "x86_64"))]
172 fn add_i32x4(&self, a: I32x4, b: I32x4) -> I32x4 {
173 let mut result = I32x4::zero();
174 for i in 0..4 {
175 result[i] = a[i].wrapping_add(b[i]);
176 }
177 result
178 }
179
180 #[inline]
181 #[cfg(target_arch = "x86_64")]
182 fn sub_i32x4(&self, a: I32x4, b: I32x4) -> I32x4 {
183 unsafe {
184 let a_vec = _mm_loadu_si128(a.as_ptr().cast());
185 let b_vec = _mm_loadu_si128(b.as_ptr().cast());
186 let result = _mm_sub_epi32(a_vec, b_vec);
187 let mut out = I32x4::zero();
188 _mm_storeu_si128(out.as_mut_ptr().cast(), result);
189 out
190 }
191 }
192
193 #[inline]
194 #[cfg(not(target_arch = "x86_64"))]
195 fn sub_i32x4(&self, a: I32x4, b: I32x4) -> I32x4 {
196 let mut result = I32x4::zero();
197 for i in 0..4 {
198 result[i] = a[i].wrapping_sub(b[i]);
199 }
200 result
201 }
202
203 #[inline]
204 #[cfg(target_arch = "x86_64")]
205 fn min_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
206 unsafe {
207 let a_vec = _mm_loadu_si128(a.as_ptr().cast());
208 let b_vec = _mm_loadu_si128(b.as_ptr().cast());
209 let result = _mm_min_epi16(a_vec, b_vec);
210 let mut out = I16x8::zero();
211 _mm_storeu_si128(out.as_mut_ptr().cast(), result);
212 out
213 }
214 }
215
216 #[inline]
217 #[cfg(not(target_arch = "x86_64"))]
218 fn min_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
219 let mut result = I16x8::zero();
220 for i in 0..8 {
221 result[i] = a[i].min(b[i]);
222 }
223 result
224 }
225
226 #[inline]
227 #[cfg(target_arch = "x86_64")]
228 fn max_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
229 unsafe {
230 let a_vec = _mm_loadu_si128(a.as_ptr().cast());
231 let b_vec = _mm_loadu_si128(b.as_ptr().cast());
232 let result = _mm_max_epi16(a_vec, b_vec);
233 let mut out = I16x8::zero();
234 _mm_storeu_si128(out.as_mut_ptr().cast(), result);
235 out
236 }
237 }
238
239 #[inline]
240 #[cfg(not(target_arch = "x86_64"))]
241 fn max_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
242 let mut result = I16x8::zero();
243 for i in 0..8 {
244 result[i] = a[i].max(b[i]);
245 }
246 result
247 }
248
249 #[inline]
250 fn clamp_i16x8(&self, v: I16x8, min: i16, max: i16) -> I16x8 {
251 let min_vec = I16x8::splat(min);
252 let max_vec = I16x8::splat(max);
253 let clamped_min = self.max_i16x8(v, min_vec);
254 self.min_i16x8(clamped_min, max_vec)
255 }
256
257 #[inline]
258 #[cfg(target_arch = "x86_64")]
259 fn min_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
260 unsafe {
261 let a_vec = _mm_loadu_si128(a.as_ptr().cast());
262 let b_vec = _mm_loadu_si128(b.as_ptr().cast());
263 let result = _mm_min_epu8(a_vec, b_vec);
264 let mut out = U8x16::zero();
265 _mm_storeu_si128(out.as_mut_ptr().cast(), result);
266 out
267 }
268 }
269
270 #[inline]
271 #[cfg(not(target_arch = "x86_64"))]
272 fn min_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
273 let mut result = U8x16::zero();
274 for i in 0..16 {
275 result[i] = a[i].min(b[i]);
276 }
277 result
278 }
279
280 #[inline]
281 #[cfg(target_arch = "x86_64")]
282 fn max_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
283 unsafe {
284 let a_vec = _mm_loadu_si128(a.as_ptr().cast());
285 let b_vec = _mm_loadu_si128(b.as_ptr().cast());
286 let result = _mm_max_epu8(a_vec, b_vec);
287 let mut out = U8x16::zero();
288 _mm_storeu_si128(out.as_mut_ptr().cast(), result);
289 out
290 }
291 }
292
293 #[inline]
294 #[cfg(not(target_arch = "x86_64"))]
295 fn max_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
296 let mut result = U8x16::zero();
297 for i in 0..16 {
298 result[i] = a[i].max(b[i]);
299 }
300 result
301 }
302
303 #[inline]
304 fn clamp_u8x16(&self, v: U8x16, min: u8, max: u8) -> U8x16 {
305 let min_vec = U8x16::splat(min);
306 let max_vec = U8x16::splat(max);
307 let clamped_min = self.max_u8x16(v, min_vec);
308 self.min_u8x16(clamped_min, max_vec)
309 }
310
311 #[inline]
312 #[cfg(target_arch = "x86_64")]
313 fn horizontal_sum_i16x8(&self, v: I16x8) -> i32 {
314 unsafe {
315 let vec = _mm_loadu_si128(v.as_ptr().cast());
316 let sum1 = _mm_hadd_epi16(vec, vec);
317 let sum2 = _mm_hadd_epi16(sum1, sum1);
318 let sum3 = _mm_hadd_epi16(sum2, sum2);
319 _mm_extract_epi16(sum3, 0) as i16 as i32
320 }
321 }
322
323 #[inline]
324 #[cfg(not(target_arch = "x86_64"))]
325 fn horizontal_sum_i16x8(&self, v: I16x8) -> i32 {
326 v.iter().map(|&x| i32::from(x)).sum()
327 }
328
329 #[inline]
330 #[cfg(target_arch = "x86_64")]
331 fn horizontal_sum_i32x4(&self, v: I32x4) -> i32 {
332 unsafe {
333 let vec = _mm_loadu_si128(v.as_ptr().cast());
334 let sum1 = _mm_hadd_epi32(vec, vec);
335 let sum2 = _mm_hadd_epi32(sum1, sum1);
336 _mm_extract_epi32(sum2, 0)
337 }
338 }
339
340 #[inline]
341 #[cfg(not(target_arch = "x86_64"))]
342 fn horizontal_sum_i32x4(&self, v: I32x4) -> i32 {
343 v.iter().sum()
344 }
345
346 #[inline]
347 #[cfg(target_arch = "x86_64")]
348 fn sad_u8x16(&self, a: U8x16, b: U8x16) -> u32 {
349 unsafe {
350 let a_vec = _mm_loadu_si128(a.as_ptr().cast());
351 let b_vec = _mm_loadu_si128(b.as_ptr().cast());
352 let sad = _mm_sad_epu8(a_vec, b_vec);
353 let low = _mm_extract_epi64(sad, 0) as u32;
354 let high = _mm_extract_epi64(sad, 1) as u32;
355 low + high
356 }
357 }
358
359 #[inline]
360 #[cfg(not(target_arch = "x86_64"))]
361 fn sad_u8x16(&self, a: U8x16, b: U8x16) -> u32 {
362 a.iter()
363 .zip(b.iter())
364 .map(|(&x, &y)| u32::from(x.abs_diff(y)))
365 .sum()
366 }
367
368 #[inline]
369 fn sad_8(&self, a: &[u8], b: &[u8]) -> u32 {
370 assert!(a.len() >= 8 && b.len() >= 8);
371 a[..8]
372 .iter()
373 .zip(b[..8].iter())
374 .map(|(&x, &y)| u32::from(x.abs_diff(y)))
375 .sum()
376 }
377
378 #[inline]
379 fn sad_16(&self, a: &[u8], b: &[u8]) -> u32 {
380 assert!(a.len() >= 16 && b.len() >= 16);
381 let mut a_vec = U8x16::zero();
382 let mut b_vec = U8x16::zero();
383 a_vec.copy_from_slice(&a[..16]);
384 b_vec.copy_from_slice(&b[..16]);
385 self.sad_u8x16(a_vec, b_vec)
386 }
387
388 #[inline]
389 #[cfg(target_arch = "x86_64")]
390 fn widen_low_u8_to_i16(&self, v: U8x16) -> I16x8 {
391 unsafe {
392 let vec = _mm_loadu_si128(v.as_ptr().cast());
393 let zero = _mm_setzero_si128();
394 let result = _mm_unpacklo_epi8(vec, zero);
395 let mut out = I16x8::zero();
396 _mm_storeu_si128(out.as_mut_ptr().cast(), result);
397 out
398 }
399 }
400
401 #[inline]
402 #[cfg(not(target_arch = "x86_64"))]
403 fn widen_low_u8_to_i16(&self, v: U8x16) -> I16x8 {
404 let mut result = I16x8::zero();
405 for i in 0..8 {
406 result[i] = i16::from(v[i]);
407 }
408 result
409 }
410
411 #[inline]
412 #[cfg(target_arch = "x86_64")]
413 fn widen_high_u8_to_i16(&self, v: U8x16) -> I16x8 {
414 unsafe {
415 let vec = _mm_loadu_si128(v.as_ptr().cast());
416 let zero = _mm_setzero_si128();
417 let result = _mm_unpackhi_epi8(vec, zero);
418 let mut out = I16x8::zero();
419 _mm_storeu_si128(out.as_mut_ptr().cast(), result);
420 out
421 }
422 }
423
424 #[inline]
425 #[cfg(not(target_arch = "x86_64"))]
426 fn widen_high_u8_to_i16(&self, v: U8x16) -> I16x8 {
427 let mut result = I16x8::zero();
428 for i in 0..8 {
429 result[i] = i16::from(v[i + 8]);
430 }
431 result
432 }
433
434 #[inline]
435 #[cfg(target_arch = "x86_64")]
436 fn narrow_i32x4_to_i16x8(&self, low: I32x4, high: I32x4) -> I16x8 {
437 unsafe {
438 let low_vec = _mm_loadu_si128(low.as_ptr().cast());
439 let high_vec = _mm_loadu_si128(high.as_ptr().cast());
440 let result = _mm_packs_epi32(low_vec, high_vec);
441 let mut out = I16x8::zero();
442 _mm_storeu_si128(out.as_mut_ptr().cast(), result);
443 out
444 }
445 }
446
447 #[inline]
448 #[cfg(not(target_arch = "x86_64"))]
449 fn narrow_i32x4_to_i16x8(&self, low: I32x4, high: I32x4) -> I16x8 {
450 let mut result = I16x8::zero();
451 for i in 0..4 {
452 result[i] = low[i].clamp(i32::from(i16::MIN), i32::from(i16::MAX)) as i16;
453 result[i + 4] = high[i].clamp(i32::from(i16::MIN), i32::from(i16::MAX)) as i16;
454 }
455 result
456 }
457
458 #[inline]
459 fn madd_i16x8(&self, a: I16x8, b: I16x8, c: I16x8) -> I16x8 {
460 let prod = self.mul_i16x8(a, b);
461 self.add_i16x8(prod, c)
462 }
463
464 #[inline]
465 #[cfg(target_arch = "x86_64")]
466 fn pmaddwd(&self, a: I16x8, b: I16x8) -> I32x4 {
467 unsafe {
468 let a_vec = _mm_loadu_si128(a.as_ptr().cast());
469 let b_vec = _mm_loadu_si128(b.as_ptr().cast());
470 let result = _mm_madd_epi16(a_vec, b_vec);
471 let mut out = I32x4::zero();
472 _mm_storeu_si128(out.as_mut_ptr().cast(), result);
473 out
474 }
475 }
476
477 #[inline]
478 #[cfg(not(target_arch = "x86_64"))]
479 fn pmaddwd(&self, a: I16x8, b: I16x8) -> I32x4 {
480 let mut result = I32x4::zero();
481 for i in 0..4 {
482 result[i] = i32::from(a[i * 2]) * i32::from(b[i * 2])
483 + i32::from(a[i * 2 + 1]) * i32::from(b[i * 2 + 1]);
484 }
485 result
486 }
487
488 #[inline]
489 #[cfg(target_arch = "x86_64")]
490 fn shr_i16x8(&self, v: I16x8, shift: u32) -> I16x8 {
491 unsafe {
492 let vec = _mm_loadu_si128(v.as_ptr().cast());
493 let shift_vec = _mm_cvtsi32_si128(shift as i32);
494 let result = _mm_sra_epi16(vec, shift_vec);
495 let mut out = I16x8::zero();
496 _mm_storeu_si128(out.as_mut_ptr().cast(), result);
497 out
498 }
499 }
500
501 #[inline]
502 #[cfg(not(target_arch = "x86_64"))]
503 fn shr_i16x8(&self, v: I16x8, shift: u32) -> I16x8 {
504 let mut result = I16x8::zero();
505 for i in 0..8 {
506 result[i] = v[i] >> shift;
507 }
508 result
509 }
510
511 #[inline]
512 #[cfg(target_arch = "x86_64")]
513 fn shl_i16x8(&self, v: I16x8, shift: u32) -> I16x8 {
514 unsafe {
515 let vec = _mm_loadu_si128(v.as_ptr().cast());
516 let shift_vec = _mm_cvtsi32_si128(shift as i32);
517 let result = _mm_sll_epi16(vec, shift_vec);
518 let mut out = I16x8::zero();
519 _mm_storeu_si128(out.as_mut_ptr().cast(), result);
520 out
521 }
522 }
523
524 #[inline]
525 #[cfg(not(target_arch = "x86_64"))]
526 fn shl_i16x8(&self, v: I16x8, shift: u32) -> I16x8 {
527 let mut result = I16x8::zero();
528 for i in 0..8 {
529 result[i] = v[i] << shift;
530 }
531 result
532 }
533
534 #[inline]
535 #[cfg(target_arch = "x86_64")]
536 fn shr_i32x4(&self, v: I32x4, shift: u32) -> I32x4 {
537 unsafe {
538 let vec = _mm_loadu_si128(v.as_ptr().cast());
539 let shift_vec = _mm_cvtsi32_si128(shift as i32);
540 let result = _mm_sra_epi32(vec, shift_vec);
541 let mut out = I32x4::zero();
542 _mm_storeu_si128(out.as_mut_ptr().cast(), result);
543 out
544 }
545 }
546
547 #[inline]
548 #[cfg(not(target_arch = "x86_64"))]
549 fn shr_i32x4(&self, v: I32x4, shift: u32) -> I32x4 {
550 let mut result = I32x4::zero();
551 for i in 0..4 {
552 result[i] = v[i] >> shift;
553 }
554 result
555 }
556
557 #[inline]
558 #[cfg(target_arch = "x86_64")]
559 fn shl_i32x4(&self, v: I32x4, shift: u32) -> I32x4 {
560 unsafe {
561 let vec = _mm_loadu_si128(v.as_ptr().cast());
562 let shift_vec = _mm_cvtsi32_si128(shift as i32);
563 let result = _mm_sll_epi32(vec, shift_vec);
564 let mut out = I32x4::zero();
565 _mm_storeu_si128(out.as_mut_ptr().cast(), result);
566 out
567 }
568 }
569
570 #[inline]
571 #[cfg(not(target_arch = "x86_64"))]
572 fn shl_i32x4(&self, v: I32x4, shift: u32) -> I32x4 {
573 let mut result = I32x4::zero();
574 for i in 0..4 {
575 result[i] = v[i] << shift;
576 }
577 result
578 }
579
580 #[inline]
581 #[cfg(target_arch = "x86_64")]
582 fn avg_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
583 unsafe {
584 let a_vec = _mm_loadu_si128(a.as_ptr().cast());
585 let b_vec = _mm_loadu_si128(b.as_ptr().cast());
586 let result = _mm_avg_epu8(a_vec, b_vec);
587 let mut out = U8x16::zero();
588 _mm_storeu_si128(out.as_mut_ptr().cast(), result);
589 out
590 }
591 }
592
593 #[inline]
594 #[cfg(not(target_arch = "x86_64"))]
595 fn avg_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
596 let mut result = U8x16::zero();
597 for i in 0..16 {
598 result[i] = ((u16::from(a[i]) + u16::from(b[i]) + 1) / 2) as u8;
599 }
600 result
601 }
602}
603
604impl SimdOpsExt for Avx512Simd {
605 #[inline]
606 fn load4_u8_to_i16x8(&self, src: &[u8]) -> I16x8 {
607 assert!(src.len() >= 4);
608 let mut result = I16x8::zero();
609 for i in 0..4 {
610 result[i] = i16::from(src[i]);
611 }
612 result
613 }
614
615 #[inline]
616 fn load8_u8_to_i16x8(&self, src: &[u8]) -> I16x8 {
617 assert!(src.len() >= 8);
618 let mut result = I16x8::zero();
619 for i in 0..8 {
620 result[i] = i16::from(src[i]);
621 }
622 result
623 }
624
625 #[inline]
626 fn store4_i16x8_as_u8(&self, v: I16x8, dst: &mut [u8]) {
627 assert!(dst.len() >= 4);
628 for i in 0..4 {
629 dst[i] = v[i].clamp(0, 255) as u8;
630 }
631 }
632
633 #[inline]
634 fn store8_i16x8_as_u8(&self, v: I16x8, dst: &mut [u8]) {
635 assert!(dst.len() >= 8);
636 for i in 0..8 {
637 dst[i] = v[i].clamp(0, 255) as u8;
638 }
639 }
640
641 #[inline]
642 fn transpose_4x4_i16(&self, rows: &[I16x8; 4]) -> [I16x8; 4] {
643 #[cfg(target_arch = "x86_64")]
644 {
645 unsafe {
646 let r0 = _mm_loadl_epi64(rows[0].as_ptr().cast());
647 let r1 = _mm_loadl_epi64(rows[1].as_ptr().cast());
648 let r2 = _mm_loadl_epi64(rows[2].as_ptr().cast());
649 let r3 = _mm_loadl_epi64(rows[3].as_ptr().cast());
650
651 let t0 = _mm_unpacklo_epi16(r0, r1);
652 let t1 = _mm_unpacklo_epi16(r2, r3);
653
654 let o0 = _mm_unpacklo_epi32(t0, t1);
655 let o1 = _mm_unpackhi_epi32(t0, t1);
656 let o2 = _mm_unpacklo_epi32(_mm_unpackhi_epi16(r0, r1), _mm_unpackhi_epi16(r2, r3));
657 let o3 = _mm_unpackhi_epi32(_mm_unpackhi_epi16(r0, r1), _mm_unpackhi_epi16(r2, r3));
658
659 let mut out = [I16x8::zero(); 4];
660 _mm_storeu_si128(out[0].as_mut_ptr().cast(), o0);
661 _mm_storeu_si128(out[1].as_mut_ptr().cast(), o1);
662 _mm_storeu_si128(out[2].as_mut_ptr().cast(), o2);
663 _mm_storeu_si128(out[3].as_mut_ptr().cast(), o3);
664 out
665 }
666 }
667 #[cfg(not(target_arch = "x86_64"))]
668 {
669 let mut out = [I16x8::zero(); 4];
670 for i in 0..4 {
671 for j in 0..4 {
672 out[i][j] = rows[j][i];
673 }
674 }
675 out
676 }
677 }
678
679 #[inline]
680 fn transpose_8x8_i16(&self, rows: &[I16x8; 8]) -> [I16x8; 8] {
681 #[cfg(target_arch = "x86_64")]
682 {
683 unsafe {
684 let r0 = _mm_loadu_si128(rows[0].as_ptr().cast());
685 let r1 = _mm_loadu_si128(rows[1].as_ptr().cast());
686 let r2 = _mm_loadu_si128(rows[2].as_ptr().cast());
687 let r3 = _mm_loadu_si128(rows[3].as_ptr().cast());
688 let r4 = _mm_loadu_si128(rows[4].as_ptr().cast());
689 let r5 = _mm_loadu_si128(rows[5].as_ptr().cast());
690 let r6 = _mm_loadu_si128(rows[6].as_ptr().cast());
691 let r7 = _mm_loadu_si128(rows[7].as_ptr().cast());
692
693 let t0 = _mm_unpacklo_epi16(r0, r1);
694 let t1 = _mm_unpackhi_epi16(r0, r1);
695 let t2 = _mm_unpacklo_epi16(r2, r3);
696 let t3 = _mm_unpackhi_epi16(r2, r3);
697 let t4 = _mm_unpacklo_epi16(r4, r5);
698 let t5 = _mm_unpackhi_epi16(r4, r5);
699 let t6 = _mm_unpacklo_epi16(r6, r7);
700 let t7 = _mm_unpackhi_epi16(r6, r7);
701
702 let u0 = _mm_unpacklo_epi32(t0, t2);
703 let u1 = _mm_unpackhi_epi32(t0, t2);
704 let u2 = _mm_unpacklo_epi32(t1, t3);
705 let u3 = _mm_unpackhi_epi32(t1, t3);
706 let u4 = _mm_unpacklo_epi32(t4, t6);
707 let u5 = _mm_unpackhi_epi32(t4, t6);
708 let u6 = _mm_unpacklo_epi32(t5, t7);
709 let u7 = _mm_unpackhi_epi32(t5, t7);
710
711 let o0 = _mm_unpacklo_epi64(u0, u4);
712 let o1 = _mm_unpackhi_epi64(u0, u4);
713 let o2 = _mm_unpacklo_epi64(u1, u5);
714 let o3 = _mm_unpackhi_epi64(u1, u5);
715 let o4 = _mm_unpacklo_epi64(u2, u6);
716 let o5 = _mm_unpackhi_epi64(u2, u6);
717 let o6 = _mm_unpacklo_epi64(u3, u7);
718 let o7 = _mm_unpackhi_epi64(u3, u7);
719
720 let mut out = [I16x8::zero(); 8];
721 _mm_storeu_si128(out[0].as_mut_ptr().cast(), o0);
722 _mm_storeu_si128(out[1].as_mut_ptr().cast(), o1);
723 _mm_storeu_si128(out[2].as_mut_ptr().cast(), o2);
724 _mm_storeu_si128(out[3].as_mut_ptr().cast(), o3);
725 _mm_storeu_si128(out[4].as_mut_ptr().cast(), o4);
726 _mm_storeu_si128(out[5].as_mut_ptr().cast(), o5);
727 _mm_storeu_si128(out[6].as_mut_ptr().cast(), o6);
728 _mm_storeu_si128(out[7].as_mut_ptr().cast(), o7);
729 out
730 }
731 }
732 #[cfg(not(target_arch = "x86_64"))]
733 {
734 let mut out = [I16x8::zero(); 8];
735 for i in 0..8 {
736 for j in 0..8 {
737 out[i][j] = rows[j][i];
738 }
739 }
740 out
741 }
742 }
743
744 #[inline]
745 fn butterfly_i16x8(&self, a: I16x8, b: I16x8) -> (I16x8, I16x8) {
746 let sum = self.add_i16x8(a, b);
747 let diff = self.sub_i16x8(a, b);
748 (sum, diff)
749 }
750
751 #[inline]
752 fn butterfly_i32x4(&self, a: I32x4, b: I32x4) -> (I32x4, I32x4) {
753 let sum = self.add_i32x4(a, b);
754 let diff = self.sub_i32x4(a, b);
755 (sum, diff)
756 }
757}