Skip to main content

oximedia_codec/simd/x86/
avx512.rs

1//! AVX-512 SIMD implementation for x86_64.
2//!
3//! This module provides optimized implementations using AVX-512 instructions,
4//! available on Intel Skylake-X (2017) and later server processors, and
5//! Ice Lake (2019) and later client processors.
6
7#![allow(unsafe_code)]
8
9use crate::simd::traits::{SimdOps, SimdOpsExt};
10use crate::simd::types::{I16x16, I16x8, I32x4, I32x8, U8x16, U8x32};
11
12#[cfg(target_arch = "x86_64")]
13use std::arch::x86_64::*;
14
15/// AVX-512 SIMD implementation.
16///
17/// This implementation leverages wider 512-bit registers and enhanced
18/// instruction set of AVX-512, providing better performance for
19/// large blocks and parallel operations.
20#[derive(Clone, Copy, Debug)]
21pub struct Avx512Simd;
22
23impl Avx512Simd {
24    /// Create a new AVX-512 SIMD instance.
25    ///
26    /// # Safety
27    ///
28    /// The caller must ensure that AVX-512F is available on the current CPU.
29    /// Use `is_available()` to check before calling SIMD operations.
30    #[inline]
31    #[must_use]
32    pub const fn new() -> Self {
33        Self
34    }
35
36    /// Check if AVX-512 is available at runtime.
37    #[inline]
38    #[must_use]
39    pub fn is_available() -> bool {
40        #[cfg(target_arch = "x86_64")]
41        {
42            // Check for AVX-512 Foundation (F) and some common extensions
43            is_x86_feature_detected!("avx512f")
44                && is_x86_feature_detected!("avx512bw")
45                && is_x86_feature_detected!("avx512dq")
46        }
47        #[cfg(not(target_arch = "x86_64"))]
48        {
49            false
50        }
51    }
52
53    /// Process 32 bytes at once using AVX-512.
54    #[inline]
55    #[cfg(target_arch = "x86_64")]
56    #[allow(dead_code)]
57    fn sad_u8x32_avx512(&self, a: &U8x32, b: &U8x32) -> u32 {
58        // SAFETY: AVX-512 is checked at runtime
59        unsafe {
60            let a_vec = _mm256_loadu_si256(a.as_ptr().cast());
61            let b_vec = _mm256_loadu_si256(b.as_ptr().cast());
62
63            // Use AVX2 sad instruction twice and combine
64            let sad = _mm256_sad_epu8(a_vec, b_vec);
65
66            // Extract and sum all 4 64-bit results
67            let arr: [u64; 4] = std::mem::transmute(sad);
68            (arr[0] + arr[1] + arr[2] + arr[3]) as u32
69        }
70    }
71}
72
73impl SimdOps for Avx512Simd {
74    #[inline]
75    fn name(&self) -> &'static str {
76        "avx512"
77    }
78
79    #[inline]
80    fn is_available(&self) -> bool {
81        Self::is_available()
82    }
83
84    // Most operations delegate to 128-bit or 256-bit instructions for compatibility
85    // AVX-512 shines in wider operations and masked operations
86
87    #[inline]
88    #[cfg(target_arch = "x86_64")]
89    fn add_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
90        // SAFETY: AVX-512 is checked at runtime
91        unsafe {
92            let a_vec = _mm_loadu_si128(a.as_ptr().cast());
93            let b_vec = _mm_loadu_si128(b.as_ptr().cast());
94            let result = _mm_add_epi16(a_vec, b_vec);
95            let mut out = I16x8::zero();
96            _mm_storeu_si128(out.as_mut_ptr().cast(), result);
97            out
98        }
99    }
100
101    #[inline]
102    #[cfg(not(target_arch = "x86_64"))]
103    fn add_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
104        let mut result = I16x8::zero();
105        for i in 0..8 {
106            result[i] = a[i].wrapping_add(b[i]);
107        }
108        result
109    }
110
111    #[inline]
112    #[cfg(target_arch = "x86_64")]
113    fn sub_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
114        unsafe {
115            let a_vec = _mm_loadu_si128(a.as_ptr().cast());
116            let b_vec = _mm_loadu_si128(b.as_ptr().cast());
117            let result = _mm_sub_epi16(a_vec, b_vec);
118            let mut out = I16x8::zero();
119            _mm_storeu_si128(out.as_mut_ptr().cast(), result);
120            out
121        }
122    }
123
124    #[inline]
125    #[cfg(not(target_arch = "x86_64"))]
126    fn sub_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
127        let mut result = I16x8::zero();
128        for i in 0..8 {
129            result[i] = a[i].wrapping_sub(b[i]);
130        }
131        result
132    }
133
134    #[inline]
135    #[cfg(target_arch = "x86_64")]
136    fn mul_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
137        unsafe {
138            let a_vec = _mm_loadu_si128(a.as_ptr().cast());
139            let b_vec = _mm_loadu_si128(b.as_ptr().cast());
140            let result = _mm_mullo_epi16(a_vec, b_vec);
141            let mut out = I16x8::zero();
142            _mm_storeu_si128(out.as_mut_ptr().cast(), result);
143            out
144        }
145    }
146
147    #[inline]
148    #[cfg(not(target_arch = "x86_64"))]
149    fn mul_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
150        let mut result = I16x8::zero();
151        for i in 0..8 {
152            result[i] = a[i].wrapping_mul(b[i]);
153        }
154        result
155    }
156
157    #[inline]
158    #[cfg(target_arch = "x86_64")]
159    fn add_i32x4(&self, a: I32x4, b: I32x4) -> I32x4 {
160        unsafe {
161            let a_vec = _mm_loadu_si128(a.as_ptr().cast());
162            let b_vec = _mm_loadu_si128(b.as_ptr().cast());
163            let result = _mm_add_epi32(a_vec, b_vec);
164            let mut out = I32x4::zero();
165            _mm_storeu_si128(out.as_mut_ptr().cast(), result);
166            out
167        }
168    }
169
170    #[inline]
171    #[cfg(not(target_arch = "x86_64"))]
172    fn add_i32x4(&self, a: I32x4, b: I32x4) -> I32x4 {
173        let mut result = I32x4::zero();
174        for i in 0..4 {
175            result[i] = a[i].wrapping_add(b[i]);
176        }
177        result
178    }
179
180    #[inline]
181    #[cfg(target_arch = "x86_64")]
182    fn sub_i32x4(&self, a: I32x4, b: I32x4) -> I32x4 {
183        unsafe {
184            let a_vec = _mm_loadu_si128(a.as_ptr().cast());
185            let b_vec = _mm_loadu_si128(b.as_ptr().cast());
186            let result = _mm_sub_epi32(a_vec, b_vec);
187            let mut out = I32x4::zero();
188            _mm_storeu_si128(out.as_mut_ptr().cast(), result);
189            out
190        }
191    }
192
193    #[inline]
194    #[cfg(not(target_arch = "x86_64"))]
195    fn sub_i32x4(&self, a: I32x4, b: I32x4) -> I32x4 {
196        let mut result = I32x4::zero();
197        for i in 0..4 {
198            result[i] = a[i].wrapping_sub(b[i]);
199        }
200        result
201    }
202
203    #[inline]
204    #[cfg(target_arch = "x86_64")]
205    fn min_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
206        unsafe {
207            let a_vec = _mm_loadu_si128(a.as_ptr().cast());
208            let b_vec = _mm_loadu_si128(b.as_ptr().cast());
209            let result = _mm_min_epi16(a_vec, b_vec);
210            let mut out = I16x8::zero();
211            _mm_storeu_si128(out.as_mut_ptr().cast(), result);
212            out
213        }
214    }
215
216    #[inline]
217    #[cfg(not(target_arch = "x86_64"))]
218    fn min_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
219        let mut result = I16x8::zero();
220        for i in 0..8 {
221            result[i] = a[i].min(b[i]);
222        }
223        result
224    }
225
226    #[inline]
227    #[cfg(target_arch = "x86_64")]
228    fn max_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
229        unsafe {
230            let a_vec = _mm_loadu_si128(a.as_ptr().cast());
231            let b_vec = _mm_loadu_si128(b.as_ptr().cast());
232            let result = _mm_max_epi16(a_vec, b_vec);
233            let mut out = I16x8::zero();
234            _mm_storeu_si128(out.as_mut_ptr().cast(), result);
235            out
236        }
237    }
238
239    #[inline]
240    #[cfg(not(target_arch = "x86_64"))]
241    fn max_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
242        let mut result = I16x8::zero();
243        for i in 0..8 {
244            result[i] = a[i].max(b[i]);
245        }
246        result
247    }
248
249    #[inline]
250    fn clamp_i16x8(&self, v: I16x8, min: i16, max: i16) -> I16x8 {
251        let min_vec = I16x8::splat(min);
252        let max_vec = I16x8::splat(max);
253        let clamped_min = self.max_i16x8(v, min_vec);
254        self.min_i16x8(clamped_min, max_vec)
255    }
256
257    #[inline]
258    #[cfg(target_arch = "x86_64")]
259    fn min_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
260        unsafe {
261            let a_vec = _mm_loadu_si128(a.as_ptr().cast());
262            let b_vec = _mm_loadu_si128(b.as_ptr().cast());
263            let result = _mm_min_epu8(a_vec, b_vec);
264            let mut out = U8x16::zero();
265            _mm_storeu_si128(out.as_mut_ptr().cast(), result);
266            out
267        }
268    }
269
270    #[inline]
271    #[cfg(not(target_arch = "x86_64"))]
272    fn min_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
273        let mut result = U8x16::zero();
274        for i in 0..16 {
275            result[i] = a[i].min(b[i]);
276        }
277        result
278    }
279
280    #[inline]
281    #[cfg(target_arch = "x86_64")]
282    fn max_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
283        unsafe {
284            let a_vec = _mm_loadu_si128(a.as_ptr().cast());
285            let b_vec = _mm_loadu_si128(b.as_ptr().cast());
286            let result = _mm_max_epu8(a_vec, b_vec);
287            let mut out = U8x16::zero();
288            _mm_storeu_si128(out.as_mut_ptr().cast(), result);
289            out
290        }
291    }
292
293    #[inline]
294    #[cfg(not(target_arch = "x86_64"))]
295    fn max_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
296        let mut result = U8x16::zero();
297        for i in 0..16 {
298            result[i] = a[i].max(b[i]);
299        }
300        result
301    }
302
303    #[inline]
304    fn clamp_u8x16(&self, v: U8x16, min: u8, max: u8) -> U8x16 {
305        let min_vec = U8x16::splat(min);
306        let max_vec = U8x16::splat(max);
307        let clamped_min = self.max_u8x16(v, min_vec);
308        self.min_u8x16(clamped_min, max_vec)
309    }
310
311    #[inline]
312    #[cfg(target_arch = "x86_64")]
313    fn horizontal_sum_i16x8(&self, v: I16x8) -> i32 {
314        unsafe {
315            let vec = _mm_loadu_si128(v.as_ptr().cast());
316            let sum1 = _mm_hadd_epi16(vec, vec);
317            let sum2 = _mm_hadd_epi16(sum1, sum1);
318            let sum3 = _mm_hadd_epi16(sum2, sum2);
319            _mm_extract_epi16(sum3, 0) as i16 as i32
320        }
321    }
322
323    #[inline]
324    #[cfg(not(target_arch = "x86_64"))]
325    fn horizontal_sum_i16x8(&self, v: I16x8) -> i32 {
326        v.iter().map(|&x| i32::from(x)).sum()
327    }
328
329    #[inline]
330    #[cfg(target_arch = "x86_64")]
331    fn horizontal_sum_i32x4(&self, v: I32x4) -> i32 {
332        unsafe {
333            let vec = _mm_loadu_si128(v.as_ptr().cast());
334            let sum1 = _mm_hadd_epi32(vec, vec);
335            let sum2 = _mm_hadd_epi32(sum1, sum1);
336            _mm_extract_epi32(sum2, 0)
337        }
338    }
339
340    #[inline]
341    #[cfg(not(target_arch = "x86_64"))]
342    fn horizontal_sum_i32x4(&self, v: I32x4) -> i32 {
343        v.iter().sum()
344    }
345
346    #[inline]
347    #[cfg(target_arch = "x86_64")]
348    fn sad_u8x16(&self, a: U8x16, b: U8x16) -> u32 {
349        unsafe {
350            let a_vec = _mm_loadu_si128(a.as_ptr().cast());
351            let b_vec = _mm_loadu_si128(b.as_ptr().cast());
352            let sad = _mm_sad_epu8(a_vec, b_vec);
353            let low = _mm_extract_epi64(sad, 0) as u32;
354            let high = _mm_extract_epi64(sad, 1) as u32;
355            low + high
356        }
357    }
358
359    #[inline]
360    #[cfg(not(target_arch = "x86_64"))]
361    fn sad_u8x16(&self, a: U8x16, b: U8x16) -> u32 {
362        a.iter()
363            .zip(b.iter())
364            .map(|(&x, &y)| u32::from(x.abs_diff(y)))
365            .sum()
366    }
367
368    #[inline]
369    fn sad_8(&self, a: &[u8], b: &[u8]) -> u32 {
370        assert!(a.len() >= 8 && b.len() >= 8);
371        a[..8]
372            .iter()
373            .zip(b[..8].iter())
374            .map(|(&x, &y)| u32::from(x.abs_diff(y)))
375            .sum()
376    }
377
378    #[inline]
379    fn sad_16(&self, a: &[u8], b: &[u8]) -> u32 {
380        assert!(a.len() >= 16 && b.len() >= 16);
381        let mut a_vec = U8x16::zero();
382        let mut b_vec = U8x16::zero();
383        a_vec.copy_from_slice(&a[..16]);
384        b_vec.copy_from_slice(&b[..16]);
385        self.sad_u8x16(a_vec, b_vec)
386    }
387
388    #[inline]
389    #[cfg(target_arch = "x86_64")]
390    fn widen_low_u8_to_i16(&self, v: U8x16) -> I16x8 {
391        unsafe {
392            let vec = _mm_loadu_si128(v.as_ptr().cast());
393            let zero = _mm_setzero_si128();
394            let result = _mm_unpacklo_epi8(vec, zero);
395            let mut out = I16x8::zero();
396            _mm_storeu_si128(out.as_mut_ptr().cast(), result);
397            out
398        }
399    }
400
401    #[inline]
402    #[cfg(not(target_arch = "x86_64"))]
403    fn widen_low_u8_to_i16(&self, v: U8x16) -> I16x8 {
404        let mut result = I16x8::zero();
405        for i in 0..8 {
406            result[i] = i16::from(v[i]);
407        }
408        result
409    }
410
411    #[inline]
412    #[cfg(target_arch = "x86_64")]
413    fn widen_high_u8_to_i16(&self, v: U8x16) -> I16x8 {
414        unsafe {
415            let vec = _mm_loadu_si128(v.as_ptr().cast());
416            let zero = _mm_setzero_si128();
417            let result = _mm_unpackhi_epi8(vec, zero);
418            let mut out = I16x8::zero();
419            _mm_storeu_si128(out.as_mut_ptr().cast(), result);
420            out
421        }
422    }
423
424    #[inline]
425    #[cfg(not(target_arch = "x86_64"))]
426    fn widen_high_u8_to_i16(&self, v: U8x16) -> I16x8 {
427        let mut result = I16x8::zero();
428        for i in 0..8 {
429            result[i] = i16::from(v[i + 8]);
430        }
431        result
432    }
433
434    #[inline]
435    #[cfg(target_arch = "x86_64")]
436    fn narrow_i32x4_to_i16x8(&self, low: I32x4, high: I32x4) -> I16x8 {
437        unsafe {
438            let low_vec = _mm_loadu_si128(low.as_ptr().cast());
439            let high_vec = _mm_loadu_si128(high.as_ptr().cast());
440            let result = _mm_packs_epi32(low_vec, high_vec);
441            let mut out = I16x8::zero();
442            _mm_storeu_si128(out.as_mut_ptr().cast(), result);
443            out
444        }
445    }
446
447    #[inline]
448    #[cfg(not(target_arch = "x86_64"))]
449    fn narrow_i32x4_to_i16x8(&self, low: I32x4, high: I32x4) -> I16x8 {
450        let mut result = I16x8::zero();
451        for i in 0..4 {
452            result[i] = low[i].clamp(i32::from(i16::MIN), i32::from(i16::MAX)) as i16;
453            result[i + 4] = high[i].clamp(i32::from(i16::MIN), i32::from(i16::MAX)) as i16;
454        }
455        result
456    }
457
458    #[inline]
459    fn madd_i16x8(&self, a: I16x8, b: I16x8, c: I16x8) -> I16x8 {
460        let prod = self.mul_i16x8(a, b);
461        self.add_i16x8(prod, c)
462    }
463
464    #[inline]
465    #[cfg(target_arch = "x86_64")]
466    fn pmaddwd(&self, a: I16x8, b: I16x8) -> I32x4 {
467        unsafe {
468            let a_vec = _mm_loadu_si128(a.as_ptr().cast());
469            let b_vec = _mm_loadu_si128(b.as_ptr().cast());
470            let result = _mm_madd_epi16(a_vec, b_vec);
471            let mut out = I32x4::zero();
472            _mm_storeu_si128(out.as_mut_ptr().cast(), result);
473            out
474        }
475    }
476
477    #[inline]
478    #[cfg(not(target_arch = "x86_64"))]
479    fn pmaddwd(&self, a: I16x8, b: I16x8) -> I32x4 {
480        let mut result = I32x4::zero();
481        for i in 0..4 {
482            result[i] = i32::from(a[i * 2]) * i32::from(b[i * 2])
483                + i32::from(a[i * 2 + 1]) * i32::from(b[i * 2 + 1]);
484        }
485        result
486    }
487
488    #[inline]
489    #[cfg(target_arch = "x86_64")]
490    fn shr_i16x8(&self, v: I16x8, shift: u32) -> I16x8 {
491        unsafe {
492            let vec = _mm_loadu_si128(v.as_ptr().cast());
493            let shift_vec = _mm_cvtsi32_si128(shift as i32);
494            let result = _mm_sra_epi16(vec, shift_vec);
495            let mut out = I16x8::zero();
496            _mm_storeu_si128(out.as_mut_ptr().cast(), result);
497            out
498        }
499    }
500
501    #[inline]
502    #[cfg(not(target_arch = "x86_64"))]
503    fn shr_i16x8(&self, v: I16x8, shift: u32) -> I16x8 {
504        let mut result = I16x8::zero();
505        for i in 0..8 {
506            result[i] = v[i] >> shift;
507        }
508        result
509    }
510
511    #[inline]
512    #[cfg(target_arch = "x86_64")]
513    fn shl_i16x8(&self, v: I16x8, shift: u32) -> I16x8 {
514        unsafe {
515            let vec = _mm_loadu_si128(v.as_ptr().cast());
516            let shift_vec = _mm_cvtsi32_si128(shift as i32);
517            let result = _mm_sll_epi16(vec, shift_vec);
518            let mut out = I16x8::zero();
519            _mm_storeu_si128(out.as_mut_ptr().cast(), result);
520            out
521        }
522    }
523
524    #[inline]
525    #[cfg(not(target_arch = "x86_64"))]
526    fn shl_i16x8(&self, v: I16x8, shift: u32) -> I16x8 {
527        let mut result = I16x8::zero();
528        for i in 0..8 {
529            result[i] = v[i] << shift;
530        }
531        result
532    }
533
534    #[inline]
535    #[cfg(target_arch = "x86_64")]
536    fn shr_i32x4(&self, v: I32x4, shift: u32) -> I32x4 {
537        unsafe {
538            let vec = _mm_loadu_si128(v.as_ptr().cast());
539            let shift_vec = _mm_cvtsi32_si128(shift as i32);
540            let result = _mm_sra_epi32(vec, shift_vec);
541            let mut out = I32x4::zero();
542            _mm_storeu_si128(out.as_mut_ptr().cast(), result);
543            out
544        }
545    }
546
547    #[inline]
548    #[cfg(not(target_arch = "x86_64"))]
549    fn shr_i32x4(&self, v: I32x4, shift: u32) -> I32x4 {
550        let mut result = I32x4::zero();
551        for i in 0..4 {
552            result[i] = v[i] >> shift;
553        }
554        result
555    }
556
557    #[inline]
558    #[cfg(target_arch = "x86_64")]
559    fn shl_i32x4(&self, v: I32x4, shift: u32) -> I32x4 {
560        unsafe {
561            let vec = _mm_loadu_si128(v.as_ptr().cast());
562            let shift_vec = _mm_cvtsi32_si128(shift as i32);
563            let result = _mm_sll_epi32(vec, shift_vec);
564            let mut out = I32x4::zero();
565            _mm_storeu_si128(out.as_mut_ptr().cast(), result);
566            out
567        }
568    }
569
570    #[inline]
571    #[cfg(not(target_arch = "x86_64"))]
572    fn shl_i32x4(&self, v: I32x4, shift: u32) -> I32x4 {
573        let mut result = I32x4::zero();
574        for i in 0..4 {
575            result[i] = v[i] << shift;
576        }
577        result
578    }
579
580    #[inline]
581    #[cfg(target_arch = "x86_64")]
582    fn avg_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
583        unsafe {
584            let a_vec = _mm_loadu_si128(a.as_ptr().cast());
585            let b_vec = _mm_loadu_si128(b.as_ptr().cast());
586            let result = _mm_avg_epu8(a_vec, b_vec);
587            let mut out = U8x16::zero();
588            _mm_storeu_si128(out.as_mut_ptr().cast(), result);
589            out
590        }
591    }
592
593    #[inline]
594    #[cfg(not(target_arch = "x86_64"))]
595    fn avg_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
596        let mut result = U8x16::zero();
597        for i in 0..16 {
598            result[i] = ((u16::from(a[i]) + u16::from(b[i]) + 1) / 2) as u8;
599        }
600        result
601    }
602}
603
604impl SimdOpsExt for Avx512Simd {
605    #[inline]
606    fn load4_u8_to_i16x8(&self, src: &[u8]) -> I16x8 {
607        assert!(src.len() >= 4);
608        let mut result = I16x8::zero();
609        for i in 0..4 {
610            result[i] = i16::from(src[i]);
611        }
612        result
613    }
614
615    #[inline]
616    fn load8_u8_to_i16x8(&self, src: &[u8]) -> I16x8 {
617        assert!(src.len() >= 8);
618        let mut result = I16x8::zero();
619        for i in 0..8 {
620            result[i] = i16::from(src[i]);
621        }
622        result
623    }
624
625    #[inline]
626    fn store4_i16x8_as_u8(&self, v: I16x8, dst: &mut [u8]) {
627        assert!(dst.len() >= 4);
628        for i in 0..4 {
629            dst[i] = v[i].clamp(0, 255) as u8;
630        }
631    }
632
633    #[inline]
634    fn store8_i16x8_as_u8(&self, v: I16x8, dst: &mut [u8]) {
635        assert!(dst.len() >= 8);
636        for i in 0..8 {
637            dst[i] = v[i].clamp(0, 255) as u8;
638        }
639    }
640
641    #[inline]
642    fn transpose_4x4_i16(&self, rows: &[I16x8; 4]) -> [I16x8; 4] {
643        #[cfg(target_arch = "x86_64")]
644        {
645            unsafe {
646                let r0 = _mm_loadl_epi64(rows[0].as_ptr().cast());
647                let r1 = _mm_loadl_epi64(rows[1].as_ptr().cast());
648                let r2 = _mm_loadl_epi64(rows[2].as_ptr().cast());
649                let r3 = _mm_loadl_epi64(rows[3].as_ptr().cast());
650
651                let t0 = _mm_unpacklo_epi16(r0, r1);
652                let t1 = _mm_unpacklo_epi16(r2, r3);
653
654                let o0 = _mm_unpacklo_epi32(t0, t1);
655                let o1 = _mm_unpackhi_epi32(t0, t1);
656                let o2 = _mm_unpacklo_epi32(_mm_unpackhi_epi16(r0, r1), _mm_unpackhi_epi16(r2, r3));
657                let o3 = _mm_unpackhi_epi32(_mm_unpackhi_epi16(r0, r1), _mm_unpackhi_epi16(r2, r3));
658
659                let mut out = [I16x8::zero(); 4];
660                _mm_storeu_si128(out[0].as_mut_ptr().cast(), o0);
661                _mm_storeu_si128(out[1].as_mut_ptr().cast(), o1);
662                _mm_storeu_si128(out[2].as_mut_ptr().cast(), o2);
663                _mm_storeu_si128(out[3].as_mut_ptr().cast(), o3);
664                out
665            }
666        }
667        #[cfg(not(target_arch = "x86_64"))]
668        {
669            let mut out = [I16x8::zero(); 4];
670            for i in 0..4 {
671                for j in 0..4 {
672                    out[i][j] = rows[j][i];
673                }
674            }
675            out
676        }
677    }
678
679    #[inline]
680    fn transpose_8x8_i16(&self, rows: &[I16x8; 8]) -> [I16x8; 8] {
681        #[cfg(target_arch = "x86_64")]
682        {
683            unsafe {
684                let r0 = _mm_loadu_si128(rows[0].as_ptr().cast());
685                let r1 = _mm_loadu_si128(rows[1].as_ptr().cast());
686                let r2 = _mm_loadu_si128(rows[2].as_ptr().cast());
687                let r3 = _mm_loadu_si128(rows[3].as_ptr().cast());
688                let r4 = _mm_loadu_si128(rows[4].as_ptr().cast());
689                let r5 = _mm_loadu_si128(rows[5].as_ptr().cast());
690                let r6 = _mm_loadu_si128(rows[6].as_ptr().cast());
691                let r7 = _mm_loadu_si128(rows[7].as_ptr().cast());
692
693                let t0 = _mm_unpacklo_epi16(r0, r1);
694                let t1 = _mm_unpackhi_epi16(r0, r1);
695                let t2 = _mm_unpacklo_epi16(r2, r3);
696                let t3 = _mm_unpackhi_epi16(r2, r3);
697                let t4 = _mm_unpacklo_epi16(r4, r5);
698                let t5 = _mm_unpackhi_epi16(r4, r5);
699                let t6 = _mm_unpacklo_epi16(r6, r7);
700                let t7 = _mm_unpackhi_epi16(r6, r7);
701
702                let u0 = _mm_unpacklo_epi32(t0, t2);
703                let u1 = _mm_unpackhi_epi32(t0, t2);
704                let u2 = _mm_unpacklo_epi32(t1, t3);
705                let u3 = _mm_unpackhi_epi32(t1, t3);
706                let u4 = _mm_unpacklo_epi32(t4, t6);
707                let u5 = _mm_unpackhi_epi32(t4, t6);
708                let u6 = _mm_unpacklo_epi32(t5, t7);
709                let u7 = _mm_unpackhi_epi32(t5, t7);
710
711                let o0 = _mm_unpacklo_epi64(u0, u4);
712                let o1 = _mm_unpackhi_epi64(u0, u4);
713                let o2 = _mm_unpacklo_epi64(u1, u5);
714                let o3 = _mm_unpackhi_epi64(u1, u5);
715                let o4 = _mm_unpacklo_epi64(u2, u6);
716                let o5 = _mm_unpackhi_epi64(u2, u6);
717                let o6 = _mm_unpacklo_epi64(u3, u7);
718                let o7 = _mm_unpackhi_epi64(u3, u7);
719
720                let mut out = [I16x8::zero(); 8];
721                _mm_storeu_si128(out[0].as_mut_ptr().cast(), o0);
722                _mm_storeu_si128(out[1].as_mut_ptr().cast(), o1);
723                _mm_storeu_si128(out[2].as_mut_ptr().cast(), o2);
724                _mm_storeu_si128(out[3].as_mut_ptr().cast(), o3);
725                _mm_storeu_si128(out[4].as_mut_ptr().cast(), o4);
726                _mm_storeu_si128(out[5].as_mut_ptr().cast(), o5);
727                _mm_storeu_si128(out[6].as_mut_ptr().cast(), o6);
728                _mm_storeu_si128(out[7].as_mut_ptr().cast(), o7);
729                out
730            }
731        }
732        #[cfg(not(target_arch = "x86_64"))]
733        {
734            let mut out = [I16x8::zero(); 8];
735            for i in 0..8 {
736                for j in 0..8 {
737                    out[i][j] = rows[j][i];
738                }
739            }
740            out
741        }
742    }
743
744    #[inline]
745    fn butterfly_i16x8(&self, a: I16x8, b: I16x8) -> (I16x8, I16x8) {
746        let sum = self.add_i16x8(a, b);
747        let diff = self.sub_i16x8(a, b);
748        (sum, diff)
749    }
750
751    #[inline]
752    fn butterfly_i32x4(&self, a: I32x4, b: I32x4) -> (I32x4, I32x4) {
753        let sum = self.add_i32x4(a, b);
754        let diff = self.sub_i32x4(a, b);
755        (sum, diff)
756    }
757}