Skip to main content

oximedia_codec/simd/arm/
neon.rs

1//! ARM NEON SIMD implementation.
2//!
3//! This module provides optimized implementations using ARM NEON instructions,
4//! available on all ARMv7-A with NEON (2008+) and all ARMv8-A/AArch64 (2011+) processors.
5
6#![allow(unsafe_code)]
7#![allow(
8    clippy::transmute_undefined_repr,
9    clippy::missing_transmute_annotations
10)]
11
12use crate::simd::traits::{SimdOps, SimdOpsExt};
13use crate::simd::types::{I16x8, I32x4, U8x16};
14
15#[cfg(target_arch = "aarch64")]
16use std::arch::aarch64::*;
17
18/// ARM NEON SIMD implementation.
19#[derive(Clone, Copy, Debug)]
20pub struct NeonSimd;
21
22impl NeonSimd {
23    /// Create a new NEON SIMD instance.
24    ///
25    /// # Safety
26    ///
27    /// On AArch64, NEON is always available. On ARMv7, the caller must
28    /// ensure NEON is available before calling SIMD operations.
29    #[inline]
30    #[must_use]
31    pub const fn new() -> Self {
32        Self
33    }
34
35    /// Check if NEON is available at runtime.
36    #[inline]
37    #[must_use]
38    pub fn is_available() -> bool {
39        #[cfg(target_arch = "aarch64")]
40        {
41            // On AArch64, NEON is always available
42            true
43        }
44        #[cfg(all(target_arch = "arm", target_feature = "neon"))]
45        {
46            true
47        }
48        #[cfg(not(any(
49            target_arch = "aarch64",
50            all(target_arch = "arm", target_feature = "neon")
51        )))]
52        {
53            false
54        }
55    }
56}
57
58impl SimdOps for NeonSimd {
59    #[inline]
60    fn name(&self) -> &'static str {
61        "neon"
62    }
63
64    #[inline]
65    fn is_available(&self) -> bool {
66        Self::is_available()
67    }
68
69    // ========================================================================
70    // Vector Arithmetic
71    // ========================================================================
72
73    #[inline]
74    #[cfg(target_arch = "aarch64")]
75    fn add_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
76        // SAFETY: NEON is always available on AArch64
77        unsafe {
78            let a_vec = vld1q_s16(a.as_ptr());
79            let b_vec = vld1q_s16(b.as_ptr());
80            let result = vaddq_s16(a_vec, b_vec);
81            let mut out = I16x8::zero();
82            vst1q_s16(out.as_mut_ptr(), result);
83            out
84        }
85    }
86
87    #[inline]
88    #[cfg(not(target_arch = "aarch64"))]
89    fn add_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
90        let mut result = I16x8::zero();
91        for i in 0..8 {
92            result[i] = a[i].wrapping_add(b[i]);
93        }
94        result
95    }
96
97    #[inline]
98    #[cfg(target_arch = "aarch64")]
99    fn sub_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
100        unsafe {
101            let a_vec = vld1q_s16(a.as_ptr());
102            let b_vec = vld1q_s16(b.as_ptr());
103            let result = vsubq_s16(a_vec, b_vec);
104            let mut out = I16x8::zero();
105            vst1q_s16(out.as_mut_ptr(), result);
106            out
107        }
108    }
109
110    #[inline]
111    #[cfg(not(target_arch = "aarch64"))]
112    fn sub_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
113        let mut result = I16x8::zero();
114        for i in 0..8 {
115            result[i] = a[i].wrapping_sub(b[i]);
116        }
117        result
118    }
119
120    #[inline]
121    #[cfg(target_arch = "aarch64")]
122    fn mul_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
123        unsafe {
124            let a_vec = vld1q_s16(a.as_ptr());
125            let b_vec = vld1q_s16(b.as_ptr());
126            let result = vmulq_s16(a_vec, b_vec);
127            let mut out = I16x8::zero();
128            vst1q_s16(out.as_mut_ptr(), result);
129            out
130        }
131    }
132
133    #[inline]
134    #[cfg(not(target_arch = "aarch64"))]
135    fn mul_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
136        let mut result = I16x8::zero();
137        for i in 0..8 {
138            result[i] = a[i].wrapping_mul(b[i]);
139        }
140        result
141    }
142
143    #[inline]
144    #[cfg(target_arch = "aarch64")]
145    fn add_i32x4(&self, a: I32x4, b: I32x4) -> I32x4 {
146        unsafe {
147            let a_vec = vld1q_s32(a.as_ptr());
148            let b_vec = vld1q_s32(b.as_ptr());
149            let result = vaddq_s32(a_vec, b_vec);
150            let mut out = I32x4::zero();
151            vst1q_s32(out.as_mut_ptr(), result);
152            out
153        }
154    }
155
156    #[inline]
157    #[cfg(not(target_arch = "aarch64"))]
158    fn add_i32x4(&self, a: I32x4, b: I32x4) -> I32x4 {
159        let mut result = I32x4::zero();
160        for i in 0..4 {
161            result[i] = a[i].wrapping_add(b[i]);
162        }
163        result
164    }
165
166    #[inline]
167    #[cfg(target_arch = "aarch64")]
168    fn sub_i32x4(&self, a: I32x4, b: I32x4) -> I32x4 {
169        unsafe {
170            let a_vec = vld1q_s32(a.as_ptr());
171            let b_vec = vld1q_s32(b.as_ptr());
172            let result = vsubq_s32(a_vec, b_vec);
173            let mut out = I32x4::zero();
174            vst1q_s32(out.as_mut_ptr(), result);
175            out
176        }
177    }
178
179    #[inline]
180    #[cfg(not(target_arch = "aarch64"))]
181    fn sub_i32x4(&self, a: I32x4, b: I32x4) -> I32x4 {
182        let mut result = I32x4::zero();
183        for i in 0..4 {
184            result[i] = a[i].wrapping_sub(b[i]);
185        }
186        result
187    }
188
189    // ========================================================================
190    // Min/Max/Clamp
191    // ========================================================================
192
193    #[inline]
194    #[cfg(target_arch = "aarch64")]
195    fn min_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
196        unsafe {
197            let a_vec = vld1q_s16(a.as_ptr());
198            let b_vec = vld1q_s16(b.as_ptr());
199            let result = vminq_s16(a_vec, b_vec);
200            let mut out = I16x8::zero();
201            vst1q_s16(out.as_mut_ptr(), result);
202            out
203        }
204    }
205
206    #[inline]
207    #[cfg(not(target_arch = "aarch64"))]
208    fn min_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
209        let mut result = I16x8::zero();
210        for i in 0..8 {
211            result[i] = a[i].min(b[i]);
212        }
213        result
214    }
215
216    #[inline]
217    #[cfg(target_arch = "aarch64")]
218    fn max_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
219        unsafe {
220            let a_vec = vld1q_s16(a.as_ptr());
221            let b_vec = vld1q_s16(b.as_ptr());
222            let result = vmaxq_s16(a_vec, b_vec);
223            let mut out = I16x8::zero();
224            vst1q_s16(out.as_mut_ptr(), result);
225            out
226        }
227    }
228
229    #[inline]
230    #[cfg(not(target_arch = "aarch64"))]
231    fn max_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
232        let mut result = I16x8::zero();
233        for i in 0..8 {
234            result[i] = a[i].max(b[i]);
235        }
236        result
237    }
238
239    #[inline]
240    fn clamp_i16x8(&self, v: I16x8, min: i16, max: i16) -> I16x8 {
241        let min_vec = I16x8::splat(min);
242        let max_vec = I16x8::splat(max);
243        let clamped_min = self.max_i16x8(v, min_vec);
244        self.min_i16x8(clamped_min, max_vec)
245    }
246
247    #[inline]
248    #[cfg(target_arch = "aarch64")]
249    fn min_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
250        unsafe {
251            let a_vec = vld1q_u8(a.as_ptr());
252            let b_vec = vld1q_u8(b.as_ptr());
253            let result = vminq_u8(a_vec, b_vec);
254            let mut out = U8x16::zero();
255            vst1q_u8(out.as_mut_ptr(), result);
256            out
257        }
258    }
259
260    #[inline]
261    #[cfg(not(target_arch = "aarch64"))]
262    fn min_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
263        let mut result = U8x16::zero();
264        for i in 0..16 {
265            result[i] = a[i].min(b[i]);
266        }
267        result
268    }
269
270    #[inline]
271    #[cfg(target_arch = "aarch64")]
272    fn max_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
273        unsafe {
274            let a_vec = vld1q_u8(a.as_ptr());
275            let b_vec = vld1q_u8(b.as_ptr());
276            let result = vmaxq_u8(a_vec, b_vec);
277            let mut out = U8x16::zero();
278            vst1q_u8(out.as_mut_ptr(), result);
279            out
280        }
281    }
282
283    #[inline]
284    #[cfg(not(target_arch = "aarch64"))]
285    fn max_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
286        let mut result = U8x16::zero();
287        for i in 0..16 {
288            result[i] = a[i].max(b[i]);
289        }
290        result
291    }
292
293    #[inline]
294    fn clamp_u8x16(&self, v: U8x16, min: u8, max: u8) -> U8x16 {
295        let min_vec = U8x16::splat(min);
296        let max_vec = U8x16::splat(max);
297        let clamped_min = self.max_u8x16(v, min_vec);
298        self.min_u8x16(clamped_min, max_vec)
299    }
300
301    // ========================================================================
302    // Horizontal Operations
303    // ========================================================================
304
305    #[inline]
306    #[cfg(target_arch = "aarch64")]
307    fn horizontal_sum_i16x8(&self, v: I16x8) -> i32 {
308        unsafe {
309            let vec = vld1q_s16(v.as_ptr());
310            // Add pairs: 8 elements -> 4 elements
311            let pair_sum = vpaddlq_s16(vec);
312            // Add pairs again: 4 elements -> 2 elements
313            let quad_sum = vpaddlq_s32(pair_sum);
314            // Final horizontal add
315            let arr: [i64; 2] = std::mem::transmute(quad_sum);
316            (arr[0] + arr[1]) as i32
317        }
318    }
319
320    #[inline]
321    #[cfg(not(target_arch = "aarch64"))]
322    fn horizontal_sum_i16x8(&self, v: I16x8) -> i32 {
323        v.iter().map(|&x| i32::from(x)).sum()
324    }
325
326    #[inline]
327    #[cfg(target_arch = "aarch64")]
328    fn horizontal_sum_i32x4(&self, v: I32x4) -> i32 {
329        unsafe {
330            let vec = vld1q_s32(v.as_ptr());
331            let pair_sum = vpaddlq_s32(vec);
332            let arr: [i64; 2] = std::mem::transmute(pair_sum);
333            (arr[0] + arr[1]) as i32
334        }
335    }
336
337    #[inline]
338    #[cfg(not(target_arch = "aarch64"))]
339    fn horizontal_sum_i32x4(&self, v: I32x4) -> i32 {
340        v.iter().sum()
341    }
342
343    // ========================================================================
344    // SAD (Sum of Absolute Differences)
345    // ========================================================================
346
347    #[inline]
348    #[cfg(target_arch = "aarch64")]
349    fn sad_u8x16(&self, a: U8x16, b: U8x16) -> u32 {
350        unsafe {
351            let a_vec = vld1q_u8(a.as_ptr());
352            let b_vec = vld1q_u8(b.as_ptr());
353
354            // Compute absolute difference
355            let diff = vabdq_u8(a_vec, b_vec);
356
357            // Sum all elements by repeated pairwise addition
358            let sum16 = vpaddlq_u8(diff); // 16xu8 -> 8xu16
359            let sum32 = vpaddlq_u16(sum16); // 8xu16 -> 4xu32
360            let sum64 = vpaddlq_u32(sum32); // 4xu32 -> 2xu64
361
362            let arr: [u64; 2] = std::mem::transmute(sum64);
363            (arr[0] + arr[1]) as u32
364        }
365    }
366
367    #[inline]
368    #[cfg(not(target_arch = "aarch64"))]
369    fn sad_u8x16(&self, a: U8x16, b: U8x16) -> u32 {
370        a.iter()
371            .zip(b.iter())
372            .map(|(&x, &y): (&u8, &u8)| u32::from(x.abs_diff(y)))
373            .sum()
374    }
375
376    #[inline]
377    fn sad_8(&self, a: &[u8], b: &[u8]) -> u32 {
378        assert!(a.len() >= 8 && b.len() >= 8);
379        a[..8]
380            .iter()
381            .zip(b[..8].iter())
382            .map(|(&x, &y)| u32::from(x.abs_diff(y)))
383            .sum()
384    }
385
386    #[inline]
387    fn sad_16(&self, a: &[u8], b: &[u8]) -> u32 {
388        assert!(a.len() >= 16 && b.len() >= 16);
389        let mut a_vec = U8x16::zero();
390        let mut b_vec = U8x16::zero();
391        a_vec.copy_from_slice(&a[..16]);
392        b_vec.copy_from_slice(&b[..16]);
393        self.sad_u8x16(a_vec, b_vec)
394    }
395
396    // ========================================================================
397    // Widening/Narrowing
398    // ========================================================================
399
400    #[inline]
401    #[cfg(target_arch = "aarch64")]
402    fn widen_low_u8_to_i16(&self, v: U8x16) -> I16x8 {
403        unsafe {
404            let vec = vld1q_u8(v.as_ptr());
405            let low = vget_low_u8(vec);
406            let widened = vmovl_u8(low);
407            let mut out = I16x8::zero();
408            vst1q_s16(out.as_mut_ptr(), std::mem::transmute(widened));
409            out
410        }
411    }
412
413    #[inline]
414    #[cfg(not(target_arch = "aarch64"))]
415    fn widen_low_u8_to_i16(&self, v: U8x16) -> I16x8 {
416        let mut result = I16x8::zero();
417        for i in 0..8 {
418            result[i] = i16::from(v[i]);
419        }
420        result
421    }
422
423    #[inline]
424    #[cfg(target_arch = "aarch64")]
425    fn widen_high_u8_to_i16(&self, v: U8x16) -> I16x8 {
426        unsafe {
427            let vec = vld1q_u8(v.as_ptr());
428            let high = vget_high_u8(vec);
429            let widened = vmovl_u8(high);
430            let mut out = I16x8::zero();
431            vst1q_s16(out.as_mut_ptr(), std::mem::transmute(widened));
432            out
433        }
434    }
435
436    #[inline]
437    #[cfg(not(target_arch = "aarch64"))]
438    fn widen_high_u8_to_i16(&self, v: U8x16) -> I16x8 {
439        let mut result = I16x8::zero();
440        for i in 0..8 {
441            result[i] = i16::from(v[i + 8]);
442        }
443        result
444    }
445
446    #[inline]
447    #[cfg(target_arch = "aarch64")]
448    fn narrow_i32x4_to_i16x8(&self, low: I32x4, high: I32x4) -> I16x8 {
449        unsafe {
450            let low_vec = vld1q_s32(low.as_ptr());
451            let high_vec = vld1q_s32(high.as_ptr());
452            let narrow_low = vqmovn_s32(low_vec);
453            let narrow_high = vqmovn_s32(high_vec);
454            let result = vcombine_s16(narrow_low, narrow_high);
455            let mut out = I16x8::zero();
456            vst1q_s16(out.as_mut_ptr(), result);
457            out
458        }
459    }
460
461    #[inline]
462    #[cfg(not(target_arch = "aarch64"))]
463    fn narrow_i32x4_to_i16x8(&self, low: I32x4, high: I32x4) -> I16x8 {
464        let mut result = I16x8::zero();
465        for i in 0..4 {
466            result[i] = low[i].clamp(i32::from(i16::MIN), i32::from(i16::MAX)) as i16;
467            result[i + 4] = high[i].clamp(i32::from(i16::MIN), i32::from(i16::MAX)) as i16;
468        }
469        result
470    }
471
472    // ========================================================================
473    // Multiply-Add
474    // ========================================================================
475
476    #[inline]
477    #[cfg(target_arch = "aarch64")]
478    fn madd_i16x8(&self, a: I16x8, b: I16x8, c: I16x8) -> I16x8 {
479        unsafe {
480            let a_vec = vld1q_s16(a.as_ptr());
481            let b_vec = vld1q_s16(b.as_ptr());
482            let c_vec = vld1q_s16(c.as_ptr());
483            let result = vmlaq_s16(c_vec, a_vec, b_vec);
484            let mut out = I16x8::zero();
485            vst1q_s16(out.as_mut_ptr(), result);
486            out
487        }
488    }
489
490    #[inline]
491    #[cfg(not(target_arch = "aarch64"))]
492    fn madd_i16x8(&self, a: I16x8, b: I16x8, c: I16x8) -> I16x8 {
493        let mut result = I16x8::zero();
494        for i in 0..8 {
495            result[i] = a[i].wrapping_mul(b[i]).wrapping_add(c[i]);
496        }
497        result
498    }
499
500    #[inline]
501    fn pmaddwd(&self, a: I16x8, b: I16x8) -> I32x4 {
502        // NEON doesn't have a direct pmaddwd equivalent
503        // Emulate: multiply pairs and add adjacent results
504        let mut result = I32x4::zero();
505        for i in 0..4 {
506            result[i] = i32::from(a[i * 2]) * i32::from(b[i * 2])
507                + i32::from(a[i * 2 + 1]) * i32::from(b[i * 2 + 1]);
508        }
509        result
510    }
511
512    // ========================================================================
513    // Shift Operations
514    // ========================================================================
515
516    #[inline]
517    #[cfg(target_arch = "aarch64")]
518    fn shr_i16x8(&self, v: I16x8, shift: u32) -> I16x8 {
519        unsafe {
520            let vec = vld1q_s16(v.as_ptr());
521            let shift_vec = vdupq_n_s16(-(shift as i16));
522            let result = vshlq_s16(vec, shift_vec);
523            let mut out = I16x8::zero();
524            vst1q_s16(out.as_mut_ptr(), result);
525            out
526        }
527    }
528
529    #[inline]
530    #[cfg(not(target_arch = "aarch64"))]
531    fn shr_i16x8(&self, v: I16x8, shift: u32) -> I16x8 {
532        let mut result = I16x8::zero();
533        for i in 0..8 {
534            result[i] = v[i] >> shift;
535        }
536        result
537    }
538
539    #[inline]
540    #[cfg(target_arch = "aarch64")]
541    fn shl_i16x8(&self, v: I16x8, shift: u32) -> I16x8 {
542        unsafe {
543            let vec = vld1q_s16(v.as_ptr());
544            let shift_vec = vdupq_n_s16(shift as i16);
545            let result = vshlq_s16(vec, shift_vec);
546            let mut out = I16x8::zero();
547            vst1q_s16(out.as_mut_ptr(), result);
548            out
549        }
550    }
551
552    #[inline]
553    #[cfg(not(target_arch = "aarch64"))]
554    fn shl_i16x8(&self, v: I16x8, shift: u32) -> I16x8 {
555        let mut result = I16x8::zero();
556        for i in 0..8 {
557            result[i] = v[i] << shift;
558        }
559        result
560    }
561
562    #[inline]
563    #[cfg(target_arch = "aarch64")]
564    fn shr_i32x4(&self, v: I32x4, shift: u32) -> I32x4 {
565        unsafe {
566            let vec = vld1q_s32(v.as_ptr());
567            let shift_vec = vdupq_n_s32(-(shift as i32));
568            let result = vshlq_s32(vec, shift_vec);
569            let mut out = I32x4::zero();
570            vst1q_s32(out.as_mut_ptr(), result);
571            out
572        }
573    }
574
575    #[inline]
576    #[cfg(not(target_arch = "aarch64"))]
577    fn shr_i32x4(&self, v: I32x4, shift: u32) -> I32x4 {
578        let mut result = I32x4::zero();
579        for i in 0..4 {
580            result[i] = v[i] >> shift;
581        }
582        result
583    }
584
585    #[inline]
586    #[cfg(target_arch = "aarch64")]
587    fn shl_i32x4(&self, v: I32x4, shift: u32) -> I32x4 {
588        unsafe {
589            let vec = vld1q_s32(v.as_ptr());
590            let shift_vec = vdupq_n_s32(shift as i32);
591            let result = vshlq_s32(vec, shift_vec);
592            let mut out = I32x4::zero();
593            vst1q_s32(out.as_mut_ptr(), result);
594            out
595        }
596    }
597
598    #[inline]
599    #[cfg(not(target_arch = "aarch64"))]
600    fn shl_i32x4(&self, v: I32x4, shift: u32) -> I32x4 {
601        let mut result = I32x4::zero();
602        for i in 0..4 {
603            result[i] = v[i] << shift;
604        }
605        result
606    }
607
608    // ========================================================================
609    // Averaging
610    // ========================================================================
611
612    #[inline]
613    #[cfg(target_arch = "aarch64")]
614    fn avg_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
615        unsafe {
616            let a_vec = vld1q_u8(a.as_ptr());
617            let b_vec = vld1q_u8(b.as_ptr());
618            let result = vrhaddq_u8(a_vec, b_vec); // Rounding halving add
619            let mut out = U8x16::zero();
620            vst1q_u8(out.as_mut_ptr(), result);
621            out
622        }
623    }
624
625    #[inline]
626    #[cfg(not(target_arch = "aarch64"))]
627    fn avg_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
628        let mut result = U8x16::zero();
629        for i in 0..16 {
630            result[i] = ((u16::from(a[i]) + u16::from(b[i]) + 1) / 2) as u8;
631        }
632        result
633    }
634}
635
636impl SimdOpsExt for NeonSimd {
637    #[inline]
638    fn load4_u8_to_i16x8(&self, src: &[u8]) -> I16x8 {
639        assert!(src.len() >= 4);
640        let mut result = I16x8::zero();
641        for i in 0..4 {
642            result[i] = i16::from(src[i]);
643        }
644        result
645    }
646
647    #[inline]
648    fn load8_u8_to_i16x8(&self, src: &[u8]) -> I16x8 {
649        assert!(src.len() >= 8);
650        let mut result = I16x8::zero();
651        for i in 0..8 {
652            result[i] = i16::from(src[i]);
653        }
654        result
655    }
656
657    #[inline]
658    fn store4_i16x8_as_u8(&self, v: I16x8, dst: &mut [u8]) {
659        assert!(dst.len() >= 4);
660        for i in 0..4 {
661            dst[i] = v[i].clamp(0, 255) as u8;
662        }
663    }
664
665    #[inline]
666    fn store8_i16x8_as_u8(&self, v: I16x8, dst: &mut [u8]) {
667        assert!(dst.len() >= 8);
668        for i in 0..8 {
669            dst[i] = v[i].clamp(0, 255) as u8;
670        }
671    }
672
673    #[inline]
674    fn transpose_4x4_i16(&self, rows: &[I16x8; 4]) -> [I16x8; 4] {
675        #[cfg(target_arch = "aarch64")]
676        {
677            unsafe {
678                // Load 4x4 matrix (only first 4 elements of each row)
679                let r0 = vld1_s16(rows[0].as_ptr());
680                let r1 = vld1_s16(rows[1].as_ptr());
681                let r2 = vld1_s16(rows[2].as_ptr());
682                let r3 = vld1_s16(rows[3].as_ptr());
683
684                // Transpose using interleaving
685                let t0 = vtrn_s16(r0, r1);
686                let t1 = vtrn_s16(r2, r3);
687
688                let t2 = vtrn_s32(std::mem::transmute(t0.0), std::mem::transmute(t1.0));
689                let t3 = vtrn_s32(std::mem::transmute(t0.1), std::mem::transmute(t1.1));
690
691                let mut out = [I16x8::zero(); 4];
692                vst1_s16(out[0].as_mut_ptr(), std::mem::transmute(t2.0));
693                vst1_s16(out[1].as_mut_ptr(), std::mem::transmute(t2.1));
694                vst1_s16(out[2].as_mut_ptr(), std::mem::transmute(t3.0));
695                vst1_s16(out[3].as_mut_ptr(), std::mem::transmute(t3.1));
696                out
697            }
698        }
699        #[cfg(not(target_arch = "aarch64"))]
700        {
701            let mut out = [I16x8::zero(); 4];
702            for i in 0..4 {
703                for j in 0..4 {
704                    out[i][j] = rows[j][i];
705                }
706            }
707            out
708        }
709    }
710
711    #[inline]
712    fn transpose_8x8_i16(&self, rows: &[I16x8; 8]) -> [I16x8; 8] {
713        #[cfg(target_arch = "aarch64")]
714        {
715            unsafe {
716                // Load all 8 rows
717                let r0 = vld1q_s16(rows[0].as_ptr());
718                let r1 = vld1q_s16(rows[1].as_ptr());
719                let r2 = vld1q_s16(rows[2].as_ptr());
720                let r3 = vld1q_s16(rows[3].as_ptr());
721                let r4 = vld1q_s16(rows[4].as_ptr());
722                let r5 = vld1q_s16(rows[5].as_ptr());
723                let r6 = vld1q_s16(rows[6].as_ptr());
724                let r7 = vld1q_s16(rows[7].as_ptr());
725
726                // First level of interleaving (16-bit)
727                let t0 = vtrnq_s16(r0, r1);
728                let t1 = vtrnq_s16(r2, r3);
729                let t2 = vtrnq_s16(r4, r5);
730                let t3 = vtrnq_s16(r6, r7);
731
732                // Second level (32-bit)
733                let u0 = vtrnq_s32(std::mem::transmute(t0.0), std::mem::transmute(t1.0));
734                let u1 = vtrnq_s32(std::mem::transmute(t0.1), std::mem::transmute(t1.1));
735                let u2 = vtrnq_s32(std::mem::transmute(t2.0), std::mem::transmute(t3.0));
736                let u3 = vtrnq_s32(std::mem::transmute(t2.1), std::mem::transmute(t3.1));
737
738                // Third level (64-bit) - using vtrn is limited, use manual construction
739                let o0 = vcombine_s16(
740                    vget_low_s16(std::mem::transmute(u0.0)),
741                    vget_low_s16(std::mem::transmute(u2.0)),
742                );
743                let o1 = vcombine_s16(
744                    vget_low_s16(std::mem::transmute(u0.1)),
745                    vget_low_s16(std::mem::transmute(u2.1)),
746                );
747                let o2 = vcombine_s16(
748                    vget_low_s16(std::mem::transmute(u1.0)),
749                    vget_low_s16(std::mem::transmute(u3.0)),
750                );
751                let o3 = vcombine_s16(
752                    vget_low_s16(std::mem::transmute(u1.1)),
753                    vget_low_s16(std::mem::transmute(u3.1)),
754                );
755                let o4 = vcombine_s16(
756                    vget_high_s16(std::mem::transmute(u0.0)),
757                    vget_high_s16(std::mem::transmute(u2.0)),
758                );
759                let o5 = vcombine_s16(
760                    vget_high_s16(std::mem::transmute(u0.1)),
761                    vget_high_s16(std::mem::transmute(u2.1)),
762                );
763                let o6 = vcombine_s16(
764                    vget_high_s16(std::mem::transmute(u1.0)),
765                    vget_high_s16(std::mem::transmute(u3.0)),
766                );
767                let o7 = vcombine_s16(
768                    vget_high_s16(std::mem::transmute(u1.1)),
769                    vget_high_s16(std::mem::transmute(u3.1)),
770                );
771
772                let mut out = [I16x8::zero(); 8];
773                vst1q_s16(out[0].as_mut_ptr(), o0);
774                vst1q_s16(out[1].as_mut_ptr(), o1);
775                vst1q_s16(out[2].as_mut_ptr(), o2);
776                vst1q_s16(out[3].as_mut_ptr(), o3);
777                vst1q_s16(out[4].as_mut_ptr(), o4);
778                vst1q_s16(out[5].as_mut_ptr(), o5);
779                vst1q_s16(out[6].as_mut_ptr(), o6);
780                vst1q_s16(out[7].as_mut_ptr(), o7);
781                out
782            }
783        }
784        #[cfg(not(target_arch = "aarch64"))]
785        {
786            let mut out = [I16x8::zero(); 8];
787            for i in 0..8 {
788                for j in 0..8 {
789                    out[i][j] = rows[j][i];
790                }
791            }
792            out
793        }
794    }
795
796    #[inline]
797    fn butterfly_i16x8(&self, a: I16x8, b: I16x8) -> (I16x8, I16x8) {
798        let sum = self.add_i16x8(a, b);
799        let diff = self.sub_i16x8(a, b);
800        (sum, diff)
801    }
802
803    #[inline]
804    fn butterfly_i32x4(&self, a: I32x4, b: I32x4) -> (I32x4, I32x4) {
805        let sum = self.add_i32x4(a, b);
806        let diff = self.sub_i32x4(a, b);
807        (sum, diff)
808    }
809}