1#![allow(unsafe_code)]
7#![allow(
8 clippy::transmute_undefined_repr,
9 clippy::missing_transmute_annotations
10)]
11
12use crate::simd::traits::{SimdOps, SimdOpsExt};
13use crate::simd::types::{I16x8, I32x4, U8x16};
14
15#[cfg(target_arch = "aarch64")]
16use std::arch::aarch64::*;
17
18#[derive(Clone, Copy, Debug)]
20pub struct NeonSimd;
21
22impl NeonSimd {
23 #[inline]
30 #[must_use]
31 pub const fn new() -> Self {
32 Self
33 }
34
35 #[inline]
37 #[must_use]
38 pub fn is_available() -> bool {
39 #[cfg(target_arch = "aarch64")]
40 {
41 true
43 }
44 #[cfg(all(target_arch = "arm", target_feature = "neon"))]
45 {
46 true
47 }
48 #[cfg(not(any(
49 target_arch = "aarch64",
50 all(target_arch = "arm", target_feature = "neon")
51 )))]
52 {
53 false
54 }
55 }
56}
57
58impl SimdOps for NeonSimd {
59 #[inline]
60 fn name(&self) -> &'static str {
61 "neon"
62 }
63
64 #[inline]
65 fn is_available(&self) -> bool {
66 Self::is_available()
67 }
68
69 #[inline]
74 #[cfg(target_arch = "aarch64")]
75 fn add_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
76 unsafe {
78 let a_vec = vld1q_s16(a.as_ptr());
79 let b_vec = vld1q_s16(b.as_ptr());
80 let result = vaddq_s16(a_vec, b_vec);
81 let mut out = I16x8::zero();
82 vst1q_s16(out.as_mut_ptr(), result);
83 out
84 }
85 }
86
87 #[inline]
88 #[cfg(not(target_arch = "aarch64"))]
89 fn add_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
90 let mut result = I16x8::zero();
91 for i in 0..8 {
92 result[i] = a[i].wrapping_add(b[i]);
93 }
94 result
95 }
96
97 #[inline]
98 #[cfg(target_arch = "aarch64")]
99 fn sub_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
100 unsafe {
101 let a_vec = vld1q_s16(a.as_ptr());
102 let b_vec = vld1q_s16(b.as_ptr());
103 let result = vsubq_s16(a_vec, b_vec);
104 let mut out = I16x8::zero();
105 vst1q_s16(out.as_mut_ptr(), result);
106 out
107 }
108 }
109
110 #[inline]
111 #[cfg(not(target_arch = "aarch64"))]
112 fn sub_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
113 let mut result = I16x8::zero();
114 for i in 0..8 {
115 result[i] = a[i].wrapping_sub(b[i]);
116 }
117 result
118 }
119
120 #[inline]
121 #[cfg(target_arch = "aarch64")]
122 fn mul_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
123 unsafe {
124 let a_vec = vld1q_s16(a.as_ptr());
125 let b_vec = vld1q_s16(b.as_ptr());
126 let result = vmulq_s16(a_vec, b_vec);
127 let mut out = I16x8::zero();
128 vst1q_s16(out.as_mut_ptr(), result);
129 out
130 }
131 }
132
133 #[inline]
134 #[cfg(not(target_arch = "aarch64"))]
135 fn mul_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
136 let mut result = I16x8::zero();
137 for i in 0..8 {
138 result[i] = a[i].wrapping_mul(b[i]);
139 }
140 result
141 }
142
143 #[inline]
144 #[cfg(target_arch = "aarch64")]
145 fn add_i32x4(&self, a: I32x4, b: I32x4) -> I32x4 {
146 unsafe {
147 let a_vec = vld1q_s32(a.as_ptr());
148 let b_vec = vld1q_s32(b.as_ptr());
149 let result = vaddq_s32(a_vec, b_vec);
150 let mut out = I32x4::zero();
151 vst1q_s32(out.as_mut_ptr(), result);
152 out
153 }
154 }
155
156 #[inline]
157 #[cfg(not(target_arch = "aarch64"))]
158 fn add_i32x4(&self, a: I32x4, b: I32x4) -> I32x4 {
159 let mut result = I32x4::zero();
160 for i in 0..4 {
161 result[i] = a[i].wrapping_add(b[i]);
162 }
163 result
164 }
165
166 #[inline]
167 #[cfg(target_arch = "aarch64")]
168 fn sub_i32x4(&self, a: I32x4, b: I32x4) -> I32x4 {
169 unsafe {
170 let a_vec = vld1q_s32(a.as_ptr());
171 let b_vec = vld1q_s32(b.as_ptr());
172 let result = vsubq_s32(a_vec, b_vec);
173 let mut out = I32x4::zero();
174 vst1q_s32(out.as_mut_ptr(), result);
175 out
176 }
177 }
178
179 #[inline]
180 #[cfg(not(target_arch = "aarch64"))]
181 fn sub_i32x4(&self, a: I32x4, b: I32x4) -> I32x4 {
182 let mut result = I32x4::zero();
183 for i in 0..4 {
184 result[i] = a[i].wrapping_sub(b[i]);
185 }
186 result
187 }
188
189 #[inline]
194 #[cfg(target_arch = "aarch64")]
195 fn min_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
196 unsafe {
197 let a_vec = vld1q_s16(a.as_ptr());
198 let b_vec = vld1q_s16(b.as_ptr());
199 let result = vminq_s16(a_vec, b_vec);
200 let mut out = I16x8::zero();
201 vst1q_s16(out.as_mut_ptr(), result);
202 out
203 }
204 }
205
206 #[inline]
207 #[cfg(not(target_arch = "aarch64"))]
208 fn min_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
209 let mut result = I16x8::zero();
210 for i in 0..8 {
211 result[i] = a[i].min(b[i]);
212 }
213 result
214 }
215
216 #[inline]
217 #[cfg(target_arch = "aarch64")]
218 fn max_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
219 unsafe {
220 let a_vec = vld1q_s16(a.as_ptr());
221 let b_vec = vld1q_s16(b.as_ptr());
222 let result = vmaxq_s16(a_vec, b_vec);
223 let mut out = I16x8::zero();
224 vst1q_s16(out.as_mut_ptr(), result);
225 out
226 }
227 }
228
229 #[inline]
230 #[cfg(not(target_arch = "aarch64"))]
231 fn max_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
232 let mut result = I16x8::zero();
233 for i in 0..8 {
234 result[i] = a[i].max(b[i]);
235 }
236 result
237 }
238
239 #[inline]
240 fn clamp_i16x8(&self, v: I16x8, min: i16, max: i16) -> I16x8 {
241 let min_vec = I16x8::splat(min);
242 let max_vec = I16x8::splat(max);
243 let clamped_min = self.max_i16x8(v, min_vec);
244 self.min_i16x8(clamped_min, max_vec)
245 }
246
247 #[inline]
248 #[cfg(target_arch = "aarch64")]
249 fn min_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
250 unsafe {
251 let a_vec = vld1q_u8(a.as_ptr());
252 let b_vec = vld1q_u8(b.as_ptr());
253 let result = vminq_u8(a_vec, b_vec);
254 let mut out = U8x16::zero();
255 vst1q_u8(out.as_mut_ptr(), result);
256 out
257 }
258 }
259
260 #[inline]
261 #[cfg(not(target_arch = "aarch64"))]
262 fn min_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
263 let mut result = U8x16::zero();
264 for i in 0..16 {
265 result[i] = a[i].min(b[i]);
266 }
267 result
268 }
269
270 #[inline]
271 #[cfg(target_arch = "aarch64")]
272 fn max_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
273 unsafe {
274 let a_vec = vld1q_u8(a.as_ptr());
275 let b_vec = vld1q_u8(b.as_ptr());
276 let result = vmaxq_u8(a_vec, b_vec);
277 let mut out = U8x16::zero();
278 vst1q_u8(out.as_mut_ptr(), result);
279 out
280 }
281 }
282
283 #[inline]
284 #[cfg(not(target_arch = "aarch64"))]
285 fn max_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
286 let mut result = U8x16::zero();
287 for i in 0..16 {
288 result[i] = a[i].max(b[i]);
289 }
290 result
291 }
292
293 #[inline]
294 fn clamp_u8x16(&self, v: U8x16, min: u8, max: u8) -> U8x16 {
295 let min_vec = U8x16::splat(min);
296 let max_vec = U8x16::splat(max);
297 let clamped_min = self.max_u8x16(v, min_vec);
298 self.min_u8x16(clamped_min, max_vec)
299 }
300
301 #[inline]
306 #[cfg(target_arch = "aarch64")]
307 fn horizontal_sum_i16x8(&self, v: I16x8) -> i32 {
308 unsafe {
309 let vec = vld1q_s16(v.as_ptr());
310 let pair_sum = vpaddlq_s16(vec);
312 let quad_sum = vpaddlq_s32(pair_sum);
314 let arr: [i64; 2] = std::mem::transmute(quad_sum);
316 (arr[0] + arr[1]) as i32
317 }
318 }
319
320 #[inline]
321 #[cfg(not(target_arch = "aarch64"))]
322 fn horizontal_sum_i16x8(&self, v: I16x8) -> i32 {
323 v.iter().map(|&x| i32::from(x)).sum()
324 }
325
326 #[inline]
327 #[cfg(target_arch = "aarch64")]
328 fn horizontal_sum_i32x4(&self, v: I32x4) -> i32 {
329 unsafe {
330 let vec = vld1q_s32(v.as_ptr());
331 let pair_sum = vpaddlq_s32(vec);
332 let arr: [i64; 2] = std::mem::transmute(pair_sum);
333 (arr[0] + arr[1]) as i32
334 }
335 }
336
337 #[inline]
338 #[cfg(not(target_arch = "aarch64"))]
339 fn horizontal_sum_i32x4(&self, v: I32x4) -> i32 {
340 v.iter().sum()
341 }
342
343 #[inline]
348 #[cfg(target_arch = "aarch64")]
349 fn sad_u8x16(&self, a: U8x16, b: U8x16) -> u32 {
350 unsafe {
351 let a_vec = vld1q_u8(a.as_ptr());
352 let b_vec = vld1q_u8(b.as_ptr());
353
354 let diff = vabdq_u8(a_vec, b_vec);
356
357 let sum16 = vpaddlq_u8(diff); let sum32 = vpaddlq_u16(sum16); let sum64 = vpaddlq_u32(sum32); let arr: [u64; 2] = std::mem::transmute(sum64);
363 (arr[0] + arr[1]) as u32
364 }
365 }
366
367 #[inline]
368 #[cfg(not(target_arch = "aarch64"))]
369 fn sad_u8x16(&self, a: U8x16, b: U8x16) -> u32 {
370 a.iter()
371 .zip(b.iter())
372 .map(|(&x, &y): (&u8, &u8)| u32::from(x.abs_diff(y)))
373 .sum()
374 }
375
376 #[inline]
377 fn sad_8(&self, a: &[u8], b: &[u8]) -> u32 {
378 assert!(a.len() >= 8 && b.len() >= 8);
379 a[..8]
380 .iter()
381 .zip(b[..8].iter())
382 .map(|(&x, &y)| u32::from(x.abs_diff(y)))
383 .sum()
384 }
385
386 #[inline]
387 fn sad_16(&self, a: &[u8], b: &[u8]) -> u32 {
388 assert!(a.len() >= 16 && b.len() >= 16);
389 let mut a_vec = U8x16::zero();
390 let mut b_vec = U8x16::zero();
391 a_vec.copy_from_slice(&a[..16]);
392 b_vec.copy_from_slice(&b[..16]);
393 self.sad_u8x16(a_vec, b_vec)
394 }
395
396 #[inline]
401 #[cfg(target_arch = "aarch64")]
402 fn widen_low_u8_to_i16(&self, v: U8x16) -> I16x8 {
403 unsafe {
404 let vec = vld1q_u8(v.as_ptr());
405 let low = vget_low_u8(vec);
406 let widened = vmovl_u8(low);
407 let mut out = I16x8::zero();
408 vst1q_s16(out.as_mut_ptr(), std::mem::transmute(widened));
409 out
410 }
411 }
412
413 #[inline]
414 #[cfg(not(target_arch = "aarch64"))]
415 fn widen_low_u8_to_i16(&self, v: U8x16) -> I16x8 {
416 let mut result = I16x8::zero();
417 for i in 0..8 {
418 result[i] = i16::from(v[i]);
419 }
420 result
421 }
422
423 #[inline]
424 #[cfg(target_arch = "aarch64")]
425 fn widen_high_u8_to_i16(&self, v: U8x16) -> I16x8 {
426 unsafe {
427 let vec = vld1q_u8(v.as_ptr());
428 let high = vget_high_u8(vec);
429 let widened = vmovl_u8(high);
430 let mut out = I16x8::zero();
431 vst1q_s16(out.as_mut_ptr(), std::mem::transmute(widened));
432 out
433 }
434 }
435
436 #[inline]
437 #[cfg(not(target_arch = "aarch64"))]
438 fn widen_high_u8_to_i16(&self, v: U8x16) -> I16x8 {
439 let mut result = I16x8::zero();
440 for i in 0..8 {
441 result[i] = i16::from(v[i + 8]);
442 }
443 result
444 }
445
446 #[inline]
447 #[cfg(target_arch = "aarch64")]
448 fn narrow_i32x4_to_i16x8(&self, low: I32x4, high: I32x4) -> I16x8 {
449 unsafe {
450 let low_vec = vld1q_s32(low.as_ptr());
451 let high_vec = vld1q_s32(high.as_ptr());
452 let narrow_low = vqmovn_s32(low_vec);
453 let narrow_high = vqmovn_s32(high_vec);
454 let result = vcombine_s16(narrow_low, narrow_high);
455 let mut out = I16x8::zero();
456 vst1q_s16(out.as_mut_ptr(), result);
457 out
458 }
459 }
460
461 #[inline]
462 #[cfg(not(target_arch = "aarch64"))]
463 fn narrow_i32x4_to_i16x8(&self, low: I32x4, high: I32x4) -> I16x8 {
464 let mut result = I16x8::zero();
465 for i in 0..4 {
466 result[i] = low[i].clamp(i32::from(i16::MIN), i32::from(i16::MAX)) as i16;
467 result[i + 4] = high[i].clamp(i32::from(i16::MIN), i32::from(i16::MAX)) as i16;
468 }
469 result
470 }
471
472 #[inline]
477 #[cfg(target_arch = "aarch64")]
478 fn madd_i16x8(&self, a: I16x8, b: I16x8, c: I16x8) -> I16x8 {
479 unsafe {
480 let a_vec = vld1q_s16(a.as_ptr());
481 let b_vec = vld1q_s16(b.as_ptr());
482 let c_vec = vld1q_s16(c.as_ptr());
483 let result = vmlaq_s16(c_vec, a_vec, b_vec);
484 let mut out = I16x8::zero();
485 vst1q_s16(out.as_mut_ptr(), result);
486 out
487 }
488 }
489
490 #[inline]
491 #[cfg(not(target_arch = "aarch64"))]
492 fn madd_i16x8(&self, a: I16x8, b: I16x8, c: I16x8) -> I16x8 {
493 let mut result = I16x8::zero();
494 for i in 0..8 {
495 result[i] = a[i].wrapping_mul(b[i]).wrapping_add(c[i]);
496 }
497 result
498 }
499
500 #[inline]
501 fn pmaddwd(&self, a: I16x8, b: I16x8) -> I32x4 {
502 let mut result = I32x4::zero();
505 for i in 0..4 {
506 result[i] = i32::from(a[i * 2]) * i32::from(b[i * 2])
507 + i32::from(a[i * 2 + 1]) * i32::from(b[i * 2 + 1]);
508 }
509 result
510 }
511
512 #[inline]
517 #[cfg(target_arch = "aarch64")]
518 fn shr_i16x8(&self, v: I16x8, shift: u32) -> I16x8 {
519 unsafe {
520 let vec = vld1q_s16(v.as_ptr());
521 let shift_vec = vdupq_n_s16(-(shift as i16));
522 let result = vshlq_s16(vec, shift_vec);
523 let mut out = I16x8::zero();
524 vst1q_s16(out.as_mut_ptr(), result);
525 out
526 }
527 }
528
529 #[inline]
530 #[cfg(not(target_arch = "aarch64"))]
531 fn shr_i16x8(&self, v: I16x8, shift: u32) -> I16x8 {
532 let mut result = I16x8::zero();
533 for i in 0..8 {
534 result[i] = v[i] >> shift;
535 }
536 result
537 }
538
539 #[inline]
540 #[cfg(target_arch = "aarch64")]
541 fn shl_i16x8(&self, v: I16x8, shift: u32) -> I16x8 {
542 unsafe {
543 let vec = vld1q_s16(v.as_ptr());
544 let shift_vec = vdupq_n_s16(shift as i16);
545 let result = vshlq_s16(vec, shift_vec);
546 let mut out = I16x8::zero();
547 vst1q_s16(out.as_mut_ptr(), result);
548 out
549 }
550 }
551
552 #[inline]
553 #[cfg(not(target_arch = "aarch64"))]
554 fn shl_i16x8(&self, v: I16x8, shift: u32) -> I16x8 {
555 let mut result = I16x8::zero();
556 for i in 0..8 {
557 result[i] = v[i] << shift;
558 }
559 result
560 }
561
562 #[inline]
563 #[cfg(target_arch = "aarch64")]
564 fn shr_i32x4(&self, v: I32x4, shift: u32) -> I32x4 {
565 unsafe {
566 let vec = vld1q_s32(v.as_ptr());
567 let shift_vec = vdupq_n_s32(-(shift as i32));
568 let result = vshlq_s32(vec, shift_vec);
569 let mut out = I32x4::zero();
570 vst1q_s32(out.as_mut_ptr(), result);
571 out
572 }
573 }
574
575 #[inline]
576 #[cfg(not(target_arch = "aarch64"))]
577 fn shr_i32x4(&self, v: I32x4, shift: u32) -> I32x4 {
578 let mut result = I32x4::zero();
579 for i in 0..4 {
580 result[i] = v[i] >> shift;
581 }
582 result
583 }
584
585 #[inline]
586 #[cfg(target_arch = "aarch64")]
587 fn shl_i32x4(&self, v: I32x4, shift: u32) -> I32x4 {
588 unsafe {
589 let vec = vld1q_s32(v.as_ptr());
590 let shift_vec = vdupq_n_s32(shift as i32);
591 let result = vshlq_s32(vec, shift_vec);
592 let mut out = I32x4::zero();
593 vst1q_s32(out.as_mut_ptr(), result);
594 out
595 }
596 }
597
598 #[inline]
599 #[cfg(not(target_arch = "aarch64"))]
600 fn shl_i32x4(&self, v: I32x4, shift: u32) -> I32x4 {
601 let mut result = I32x4::zero();
602 for i in 0..4 {
603 result[i] = v[i] << shift;
604 }
605 result
606 }
607
608 #[inline]
613 #[cfg(target_arch = "aarch64")]
614 fn avg_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
615 unsafe {
616 let a_vec = vld1q_u8(a.as_ptr());
617 let b_vec = vld1q_u8(b.as_ptr());
618 let result = vrhaddq_u8(a_vec, b_vec); let mut out = U8x16::zero();
620 vst1q_u8(out.as_mut_ptr(), result);
621 out
622 }
623 }
624
625 #[inline]
626 #[cfg(not(target_arch = "aarch64"))]
627 fn avg_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
628 let mut result = U8x16::zero();
629 for i in 0..16 {
630 result[i] = ((u16::from(a[i]) + u16::from(b[i]) + 1) / 2) as u8;
631 }
632 result
633 }
634}
635
636impl SimdOpsExt for NeonSimd {
637 #[inline]
638 fn load4_u8_to_i16x8(&self, src: &[u8]) -> I16x8 {
639 assert!(src.len() >= 4);
640 let mut result = I16x8::zero();
641 for i in 0..4 {
642 result[i] = i16::from(src[i]);
643 }
644 result
645 }
646
647 #[inline]
648 fn load8_u8_to_i16x8(&self, src: &[u8]) -> I16x8 {
649 assert!(src.len() >= 8);
650 let mut result = I16x8::zero();
651 for i in 0..8 {
652 result[i] = i16::from(src[i]);
653 }
654 result
655 }
656
657 #[inline]
658 fn store4_i16x8_as_u8(&self, v: I16x8, dst: &mut [u8]) {
659 assert!(dst.len() >= 4);
660 for i in 0..4 {
661 dst[i] = v[i].clamp(0, 255) as u8;
662 }
663 }
664
665 #[inline]
666 fn store8_i16x8_as_u8(&self, v: I16x8, dst: &mut [u8]) {
667 assert!(dst.len() >= 8);
668 for i in 0..8 {
669 dst[i] = v[i].clamp(0, 255) as u8;
670 }
671 }
672
673 #[inline]
674 fn transpose_4x4_i16(&self, rows: &[I16x8; 4]) -> [I16x8; 4] {
675 #[cfg(target_arch = "aarch64")]
676 {
677 unsafe {
678 let r0 = vld1_s16(rows[0].as_ptr());
680 let r1 = vld1_s16(rows[1].as_ptr());
681 let r2 = vld1_s16(rows[2].as_ptr());
682 let r3 = vld1_s16(rows[3].as_ptr());
683
684 let t0 = vtrn_s16(r0, r1);
686 let t1 = vtrn_s16(r2, r3);
687
688 let t2 = vtrn_s32(std::mem::transmute(t0.0), std::mem::transmute(t1.0));
689 let t3 = vtrn_s32(std::mem::transmute(t0.1), std::mem::transmute(t1.1));
690
691 let mut out = [I16x8::zero(); 4];
692 vst1_s16(out[0].as_mut_ptr(), std::mem::transmute(t2.0));
693 vst1_s16(out[1].as_mut_ptr(), std::mem::transmute(t2.1));
694 vst1_s16(out[2].as_mut_ptr(), std::mem::transmute(t3.0));
695 vst1_s16(out[3].as_mut_ptr(), std::mem::transmute(t3.1));
696 out
697 }
698 }
699 #[cfg(not(target_arch = "aarch64"))]
700 {
701 let mut out = [I16x8::zero(); 4];
702 for i in 0..4 {
703 for j in 0..4 {
704 out[i][j] = rows[j][i];
705 }
706 }
707 out
708 }
709 }
710
711 #[inline]
712 fn transpose_8x8_i16(&self, rows: &[I16x8; 8]) -> [I16x8; 8] {
713 #[cfg(target_arch = "aarch64")]
714 {
715 unsafe {
716 let r0 = vld1q_s16(rows[0].as_ptr());
718 let r1 = vld1q_s16(rows[1].as_ptr());
719 let r2 = vld1q_s16(rows[2].as_ptr());
720 let r3 = vld1q_s16(rows[3].as_ptr());
721 let r4 = vld1q_s16(rows[4].as_ptr());
722 let r5 = vld1q_s16(rows[5].as_ptr());
723 let r6 = vld1q_s16(rows[6].as_ptr());
724 let r7 = vld1q_s16(rows[7].as_ptr());
725
726 let t0 = vtrnq_s16(r0, r1);
728 let t1 = vtrnq_s16(r2, r3);
729 let t2 = vtrnq_s16(r4, r5);
730 let t3 = vtrnq_s16(r6, r7);
731
732 let u0 = vtrnq_s32(std::mem::transmute(t0.0), std::mem::transmute(t1.0));
734 let u1 = vtrnq_s32(std::mem::transmute(t0.1), std::mem::transmute(t1.1));
735 let u2 = vtrnq_s32(std::mem::transmute(t2.0), std::mem::transmute(t3.0));
736 let u3 = vtrnq_s32(std::mem::transmute(t2.1), std::mem::transmute(t3.1));
737
738 let o0 = vcombine_s16(
740 vget_low_s16(std::mem::transmute(u0.0)),
741 vget_low_s16(std::mem::transmute(u2.0)),
742 );
743 let o1 = vcombine_s16(
744 vget_low_s16(std::mem::transmute(u0.1)),
745 vget_low_s16(std::mem::transmute(u2.1)),
746 );
747 let o2 = vcombine_s16(
748 vget_low_s16(std::mem::transmute(u1.0)),
749 vget_low_s16(std::mem::transmute(u3.0)),
750 );
751 let o3 = vcombine_s16(
752 vget_low_s16(std::mem::transmute(u1.1)),
753 vget_low_s16(std::mem::transmute(u3.1)),
754 );
755 let o4 = vcombine_s16(
756 vget_high_s16(std::mem::transmute(u0.0)),
757 vget_high_s16(std::mem::transmute(u2.0)),
758 );
759 let o5 = vcombine_s16(
760 vget_high_s16(std::mem::transmute(u0.1)),
761 vget_high_s16(std::mem::transmute(u2.1)),
762 );
763 let o6 = vcombine_s16(
764 vget_high_s16(std::mem::transmute(u1.0)),
765 vget_high_s16(std::mem::transmute(u3.0)),
766 );
767 let o7 = vcombine_s16(
768 vget_high_s16(std::mem::transmute(u1.1)),
769 vget_high_s16(std::mem::transmute(u3.1)),
770 );
771
772 let mut out = [I16x8::zero(); 8];
773 vst1q_s16(out[0].as_mut_ptr(), o0);
774 vst1q_s16(out[1].as_mut_ptr(), o1);
775 vst1q_s16(out[2].as_mut_ptr(), o2);
776 vst1q_s16(out[3].as_mut_ptr(), o3);
777 vst1q_s16(out[4].as_mut_ptr(), o4);
778 vst1q_s16(out[5].as_mut_ptr(), o5);
779 vst1q_s16(out[6].as_mut_ptr(), o6);
780 vst1q_s16(out[7].as_mut_ptr(), o7);
781 out
782 }
783 }
784 #[cfg(not(target_arch = "aarch64"))]
785 {
786 let mut out = [I16x8::zero(); 8];
787 for i in 0..8 {
788 for j in 0..8 {
789 out[i][j] = rows[j][i];
790 }
791 }
792 out
793 }
794 }
795
796 #[inline]
797 fn butterfly_i16x8(&self, a: I16x8, b: I16x8) -> (I16x8, I16x8) {
798 let sum = self.add_i16x8(a, b);
799 let diff = self.sub_i16x8(a, b);
800 (sum, diff)
801 }
802
803 #[inline]
804 fn butterfly_i32x4(&self, a: I32x4, b: I32x4) -> (I32x4, I32x4) {
805 let sum = self.add_i32x4(a, b);
806 let diff = self.sub_i32x4(a, b);
807 (sum, diff)
808 }
809}