1#![allow(unused)]
2
3use wide::u32x4;
4use wide::u32x8 as S;
5
6#[inline(always)]
10pub fn transpose(m: [S; 8]) -> [S; 8] {
11 _transpose(m)
12}
13
14#[inline(always)]
19const fn _mm_shuffle(z: u32, y: u32, x: u32, w: u32) -> i32 {
20 ((z << 6) | (y << 4) | (x << 2) | w) as i32
21}
22
23#[inline(always)]
25#[cfg(target_feature = "avx")]
26fn _transpose(m: [S; 8]) -> [S; 8] {
27 unsafe {
28 #[cfg(target_arch = "x86")]
29 use core::arch::x86::*;
30 #[cfg(target_arch = "x86_64")]
31 use core::arch::x86_64::*;
32 use core::mem::transmute;
33
34 let m: [__m256; 8] = transmute(m);
35 let x0 = _mm256_unpacklo_ps(m[0], m[1]);
36 let x1 = _mm256_unpackhi_ps(m[0], m[1]);
37 let x2 = _mm256_unpacklo_ps(m[2], m[3]);
38 let x3 = _mm256_unpackhi_ps(m[2], m[3]);
39 let x4 = _mm256_unpacklo_ps(m[4], m[5]);
40 let x5 = _mm256_unpackhi_ps(m[4], m[5]);
41 let x6 = _mm256_unpacklo_ps(m[6], m[7]);
42 let x7 = _mm256_unpackhi_ps(m[6], m[7]);
43 let y0 = _mm256_shuffle_ps(x0, x2, _mm_shuffle(1, 0, 1, 0));
44 let y1 = _mm256_shuffle_ps(x0, x2, _mm_shuffle(3, 2, 3, 2));
45 let y2 = _mm256_shuffle_ps(x1, x3, _mm_shuffle(1, 0, 1, 0));
46 let y3 = _mm256_shuffle_ps(x1, x3, _mm_shuffle(3, 2, 3, 2));
47 let y4 = _mm256_shuffle_ps(x4, x6, _mm_shuffle(1, 0, 1, 0));
48 let y5 = _mm256_shuffle_ps(x4, x6, _mm_shuffle(3, 2, 3, 2));
49 let y6 = _mm256_shuffle_ps(x5, x7, _mm_shuffle(1, 0, 1, 0));
50 let y7 = _mm256_shuffle_ps(x5, x7, _mm_shuffle(3, 2, 3, 2));
51 let mut t: [__m256; 8] = [transmute([0; 8]); 8];
52 t[0] = _mm256_permute2f128_ps(y0, y4, 0x20);
53 t[1] = _mm256_permute2f128_ps(y1, y5, 0x20);
54 t[2] = _mm256_permute2f128_ps(y2, y6, 0x20);
55 t[3] = _mm256_permute2f128_ps(y3, y7, 0x20);
56 t[4] = _mm256_permute2f128_ps(y0, y4, 0x31);
57 t[5] = _mm256_permute2f128_ps(y1, y5, 0x31);
58 t[6] = _mm256_permute2f128_ps(y2, y6, 0x31);
59 t[7] = _mm256_permute2f128_ps(y3, y7, 0x31);
60 transmute(t)
61 }
62}
63
64#[inline(always)]
65#[cfg(target_feature = "neon")]
66fn _transpose(m: [S; 8]) -> [S; 8] {
67 unsafe {
68 use core::mem::transmute;
69
70 let m: [u32x4; 16] = transmute(m);
71 let t11 = transpose_4x4_neon(
72 *m.get_unchecked(0),
73 *m.get_unchecked(2),
74 *m.get_unchecked(4),
75 *m.get_unchecked(6),
76 );
77 let t21 = transpose_4x4_neon(
78 *m.get_unchecked(1),
79 *m.get_unchecked(3),
80 *m.get_unchecked(5),
81 *m.get_unchecked(7),
82 );
83 let t12 = transpose_4x4_neon(
84 *m.get_unchecked(8),
85 *m.get_unchecked(10),
86 *m.get_unchecked(12),
87 *m.get_unchecked(14),
88 );
89 let t22 = transpose_4x4_neon(
90 *m.get_unchecked(9),
91 *m.get_unchecked(11),
92 *m.get_unchecked(13),
93 *m.get_unchecked(15),
94 );
95
96 transmute((
97 *t11.get_unchecked(0),
98 *t12.get_unchecked(0),
99 *t11.get_unchecked(1),
100 *t12.get_unchecked(1),
101 *t11.get_unchecked(2),
102 *t12.get_unchecked(2),
103 *t11.get_unchecked(3),
104 *t12.get_unchecked(3),
105 *t21.get_unchecked(0),
106 *t22.get_unchecked(0),
107 *t21.get_unchecked(1),
108 *t22.get_unchecked(1),
109 *t21.get_unchecked(2),
110 *t22.get_unchecked(2),
111 *t21.get_unchecked(3),
112 *t22.get_unchecked(3),
113 ))
114 }
115}
116
117#[inline(always)]
118#[cfg(target_feature = "neon")]
119fn transpose_4x4_neon(m0: u32x4, m1: u32x4, m2: u32x4, m3: u32x4) -> [u32x4; 4] {
120 unsafe {
121 use core::arch::aarch64::vzipq_u32;
122 use core::mem::transmute;
123
124 let x = vzipq_u32(transmute(m0), transmute(m2));
125 let y = vzipq_u32(transmute(m1), transmute(m3));
126 transmute((vzipq_u32(x.0, y.0), vzipq_u32(x.1, y.1)))
127 }
128}
129
130#[inline(always)]
131#[cfg(not(any(target_feature = "avx", target_feature = "neon")))]
132fn _transpose(m: [S; 8]) -> [S; 8] {
133 unsafe {
134 let m = m.map(|v| v.to_array());
135 [0, 1, 2, 3, 4, 5, 6, 7].map(|i| S::new(m.map(|v| *v.get_unchecked(i))))
136 }
137}
138
139#[cfg(test)]
140mod tests {
141 use super::*;
142
143 #[test]
144 fn test_transpose() {
145 let m = [
146 S::new([0, 1, 2, 3, 4, 5, 6, 7]),
147 S::new([10, 11, 12, 13, 14, 15, 16, 17]),
148 S::new([20, 21, 22, 23, 24, 25, 26, 27]),
149 S::new([30, 31, 32, 33, 34, 35, 36, 37]),
150 S::new([40, 41, 42, 43, 44, 45, 46, 47]),
151 S::new([50, 51, 52, 53, 54, 55, 56, 57]),
152 S::new([60, 61, 62, 63, 64, 65, 66, 67]),
153 S::new([70, 71, 72, 73, 74, 75, 76, 77]),
154 ];
155
156 let mt = [
157 S::new([0, 10, 20, 30, 40, 50, 60, 70]),
158 S::new([1, 11, 21, 31, 41, 51, 61, 71]),
159 S::new([2, 12, 22, 32, 42, 52, 62, 72]),
160 S::new([3, 13, 23, 33, 43, 53, 63, 73]),
161 S::new([4, 14, 24, 34, 44, 54, 64, 74]),
162 S::new([5, 15, 25, 35, 45, 55, 65, 75]),
163 S::new([6, 16, 26, 36, 46, 56, 66, 76]),
164 S::new([7, 17, 27, 37, 47, 57, 67, 77]),
165 ];
166
167 assert_eq!(transpose(m), mt);
168 }
169}