wide/
u32x4_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="sse2")] {
5    #[derive(Default, Clone, Copy, PartialEq, Eq)]
6    #[repr(C, align(16))]
7    pub struct u32x4 { pub(crate) sse: m128i }
8  } else if #[cfg(target_feature="simd128")] {
9    use core::arch::wasm32::*;
10
11    #[derive(Clone, Copy)]
12    #[repr(transparent)]
13    pub struct u32x4 { pub(crate) simd: v128 }
14
15    impl Default for u32x4 {
16      fn default() -> Self {
17        Self::splat(0)
18      }
19    }
20
21    impl PartialEq for u32x4 {
22      fn eq(&self, other: &Self) -> bool {
23        u32x4_all_true(u32x4_eq(self.simd, other.simd))
24      }
25    }
26
27    impl Eq for u32x4 { }
28  } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
29    use core::arch::aarch64::*;
30    #[repr(C)]
31    #[derive(Copy, Clone)]
32    pub struct u32x4 { pub(crate) neon : uint32x4_t }
33
34    impl Default for u32x4 {
35      #[inline]
36      fn default() -> Self {
37        Self::splat(0)
38      }
39    }
40
41    impl PartialEq for u32x4 {
42      #[inline]
43      fn eq(&self, other: &Self) -> bool {
44        unsafe { vminvq_u32(vceqq_u32(self.neon, other.neon))==u32::MAX }
45      }
46    }
47
48    impl Eq for u32x4 { }
49} else {
50    #[derive(Default, Clone, Copy, PartialEq, Eq)]
51    #[repr(C, align(16))]
52    pub struct u32x4 { arr: [u32;4] }
53  }
54}
55
56int_uint_consts!(u32, 4, u32x4, 128);
57
58unsafe impl Zeroable for u32x4 {}
59unsafe impl Pod for u32x4 {}
60
61impl AlignTo for u32x4 {
62  type Elem = u32;
63}
64
65impl Add for u32x4 {
66  type Output = Self;
67  #[inline]
68  fn add(self, rhs: Self) -> Self::Output {
69    pick! {
70      if #[cfg(target_feature="sse2")] {
71        Self { sse: add_i32_m128i(self.sse, rhs.sse) }
72      } else if #[cfg(target_feature="simd128")] {
73        Self { simd: u32x4_add(self.simd, rhs.simd) }
74      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
75        unsafe { Self { neon: vaddq_u32(self.neon, rhs.neon) } }
76      } else {
77        Self { arr: [
78          self.arr[0].wrapping_add(rhs.arr[0]),
79          self.arr[1].wrapping_add(rhs.arr[1]),
80          self.arr[2].wrapping_add(rhs.arr[2]),
81          self.arr[3].wrapping_add(rhs.arr[3]),
82        ]}
83      }
84    }
85  }
86}
87
88impl Sub for u32x4 {
89  type Output = Self;
90  #[inline]
91  fn sub(self, rhs: Self) -> Self::Output {
92    pick! {
93      if #[cfg(target_feature="sse2")] {
94        Self { sse: sub_i32_m128i(self.sse, rhs.sse) }
95      } else if #[cfg(target_feature="simd128")] {
96        Self { simd: u32x4_sub(self.simd, rhs.simd) }
97      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
98        unsafe {Self { neon: vsubq_u32(self.neon, rhs.neon) }}
99      } else {
100        Self { arr: [
101          self.arr[0].wrapping_sub(rhs.arr[0]),
102          self.arr[1].wrapping_sub(rhs.arr[1]),
103          self.arr[2].wrapping_sub(rhs.arr[2]),
104          self.arr[3].wrapping_sub(rhs.arr[3]),
105        ]}
106      }
107    }
108  }
109}
110
111impl Mul for u32x4 {
112  type Output = Self;
113  #[inline]
114  fn mul(self, rhs: Self) -> Self::Output {
115    pick! {
116      if #[cfg(target_feature="sse4.1")] {
117        Self { sse: mul_32_m128i(self.sse, rhs.sse) }
118      } else if #[cfg(target_feature="simd128")] {
119        Self { simd: u32x4_mul(self.simd, rhs.simd) }
120      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
121        unsafe {Self { neon: vmulq_u32(self.neon, rhs.neon) }}
122      } else {
123        let arr1: [u32; 4] = cast(self);
124        let arr2: [u32; 4] = cast(rhs);
125        cast([
126          arr1[0].wrapping_mul(arr2[0]),
127          arr1[1].wrapping_mul(arr2[1]),
128          arr1[2].wrapping_mul(arr2[2]),
129          arr1[3].wrapping_mul(arr2[3]),
130        ])
131      }
132    }
133  }
134}
135
136impl Add<u32> for u32x4 {
137  type Output = Self;
138  #[inline]
139  fn add(self, rhs: u32) -> Self::Output {
140    self.add(Self::splat(rhs))
141  }
142}
143
144impl Sub<u32> for u32x4 {
145  type Output = Self;
146  #[inline]
147  fn sub(self, rhs: u32) -> Self::Output {
148    self.sub(Self::splat(rhs))
149  }
150}
151
152impl Mul<u32> for u32x4 {
153  type Output = Self;
154  #[inline]
155  fn mul(self, rhs: u32) -> Self::Output {
156    self.mul(Self::splat(rhs))
157  }
158}
159
160impl Add<u32x4> for u32 {
161  type Output = u32x4;
162  #[inline]
163  fn add(self, rhs: u32x4) -> Self::Output {
164    u32x4::splat(self).add(rhs)
165  }
166}
167
168impl Sub<u32x4> for u32 {
169  type Output = u32x4;
170  #[inline]
171  fn sub(self, rhs: u32x4) -> Self::Output {
172    u32x4::splat(self).sub(rhs)
173  }
174}
175
176impl Mul<u32x4> for u32 {
177  type Output = u32x4;
178  #[inline]
179  fn mul(self, rhs: u32x4) -> Self::Output {
180    u32x4::splat(self).mul(rhs)
181  }
182}
183
184impl BitAnd for u32x4 {
185  type Output = Self;
186  #[inline]
187  fn bitand(self, rhs: Self) -> Self::Output {
188    pick! {
189      if #[cfg(target_feature="sse2")] {
190        Self { sse: bitand_m128i(self.sse, rhs.sse) }
191      } else if #[cfg(target_feature="simd128")] {
192        Self { simd: v128_and(self.simd, rhs.simd) }
193      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
194        unsafe {Self { neon: vandq_u32(self.neon, rhs.neon) }}
195      } else {
196        Self { arr: [
197          self.arr[0].bitand(rhs.arr[0]),
198          self.arr[1].bitand(rhs.arr[1]),
199          self.arr[2].bitand(rhs.arr[2]),
200          self.arr[3].bitand(rhs.arr[3]),
201        ]}
202      }
203    }
204  }
205}
206
207impl BitOr for u32x4 {
208  type Output = Self;
209  #[inline]
210  fn bitor(self, rhs: Self) -> Self::Output {
211    pick! {
212      if #[cfg(target_feature="sse2")] {
213        Self { sse: bitor_m128i(self.sse, rhs.sse) }
214      } else if #[cfg(target_feature="simd128")] {
215        Self { simd: v128_or(self.simd, rhs.simd) }
216      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
217        unsafe {Self { neon: vorrq_u32(self.neon, rhs.neon) }}
218      } else {
219        Self { arr: [
220          self.arr[0].bitor(rhs.arr[0]),
221          self.arr[1].bitor(rhs.arr[1]),
222          self.arr[2].bitor(rhs.arr[2]),
223          self.arr[3].bitor(rhs.arr[3]),
224        ]}
225      }
226    }
227  }
228}
229
230impl BitXor for u32x4 {
231  type Output = Self;
232  #[inline]
233  fn bitxor(self, rhs: Self) -> Self::Output {
234    pick! {
235      if #[cfg(target_feature="sse2")] {
236        Self { sse: bitxor_m128i(self.sse, rhs.sse) }
237      } else if #[cfg(target_feature="simd128")] {
238        Self { simd: v128_xor(self.simd, rhs.simd) }
239      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
240        unsafe {Self { neon: veorq_u32(self.neon, rhs.neon) }}
241      } else {
242        Self { arr: [
243          self.arr[0].bitxor(rhs.arr[0]),
244          self.arr[1].bitxor(rhs.arr[1]),
245          self.arr[2].bitxor(rhs.arr[2]),
246          self.arr[3].bitxor(rhs.arr[3]),
247        ]}
248      }
249    }
250  }
251}
252
253macro_rules! impl_shl_t_for_u32x4 {
254  ($($shift_type:ty),+ $(,)?) => {
255    $(impl Shl<$shift_type> for u32x4 {
256      type Output = Self;
257      /// Shifts all lanes by the value given.
258      #[inline]
259      fn shl(self, rhs: $shift_type) -> Self::Output {
260        pick! {
261          if #[cfg(target_feature="sse2")] {
262            let shift = cast([rhs as u64, 0]);
263            Self { sse: shl_all_u32_m128i(self.sse, shift) }
264          } else if #[cfg(target_feature="simd128")] {
265            Self { simd: u32x4_shl(self.simd, rhs as u32) }
266          } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
267            unsafe {Self { neon: vshlq_u32(self.neon, vmovq_n_s32(rhs as i32)) }}
268          } else {
269            let u = rhs as u32;
270            Self { arr: [
271              self.arr[0].wrapping_shl(u),
272              self.arr[1].wrapping_shl(u),
273              self.arr[2].wrapping_shl(u),
274              self.arr[3].wrapping_shl(u),
275            ]}
276          }
277        }
278      }
279    })+
280  };
281}
282impl_shl_t_for_u32x4!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
283
284macro_rules! impl_shr_t_for_u32x4 {
285  ($($shift_type:ty),+ $(,)?) => {
286    $(impl Shr<$shift_type> for u32x4 {
287      type Output = Self;
288      /// Shifts all lanes by the value given.
289      #[inline]
290      fn shr(self, rhs: $shift_type) -> Self::Output {
291        pick! {
292          if #[cfg(target_feature="sse2")] {
293            let shift = cast([rhs as u64, 0]);
294            Self { sse: shr_all_u32_m128i(self.sse, shift) }
295          } else if #[cfg(target_feature="simd128")] {
296            Self { simd: u32x4_shr(self.simd, rhs as u32) }
297          } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
298            unsafe {Self { neon: vshlq_u32(self.neon, vmovq_n_s32( -(rhs as i32))) }}
299          } else {
300            let u = rhs as u32;
301            Self { arr: [
302              self.arr[0].wrapping_shr(u),
303              self.arr[1].wrapping_shr(u),
304              self.arr[2].wrapping_shr(u),
305              self.arr[3].wrapping_shr(u),
306            ]}
307          }
308        }
309      }
310    })+
311  };
312}
313impl_shr_t_for_u32x4!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
314
315/// Shifts lanes by the corresponding lane.
316///
317/// Bitwise shift-right; yields `self >> mask(rhs)`, where mask removes any
318/// high-order bits of `rhs` that would cause the shift to exceed the bitwidth
319/// of the type. (same as `wrapping_shr`)
320impl Shr<u32x4> for u32x4 {
321  type Output = Self;
322  #[inline]
323  fn shr(self, rhs: u32x4) -> Self::Output {
324    pick! {
325      if #[cfg(target_feature="avx2")] {
326        // mask the shift count to 31 to have same behavior on all platforms
327        let shift_by = bitand_m128i(rhs.sse, set_splat_i32_m128i(31));
328        Self { sse: shr_each_u32_m128i(self.sse, shift_by) }
329      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
330        unsafe {
331          // mask the shift count to 31 to have same behavior on all platforms
332          // no right shift, have to pass negative value to left shift on neon
333          let shift_by = vnegq_s32(vreinterpretq_s32_u32(vandq_u32(rhs.neon, vmovq_n_u32(31))));
334          Self { neon: vshlq_u32(self.neon, shift_by) }
335        }
336      } else {
337        let arr: [u32; 4] = cast(self);
338        let rhs: [u32; 4] = cast(rhs);
339        cast([
340          arr[0].wrapping_shr(rhs[0]),
341          arr[1].wrapping_shr(rhs[1]),
342          arr[2].wrapping_shr(rhs[2]),
343          arr[3].wrapping_shr(rhs[3]),
344        ])
345      }
346    }
347  }
348}
349
350/// Shifts lanes by the corresponding lane.
351///
352/// Bitwise shift-left; yields `self << mask(rhs)`, where mask removes any
353/// high-order bits of `rhs` that would cause the shift to exceed the bitwidth
354/// of the type. (same as `wrapping_shl`)
355impl Shl<u32x4> for u32x4 {
356  type Output = Self;
357  #[inline]
358  fn shl(self, rhs: u32x4) -> Self::Output {
359    pick! {
360      if #[cfg(target_feature="avx2")] {
361        // mask the shift count to 31 to have same behavior on all platforms
362        let shift_by = bitand_m128i(rhs.sse, set_splat_i32_m128i(31));
363        Self { sse: shl_each_u32_m128i(self.sse, shift_by) }
364      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
365        unsafe {
366          // mask the shift count to 31 to have same behavior on all platforms
367          let shift_by = vreinterpretq_s32_u32(vandq_u32(rhs.neon, vmovq_n_u32(31)));
368          Self { neon: vshlq_u32(self.neon, shift_by) }
369        }
370      } else {
371        let arr: [u32; 4] = cast(self);
372        let rhs: [u32; 4] = cast(rhs);
373        cast([
374          arr[0].wrapping_shl(rhs[0]),
375          arr[1].wrapping_shl(rhs[1]),
376          arr[2].wrapping_shl(rhs[2]),
377          arr[3].wrapping_shl(rhs[3]),
378        ])
379      }
380    }
381  }
382}
383
384impl CmpEq for u32x4 {
385  type Output = Self;
386  #[inline]
387  fn simd_eq(self, rhs: Self) -> Self::Output {
388    Self::simd_eq(self, rhs)
389  }
390}
391
392impl CmpGt for u32x4 {
393  type Output = Self;
394  #[inline]
395  fn simd_gt(self, rhs: Self) -> Self::Output {
396    Self::simd_gt(self, rhs)
397  }
398}
399
400impl CmpLt for u32x4 {
401  type Output = Self;
402  #[inline]
403  fn simd_lt(self, rhs: Self) -> Self::Output {
404    // no gt, so just reverse to get same answer
405    Self::simd_gt(rhs, self)
406  }
407}
408
409impl u32x4 {
410  #[inline]
411  #[must_use]
412  pub const fn new(array: [u32; 4]) -> Self {
413    unsafe { core::mem::transmute(array) }
414  }
415  #[inline]
416  #[must_use]
417  pub fn simd_eq(self, rhs: Self) -> Self {
418    pick! {
419      if #[cfg(target_feature="sse2")] {
420        Self { sse: cmp_eq_mask_i32_m128i(self.sse, rhs.sse) }
421      } else if #[cfg(target_feature="simd128")] {
422        Self { simd: u32x4_eq(self.simd, rhs.simd) }
423      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
424        unsafe {Self { neon: vceqq_u32(self.neon, rhs.neon) }}
425      } else {
426        Self { arr: [
427          if self.arr[0] == rhs.arr[0] { u32::MAX } else { 0 },
428          if self.arr[1] == rhs.arr[1] { u32::MAX } else { 0 },
429          if self.arr[2] == rhs.arr[2] { u32::MAX } else { 0 },
430          if self.arr[3] == rhs.arr[3] { u32::MAX } else { 0 },
431        ]}
432      }
433    }
434  }
435  #[inline]
436  #[must_use]
437  pub fn simd_gt(self, rhs: Self) -> Self {
438    pick! {
439      if #[cfg(target_feature="sse2")] {
440        // no unsigned less than so inverting the high bit will get the correct result
441        let h = u32x4::splat(1 << 31);
442        Self { sse: cmp_gt_mask_i32_m128i((self ^ h).sse, (rhs ^ h).sse) }
443      } else if #[cfg(target_feature="simd128")] {
444        Self { simd: u32x4_gt(self.simd, rhs.simd) }
445      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
446        unsafe {Self { neon: vcgtq_u32(self.neon, rhs.neon) }}
447      } else {
448        Self { arr: [
449          if self.arr[0] > rhs.arr[0] { u32::MAX } else { 0 },
450          if self.arr[1] > rhs.arr[1] { u32::MAX } else { 0 },
451          if self.arr[2] > rhs.arr[2] { u32::MAX } else { 0 },
452          if self.arr[3] > rhs.arr[3] { u32::MAX } else { 0 },
453        ]}
454      }
455    }
456  }
457  #[inline]
458  #[must_use]
459  pub fn simd_lt(self, rhs: Self) -> Self {
460    // lt is just gt the other way around
461    rhs.simd_gt(self)
462  }
463
464  /// Multiplies 32x32 bit to 64 bit and then only keeps the high 32 bits of the
465  /// result. Useful for implementing divide constant value (see `t_usefulness`
466  /// example)
467  #[inline]
468  #[must_use]
469  pub fn mul_keep_high(self, rhs: Self) -> Self {
470    pick! {
471      if #[cfg(target_feature="avx2")] {
472        let a = convert_to_i64_m256i_from_u32_m128i(self.sse);
473        let b = convert_to_i64_m256i_from_u32_m128i(rhs.sse);
474        let r = mul_u64_low_bits_m256i(a, b);
475
476        // the compiler does a good job shuffling the lanes around
477        let b : [u32;8] = cast(r);
478        cast([b[1],b[3],b[5],b[7]])
479      } else if #[cfg(target_feature="sse2")] {
480        let evenp = mul_widen_u32_odd_m128i(self.sse, rhs.sse);
481
482        let oddp = mul_widen_u32_odd_m128i(
483          shr_imm_u64_m128i::<32>(self.sse),
484          shr_imm_u64_m128i::<32>(rhs.sse));
485
486        // the compiler does a good job shuffling the lanes around
487        let a : [u32;4]= cast(evenp);
488        let b : [u32;4]= cast(oddp);
489        cast([a[1],b[1],a[3],b[3]])
490
491      } else if #[cfg(target_feature="simd128")] {
492        let low =  u64x2_extmul_low_u32x4(self.simd, rhs.simd);
493        let high = u64x2_extmul_high_u32x4(self.simd, rhs.simd);
494
495        Self { simd: u32x4_shuffle::<1, 3, 5, 7>(low, high) }
496      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
497        unsafe {
498          let l = vmull_u32(vget_low_u32(self.neon), vget_low_u32(rhs.neon));
499          let h = vmull_u32(vget_high_u32(self.neon), vget_high_u32(rhs.neon));
500          u32x4 { neon: vcombine_u32(vshrn_n_u64(l,32), vshrn_n_u64(h,32)) }
501        }
502      } else {
503        let a: [u32; 4] = cast(self);
504        let b: [u32; 4] = cast(rhs);
505        cast([
506          ((u64::from(a[0]) * u64::from(b[0])) >> 32) as u32,
507          ((u64::from(a[1]) * u64::from(b[1])) >> 32) as u32,
508          ((u64::from(a[2]) * u64::from(b[2])) >> 32) as u32,
509          ((u64::from(a[3]) * u64::from(b[3])) >> 32) as u32,
510        ])
511      }
512    }
513  }
514
515  /// Multiplies corresponding 32 bit lanes and returns the 64 bit result
516  /// on the corresponding lanes.
517  ///
518  /// Effectively does two multiplies on 128 bit platforms, but is easier
519  /// to use than wrapping `mul_widen_u32_odd_m128i` individually.
520  #[inline]
521  #[must_use]
522  pub fn mul_widen(self, rhs: Self) -> u64x4 {
523    pick! {
524      if #[cfg(target_feature="avx2")] {
525        // ok to sign extend since we are throwing away the high half of the result anyway
526        let a = convert_to_i64_m256i_from_i32_m128i(self.sse);
527        let b = convert_to_i64_m256i_from_i32_m128i(rhs.sse);
528        cast(mul_u64_low_bits_m256i(a, b))
529      } else if #[cfg(target_feature="sse2")] {
530        let evenp = mul_widen_u32_odd_m128i(self.sse, rhs.sse);
531
532        let oddp = mul_widen_u32_odd_m128i(
533          shr_imm_u64_m128i::<32>(self.sse),
534          shr_imm_u64_m128i::<32>(rhs.sse));
535
536        u64x4 {
537          a: u64x2 { sse: unpack_low_i64_m128i(evenp, oddp)},
538          b: u64x2 { sse: unpack_high_i64_m128i(evenp, oddp)}
539        }
540      } else if #[cfg(target_feature="simd128")] {
541        u64x4 {
542          a: u64x2 { simd: u64x2_extmul_low_u32x4(self.simd, rhs.simd) },
543          b: u64x2 { simd: u64x2_extmul_high_u32x4(self.simd, rhs.simd) },
544        }
545      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
546      unsafe {
547        u64x4 { a: u64x2 { neon: vmull_u32(vget_low_u32(self.neon), vget_low_u32(rhs.neon)) },
548                b: u64x2 { neon: vmull_u32(vget_high_u32(self.neon), vget_high_u32(rhs.neon)) } }
549        }
550      } else {
551        let a: [u32; 4] = cast(self);
552        let b: [u32; 4] = cast(rhs);
553        cast([
554          u64::from(a[0]) * u64::from(b[0]),
555          u64::from(a[1]) * u64::from(b[1]),
556          u64::from(a[2]) * u64::from(b[2]),
557          u64::from(a[3]) * u64::from(b[3]),
558        ])
559      }
560    }
561  }
562
563  #[inline]
564  #[must_use]
565  pub fn blend(self, t: Self, f: Self) -> Self {
566    pick! {
567      if #[cfg(target_feature="sse4.1")] {
568        Self { sse: blend_varying_i8_m128i(f.sse, t.sse, self.sse) }
569      } else if #[cfg(target_feature="simd128")] {
570        Self { simd: v128_bitselect(t.simd, f.simd, self.simd) }
571      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
572        unsafe {Self { neon: vbslq_u32(self.neon, t.neon, f.neon) }}
573      } else {
574        generic_bit_blend(self, t, f)
575      }
576    }
577  }
578  #[inline]
579  #[must_use]
580  pub fn max(self, rhs: Self) -> Self {
581    pick! {
582      if #[cfg(target_feature="sse4.1")] {
583        Self { sse: max_u32_m128i(self.sse, rhs.sse) }
584      } else if #[cfg(target_feature="simd128")] {
585        Self { simd: u32x4_max(self.simd, rhs.simd) }
586      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
587        unsafe {Self { neon: vmaxq_u32(self.neon, rhs.neon) }}
588      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
589        unsafe {Self { neon: vmaxq_u16(self.neon, rhs.neon) }}
590      } else {
591        let arr: [u32; 4] = cast(self);
592        let rhs: [u32; 4] = cast(rhs);
593        cast([
594          arr[0].max(rhs[0]),
595          arr[1].max(rhs[1]),
596          arr[2].max(rhs[2]),
597          arr[3].max(rhs[3]),
598        ])
599      }
600    }
601  }
602  #[inline]
603  #[must_use]
604  pub fn min(self, rhs: Self) -> Self {
605    pick! {
606      if #[cfg(target_feature="sse4.1")] {
607        Self { sse: min_u32_m128i(self.sse, rhs.sse) }
608      } else if #[cfg(target_feature="simd128")] {
609        Self { simd: u32x4_min(self.simd, rhs.simd) }
610      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
611        unsafe {Self { neon: vminq_u32(self.neon, rhs.neon) }}
612      } else {
613        let arr: [u32; 4] = cast(self);
614        let rhs: [u32; 4] = cast(rhs);
615        cast([
616          arr[0].min(rhs[0]),
617          arr[1].min(rhs[1]),
618          arr[2].min(rhs[2]),
619          arr[3].min(rhs[3]),
620        ])
621      }
622    }
623  }
624
625  #[inline]
626  #[must_use]
627  pub fn any(self) -> bool {
628    pick! {
629      if #[cfg(target_feature="sse2")] {
630        (move_mask_i8_m128i(self.sse) & 0b1000100010001000) != 0
631      } else if #[cfg(target_feature="simd128")] {
632        u32x4_bitmask(self.simd) != 0
633      } else {
634        let v : [u64;2] = cast(self);
635        ((v[0] | v[1]) & 0x8000000080000000) != 0
636      }
637    }
638  }
639
640  #[inline]
641  #[must_use]
642  pub fn all(self) -> bool {
643    pick! {
644      if #[cfg(target_feature="sse2")] {
645        (move_mask_i8_m128i(self.sse) & 0b1000100010001000) == 0b1000100010001000
646      } else if #[cfg(target_feature="simd128")] {
647        u32x4_bitmask(self.simd) == 0b1111
648      } else {
649        let v : [u64;2] = cast(self);
650        (v[0] & v[1] & 0x8000000080000000) == 0x8000000080000000
651      }
652    }
653  }
654
655  #[inline]
656  #[must_use]
657  pub fn none(self) -> bool {
658    !self.any()
659  }
660
661  /// Transpose matrix of 4x4 `u32` matrix. Currently only accelerated on SSE.
662  #[must_use]
663  #[inline]
664  pub fn transpose(data: [u32x4; 4]) -> [u32x4; 4] {
665    pick! {
666      if #[cfg(target_feature="sse")] {
667        let mut e0 = data[0];
668        let mut e1 = data[1];
669        let mut e2 = data[2];
670        let mut e3 = data[3];
671
672        transpose_four_m128(
673          cast_mut(&mut e0.sse),
674          cast_mut(&mut e1.sse),
675          cast_mut(&mut e2.sse),
676          cast_mut(&mut e3.sse),
677        );
678
679        [e0, e1, e2, e3]
680      } else {
681        #[inline(always)]
682        fn transpose_column(data: &[u32x4; 4], index: usize) -> u32x4 {
683          u32x4::new([
684            data[0].as_array()[index],
685            data[1].as_array()[index],
686            data[2].as_array()[index],
687            data[3].as_array()[index],
688          ])
689        }
690
691        [
692          transpose_column(&data, 0),
693          transpose_column(&data, 1),
694          transpose_column(&data, 2),
695          transpose_column(&data, 3),
696        ]
697      }
698    }
699  }
700  
701  #[inline]
702  #[must_use]
703  pub fn to_bitmask(self) -> u32 {
704    i32x4::to_bitmask(cast(self))
705  }
706
707  #[inline]
708  pub fn to_array(self) -> [u32; 4] {
709    cast(self)
710  }
711
712  #[inline]
713  pub fn as_array(&self) -> &[u32; 4] {
714    cast_ref(self)
715  }
716
717  #[inline]
718  pub fn as_mut_array(&mut self) -> &mut [u32; 4] {
719    cast_mut(self)
720  }
721}