Skip to main content

wide/
i8x16_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="sse2")] {
5    #[derive(Default, Clone, Copy, PartialEq, Eq)]
6    #[repr(C, align(16))]
7    pub struct i8x16 { pub(crate) sse: m128i }
8  } else if #[cfg(target_feature="simd128")] {
9    use core::arch::wasm32::*;
10
11    #[derive(Clone, Copy)]
12    #[repr(transparent)]
13    pub struct i8x16 { pub(crate) simd: v128 }
14
15    impl Default for i8x16 {
16      fn default() -> Self {
17        Self::splat(0)
18      }
19    }
20
21    impl PartialEq for i8x16 {
22      fn eq(&self, other: &Self) -> bool {
23        u8x16_all_true(i8x16_eq(self.simd, other.simd))
24      }
25    }
26
27    impl Eq for i8x16 { }
28  } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
29    use core::arch::aarch64::*;
30    #[repr(C)]
31    #[derive(Copy, Clone)]
32    pub struct i8x16 { pub(crate) neon : int8x16_t }
33
34    impl Default for i8x16 {
35      #[inline]
36      fn default() -> Self {
37        Self::splat(0)
38      }
39    }
40
41    impl PartialEq for i8x16 {
42      #[inline]
43      fn eq(&self, other: &Self) -> bool {
44        unsafe { vminvq_u8(vceqq_s8(self.neon, other.neon))==u8::MAX }
45      }
46    }
47
48    impl Eq for i8x16 { }
49  } else {
50    #[derive(Default, Clone, Copy, PartialEq, Eq)]
51    #[repr(C, align(16))]
52    pub struct i8x16 { arr: [i8;16] }
53  }
54}
55
56int_uint_consts!(i8, 16, i8x16, 128);
57
58unsafe impl Zeroable for i8x16 {}
59unsafe impl Pod for i8x16 {}
60
61impl AlignTo for i8x16 {
62  type Elem = i8;
63}
64
65impl Add for i8x16 {
66  type Output = Self;
67  #[inline]
68  fn add(self, rhs: Self) -> Self::Output {
69    pick! {
70      if #[cfg(target_feature="sse2")] {
71        Self { sse: add_i8_m128i(self.sse, rhs.sse) }
72      } else if #[cfg(target_feature="simd128")] {
73        Self { simd: i8x16_add(self.simd, rhs.simd) }
74      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
75        unsafe { Self { neon: vaddq_s8(self.neon, rhs.neon) } }
76      } else {
77        Self { arr: [
78          self.arr[0].wrapping_add(rhs.arr[0]),
79          self.arr[1].wrapping_add(rhs.arr[1]),
80          self.arr[2].wrapping_add(rhs.arr[2]),
81          self.arr[3].wrapping_add(rhs.arr[3]),
82          self.arr[4].wrapping_add(rhs.arr[4]),
83          self.arr[5].wrapping_add(rhs.arr[5]),
84          self.arr[6].wrapping_add(rhs.arr[6]),
85          self.arr[7].wrapping_add(rhs.arr[7]),
86          self.arr[8].wrapping_add(rhs.arr[8]),
87          self.arr[9].wrapping_add(rhs.arr[9]),
88          self.arr[10].wrapping_add(rhs.arr[10]),
89          self.arr[11].wrapping_add(rhs.arr[11]),
90          self.arr[12].wrapping_add(rhs.arr[12]),
91          self.arr[13].wrapping_add(rhs.arr[13]),
92          self.arr[14].wrapping_add(rhs.arr[14]),
93          self.arr[15].wrapping_add(rhs.arr[15]),
94        ]}
95      }
96    }
97  }
98}
99
100impl Sub for i8x16 {
101  type Output = Self;
102  #[inline]
103  fn sub(self, rhs: Self) -> Self::Output {
104    pick! {
105      if #[cfg(target_feature="sse2")] {
106        Self { sse: sub_i8_m128i(self.sse, rhs.sse) }
107      } else if #[cfg(target_feature="simd128")] {
108        Self { simd: i8x16_sub(self.simd, rhs.simd) }
109      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
110        unsafe {Self { neon: vsubq_s8(self.neon, rhs.neon) }}
111      } else {
112        Self { arr: [
113          self.arr[0].wrapping_sub(rhs.arr[0]),
114          self.arr[1].wrapping_sub(rhs.arr[1]),
115          self.arr[2].wrapping_sub(rhs.arr[2]),
116          self.arr[3].wrapping_sub(rhs.arr[3]),
117          self.arr[4].wrapping_sub(rhs.arr[4]),
118          self.arr[5].wrapping_sub(rhs.arr[5]),
119          self.arr[6].wrapping_sub(rhs.arr[6]),
120          self.arr[7].wrapping_sub(rhs.arr[7]),
121          self.arr[8].wrapping_sub(rhs.arr[8]),
122          self.arr[9].wrapping_sub(rhs.arr[9]),
123          self.arr[10].wrapping_sub(rhs.arr[10]),
124          self.arr[11].wrapping_sub(rhs.arr[11]),
125          self.arr[12].wrapping_sub(rhs.arr[12]),
126          self.arr[13].wrapping_sub(rhs.arr[13]),
127          self.arr[14].wrapping_sub(rhs.arr[14]),
128          self.arr[15].wrapping_sub(rhs.arr[15]),
129        ]}
130      }
131    }
132  }
133}
134
135impl Mul for i8x16 {
136  type Output = Self;
137
138  #[inline]
139  fn mul(self, rhs: Self) -> Self::Output {
140    // For x86 and wasm, this technically can be done explicitly by converting
141    // to `i16` then converting back after multiplication, but that may not
142    // actually be faster than auto-vectorization.
143    pick! {
144      if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
145        unsafe { Self { neon: vmulq_s8(self.neon, rhs.neon) } }
146      } else {
147        let self_array: [i8; 16] = cast(self);
148        let rhs_array: [i8; 16] = cast(rhs);
149
150        Self::new([
151          self_array[0].wrapping_mul(rhs_array[0]),
152          self_array[1].wrapping_mul(rhs_array[1]),
153          self_array[2].wrapping_mul(rhs_array[2]),
154          self_array[3].wrapping_mul(rhs_array[3]),
155          self_array[4].wrapping_mul(rhs_array[4]),
156          self_array[5].wrapping_mul(rhs_array[5]),
157          self_array[6].wrapping_mul(rhs_array[6]),
158          self_array[7].wrapping_mul(rhs_array[7]),
159          self_array[8].wrapping_mul(rhs_array[8]),
160          self_array[9].wrapping_mul(rhs_array[9]),
161          self_array[10].wrapping_mul(rhs_array[10]),
162          self_array[11].wrapping_mul(rhs_array[11]),
163          self_array[12].wrapping_mul(rhs_array[12]),
164          self_array[13].wrapping_mul(rhs_array[13]),
165          self_array[14].wrapping_mul(rhs_array[14]),
166          self_array[15].wrapping_mul(rhs_array[15]),
167        ])
168      }
169    }
170  }
171}
172
173integer_impl_div_rem!(
174  i8,
175  i8x16,
176  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
177);
178
179impl Shl for i8x16 {
180  type Output = Self;
181
182  /// Shifts lanes by the corresponding lane.
183  ///
184  /// Bitwise shift-left; yields `self << mask(rhs)`, where mask removes any
185  /// high-order bits of `rhs` that would cause the shift to exceed the bitwidth
186  /// of the type. (same as `wrapping_shl`)
187  #[inline]
188  fn shl(self, rhs: Self) -> Self::Output {
189    // For x86, this technically can be done explicitly by converting
190    // to `i16` or `i32` then converting back after multiplication, but that may
191    // not actually be faster than auto-vectorization.
192    pick! {
193      if #[cfg(all(target_feature="neon", target_arch="aarch64"))] {
194        unsafe {
195          // Mask `rhs` to 7 to match `wrapping_shl`.
196          let shift_by = vandq_s8(rhs.neon, vmovq_n_s8(7));
197          Self { neon: vshlq_s8(self.neon, shift_by) }
198        }
199      } else {
200        let self_array: [i8; 16] = cast(self);
201        let rhs_array: [i8; 16] = cast(rhs);
202
203        Self::new([
204          self_array[0].wrapping_shl(rhs_array[0] as u32),
205          self_array[1].wrapping_shl(rhs_array[1] as u32),
206          self_array[2].wrapping_shl(rhs_array[2] as u32),
207          self_array[3].wrapping_shl(rhs_array[3] as u32),
208          self_array[4].wrapping_shl(rhs_array[4] as u32),
209          self_array[5].wrapping_shl(rhs_array[5] as u32),
210          self_array[6].wrapping_shl(rhs_array[6] as u32),
211          self_array[7].wrapping_shl(rhs_array[7] as u32),
212          self_array[8].wrapping_shl(rhs_array[8] as u32),
213          self_array[9].wrapping_shl(rhs_array[9] as u32),
214          self_array[10].wrapping_shl(rhs_array[10] as u32),
215          self_array[11].wrapping_shl(rhs_array[11] as u32),
216          self_array[12].wrapping_shl(rhs_array[12] as u32),
217          self_array[13].wrapping_shl(rhs_array[13] as u32),
218          self_array[14].wrapping_shl(rhs_array[14] as u32),
219          self_array[15].wrapping_shl(rhs_array[15] as u32),
220        ])
221      }
222    }
223  }
224}
225
226impl Shr for i8x16 {
227  type Output = Self;
228
229  #[inline]
230  fn shr(self, rhs: Self) -> Self::Output {
231    // For x86, this technically can be done explicitly by converting
232    // to `i16` or `i32` then converting back after multiplication, but that may
233    // not actually be faster than auto-vectorization.
234    pick! {
235      if #[cfg(all(target_feature="neon", target_arch="aarch64"))] {
236        unsafe {
237          // Mask `rhs` to 7 to match `wrapping_shr`, and negate it because
238          // there is no shift-right intrinsic.
239          let neg_rhs = vnegq_s8(vandq_s8(rhs.neon, vmovq_n_s8(7)));
240          Self { neon: vshlq_s8(self.neon, neg_rhs) }
241        }
242      } else {
243        let self_array: [i8; 16] = cast(self);
244        let rhs_array: [i8; 16] = cast(rhs);
245
246        Self::new([
247          self_array[0].wrapping_shr(rhs_array[0] as u32),
248          self_array[1].wrapping_shr(rhs_array[1] as u32),
249          self_array[2].wrapping_shr(rhs_array[2] as u32),
250          self_array[3].wrapping_shr(rhs_array[3] as u32),
251          self_array[4].wrapping_shr(rhs_array[4] as u32),
252          self_array[5].wrapping_shr(rhs_array[5] as u32),
253          self_array[6].wrapping_shr(rhs_array[6] as u32),
254          self_array[7].wrapping_shr(rhs_array[7] as u32),
255          self_array[8].wrapping_shr(rhs_array[8] as u32),
256          self_array[9].wrapping_shr(rhs_array[9] as u32),
257          self_array[10].wrapping_shr(rhs_array[10] as u32),
258          self_array[11].wrapping_shr(rhs_array[11] as u32),
259          self_array[12].wrapping_shr(rhs_array[12] as u32),
260          self_array[13].wrapping_shr(rhs_array[13] as u32),
261          self_array[14].wrapping_shr(rhs_array[14] as u32),
262          self_array[15].wrapping_shr(rhs_array[15] as u32),
263        ])
264      }
265    }
266  }
267}
268
269impl Add<i8> for i8x16 {
270  type Output = Self;
271  #[inline]
272  fn add(self, rhs: i8) -> Self::Output {
273    self.add(Self::splat(rhs))
274  }
275}
276
277impl Sub<i8> for i8x16 {
278  type Output = Self;
279  #[inline]
280  fn sub(self, rhs: i8) -> Self::Output {
281    self.sub(Self::splat(rhs))
282  }
283}
284
285impl Mul<i8> for i8x16 {
286  type Output = Self;
287
288  #[inline]
289  fn mul(self, rhs: i8) -> Self::Output {
290    self * Self::splat(rhs)
291  }
292}
293
294macro_rules! impl_shl_scalar {
295  ($Rhs:ident) => {
296    impl Shl<$Rhs> for i8x16 {
297      type Output = Self;
298
299      /// Shifts all lanes by a uniform value.
300      ///
301      /// Bitwise shift-left; yields `self << mask(rhs)`, where mask removes any
302      /// high-order bits of `rhs` that would cause the shift to exceed the bitwidth
303      /// of the type. (same as `wrapping_shl`)
304      #[inline]
305      fn shl(self, rhs: $Rhs) -> Self::Output {
306        // For x86, this technically can be done explicitly by converting
307        // to `i16` or `i32` then converting back after multiplication, but that
308        // may not actually be faster than auto-vectorization.
309        pick! {
310          if #[cfg(target_feature="simd128")] {
311            // Mask `rhs` to 7 to match `wrapping_shl`.
312            Self { simd: i8x16_shl(self.simd, rhs as u32 & 7) }
313          } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
314            // Mask `rhs` to 7 to match `wrapping_shl`.
315            unsafe { Self { neon: vshlq_s8(self.neon, vmovq_n_s8(rhs as i8 & 7)) } }
316          } else {
317            let self_array = self.to_array();
318            let rhs = rhs as u32;
319
320            cast([
321              self_array[0].wrapping_shl(rhs),
322              self_array[1].wrapping_shl(rhs),
323              self_array[2].wrapping_shl(rhs),
324              self_array[3].wrapping_shl(rhs),
325              self_array[4].wrapping_shl(rhs),
326              self_array[5].wrapping_shl(rhs),
327              self_array[6].wrapping_shl(rhs),
328              self_array[7].wrapping_shl(rhs),
329              self_array[8].wrapping_shl(rhs),
330              self_array[9].wrapping_shl(rhs),
331              self_array[10].wrapping_shl(rhs),
332              self_array[11].wrapping_shl(rhs),
333              self_array[12].wrapping_shl(rhs),
334              self_array[13].wrapping_shl(rhs),
335              self_array[14].wrapping_shl(rhs),
336              self_array[15].wrapping_shl(rhs),
337            ])
338          }
339        }
340      }
341    }
342  };
343}
344impl_shl_scalar!(i8);
345impl_shl_scalar!(u8);
346impl_shl_scalar!(i16);
347impl_shl_scalar!(u16);
348impl_shl_scalar!(i32);
349impl_shl_scalar!(u32);
350impl_shl_scalar!(i64);
351impl_shl_scalar!(u64);
352impl_shl_scalar!(i128);
353impl_shl_scalar!(u128);
354
355macro_rules! impl_shr_scalar {
356  ($Rhs:ident) => {
357    impl Shr<$Rhs> for i8x16 {
358      type Output = Self;
359
360      /// Shifts all lanes by a uniform value.
361      ///
362      /// Bitwise shift-right; yields `self >> mask(rhs)`, where mask removes any
363      /// high-order bits of `rhs` that would cause the shift to exceed the bitwidth
364      /// of the type. (same as `wrapping_shr`)
365      #[inline]
366      fn shr(self, rhs: $Rhs) -> Self::Output {
367        // For x86, this technically can be done explicitly by converting
368        // to `i16` or `i32` then converting back after multiplication, but that
369        // may not actually be faster than auto-vectorization.
370        pick! {
371          if #[cfg(target_feature="simd128")] {
372            // Mask `rhs` to 7 to match `wrapping_shr`.
373            Self { simd: i8x16_shr(self.simd, rhs as u32 & 7) }
374          } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
375            // Mask `rhs` to 7 to match `wrapping_shr`, and negate it because
376            // there is no shift-right intrinsic.
377            unsafe { Self { neon: vshlq_s8(self.neon, vmovq_n_s8(-(rhs as i8 & 7))) } }
378          } else {
379            let self_array = self.to_array();
380            let rhs = rhs as u32;
381
382            cast([
383              self_array[0].wrapping_shr(rhs),
384              self_array[1].wrapping_shr(rhs),
385              self_array[2].wrapping_shr(rhs),
386              self_array[3].wrapping_shr(rhs),
387              self_array[4].wrapping_shr(rhs),
388              self_array[5].wrapping_shr(rhs),
389              self_array[6].wrapping_shr(rhs),
390              self_array[7].wrapping_shr(rhs),
391              self_array[8].wrapping_shr(rhs),
392              self_array[9].wrapping_shr(rhs),
393              self_array[10].wrapping_shr(rhs),
394              self_array[11].wrapping_shr(rhs),
395              self_array[12].wrapping_shr(rhs),
396              self_array[13].wrapping_shr(rhs),
397              self_array[14].wrapping_shr(rhs),
398              self_array[15].wrapping_shr(rhs),
399            ])
400          }
401        }
402      }
403    }
404  };
405}
406impl_shr_scalar!(i8);
407impl_shr_scalar!(u8);
408impl_shr_scalar!(i16);
409impl_shr_scalar!(u16);
410impl_shr_scalar!(i32);
411impl_shr_scalar!(u32);
412impl_shr_scalar!(i64);
413impl_shr_scalar!(u64);
414impl_shr_scalar!(i128);
415impl_shr_scalar!(u128);
416
417impl Add<i8x16> for i8 {
418  type Output = i8x16;
419  #[inline]
420  fn add(self, rhs: i8x16) -> Self::Output {
421    i8x16::splat(self).add(rhs)
422  }
423}
424
425impl Sub<i8x16> for i8 {
426  type Output = i8x16;
427  #[inline]
428  fn sub(self, rhs: i8x16) -> Self::Output {
429    i8x16::splat(self).sub(rhs)
430  }
431}
432
433impl Mul<i8x16> for i8 {
434  type Output = i8x16;
435
436  #[inline]
437  fn mul(self, rhs: i8x16) -> Self::Output {
438    i8x16::splat(self) * rhs
439  }
440}
441
442impl BitAnd for i8x16 {
443  type Output = Self;
444  #[inline]
445  fn bitand(self, rhs: Self) -> Self::Output {
446    pick! {
447      if #[cfg(target_feature="sse2")] {
448        Self { sse: bitand_m128i(self.sse, rhs.sse) }
449      } else if #[cfg(target_feature="simd128")] {
450        Self { simd: v128_and(self.simd, rhs.simd) }
451      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
452        unsafe {Self { neon: vandq_s8(self.neon, rhs.neon) }}
453      } else {
454        Self { arr: [
455          self.arr[0].bitand(rhs.arr[0]),
456          self.arr[1].bitand(rhs.arr[1]),
457          self.arr[2].bitand(rhs.arr[2]),
458          self.arr[3].bitand(rhs.arr[3]),
459          self.arr[4].bitand(rhs.arr[4]),
460          self.arr[5].bitand(rhs.arr[5]),
461          self.arr[6].bitand(rhs.arr[6]),
462          self.arr[7].bitand(rhs.arr[7]),
463          self.arr[8].bitand(rhs.arr[8]),
464          self.arr[9].bitand(rhs.arr[9]),
465          self.arr[10].bitand(rhs.arr[10]),
466          self.arr[11].bitand(rhs.arr[11]),
467          self.arr[12].bitand(rhs.arr[12]),
468          self.arr[13].bitand(rhs.arr[13]),
469          self.arr[14].bitand(rhs.arr[14]),
470          self.arr[15].bitand(rhs.arr[15]),
471        ]}
472      }
473    }
474  }
475}
476
477impl BitOr for i8x16 {
478  type Output = Self;
479  #[inline]
480  fn bitor(self, rhs: Self) -> Self::Output {
481    pick! {
482      if #[cfg(target_feature="sse2")] {
483        Self { sse: bitor_m128i(self.sse, rhs.sse) }
484      } else if #[cfg(target_feature="simd128")] {
485        Self { simd: v128_or(self.simd, rhs.simd) }
486      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
487        unsafe {Self { neon: vorrq_s8(self.neon, rhs.neon) }}
488      } else {
489        Self { arr: [
490          self.arr[0].bitor(rhs.arr[0]),
491          self.arr[1].bitor(rhs.arr[1]),
492          self.arr[2].bitor(rhs.arr[2]),
493          self.arr[3].bitor(rhs.arr[3]),
494          self.arr[4].bitor(rhs.arr[4]),
495          self.arr[5].bitor(rhs.arr[5]),
496          self.arr[6].bitor(rhs.arr[6]),
497          self.arr[7].bitor(rhs.arr[7]),
498          self.arr[8].bitor(rhs.arr[8]),
499          self.arr[9].bitor(rhs.arr[9]),
500          self.arr[10].bitor(rhs.arr[10]),
501          self.arr[11].bitor(rhs.arr[11]),
502          self.arr[12].bitor(rhs.arr[12]),
503          self.arr[13].bitor(rhs.arr[13]),
504          self.arr[14].bitor(rhs.arr[14]),
505          self.arr[15].bitor(rhs.arr[15]),
506        ]}
507      }
508    }
509  }
510}
511
512impl BitXor for i8x16 {
513  type Output = Self;
514  #[inline]
515  fn bitxor(self, rhs: Self) -> Self::Output {
516    pick! {
517      if #[cfg(target_feature="sse2")] {
518        Self { sse: bitxor_m128i(self.sse, rhs.sse) }
519      } else if #[cfg(target_feature="simd128")] {
520        Self { simd: v128_xor(self.simd, rhs.simd) }
521      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
522        unsafe {Self { neon: veorq_s8(self.neon, rhs.neon) }}
523      } else {
524        Self { arr: [
525          self.arr[0].bitxor(rhs.arr[0]),
526          self.arr[1].bitxor(rhs.arr[1]),
527          self.arr[2].bitxor(rhs.arr[2]),
528          self.arr[3].bitxor(rhs.arr[3]),
529          self.arr[4].bitxor(rhs.arr[4]),
530          self.arr[5].bitxor(rhs.arr[5]),
531          self.arr[6].bitxor(rhs.arr[6]),
532          self.arr[7].bitxor(rhs.arr[7]),
533          self.arr[8].bitxor(rhs.arr[8]),
534          self.arr[9].bitxor(rhs.arr[9]),
535          self.arr[10].bitxor(rhs.arr[10]),
536          self.arr[11].bitxor(rhs.arr[11]),
537          self.arr[12].bitxor(rhs.arr[12]),
538          self.arr[13].bitxor(rhs.arr[13]),
539          self.arr[14].bitxor(rhs.arr[14]),
540          self.arr[15].bitxor(rhs.arr[15]),
541        ]}
542      }
543    }
544  }
545}
546
547#[expect(deprecated)]
548impl CmpEq for i8x16 {
549  type Output = Self;
550  #[inline]
551  fn simd_eq(self, rhs: Self) -> Self::Output {
552    pick! {
553      if #[cfg(target_feature="sse2")] {
554        Self { sse: cmp_eq_mask_i8_m128i(self.sse, rhs.sse) }
555      } else if #[cfg(target_feature="simd128")] {
556        Self { simd: i8x16_eq(self.simd, rhs.simd) }
557      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
558        unsafe {Self { neon: vreinterpretq_s8_u8(vceqq_s8(self.neon, rhs.neon)) }}
559      } else {
560        Self { arr: [
561          if self.arr[0] == rhs.arr[0] { -1 } else { 0 },
562          if self.arr[1] == rhs.arr[1] { -1 } else { 0 },
563          if self.arr[2] == rhs.arr[2] { -1 } else { 0 },
564          if self.arr[3] == rhs.arr[3] { -1 } else { 0 },
565          if self.arr[4] == rhs.arr[4] { -1 } else { 0 },
566          if self.arr[5] == rhs.arr[5] { -1 } else { 0 },
567          if self.arr[6] == rhs.arr[6] { -1 } else { 0 },
568          if self.arr[7] == rhs.arr[7] { -1 } else { 0 },
569          if self.arr[8] == rhs.arr[8] { -1 } else { 0 },
570          if self.arr[9] == rhs.arr[9] { -1 } else { 0 },
571          if self.arr[10] == rhs.arr[10] { -1 } else { 0 },
572          if self.arr[11] == rhs.arr[11] { -1 } else { 0 },
573          if self.arr[12] == rhs.arr[12] { -1 } else { 0 },
574          if self.arr[13] == rhs.arr[13] { -1 } else { 0 },
575          if self.arr[14] == rhs.arr[14] { -1 } else { 0 },
576          if self.arr[15] == rhs.arr[15] { -1 } else { 0 },
577        ]}
578      }
579    }
580  }
581}
582
583#[expect(deprecated)]
584impl CmpGt for i8x16 {
585  type Output = Self;
586  #[inline]
587  fn simd_gt(self, rhs: Self) -> Self::Output {
588    pick! {
589      if #[cfg(target_feature="sse2")] {
590        Self { sse: cmp_gt_mask_i8_m128i(self.sse, rhs.sse) }
591      } else if #[cfg(target_feature="simd128")] {
592        Self { simd: i8x16_gt(self.simd, rhs.simd) }
593      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
594        unsafe {Self { neon: vreinterpretq_s8_u8(vcgtq_s8(self.neon, rhs.neon)) }}
595      } else {
596        Self { arr: [
597          if self.arr[0] > rhs.arr[0] { -1 } else { 0 },
598          if self.arr[1] > rhs.arr[1] { -1 } else { 0 },
599          if self.arr[2] > rhs.arr[2] { -1 } else { 0 },
600          if self.arr[3] > rhs.arr[3] { -1 } else { 0 },
601          if self.arr[4] > rhs.arr[4] { -1 } else { 0 },
602          if self.arr[5] > rhs.arr[5] { -1 } else { 0 },
603          if self.arr[6] > rhs.arr[6] { -1 } else { 0 },
604          if self.arr[7] > rhs.arr[7] { -1 } else { 0 },
605          if self.arr[8] > rhs.arr[8] { -1 } else { 0 },
606          if self.arr[9] > rhs.arr[9] { -1 } else { 0 },
607          if self.arr[10] > rhs.arr[10] { -1 } else { 0 },
608          if self.arr[11] > rhs.arr[11] { -1 } else { 0 },
609          if self.arr[12] > rhs.arr[12] { -1 } else { 0 },
610          if self.arr[13] > rhs.arr[13] { -1 } else { 0 },
611          if self.arr[14] > rhs.arr[14] { -1 } else { 0 },
612          if self.arr[15] > rhs.arr[15] { -1 } else { 0 },
613        ]}
614      }
615    }
616  }
617}
618
619#[expect(deprecated)]
620impl CmpLt for i8x16 {
621  type Output = Self;
622  #[inline]
623  fn simd_lt(self, rhs: Self) -> Self::Output {
624    pick! {
625      if #[cfg(target_feature="sse2")] {
626        Self { sse: cmp_lt_mask_i8_m128i(self.sse, rhs.sse) }
627      } else if #[cfg(target_feature="simd128")] {
628        Self { simd: i8x16_lt(self.simd, rhs.simd) }
629      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
630        unsafe {Self { neon: vreinterpretq_s8_u8(vcltq_s8(self.neon, rhs.neon)) }}
631      } else {
632        Self { arr: [
633          if self.arr[0] < rhs.arr[0] { -1 } else { 0 },
634          if self.arr[1] < rhs.arr[1] { -1 } else { 0 },
635          if self.arr[2] < rhs.arr[2] { -1 } else { 0 },
636          if self.arr[3] < rhs.arr[3] { -1 } else { 0 },
637          if self.arr[4] < rhs.arr[4] { -1 } else { 0 },
638          if self.arr[5] < rhs.arr[5] { -1 } else { 0 },
639          if self.arr[6] < rhs.arr[6] { -1 } else { 0 },
640          if self.arr[7] < rhs.arr[7] { -1 } else { 0 },
641          if self.arr[8] < rhs.arr[8] { -1 } else { 0 },
642          if self.arr[9] < rhs.arr[9] { -1 } else { 0 },
643          if self.arr[10] < rhs.arr[10] { -1 } else { 0 },
644          if self.arr[11] < rhs.arr[11] { -1 } else { 0 },
645          if self.arr[12] < rhs.arr[12] { -1 } else { 0 },
646          if self.arr[13] < rhs.arr[13] { -1 } else { 0 },
647          if self.arr[14] < rhs.arr[14] { -1 } else { 0 },
648          if self.arr[15] < rhs.arr[15] { -1 } else { 0 },
649        ]}
650      }
651    }
652  }
653}
654
655#[expect(deprecated)]
656impl CmpNe for i8x16 {
657  type Output = Self;
658  #[inline]
659  fn simd_ne(self, rhs: Self) -> Self::Output {
660    pick! {
661      if #[cfg(target_feature="sse2")] {
662        !self.simd_eq(rhs)
663      } else if #[cfg(target_feature="simd128")] {
664        Self { simd: i8x16_ne(self.simd, rhs.simd) }
665      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
666        !self.simd_eq(rhs)
667      } else {
668        Self { arr: [
669          if self.arr[0] != rhs.arr[0] { -1 } else { 0 },
670          if self.arr[1] != rhs.arr[1] { -1 } else { 0 },
671          if self.arr[2] != rhs.arr[2] { -1 } else { 0 },
672          if self.arr[3] != rhs.arr[3] { -1 } else { 0 },
673          if self.arr[4] != rhs.arr[4] { -1 } else { 0 },
674          if self.arr[5] != rhs.arr[5] { -1 } else { 0 },
675          if self.arr[6] != rhs.arr[6] { -1 } else { 0 },
676          if self.arr[7] != rhs.arr[7] { -1 } else { 0 },
677          if self.arr[8] != rhs.arr[8] { -1 } else { 0 },
678          if self.arr[9] != rhs.arr[9] { -1 } else { 0 },
679          if self.arr[10] != rhs.arr[10] { -1 } else { 0 },
680          if self.arr[11] != rhs.arr[11] { -1 } else { 0 },
681          if self.arr[12] != rhs.arr[12] { -1 } else { 0 },
682          if self.arr[13] != rhs.arr[13] { -1 } else { 0 },
683          if self.arr[14] != rhs.arr[14] { -1 } else { 0 },
684          if self.arr[15] != rhs.arr[15] { -1 } else { 0 },
685        ]}
686      }
687    }
688  }
689}
690
691#[expect(deprecated)]
692impl CmpLe for i8x16 {
693  type Output = Self;
694  #[inline]
695  fn simd_le(self, rhs: Self) -> Self::Output {
696    pick! {
697      if #[cfg(target_feature="sse2")] {
698        !self.simd_gt(rhs)
699      } else if #[cfg(target_feature="simd128")] {
700        Self { simd: i8x16_le(self.simd, rhs.simd) }
701      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
702        !self.simd_gt(rhs)
703      } else {
704        Self { arr: [
705          if self.arr[0] <= rhs.arr[0] { -1 } else { 0 },
706          if self.arr[1] <= rhs.arr[1] { -1 } else { 0 },
707          if self.arr[2] <= rhs.arr[2] { -1 } else { 0 },
708          if self.arr[3] <= rhs.arr[3] { -1 } else { 0 },
709          if self.arr[4] <= rhs.arr[4] { -1 } else { 0 },
710          if self.arr[5] <= rhs.arr[5] { -1 } else { 0 },
711          if self.arr[6] <= rhs.arr[6] { -1 } else { 0 },
712          if self.arr[7] <= rhs.arr[7] { -1 } else { 0 },
713          if self.arr[8] <= rhs.arr[8] { -1 } else { 0 },
714          if self.arr[9] <= rhs.arr[9] { -1 } else { 0 },
715          if self.arr[10] <= rhs.arr[10] { -1 } else { 0 },
716          if self.arr[11] <= rhs.arr[11] { -1 } else { 0 },
717          if self.arr[12] <= rhs.arr[12] { -1 } else { 0 },
718          if self.arr[13] <= rhs.arr[13] { -1 } else { 0 },
719          if self.arr[14] <= rhs.arr[14] { -1 } else { 0 },
720          if self.arr[15] <= rhs.arr[15] { -1 } else { 0 },
721        ]}
722      }
723    }
724  }
725}
726
727#[expect(deprecated)]
728impl CmpGe for i8x16 {
729  type Output = Self;
730  #[inline]
731  fn simd_ge(self, rhs: Self) -> Self::Output {
732    pick! {
733      if #[cfg(target_feature="sse2")] {
734        !self.simd_lt(rhs)
735      } else if #[cfg(target_feature="simd128")] {
736        Self { simd: i8x16_ge(self.simd, rhs.simd) }
737      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
738        !self.simd_lt(rhs)
739      } else {
740        Self { arr: [
741          if self.arr[0] >= rhs.arr[0] { -1 } else { 0 },
742          if self.arr[1] >= rhs.arr[1] { -1 } else { 0 },
743          if self.arr[2] >= rhs.arr[2] { -1 } else { 0 },
744          if self.arr[3] >= rhs.arr[3] { -1 } else { 0 },
745          if self.arr[4] >= rhs.arr[4] { -1 } else { 0 },
746          if self.arr[5] >= rhs.arr[5] { -1 } else { 0 },
747          if self.arr[6] >= rhs.arr[6] { -1 } else { 0 },
748          if self.arr[7] >= rhs.arr[7] { -1 } else { 0 },
749          if self.arr[8] >= rhs.arr[8] { -1 } else { 0 },
750          if self.arr[9] >= rhs.arr[9] { -1 } else { 0 },
751          if self.arr[10] >= rhs.arr[10] { -1 } else { 0 },
752          if self.arr[11] >= rhs.arr[11] { -1 } else { 0 },
753          if self.arr[12] >= rhs.arr[12] { -1 } else { 0 },
754          if self.arr[13] >= rhs.arr[13] { -1 } else { 0 },
755          if self.arr[14] >= rhs.arr[14] { -1 } else { 0 },
756          if self.arr[15] >= rhs.arr[15] { -1 } else { 0 },
757        ]}
758      }
759    }
760  }
761}
762
763impl i8x16 {
764  #[inline]
765  #[must_use]
766  pub const fn new(array: [i8; 16]) -> Self {
767    unsafe { core::mem::transmute(array) }
768  }
769
770  simd_comparison_fns!();
771
772  /// converts `i16` to `i8`, saturating values that are too large
773  #[inline]
774  #[must_use]
775  pub fn from_i16x16_saturate(v: i16x16) -> i8x16 {
776    pick! {
777      if #[cfg(target_feature="avx2")] {
778        i8x16 { sse: pack_i16_to_i8_m128i( extract_m128i_from_m256i::<0>(v.avx2), extract_m128i_from_m256i::<1>(v.avx2))  }
779      } else if #[cfg(target_feature="sse2")] {
780        i8x16 { sse: pack_i16_to_i8_m128i( v.a.sse, v.b.sse ) }
781      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
782        use core::arch::aarch64::*;
783
784        unsafe {
785          i8x16 { neon: vcombine_s8(vqmovn_s16(v.a.neon), vqmovn_s16(v.b.neon)) }
786        }
787      } else if #[cfg(target_feature="simd128")] {
788        use core::arch::wasm32::*;
789
790        i8x16 { simd: i8x16_narrow_i16x8(v.a.simd, v.b.simd) }
791      } else {
792        fn clamp(a : i16) -> i8 {
793            if a < i8::MIN as i16 {
794              i8::MIN
795            }
796            else if a > i8::MAX as i16 {
797              i8::MAX
798            } else {
799                a as i8
800            }
801        }
802
803        i8x16::new([
804          clamp(v.as_array()[0]),
805          clamp(v.as_array()[1]),
806          clamp(v.as_array()[2]),
807          clamp(v.as_array()[3]),
808          clamp(v.as_array()[4]),
809          clamp(v.as_array()[5]),
810          clamp(v.as_array()[6]),
811          clamp(v.as_array()[7]),
812          clamp(v.as_array()[8]),
813          clamp(v.as_array()[9]),
814          clamp(v.as_array()[10]),
815          clamp(v.as_array()[11]),
816          clamp(v.as_array()[12]),
817          clamp(v.as_array()[13]),
818          clamp(v.as_array()[14]),
819          clamp(v.as_array()[15]),
820        ])
821      }
822    }
823  }
824
825  /// converts `i16` to `i8`, truncating the upper bits if they are set
826  #[inline]
827  #[must_use]
828  pub fn from_i16x16_truncate(v: i16x16) -> i8x16 {
829    pick! {
830      if #[cfg(target_feature="avx2")] {
831        let a = v.avx2.bitand(set_splat_i16_m256i(0xff));
832        i8x16 { sse: pack_i16_to_u8_m128i( extract_m128i_from_m256i::<0>(a), extract_m128i_from_m256i::<1>(a))  }
833      } else if #[cfg(target_feature="sse2")] {
834        let mask = set_splat_i16_m128i(0xff);
835        i8x16 { sse: pack_i16_to_u8_m128i( v.a.sse.bitand(mask), v.b.sse.bitand(mask) ) }
836      } else {
837        // no super good intrinsics on other platforms... plain old codegen does a reasonable job
838        i8x16::new([
839          v.as_array()[0] as i8,
840          v.as_array()[1] as i8,
841          v.as_array()[2] as i8,
842          v.as_array()[3] as i8,
843          v.as_array()[4] as i8,
844          v.as_array()[5] as i8,
845          v.as_array()[6] as i8,
846          v.as_array()[7] as i8,
847          v.as_array()[8] as i8,
848          v.as_array()[9] as i8,
849          v.as_array()[10] as i8,
850          v.as_array()[11] as i8,
851          v.as_array()[12] as i8,
852          v.as_array()[13] as i8,
853          v.as_array()[14] as i8,
854          v.as_array()[15] as i8,
855        ])
856      }
857    }
858  }
859
860  #[inline]
861  #[must_use]
862  pub fn blend(self, t: Self, f: Self) -> Self {
863    pick! {
864      if #[cfg(target_feature="sse4.1")] {
865        Self { sse: blend_varying_i8_m128i(f.sse, t.sse, self.sse) }
866      } else if #[cfg(target_feature="simd128")] {
867        Self { simd: v128_bitselect(t.simd, f.simd, self.simd) }
868      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
869        unsafe {Self { neon: vbslq_s8(vreinterpretq_u8_s8(self.neon), t.neon, f.neon) }}
870      } else {
871        generic_bit_blend(self, t, f)
872      }
873    }
874  }
875
876  /// Returns true for each positive element and false if it is zero or
877  /// negative.
878  #[inline]
879  #[must_use]
880  pub fn is_positive(self) -> Self {
881    pick! {
882      if #[cfg(all(target_feature="neon", target_arch="aarch64"))] {
883        Self { neon: unsafe { vreinterpretq_s8_u8(vcgtzq_s8(self.neon)) } }
884      } else {
885        self.simd_gt(Self::ZERO)
886      }
887    }
888  }
889
890  /// Returns true for each negative element and false if it is zero or
891  /// positive.
892  #[inline]
893  #[must_use]
894  pub fn is_negative(self) -> Self {
895    pick! {
896      if #[cfg(all(target_feature="neon", target_arch="aarch64"))] {
897        Self { neon: unsafe { vreinterpretq_s8_u8(vcltzq_s8(self.neon)) } }
898      } else {
899        self.simd_lt(Self::ZERO)
900      }
901    }
902  }
903
904  #[inline]
905  #[must_use]
906  pub fn reduce_add(self) -> i8 {
907    #[allow(dead_code)]
908    const SHUFFLE_1: [i8; 16] =
909      [8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0];
910    #[allow(dead_code)]
911    const SHUFFLE_2: [i8; 16] =
912      [4, 5, 6, 7, 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0];
913    #[allow(dead_code)]
914    const SHUFFLE_3: [i8; 16] =
915      [2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
916    #[allow(dead_code)]
917    const SHUFFLE_4: [i8; 16] =
918      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
919
920    pick! {
921      if #[cfg(target_feature="ssse3")] {
922        let rhs = shuffle_av_i8z_all_m128i(self.sse, m128i::from(SHUFFLE_1));
923        let sum = add_i8_m128i(self.sse, rhs);
924        let rhs = shuffle_av_i8z_all_m128i(sum, m128i::from(SHUFFLE_2));
925        let sum = add_i8_m128i(sum, rhs);
926        let rhs = shuffle_av_i8z_all_m128i(sum, m128i::from(SHUFFLE_3));
927        let sum = add_i8_m128i(sum, rhs);
928        let rhs = shuffle_av_i8z_all_m128i(sum, m128i::from(SHUFFLE_4));
929        let sum = add_i8_m128i(sum, rhs);
930        get_i32_from_m128i_s(sum) as i8
931      } else if #[cfg(target_feature="simd128")] {
932        let rhs = i8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7>(self.simd, self.simd);
933        let sum = i8x16_add(self.simd, rhs);
934        let rhs = i8x16_shuffle::<4, 5, 6, 7, 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0>(sum, sum);
935        let sum = i8x16_add(sum, rhs);
936        let rhs = i8x16_shuffle::<2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>(sum, sum);
937        let sum = i8x16_add(sum, rhs);
938        let rhs = i8x16_shuffle::<1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>(sum, sum);
939        let sum = i8x16_add(sum, rhs);
940        i8x16_extract_lane::<0>(sum)
941      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
942        unsafe {
943          // Use `transmute` instead of `cast` because `int8x16_t` does not
944          // implement `bytemuck::Pod`.
945          let rhs = vqtbl1q_s8(self.neon, core::mem::transmute(SHUFFLE_1));
946          let sum = vaddq_s8(self.neon, rhs);
947          let rhs = vqtbl1q_s8(sum, core::mem::transmute(SHUFFLE_2));
948          let sum = vaddq_s8(sum, rhs);
949          let rhs = vqtbl1q_s8(sum, core::mem::transmute(SHUFFLE_3));
950          let sum = vaddq_s8(sum, rhs);
951          let rhs = vqtbl1q_s8(sum, core::mem::transmute(SHUFFLE_4));
952          let sum = vaddq_s8(sum, rhs);
953          vgetq_lane_s8(sum, 0)
954        }
955      } else {
956        let array: [i8; 16] = cast(self);
957        array.into_iter().reduce(i8::wrapping_add).unwrap()
958      }
959    }
960  }
961
962  #[inline]
963  #[must_use]
964  pub fn reduce_max(self) -> i8 {
965    #[allow(dead_code)]
966    const SHUFFLE_1: [i8; 16] =
967      [8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0];
968    #[allow(dead_code)]
969    const SHUFFLE_2: [i8; 16] =
970      [4, 5, 6, 7, 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0];
971    #[allow(dead_code)]
972    const SHUFFLE_3: [i8; 16] =
973      [2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
974    #[allow(dead_code)]
975    const SHUFFLE_4: [i8; 16] =
976      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
977
978    pick! {
979      if #[cfg(all(target_feature="ssse3", target_feature="sse4.1"))] {
980        let rhs = shuffle_av_i8z_all_m128i(self.sse, m128i::from(SHUFFLE_1));
981        let max = max_i8_m128i(self.sse, rhs);
982        let rhs = shuffle_av_i8z_all_m128i(max, m128i::from(SHUFFLE_2));
983        let max = max_i8_m128i(max, rhs);
984        let rhs = shuffle_av_i8z_all_m128i(max, m128i::from(SHUFFLE_3));
985        let max = max_i8_m128i(max, rhs);
986        let rhs = shuffle_av_i8z_all_m128i(max, m128i::from(SHUFFLE_4));
987        let max = max_i8_m128i(max, rhs);
988        get_i32_from_m128i_s(max) as i8
989      } else if #[cfg(target_feature="simd128")] {
990        let rhs = i8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7>(self.simd, self.simd);
991        let max = i8x16_max(self.simd, rhs);
992        let rhs = i8x16_shuffle::<4, 5, 6, 7, 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0>(max, max);
993        let max = i8x16_max(max, rhs);
994        let rhs = i8x16_shuffle::<2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>(max, max);
995        let max = i8x16_max(max, rhs);
996        let rhs = i8x16_shuffle::<1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>(max, max);
997        let max = i8x16_max(max, rhs);
998        i8x16_extract_lane::<0>(max)
999      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1000        unsafe {
1001          // Use `transmute` instead of `cast` because `int8x16_t` does not
1002          // implement `bytemuck::Pod`.
1003          let rhs = vqtbl1q_s8(self.neon, core::mem::transmute(SHUFFLE_1));
1004          let max = vmaxq_s8(self.neon, rhs);
1005          let rhs = vqtbl1q_s8(max, core::mem::transmute(SHUFFLE_2));
1006          let max = vmaxq_s8(max, rhs);
1007          let rhs = vqtbl1q_s8(max, core::mem::transmute(SHUFFLE_3));
1008          let max = vmaxq_s8(max, rhs);
1009          let rhs = vqtbl1q_s8(max, core::mem::transmute(SHUFFLE_4));
1010          let max = vmaxq_s8(max, rhs);
1011          vgetq_lane_s8(max, 0)
1012        }
1013      } else {
1014        let array: [i8; 16] = cast(self);
1015        array.into_iter().reduce(i8::max).unwrap()
1016      }
1017    }
1018  }
1019
1020  #[inline]
1021  #[must_use]
1022  pub fn reduce_min(self) -> i8 {
1023    #[allow(dead_code)]
1024    const SHUFFLE_1: [i8; 16] =
1025      [8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0];
1026    #[allow(dead_code)]
1027    const SHUFFLE_2: [i8; 16] =
1028      [4, 5, 6, 7, 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0];
1029    #[allow(dead_code)]
1030    const SHUFFLE_3: [i8; 16] =
1031      [2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
1032    #[allow(dead_code)]
1033    const SHUFFLE_4: [i8; 16] =
1034      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
1035
1036    pick! {
1037      if #[cfg(all(target_feature="ssse3", target_feature="sse4.1"))] {
1038        let rhs = shuffle_av_i8z_all_m128i(self.sse, m128i::from(SHUFFLE_1));
1039        let min = min_i8_m128i(self.sse, rhs);
1040        let rhs = shuffle_av_i8z_all_m128i(min, m128i::from(SHUFFLE_2));
1041        let min = min_i8_m128i(min, rhs);
1042        let rhs = shuffle_av_i8z_all_m128i(min, m128i::from(SHUFFLE_3));
1043        let min = min_i8_m128i(min, rhs);
1044        let rhs = shuffle_av_i8z_all_m128i(min, m128i::from(SHUFFLE_4));
1045        let min = min_i8_m128i(min, rhs);
1046        get_i32_from_m128i_s(min) as i8
1047      } else if #[cfg(target_feature="simd128")] {
1048        let rhs = i8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7>(self.simd, self.simd);
1049        let min = i8x16_min(self.simd, rhs);
1050        let rhs = i8x16_shuffle::<4, 5, 6, 7, 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0>(min, min);
1051        let min = i8x16_min(min, rhs);
1052        let rhs = i8x16_shuffle::<2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>(min, min);
1053        let min = i8x16_min(min, rhs);
1054        let rhs = i8x16_shuffle::<1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>(min, min);
1055        let min = i8x16_min(min, rhs);
1056        i8x16_extract_lane::<0>(min)
1057      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1058        unsafe {
1059          // Use `transmute` instead of `cast` because `int8x16_t` does not
1060          // implement `bytemuck::Pod`.
1061          let rhs = vqtbl1q_s8(self.neon, core::mem::transmute(SHUFFLE_1));
1062          let min = vminq_s8(self.neon, rhs);
1063          let rhs = vqtbl1q_s8(min, core::mem::transmute(SHUFFLE_2));
1064          let min = vminq_s8(min, rhs);
1065          let rhs = vqtbl1q_s8(min, core::mem::transmute(SHUFFLE_3));
1066          let min = vminq_s8(min, rhs);
1067          let rhs = vqtbl1q_s8(min, core::mem::transmute(SHUFFLE_4));
1068          let min = vminq_s8(min, rhs);
1069          vgetq_lane_s8(min, 0)
1070        }
1071      } else {
1072        let array: [i8; 16] = cast(self);
1073        array.into_iter().reduce(i8::min).unwrap()
1074      }
1075    }
1076  }
1077
1078  #[inline]
1079  #[must_use]
1080  pub fn abs(self) -> Self {
1081    pick! {
1082      if #[cfg(target_feature="ssse3")] {
1083        Self { sse: abs_i8_m128i(self.sse) }
1084      } else if #[cfg(target_feature="simd128")] {
1085        Self { simd: i8x16_abs(self.simd) }
1086      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1087        unsafe {Self { neon: vabsq_s8(self.neon) }}
1088      } else {
1089        let arr: [i8; 16] = cast(self);
1090        cast([
1091          arr[0].wrapping_abs(),
1092          arr[1].wrapping_abs(),
1093          arr[2].wrapping_abs(),
1094          arr[3].wrapping_abs(),
1095          arr[4].wrapping_abs(),
1096          arr[5].wrapping_abs(),
1097          arr[6].wrapping_abs(),
1098          arr[7].wrapping_abs(),
1099          arr[8].wrapping_abs(),
1100          arr[9].wrapping_abs(),
1101          arr[10].wrapping_abs(),
1102          arr[11].wrapping_abs(),
1103          arr[12].wrapping_abs(),
1104          arr[13].wrapping_abs(),
1105          arr[14].wrapping_abs(),
1106          arr[15].wrapping_abs(),
1107        ])
1108      }
1109    }
1110  }
1111
1112  #[inline]
1113  #[must_use]
1114  pub fn unsigned_abs(self) -> u8x16 {
1115    pick! {
1116      if #[cfg(target_feature="ssse3")] {
1117        u8x16 { sse: abs_i8_m128i(self.sse) }
1118      } else if #[cfg(target_feature="simd128")] {
1119        u8x16 { simd: i8x16_abs(self.simd) }
1120      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1121        unsafe { u8x16 { neon: vreinterpretq_u8_s8(vabsq_s8(self.neon)) }}
1122      } else {
1123        let arr: [i8; 16] = cast(self);
1124        cast(
1125          [
1126            arr[0].unsigned_abs(),
1127            arr[1].unsigned_abs(),
1128            arr[2].unsigned_abs(),
1129            arr[3].unsigned_abs(),
1130            arr[4].unsigned_abs(),
1131            arr[5].unsigned_abs(),
1132            arr[6].unsigned_abs(),
1133            arr[7].unsigned_abs(),
1134            arr[8].unsigned_abs(),
1135            arr[9].unsigned_abs(),
1136            arr[10].unsigned_abs(),
1137            arr[11].unsigned_abs(),
1138            arr[12].unsigned_abs(),
1139            arr[13].unsigned_abs(),
1140            arr[14].unsigned_abs(),
1141            arr[15].unsigned_abs(),
1142            ])
1143      }
1144    }
1145  }
1146
1147  signed_fn_signum!();
1148
1149  #[inline]
1150  #[must_use]
1151  pub fn max(self, rhs: Self) -> Self {
1152    pick! {
1153      if #[cfg(target_feature="sse4.1")] {
1154        Self { sse: max_i8_m128i(self.sse, rhs.sse) }
1155      } else if #[cfg(target_feature="simd128")] {
1156        Self { simd: i8x16_max(self.simd, rhs.simd) }
1157      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1158        unsafe {Self { neon: vmaxq_s8(self.neon, rhs.neon) }}
1159      } else {
1160        self.simd_lt(rhs).blend(rhs, self)
1161      }
1162    }
1163  }
1164  #[inline]
1165  #[must_use]
1166  pub fn min(self, rhs: Self) -> Self {
1167    pick! {
1168      if #[cfg(target_feature="sse4.1")] {
1169        Self { sse: min_i8_m128i(self.sse, rhs.sse) }
1170      } else if #[cfg(target_feature="simd128")] {
1171        Self { simd: i8x16_min(self.simd, rhs.simd) }
1172      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1173        unsafe {Self { neon: vminq_s8(self.neon, rhs.neon) }}
1174      } else {
1175        self.simd_lt(rhs).blend(self, rhs)
1176      }
1177    }
1178  }
1179
1180  integer_fn_clamp!();
1181
1182  #[inline]
1183  #[must_use]
1184  pub fn from_slice_unaligned(input: &[i8]) -> Self {
1185    assert!(input.len() >= 16);
1186
1187    pick! {
1188      if #[cfg(target_feature="sse2")] {
1189        unsafe { Self { sse: load_unaligned_m128i( &*(input.as_ptr() as * const [u8;16]) ) } }
1190      } else if #[cfg(target_feature="simd128")] {
1191        unsafe { Self { simd: v128_load(input.as_ptr() as *const v128 ) } }
1192      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1193        unsafe { Self { neon: vld1q_s8( input.as_ptr() as *const i8 ) } }
1194      } else {
1195        // 2018 edition doesn't have try_into
1196        unsafe { Self::new( *(input.as_ptr() as * const [i8;16]) ) }
1197      }
1198    }
1199  }
1200
1201  #[inline]
1202  #[must_use]
1203  #[doc(alias("movemask", "move_mask"))]
1204  pub fn to_bitmask(self) -> u32 {
1205    pick! {
1206      if #[cfg(target_feature="sse2")] {
1207        move_mask_i8_m128i(self.sse) as u32
1208      } else if #[cfg(target_feature="simd128")] {
1209        i8x16_bitmask(self.simd) as u32
1210      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1211        unsafe
1212        {
1213          // set all to 1 if top bit is set, else 0
1214          let masked = vcltq_s8(self.neon, vdupq_n_s8(0));
1215
1216          // select the right bit out of each lane
1217          let selectbit : uint8x16_t = core::mem::transmute([1u8, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128]);
1218          let out = vandq_u8(masked, selectbit);
1219
1220          // interleave the lanes so that a 16-bit sum accumulates the bits in the right order
1221          let table : uint8x16_t = core::mem::transmute([0u8, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15]);
1222          let r = vqtbl1q_u8(out, table);
1223
1224          // horizontally add the 16-bit lanes
1225          vaddvq_u16(vreinterpretq_u16_u8(r)) as u32
1226        }
1227       } else {
1228        ((self.arr[0] < 0) as u32) << 0 |
1229        ((self.arr[1] < 0) as u32) << 1 |
1230        ((self.arr[2] < 0) as u32) << 2 |
1231        ((self.arr[3] < 0) as u32) << 3 |
1232        ((self.arr[4] < 0) as u32) << 4 |
1233        ((self.arr[5] < 0) as u32) << 5 |
1234        ((self.arr[6] < 0) as u32) << 6 |
1235        ((self.arr[7] < 0) as u32) << 7 |
1236        ((self.arr[8] < 0) as u32) << 8 |
1237        ((self.arr[9] < 0) as u32) << 9 |
1238        ((self.arr[10] < 0) as u32) << 10 |
1239        ((self.arr[11] < 0) as u32) << 11 |
1240        ((self.arr[12] < 0) as u32) << 12 |
1241        ((self.arr[13] < 0) as u32) << 13 |
1242        ((self.arr[14] < 0) as u32) << 14 |
1243        ((self.arr[15] < 0) as u32) << 15
1244      }
1245    }
1246  }
1247
1248  #[inline]
1249  #[must_use]
1250  pub fn any(self) -> bool {
1251    pick! {
1252      if #[cfg(target_feature="sse2")] {
1253        move_mask_i8_m128i(self.sse) != 0
1254      } else if #[cfg(target_feature="simd128")] {
1255        u8x16_bitmask(self.simd) != 0
1256      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
1257        unsafe {
1258          vminvq_s8(self.neon) < 0
1259        }
1260      } else {
1261        let v : [u64;2] = cast(self);
1262        ((v[0] | v[1]) & 0x80808080808080) != 0
1263      }
1264    }
1265  }
1266  #[inline]
1267  #[must_use]
1268  pub fn all(self) -> bool {
1269    pick! {
1270      if #[cfg(target_feature="sse2")] {
1271        move_mask_i8_m128i(self.sse) == 0b1111_1111_1111_1111
1272      } else if #[cfg(target_feature="simd128")] {
1273        u8x16_bitmask(self.simd) == 0b1111_1111_1111_1111
1274      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
1275        unsafe {
1276          vmaxvq_s8(self.neon) < 0
1277        }
1278      } else {
1279        let v : [u64;2] = cast(self);
1280        (v[0] & v[1] & 0x80808080808080) == 0x80808080808080
1281      }
1282    }
1283  }
1284
1285  /// Returns a new vector where each element is based on the index values in
1286  /// `rhs`.
1287  ///
1288  /// * Index values in the range `[0, 15]` select the i-th element of `self`.
1289  /// * Index values that are out of range will cause that output lane to be
1290  ///   `0`.
1291  #[inline]
1292  pub fn swizzle(self, rhs: i8x16) -> i8x16 {
1293    pick! {
1294      if #[cfg(target_feature="ssse3")] {
1295        Self { sse: shuffle_av_i8z_all_m128i(self.sse, add_saturating_u8_m128i(rhs.sse, set_splat_i8_m128i(0x70))) }
1296      } else if #[cfg(target_feature="simd128")] {
1297        Self { simd: i8x16_swizzle(self.simd, rhs.simd) }
1298      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
1299        unsafe { Self { neon: vqtbl1q_s8(self.neon, vreinterpretq_u8_s8(rhs.neon)) } }
1300      } else {
1301        let idxs = rhs.to_array();
1302        let arr = self.to_array();
1303        let mut out = [0i8;16];
1304        for i in 0..16 {
1305          let idx = idxs[i] as usize;
1306          if idx >= 16 {
1307            out[i] = 0;
1308          } else {
1309            out[i] = arr[idx];
1310          }
1311        }
1312        Self::new(out)
1313      }
1314    }
1315  }
1316
1317  /// Works like [`swizzle`](Self::swizzle) with the following additional
1318  /// details
1319  ///
1320  /// * Indices in the range `[0, 15]` will select the i-th element of `self`.
1321  /// * If the high bit of any index is set (meaning that the index is
1322  ///   negative), then the corresponding output lane is guaranteed to be zero.
1323  /// * Otherwise the output lane is either `0` or `self[rhs[i] % 16]`,
1324  ///   depending on the implementation.
1325  #[inline]
1326  pub fn swizzle_relaxed(self, rhs: i8x16) -> i8x16 {
1327    pick! {
1328      if #[cfg(target_feature="ssse3")] {
1329        Self { sse: shuffle_av_i8z_all_m128i(self.sse, rhs.sse) }
1330      } else if #[cfg(target_feature="simd128")] {
1331        Self { simd: i8x16_swizzle(self.simd, rhs.simd) }
1332      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
1333        unsafe { Self { neon: vqtbl1q_s8(self.neon, vreinterpretq_u8_s8(rhs.neon)) } }
1334      } else {
1335        let idxs = rhs.to_array();
1336        let arr = self.to_array();
1337        let mut out = [0i8;16];
1338        for i in 0..16 {
1339          let idx = idxs[i] as usize;
1340          if idx >= 16 {
1341            out[i] = 0;
1342          } else {
1343            out[i] = arr[idx];
1344          }
1345        }
1346        Self::new(out)
1347      }
1348    }
1349  }
1350
1351  #[inline]
1352  #[must_use]
1353  pub fn none(self) -> bool {
1354    !self.any()
1355  }
1356
1357  #[inline]
1358  #[must_use]
1359  pub fn saturating_add(self, rhs: Self) -> Self {
1360    pick! {
1361      if #[cfg(target_feature="sse2")] {
1362        Self { sse: add_saturating_i8_m128i(self.sse, rhs.sse) }
1363      } else if #[cfg(target_feature="simd128")] {
1364        Self { simd: i8x16_add_sat(self.simd, rhs.simd) }
1365      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1366        unsafe {Self { neon: vqaddq_s8(self.neon, rhs.neon) }}
1367      } else {
1368        Self { arr: [
1369          self.arr[0].saturating_add(rhs.arr[0]),
1370          self.arr[1].saturating_add(rhs.arr[1]),
1371          self.arr[2].saturating_add(rhs.arr[2]),
1372          self.arr[3].saturating_add(rhs.arr[3]),
1373          self.arr[4].saturating_add(rhs.arr[4]),
1374          self.arr[5].saturating_add(rhs.arr[5]),
1375          self.arr[6].saturating_add(rhs.arr[6]),
1376          self.arr[7].saturating_add(rhs.arr[7]),
1377          self.arr[8].saturating_add(rhs.arr[8]),
1378          self.arr[9].saturating_add(rhs.arr[9]),
1379          self.arr[10].saturating_add(rhs.arr[10]),
1380          self.arr[11].saturating_add(rhs.arr[11]),
1381          self.arr[12].saturating_add(rhs.arr[12]),
1382          self.arr[13].saturating_add(rhs.arr[13]),
1383          self.arr[14].saturating_add(rhs.arr[14]),
1384          self.arr[15].saturating_add(rhs.arr[15]),
1385        ]}
1386      }
1387    }
1388  }
1389  #[inline]
1390  #[must_use]
1391  pub fn saturating_sub(self, rhs: Self) -> Self {
1392    pick! {
1393      if #[cfg(target_feature="sse2")] {
1394        Self { sse: sub_saturating_i8_m128i(self.sse, rhs.sse) }
1395      } else if #[cfg(target_feature="simd128")] {
1396        Self { simd: i8x16_sub_sat(self.simd, rhs.simd) }
1397      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1398        unsafe { Self { neon: vqsubq_s8(self.neon, rhs.neon) } }
1399      } else {
1400        Self { arr: [
1401          self.arr[0].saturating_sub(rhs.arr[0]),
1402          self.arr[1].saturating_sub(rhs.arr[1]),
1403          self.arr[2].saturating_sub(rhs.arr[2]),
1404          self.arr[3].saturating_sub(rhs.arr[3]),
1405          self.arr[4].saturating_sub(rhs.arr[4]),
1406          self.arr[5].saturating_sub(rhs.arr[5]),
1407          self.arr[6].saturating_sub(rhs.arr[6]),
1408          self.arr[7].saturating_sub(rhs.arr[7]),
1409          self.arr[8].saturating_sub(rhs.arr[8]),
1410          self.arr[9].saturating_sub(rhs.arr[9]),
1411          self.arr[10].saturating_sub(rhs.arr[10]),
1412          self.arr[11].saturating_sub(rhs.arr[11]),
1413          self.arr[12].saturating_sub(rhs.arr[12]),
1414          self.arr[13].saturating_sub(rhs.arr[13]),
1415          self.arr[14].saturating_sub(rhs.arr[14]),
1416          self.arr[15].saturating_sub(rhs.arr[15]),
1417        ]}
1418      }
1419    }
1420  }
1421
1422  /// Lanewise saturating multiply.
1423  #[inline]
1424  #[must_use]
1425  pub fn saturating_mul(self, rhs: Self) -> Self {
1426    pick! {
1427      if #[cfg(all(target_feature="neon", target_arch="aarch64"))] {
1428        unsafe {
1429          let low_wide_mul = vreinterpretq_s8_s16(
1430            vmull_s8(vget_low_s8(self.neon), vget_low_s8(rhs.neon)),
1431          );
1432          let high_wide_mul = vreinterpretq_s8_s16(
1433            vmull_s8(vget_high_s8(self.neon), vget_high_s8(rhs.neon)),
1434          );
1435          let low_high = vuzpq_s8(low_wide_mul, high_wide_mul);
1436          let low = Self { neon: low_high.0 };
1437          let high = Self { neon: low_high.1 };
1438
1439          let no_overflow = high.simd_eq(low.is_negative());
1440          let limit = Self::MAX ^ (self ^ rhs).is_negative();
1441          no_overflow.blend(low, limit)
1442        }
1443      } else {
1444        let self_array = self.to_array();
1445        let rhs_array = rhs.to_array();
1446
1447        Self::new([
1448          self_array[0].saturating_mul(rhs_array[0]),
1449          self_array[1].saturating_mul(rhs_array[1]),
1450          self_array[2].saturating_mul(rhs_array[2]),
1451          self_array[3].saturating_mul(rhs_array[3]),
1452          self_array[4].saturating_mul(rhs_array[4]),
1453          self_array[5].saturating_mul(rhs_array[5]),
1454          self_array[6].saturating_mul(rhs_array[6]),
1455          self_array[7].saturating_mul(rhs_array[7]),
1456          self_array[8].saturating_mul(rhs_array[8]),
1457          self_array[9].saturating_mul(rhs_array[9]),
1458          self_array[10].saturating_mul(rhs_array[10]),
1459          self_array[11].saturating_mul(rhs_array[11]),
1460          self_array[12].saturating_mul(rhs_array[12]),
1461          self_array[13].saturating_mul(rhs_array[13]),
1462          self_array[14].saturating_mul(rhs_array[14]),
1463          self_array[15].saturating_mul(rhs_array[15]),
1464        ])
1465      }
1466    }
1467  }
1468
1469  integer_fn_saturating_div!([
1470    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1471  ]);
1472
1473  /// Transpose matrix of 16x16 `i8` matrix. Currently not accelerated.
1474  #[must_use]
1475  #[inline]
1476  pub fn transpose(data: [i8x16; 16]) -> [i8x16; 16] {
1477    // Can this be optimized?
1478
1479    #[inline(always)]
1480    fn transpose_column(data: &[i8x16; 16], index: usize) -> i8x16 {
1481      i8x16::new([
1482        data[0].as_array()[index],
1483        data[1].as_array()[index],
1484        data[2].as_array()[index],
1485        data[3].as_array()[index],
1486        data[4].as_array()[index],
1487        data[5].as_array()[index],
1488        data[6].as_array()[index],
1489        data[7].as_array()[index],
1490        data[8].as_array()[index],
1491        data[9].as_array()[index],
1492        data[10].as_array()[index],
1493        data[11].as_array()[index],
1494        data[12].as_array()[index],
1495        data[13].as_array()[index],
1496        data[14].as_array()[index],
1497        data[15].as_array()[index],
1498      ])
1499    }
1500
1501    [
1502      transpose_column(&data, 0),
1503      transpose_column(&data, 1),
1504      transpose_column(&data, 2),
1505      transpose_column(&data, 3),
1506      transpose_column(&data, 4),
1507      transpose_column(&data, 5),
1508      transpose_column(&data, 6),
1509      transpose_column(&data, 7),
1510      transpose_column(&data, 8),
1511      transpose_column(&data, 9),
1512      transpose_column(&data, 10),
1513      transpose_column(&data, 11),
1514      transpose_column(&data, 12),
1515      transpose_column(&data, 13),
1516      transpose_column(&data, 14),
1517      transpose_column(&data, 15),
1518    ]
1519  }
1520
1521  #[inline]
1522  pub fn to_array(self) -> [i8; 16] {
1523    cast(self)
1524  }
1525
1526  #[inline]
1527  pub fn as_array(&self) -> &[i8; 16] {
1528    cast_ref(self)
1529  }
1530
1531  #[inline]
1532  pub fn as_mut_array(&mut self) -> &mut [i8; 16] {
1533    cast_mut(self)
1534  }
1535}