Skip to main content

wide/
f32x8_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="avx")] {
5    #[derive(Default, Clone, Copy, PartialEq)]
6    #[repr(C, align(32))]
7    pub struct f32x8 { avx: m256 }
8  } else {
9    #[derive(Default, Clone, Copy, PartialEq)]
10    #[repr(C, align(32))]
11    pub struct f32x8 { a : f32x4, b : f32x4 }
12  }
13}
14
15macro_rules! const_f32_as_f32x8 {
16  ($i:ident, $f:expr) => {
17    #[allow(non_upper_case_globals)]
18    pub const $i: f32x8 = f32x8::new([$f; 8]);
19  };
20}
21
22impl f32x8 {
23  const_f32_as_f32x8!(ONE, 1.0);
24  const_f32_as_f32x8!(HALF, 0.5);
25  const_f32_as_f32x8!(ZERO, 0.0);
26  const_f32_as_f32x8!(E, core::f32::consts::E);
27  const_f32_as_f32x8!(FRAC_1_PI, core::f32::consts::FRAC_1_PI);
28  const_f32_as_f32x8!(FRAC_2_PI, core::f32::consts::FRAC_2_PI);
29  const_f32_as_f32x8!(FRAC_2_SQRT_PI, core::f32::consts::FRAC_2_SQRT_PI);
30  const_f32_as_f32x8!(FRAC_1_SQRT_2, core::f32::consts::FRAC_1_SQRT_2);
31  const_f32_as_f32x8!(FRAC_PI_2, core::f32::consts::FRAC_PI_2);
32  const_f32_as_f32x8!(FRAC_PI_3, core::f32::consts::FRAC_PI_3);
33  const_f32_as_f32x8!(FRAC_PI_4, core::f32::consts::FRAC_PI_4);
34  const_f32_as_f32x8!(FRAC_PI_6, core::f32::consts::FRAC_PI_6);
35  const_f32_as_f32x8!(FRAC_PI_8, core::f32::consts::FRAC_PI_8);
36  const_f32_as_f32x8!(LN_2, core::f32::consts::LN_2);
37  const_f32_as_f32x8!(LN_10, core::f32::consts::LN_10);
38  const_f32_as_f32x8!(LOG2_E, core::f32::consts::LOG2_E);
39  const_f32_as_f32x8!(LOG10_E, core::f32::consts::LOG10_E);
40  const_f32_as_f32x8!(LOG10_2, core::f32::consts::LOG10_2);
41  const_f32_as_f32x8!(LOG2_10, core::f32::consts::LOG2_10);
42  const_f32_as_f32x8!(PI, core::f32::consts::PI);
43  const_f32_as_f32x8!(SQRT_2, core::f32::consts::SQRT_2);
44  const_f32_as_f32x8!(TAU, core::f32::consts::TAU);
45}
46
47unsafe impl Zeroable for f32x8 {}
48unsafe impl Pod for f32x8 {}
49
50impl AlignTo for f32x8 {
51  type Elem = f32;
52}
53
54impl Add for f32x8 {
55  type Output = Self;
56  #[inline]
57  fn add(self, rhs: Self) -> Self::Output {
58    pick! {
59      if #[cfg(target_feature="avx")] {
60        Self { avx: add_m256(self.avx, rhs.avx) }
61      } else {
62        Self {
63          a : self.a.add(rhs.a),
64          b : self.b.add(rhs.b),
65        }
66      }
67    }
68  }
69}
70
71impl Sub for f32x8 {
72  type Output = Self;
73  #[inline]
74  fn sub(self, rhs: Self) -> Self::Output {
75    pick! {
76      if #[cfg(target_feature="avx")] {
77        Self { avx: sub_m256(self.avx, rhs.avx) }
78      } else {
79        Self {
80          a : self.a.sub(rhs.a),
81          b : self.b.sub(rhs.b),
82        }
83      }
84    }
85  }
86}
87
88impl Mul for f32x8 {
89  type Output = Self;
90  #[inline]
91  fn mul(self, rhs: Self) -> Self::Output {
92    pick! {
93      if #[cfg(target_feature="avx")] {
94        Self { avx: mul_m256(self.avx, rhs.avx) }
95      } else {
96        Self {
97          a : self.a.mul(rhs.a),
98          b : self.b.mul(rhs.b),
99        }
100      }
101    }
102  }
103}
104
105impl Div for f32x8 {
106  type Output = Self;
107  #[inline]
108  fn div(self, rhs: Self) -> Self::Output {
109    pick! {
110      if #[cfg(target_feature="avx")] {
111        Self { avx: div_m256(self.avx, rhs.avx) }
112      } else {
113        Self {
114          a : self.a.div(rhs.a),
115          b : self.b.div(rhs.b),
116        }
117      }
118    }
119  }
120}
121
122impl Neg for f32x8 {
123  type Output = Self;
124  #[inline]
125  fn neg(self) -> Self::Output {
126    pick! {
127      if #[cfg(target_feature="avx")] {
128        Self { avx: bitxor_m256(self.avx, Self::splat(-0.0).avx) }
129      } else {
130        Self {
131          a : self.a.neg(),
132          b : self.b.neg(),
133        }
134      }
135    }
136  }
137}
138
139impl Add<f32> for f32x8 {
140  type Output = Self;
141  #[inline]
142  fn add(self, rhs: f32) -> Self::Output {
143    self.add(Self::splat(rhs))
144  }
145}
146
147impl Sub<f32> for f32x8 {
148  type Output = Self;
149  #[inline]
150  fn sub(self, rhs: f32) -> Self::Output {
151    self.sub(Self::splat(rhs))
152  }
153}
154
155impl Mul<f32> for f32x8 {
156  type Output = Self;
157  #[inline]
158  fn mul(self, rhs: f32) -> Self::Output {
159    self.mul(Self::splat(rhs))
160  }
161}
162
163impl Div<f32> for f32x8 {
164  type Output = Self;
165  #[inline]
166  fn div(self, rhs: f32) -> Self::Output {
167    self.div(Self::splat(rhs))
168  }
169}
170
171impl Add<f32x8> for f32 {
172  type Output = f32x8;
173  #[inline]
174  fn add(self, rhs: f32x8) -> Self::Output {
175    f32x8::splat(self).add(rhs)
176  }
177}
178
179impl Sub<f32x8> for f32 {
180  type Output = f32x8;
181  #[inline]
182  fn sub(self, rhs: f32x8) -> Self::Output {
183    f32x8::splat(self).sub(rhs)
184  }
185}
186
187impl Mul<f32x8> for f32 {
188  type Output = f32x8;
189  #[inline]
190  fn mul(self, rhs: f32x8) -> Self::Output {
191    f32x8::splat(self).mul(rhs)
192  }
193}
194
195impl Div<f32x8> for f32 {
196  type Output = f32x8;
197  #[inline]
198  fn div(self, rhs: f32x8) -> Self::Output {
199    f32x8::splat(self).div(rhs)
200  }
201}
202
203impl BitAnd for f32x8 {
204  type Output = Self;
205  #[inline]
206  fn bitand(self, rhs: Self) -> Self::Output {
207    pick! {
208      if #[cfg(target_feature="avx")] {
209        Self { avx: bitand_m256(self.avx, rhs.avx) }
210      } else {
211        Self {
212          a : self.a.bitand(rhs.a),
213          b : self.b.bitand(rhs.b),
214        }
215      }
216    }
217  }
218}
219
220impl BitOr for f32x8 {
221  type Output = Self;
222  #[inline]
223  fn bitor(self, rhs: Self) -> Self::Output {
224    pick! {
225      if #[cfg(target_feature="avx")] {
226        Self { avx: bitor_m256(self.avx, rhs.avx) }
227      } else {
228        Self {
229          a : self.a.bitor(rhs.a),
230          b : self.b.bitor(rhs.b),
231        }
232      }
233    }
234  }
235}
236
237impl BitXor for f32x8 {
238  type Output = Self;
239  #[inline]
240  fn bitxor(self, rhs: Self) -> Self::Output {
241    pick! {
242      if #[cfg(target_feature="avx")] {
243        Self { avx: bitxor_m256(self.avx, rhs.avx) }
244      } else {
245        Self {
246          a : self.a.bitxor(rhs.a),
247          b : self.b.bitxor(rhs.b),
248        }
249      }
250    }
251  }
252}
253
254impl CmpEq for f32x8 {
255  type Output = Self;
256  #[inline]
257  fn simd_eq(self, rhs: Self) -> Self::Output {
258    pick! {
259      if #[cfg(target_feature="avx")] {
260        Self { avx: cmp_op_mask_m256::<{cmp_op!(EqualOrdered)}>(self.avx, rhs.avx) }
261      } else {
262        Self {
263          a : self.a.simd_eq(rhs.a),
264          b : self.b.simd_eq(rhs.b),
265        }
266      }
267    }
268  }
269}
270
271impl CmpGe for f32x8 {
272  type Output = Self;
273  #[inline]
274  fn simd_ge(self, rhs: Self) -> Self::Output {
275    pick! {
276      if #[cfg(target_feature="avx")] {
277        Self { avx: cmp_op_mask_m256::<{cmp_op!(GreaterEqualOrdered)}>(self.avx, rhs.avx) }
278      } else {
279        Self {
280          a : self.a.simd_ge(rhs.a),
281          b : self.b.simd_ge(rhs.b),
282        }
283      }
284    }
285  }
286}
287
288impl CmpGt for f32x8 {
289  type Output = Self;
290  #[inline]
291  fn simd_gt(self, rhs: Self) -> Self::Output {
292    pick! {
293      if #[cfg(target_feature="avx")] {
294        Self { avx: cmp_op_mask_m256::<{cmp_op!(GreaterThanOrdered)}>(self.avx, rhs.avx) }
295      } else {
296        Self {
297          a : self.a.simd_gt(rhs.a),
298          b : self.b.simd_gt(rhs.b),
299        }
300      }
301    }
302  }
303}
304
305impl CmpNe for f32x8 {
306  type Output = Self;
307  #[inline]
308  fn simd_ne(self, rhs: Self) -> Self::Output {
309    pick! {
310      if #[cfg(target_feature="avx")] {
311        Self { avx: cmp_op_mask_m256::<{cmp_op!(NotEqualOrdered)}>(self.avx, rhs.avx) }
312      } else {
313        Self {
314          a : self.a.simd_ne(rhs.a),
315          b : self.b.simd_ne(rhs.b),
316        }
317      }
318    }
319  }
320}
321
322impl CmpLe for f32x8 {
323  type Output = Self;
324  #[inline]
325  fn simd_le(self, rhs: Self) -> Self::Output {
326    pick! {
327      if #[cfg(target_feature="avx")] {
328        Self { avx: cmp_op_mask_m256::<{cmp_op!(LessEqualOrdered)}>(self.avx, rhs.avx) }
329      } else {
330        Self {
331          a : self.a.simd_le(rhs.a),
332          b : self.b.simd_le(rhs.b),
333        }
334      }
335    }
336  }
337}
338
339impl CmpLt for f32x8 {
340  type Output = Self;
341  #[inline]
342  fn simd_lt(self, rhs: Self) -> Self::Output {
343    pick! {
344      if #[cfg(target_feature="avx")] {
345        Self { avx: cmp_op_mask_m256::<{cmp_op!(LessThanOrdered)}>(self.avx, rhs.avx) }
346      } else {
347        Self {
348          a : self.a.simd_lt(rhs.a),
349          b : self.b.simd_lt(rhs.b),
350        }
351      }
352    }
353  }
354}
355
356impl f32x8 {
357  #[inline]
358  #[must_use]
359  pub const fn new(array: [f32; 8]) -> Self {
360    unsafe { core::mem::transmute(array) }
361  }
362  #[inline]
363  #[must_use]
364  pub fn blend(self, t: Self, f: Self) -> Self {
365    pick! {
366      if #[cfg(target_feature="avx")] {
367        Self { avx: blend_varying_m256(f.avx, t.avx, self.avx) }
368      } else {
369        Self {
370          a : self.a.blend(t.a, f.a),
371          b : self.b.blend(t.b, f.b),
372        }
373      }
374    }
375  }
376  #[inline]
377  #[must_use]
378  pub fn abs(self) -> Self {
379    pick! {
380      if #[cfg(target_feature="avx")] {
381        let non_sign_bits = f32x8::from(f32::from_bits(i32::MAX as u32));
382        self & non_sign_bits
383      } else {
384        Self {
385          a : self.a.abs(),
386          b : self.b.abs(),
387        }
388      }
389    }
390  }
391  #[inline]
392  #[must_use]
393  pub fn floor(self) -> Self {
394    pick! {
395      if #[cfg(target_feature="avx")] {
396        Self { avx: floor_m256(self.avx) }
397      } else {
398        Self {
399          a : self.a.floor(),
400          b : self.b.floor(),
401        }
402      }
403    }
404  }
405  #[inline]
406  #[must_use]
407  pub fn ceil(self) -> Self {
408    pick! {
409      if #[cfg(target_feature="avx")] {
410        Self { avx: ceil_m256(self.avx) }
411      } else {
412        Self {
413          a : self.a.ceil(),
414          b : self.b.ceil(),
415        }
416      }
417    }
418  }
419
420  /// Calculates the lanewise maximum of both vectors. This is a faster
421  /// implementation than `max`, but it doesn't specify any behavior if NaNs are
422  /// involved.
423  #[inline]
424  #[must_use]
425  pub fn fast_max(self, rhs: Self) -> Self {
426    pick! {
427      if #[cfg(target_feature="avx")] {
428        Self { avx: max_m256(self.avx, rhs.avx) }
429      } else {
430        Self {
431          a : self.a.fast_max(rhs.a),
432          b : self.b.fast_max(rhs.b),
433        }
434      }
435    }
436  }
437
438  /// Calculates the lanewise maximum of both vectors. This doesn't match
439  /// IEEE-754 and instead is defined as `self < rhs ? rhs : self`.
440  #[inline]
441  #[must_use]
442  pub fn max(self, rhs: Self) -> Self {
443    pick! {
444      if #[cfg(target_feature="avx")] {
445        // max_m256 seems to do rhs < self ? self : rhs. So if there's any NaN
446        // involved, it chooses rhs, so we need to specifically check rhs for
447        // NaN.
448        rhs.is_nan().blend(self, Self { avx: max_m256(self.avx, rhs.avx) })
449      } else {
450        Self {
451          a : self.a.max(rhs.a),
452          b : self.b.max(rhs.b),
453        }
454      }
455
456    }
457  }
458
459  /// Calculates the lanewise minimum of both vectors. This is a faster
460  /// implementation than `min`, but it doesn't specify any behavior if NaNs are
461  /// involved.
462  #[inline]
463  #[must_use]
464  pub fn fast_min(self, rhs: Self) -> Self {
465    pick! {
466      if #[cfg(target_feature="avx")] {
467        Self { avx: min_m256(self.avx, rhs.avx) }
468      } else {
469        Self {
470          a : self.a.fast_min(rhs.a),
471          b : self.b.fast_min(rhs.b),
472        }
473      }
474    }
475  }
476
477  /// Calculates the lanewise minimum of both vectors. If either lane is NaN,
478  /// the other lane gets chosen. Use `fast_min` for a faster implementation
479  /// that doesn't handle NaNs.
480  #[inline]
481  #[must_use]
482  pub fn min(self, rhs: Self) -> Self {
483    pick! {
484      if #[cfg(target_feature="avx")] {
485        // min_m256 seems to do rhs > self ? self : rhs. So if there's any NaN
486        // involved, it chooses rhs, so we need to specifically check rhs for
487        // NaN.
488        rhs.is_nan().blend(self, Self { avx: min_m256(self.avx, rhs.avx) })
489      } else {
490        Self {
491          a : self.a.min(rhs.a),
492          b : self.b.min(rhs.b),
493        }
494      }
495    }
496  }
497  #[inline]
498  #[must_use]
499  pub fn is_nan(self) -> Self {
500    pick! {
501      if #[cfg(target_feature="avx")] {
502        Self { avx: cmp_op_mask_m256::<{cmp_op!(Unordered)}>(self.avx, self.avx) }
503      } else {
504        Self {
505          a : self.a.is_nan(),
506          b : self.b.is_nan(),
507        }
508      }
509    }
510  }
511  #[inline]
512  #[must_use]
513  pub fn is_finite(self) -> Self {
514    let shifted_exp_mask = u32x8::from(0xFF000000);
515    let u: u32x8 = cast(self);
516    let shift_u = u << 1_u64;
517    let out = !(shift_u & shifted_exp_mask).simd_eq(shifted_exp_mask);
518    cast(out)
519  }
520  #[inline]
521  #[must_use]
522  pub fn is_inf(self) -> Self {
523    let shifted_inf = u32x8::from(0xFF000000);
524    let u: u32x8 = cast(self);
525    let shift_u = u << 1_u64;
526    let out = (shift_u).simd_eq(shifted_inf);
527    cast(out)
528  }
529
530  #[inline]
531  #[must_use]
532  pub fn round(self) -> Self {
533    pick! {
534      // NOTE: Is there an SSE2 version of this? f32x4 version probably translates but I've not had time to figure it out
535      if #[cfg(target_feature="avx")] {
536        Self { avx: round_m256::<{round_op!(Nearest)}>(self.avx) }
537      } else {
538        Self {
539          a : self.a.round(),
540          b : self.b.round(),
541        }
542      }
543    }
544  }
545
546  /// Rounds each lane into an integer. This is a faster implementation than
547  /// `round_int`, but it doesn't handle out of range values or NaNs. For those
548  /// values you get implementation defined behavior.
549  #[inline]
550  #[must_use]
551  pub fn fast_round_int(self) -> i32x8 {
552    pick! {
553      if #[cfg(target_feature="avx")] {
554        cast(convert_to_i32_m256i_from_m256(self.avx))
555      } else {
556        cast([
557          self.a.fast_round_int(),
558          self.b.fast_round_int()])
559      }
560    }
561  }
562
563  /// Rounds each lane into an integer. This saturates out of range values and
564  /// turns NaNs into 0. Use `fast_round_int` for a faster implementation that
565  /// doesn't handle out of range values or NaNs.
566  #[inline]
567  #[must_use]
568  pub fn round_int(self) -> i32x8 {
569    pick! {
570      if #[cfg(target_feature="avx")] {
571        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
572        let non_nan_mask = self.simd_eq(self);
573        let non_nan = self & non_nan_mask;
574        let flip_to_max: i32x8 = cast(self.simd_ge(Self::splat(2147483648.0)));
575        let cast: i32x8 = cast(convert_to_i32_m256i_from_m256(non_nan.avx));
576        flip_to_max ^ cast
577      } else {
578        cast([
579          self.a.round_int(),
580          self.b.round_int(),
581        ])
582      }
583    }
584  }
585
586  /// Truncates each lane into an integer. This is a faster implementation than
587  /// `trunc_int`, but it doesn't handle out of range values or NaNs. For those
588  /// values you get implementation defined behavior.
589  #[inline]
590  #[must_use]
591  pub fn fast_trunc_int(self) -> i32x8 {
592    pick! {
593      if #[cfg(all(target_feature="avx"))] {
594        cast(convert_truncate_to_i32_m256i_from_m256(self.avx))
595      } else {
596        cast([
597          self.a.fast_trunc_int(),
598          self.b.fast_trunc_int(),
599        ])
600      }
601    }
602  }
603
604  /// Truncates each lane into an integer. This saturates out of range values
605  /// and turns NaNs into 0. Use `fast_trunc_int` for a faster implementation
606  /// that doesn't handle out of range values or NaNs.
607  #[inline]
608  #[must_use]
609  pub fn trunc_int(self) -> i32x8 {
610    pick! {
611        if #[cfg(target_feature="avx")] {
612        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
613        let non_nan_mask = self.simd_eq(self);
614        let non_nan = self & non_nan_mask;
615        let flip_to_max: i32x8 = cast(self.simd_ge(Self::splat(2147483648.0)));
616        let cast: i32x8 = cast(convert_truncate_to_i32_m256i_from_m256(non_nan.avx));
617        flip_to_max ^ cast
618      } else {
619        cast([
620          self.a.trunc_int(),
621          self.b.trunc_int(),
622        ])
623      }
624    }
625  }
626  /// Performs a multiply-add operation: `self * m + a`
627  ///
628  /// When hardware FMA support is available, this computes the result with a
629  /// single rounding operation. Without FMA support, it falls back to separate
630  /// multiply and add operations with two roundings.
631  ///
632  /// # Platform-specific behavior
633  /// - On `x86`/`x86_64` with AVX+FMA: Uses `vfmadd` (single rounding, best
634  ///   accuracy)
635  /// - On `x86`/`x86_64` with AVX only: Uses `(self * m) + a` (two roundings)
636  /// - Other platforms: Delegates to [`f32x4`] (may use NEON FMA or fallback)
637  ///
638  /// # Examples
639  /// ```
640  /// # use wide::f32x8;
641  /// let a = f32x8::from([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
642  /// let b = f32x8::from([2.0; 8]);
643  /// let c = f32x8::from([10.0; 8]);
644  ///
645  /// let result = a.mul_add(b, c);
646  ///
647  /// let expected = f32x8::from([12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0, 26.0]);
648  /// assert_eq!(result, expected);
649  /// ```
650  #[inline]
651  #[must_use]
652  pub fn mul_add(self, m: Self, a: Self) -> Self {
653    pick! {
654      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
655        Self { avx: fused_mul_add_m256(self.avx, m.avx, a.avx) }
656      } else if #[cfg(target_feature="avx")] {
657        // still want to use 256 bit ops
658        (self * m) + a
659      } else {
660        Self {
661          a : self.a.mul_add(m.a, a.a),
662          b : self.b.mul_add(m.b, a.b),
663        }
664      }
665    }
666  }
667
668  /// Performs a multiply-subtract operation: `self * m - s`
669  ///
670  /// When hardware FMA support is available, this computes the result with a
671  /// single rounding operation. Without FMA support, it falls back to separate
672  /// multiply and subtract operations with two roundings.
673  ///
674  /// # Platform-specific behavior
675  /// - On `x86`/`x86_64` with AVX+FMA: Uses `vfmsub` (single rounding, best
676  ///   accuracy)
677  /// - On `x86`/`x86_64` with AVX only: Uses `(self * m) - s` (two roundings)
678  /// - Other platforms: Delegates to [`f32x4`] (may use NEON FMA or fallback)
679  ///
680  /// # Examples
681  /// ```
682  /// # use wide::f32x8;
683  /// let a = f32x8::from([10.0; 8]);
684  /// let b = f32x8::from([2.0; 8]);
685  /// let c = f32x8::from([5.0; 8]);
686  ///
687  /// let result = a.mul_sub(b, c);
688  ///
689  /// let expected = f32x8::from([15.0; 8]);
690  /// assert_eq!(result, expected);
691  /// ```
692  #[inline]
693  #[must_use]
694  pub fn mul_sub(self, m: Self, s: Self) -> Self {
695    pick! {
696      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
697        Self { avx: fused_mul_sub_m256(self.avx, m.avx, s.avx) }
698      } else if #[cfg(target_feature="avx")] {
699        // still want to use 256 bit ops
700        (self * m) - s
701      } else {
702        Self {
703          a : self.a.mul_sub(m.a, s.a),
704          b : self.b.mul_sub(m.b, s.b),
705        }
706      }
707    }
708  }
709
710  /// Performs a negative multiply-add operation: `a - (self * m)`
711  ///
712  /// When hardware FMA support is available, this computes the result with a
713  /// single rounding operation. Without FMA support, it falls back to separate
714  /// operations with two roundings.
715  ///
716  /// # Platform-specific behavior
717  /// - On `x86`/`x86_64` with AVX+FMA: Uses `vfnmadd` (single rounding, best
718  ///   accuracy)
719  /// - On `x86`/`x86_64` with AVX only: Uses `a - (self * m)` (two roundings)
720  /// - Other platforms: Delegates to [`f32x4`] (may use NEON FMA or fallback)
721  ///
722  /// # Examples
723  /// ```
724  /// # use wide::f32x8;
725  /// let a = f32x8::from([3.0; 8]);
726  /// let b = f32x8::from([2.0; 8]);
727  /// let c = f32x8::from([10.0; 8]);
728  ///
729  /// let result = a.mul_neg_add(b, c);
730  ///
731  /// let expected = f32x8::from([4.0; 8]);
732  /// assert_eq!(result, expected);
733  /// ```
734  #[inline]
735  #[must_use]
736  pub fn mul_neg_add(self, m: Self, a: Self) -> Self {
737    pick! {
738      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
739        Self { avx: fused_mul_neg_add_m256(self.avx, m.avx, a.avx) }
740      } else if #[cfg(target_feature="avx")] {
741        // still want to use 256 bit ops
742        a - (self * m)
743      } else {
744        Self {
745          a : self.a.mul_neg_add(m.a, a.a),
746          b : self.b.mul_neg_add(m.b, a.b),
747        }
748      }
749    }
750  }
751
752  /// Performs a negative multiply-subtract operation: `-(self * m) - s`
753  ///
754  /// When hardware FMA support is available, this computes the result with a
755  /// single rounding operation. Without FMA support, it falls back to separate
756  /// operations with two roundings.
757  ///
758  /// # Platform-specific behavior
759  /// - On `x86`/`x86_64` with AVX+FMA: Uses `vfnmsub` (single rounding, best
760  ///   accuracy)
761  /// - On `x86`/`x86_64` with AVX only: Uses `-(self * m) - s` (two roundings)
762  /// - Other platforms: Delegates to [`f32x4`] (may use NEON FMA or fallback)
763  ///
764  /// # Examples
765  /// ```
766  /// # use wide::f32x8;
767  /// let a = f32x8::from([3.0; 8]);
768  /// let b = f32x8::from([2.0; 8]);
769  /// let c = f32x8::from([1.0; 8]);
770  ///
771  /// let result = a.mul_neg_sub(b, c);
772  ///
773  /// let expected = f32x8::from([-7.0; 8]);
774  /// assert_eq!(result, expected);
775  /// ```
776  #[inline]
777  #[must_use]
778  pub fn mul_neg_sub(self, m: Self, s: Self) -> Self {
779    pick! {
780      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
781        Self { avx: fused_mul_neg_sub_m256(self.avx, m.avx, s.avx) }
782      } else if #[cfg(target_feature="avx")] {
783        // still want to use 256 bit ops
784        -(self * m) - s
785      } else {
786        Self {
787          a : self.a.mul_neg_sub(m.a, s.a),
788          b : self.b.mul_neg_sub(m.b, s.b),
789        }
790      }
791    }
792  }
793
794  #[inline]
795  #[must_use]
796  pub fn flip_signs(self, signs: Self) -> Self {
797    self ^ (signs & Self::from(-0.0))
798  }
799
800  #[inline]
801  #[must_use]
802  pub fn copysign(self, sign: Self) -> Self {
803    let magnitude_mask = Self::from(f32::from_bits(u32::MAX >> 1));
804    (self & magnitude_mask) | (sign & Self::from(-0.0))
805  }
806
807  #[inline]
808  pub fn asin_acos(self) -> (Self, Self) {
809    // Based on the Agner Fog "vector class library":
810    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
811    const_f32_as_f32x8!(P4asinf, 4.2163199048E-2);
812    const_f32_as_f32x8!(P3asinf, 2.4181311049E-2);
813    const_f32_as_f32x8!(P2asinf, 4.5470025998E-2);
814    const_f32_as_f32x8!(P1asinf, 7.4953002686E-2);
815    const_f32_as_f32x8!(P0asinf, 1.6666752422E-1);
816
817    let xa = self.abs();
818    let big = xa.simd_ge(f32x8::splat(0.5));
819
820    let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa);
821    let x2 = xa * xa;
822    let x3 = big.blend(x1, x2);
823
824    let xb = x1.sqrt();
825
826    let x4 = big.blend(xb, xa);
827
828    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
829    let z = z.mul_add(x3 * x4, x4);
830
831    let z1 = z + z;
832
833    // acos
834    let z3 = self.simd_lt(f32x8::ZERO).blend(f32x8::PI - z1, z1);
835    let z4 = f32x8::FRAC_PI_2 - z.flip_signs(self);
836    let acos = big.blend(z3, z4);
837
838    // asin
839    let z3 = f32x8::FRAC_PI_2 - z1;
840    let asin = big.blend(z3, z);
841    let asin = asin.flip_signs(self);
842
843    (asin, acos)
844  }
845
846  #[inline]
847  #[must_use]
848  pub fn asin(self) -> Self {
849    // Based on the Agner Fog "vector class library":
850    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
851    const_f32_as_f32x8!(P4asinf, 4.2163199048E-2);
852    const_f32_as_f32x8!(P3asinf, 2.4181311049E-2);
853    const_f32_as_f32x8!(P2asinf, 4.5470025998E-2);
854    const_f32_as_f32x8!(P1asinf, 7.4953002686E-2);
855    const_f32_as_f32x8!(P0asinf, 1.6666752422E-1);
856
857    let xa = self.abs();
858    let big = xa.simd_ge(f32x8::splat(0.5));
859
860    let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa);
861    let x2 = xa * xa;
862    let x3 = big.blend(x1, x2);
863
864    let xb = x1.sqrt();
865
866    let x4 = big.blend(xb, xa);
867
868    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
869    let z = z.mul_add(x3 * x4, x4);
870
871    let z1 = z + z;
872
873    // asin
874    let z3 = f32x8::FRAC_PI_2 - z1;
875    let asin = big.blend(z3, z);
876    let asin = asin.flip_signs(self);
877
878    asin
879  }
880
881  #[inline]
882  #[must_use]
883  pub fn acos(self) -> Self {
884    // Based on the Agner Fog "vector class library":
885    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
886    const_f32_as_f32x8!(P4asinf, 4.2163199048E-2);
887    const_f32_as_f32x8!(P3asinf, 2.4181311049E-2);
888    const_f32_as_f32x8!(P2asinf, 4.5470025998E-2);
889    const_f32_as_f32x8!(P1asinf, 7.4953002686E-2);
890    const_f32_as_f32x8!(P0asinf, 1.6666752422E-1);
891
892    let xa = self.abs();
893    let big = xa.simd_ge(f32x8::splat(0.5));
894
895    let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa);
896    let x2 = xa * xa;
897    let x3 = big.blend(x1, x2);
898
899    let xb = x1.sqrt();
900
901    let x4 = big.blend(xb, xa);
902
903    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
904    let z = z.mul_add(x3 * x4, x4);
905
906    let z1 = z + z;
907
908    // acos
909    let z3 = self.simd_lt(f32x8::ZERO).blend(f32x8::PI - z1, z1);
910    let z4 = f32x8::FRAC_PI_2 - z.flip_signs(self);
911    let acos = big.blend(z3, z4);
912
913    acos
914  }
915
916  #[inline]
917  pub fn atan(self) -> Self {
918    // Based on the Agner Fog "vector class library":
919    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
920    const_f32_as_f32x8!(P3atanf, 8.05374449538E-2);
921    const_f32_as_f32x8!(P2atanf, -1.38776856032E-1);
922    const_f32_as_f32x8!(P1atanf, 1.99777106478E-1);
923    const_f32_as_f32x8!(P0atanf, -3.33329491539E-1);
924
925    let t = self.abs();
926
927    // small:  z = t / 1.0;
928    // medium: z = (t-1.0) / (t+1.0);
929    // big:    z = -1.0 / t;
930    let notsmal = t.simd_ge(Self::SQRT_2 - Self::ONE);
931    let notbig = t.simd_le(Self::SQRT_2 + Self::ONE);
932
933    let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2);
934    s = notsmal & s;
935
936    let mut a = notbig & t;
937    a = notsmal.blend(a - Self::ONE, a);
938    let mut b = notbig & Self::ONE;
939    b = notsmal.blend(b + t, b);
940    let z = a / b;
941
942    let zz = z * z;
943
944    // Taylor expansion
945    let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
946    re = re.mul_add(zz * z, z) + s;
947
948    // get sign bit
949    re = (self.sign_bit()).blend(-re, re);
950
951    re
952  }
953
954  #[inline]
955  pub fn atan2(self, x: Self) -> Self {
956    // Based on the Agner Fog "vector class library":
957    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
958    const_f32_as_f32x8!(P3atanf, 8.05374449538E-2);
959    const_f32_as_f32x8!(P2atanf, -1.38776856032E-1);
960    const_f32_as_f32x8!(P1atanf, 1.99777106478E-1);
961    const_f32_as_f32x8!(P0atanf, -3.33329491539E-1);
962
963    let y = self;
964
965    // move in first octant
966    let x1 = x.abs();
967    let y1 = y.abs();
968    let swapxy = y1.simd_gt(x1);
969    // swap x and y if y1 > x1
970    let mut x2 = swapxy.blend(y1, x1);
971    let mut y2 = swapxy.blend(x1, y1);
972
973    // check for special case: x and y are both +/- INF
974    let both_infinite = x.is_inf() & y.is_inf();
975    if both_infinite.any() {
976      let minus_one = -Self::ONE;
977      x2 = both_infinite.blend(x2 & minus_one, x2);
978      y2 = both_infinite.blend(y2 & minus_one, y2);
979    }
980
981    // x = y = 0 will produce NAN. No problem, fixed below
982    let t = y2 / x2;
983
984    // small:  z = t / 1.0;
985    // medium: z = (t-1.0) / (t+1.0);
986    let notsmal = t.simd_ge(Self::SQRT_2 - Self::ONE);
987
988    let a = notsmal.blend(t - Self::ONE, t);
989    let b = notsmal.blend(t + Self::ONE, Self::ONE);
990    let s = notsmal & Self::FRAC_PI_4;
991    let z = a / b;
992
993    let zz = z * z;
994
995    // Taylor expansion
996    let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
997    re = re.mul_add(zz * z, z) + s;
998
999    // move back in place
1000    re = swapxy.blend(Self::FRAC_PI_2 - re, re);
1001    re = ((x | y).simd_eq(Self::ZERO)).blend(Self::ZERO, re);
1002    re = (x.sign_bit()).blend(Self::PI - re, re);
1003
1004    // get sign bit
1005    re = (y.sign_bit()).blend(-re, re);
1006
1007    re
1008  }
1009
1010  #[inline]
1011  #[must_use]
1012  pub fn sin_cos(self) -> (Self, Self) {
1013    // Based on the Agner Fog "vector class library":
1014    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1015
1016    const_f32_as_f32x8!(DP1F, 0.78515625_f32 * 2.0);
1017    const_f32_as_f32x8!(DP2F, 2.4187564849853515625E-4_f32 * 2.0);
1018    const_f32_as_f32x8!(DP3F, 3.77489497744594108E-8_f32 * 2.0);
1019
1020    const_f32_as_f32x8!(P0sinf, -1.6666654611E-1);
1021    const_f32_as_f32x8!(P1sinf, 8.3321608736E-3);
1022    const_f32_as_f32x8!(P2sinf, -1.9515295891E-4);
1023
1024    const_f32_as_f32x8!(P0cosf, 4.166664568298827E-2);
1025    const_f32_as_f32x8!(P1cosf, -1.388731625493765E-3);
1026    const_f32_as_f32x8!(P2cosf, 2.443315711809948E-5);
1027
1028    const_f32_as_f32x8!(TWO_OVER_PI, 2.0 / core::f32::consts::PI);
1029
1030    let xa = self.abs();
1031
1032    // Find quadrant
1033    let y = (xa * TWO_OVER_PI).round();
1034    let q: i32x8 = y.round_int();
1035
1036    let x = y.mul_neg_add(DP3F, y.mul_neg_add(DP2F, y.mul_neg_add(DP1F, xa)));
1037
1038    let x2 = x * x;
1039    let mut s = polynomial_2!(x2, P0sinf, P1sinf, P2sinf) * (x * x2) + x;
1040    let mut c = polynomial_2!(x2, P0cosf, P1cosf, P2cosf) * (x2 * x2)
1041      + f32x8::from(0.5).mul_neg_add(x2, f32x8::from(1.0));
1042
1043    let swap = !(q & i32x8::from(1)).simd_eq(i32x8::from(0));
1044
1045    let mut overflow: f32x8 = cast(q.simd_gt(i32x8::from(0x2000000)));
1046    overflow &= xa.is_finite();
1047    s = overflow.blend(f32x8::from(0.0), s);
1048    c = overflow.blend(f32x8::from(1.0), c);
1049
1050    // calc sin
1051    let mut sin1 = cast::<_, f32x8>(swap).blend(c, s);
1052    let sign_sin: i32x8 = (q << 30) ^ cast::<_, i32x8>(self);
1053    sin1 = sin1.flip_signs(cast(sign_sin));
1054
1055    // calc cos
1056    let mut cos1 = cast::<_, f32x8>(swap).blend(s, c);
1057    let sign_cos: i32x8 = ((q + i32x8::from(1)) & i32x8::from(2)) << 30;
1058    cos1 ^= cast::<_, f32x8>(sign_cos);
1059
1060    (sin1, cos1)
1061  }
1062  #[inline]
1063  #[must_use]
1064  pub fn sin(self) -> Self {
1065    let (s, _) = self.sin_cos();
1066    s
1067  }
1068  #[inline]
1069  #[must_use]
1070  pub fn cos(self) -> Self {
1071    let (_, c) = self.sin_cos();
1072    c
1073  }
1074  #[inline]
1075  #[must_use]
1076  pub fn tan(self) -> Self {
1077    let (s, c) = self.sin_cos();
1078    s / c
1079  }
1080  #[inline]
1081  #[must_use]
1082  pub fn to_degrees(self) -> Self {
1083    const_f32_as_f32x8!(RAD_TO_DEG_RATIO, 180.0_f32 / core::f32::consts::PI);
1084    self * RAD_TO_DEG_RATIO
1085  }
1086  #[inline]
1087  #[must_use]
1088  pub fn to_radians(self) -> Self {
1089    const_f32_as_f32x8!(DEG_TO_RAD_RATIO, core::f32::consts::PI / 180.0_f32);
1090    self * DEG_TO_RAD_RATIO
1091  }
1092  #[inline]
1093  #[must_use]
1094  pub fn recip(self) -> Self {
1095    pick! {
1096      if #[cfg(target_feature="avx")] {
1097        Self { avx: reciprocal_m256(self.avx) }
1098      } else {
1099        Self {
1100          a : self.a.recip(),
1101          b : self.b.recip(),
1102        }
1103      }
1104    }
1105  }
1106  #[inline]
1107  #[must_use]
1108  pub fn recip_sqrt(self) -> Self {
1109    pick! {
1110      if #[cfg(target_feature="avx")] {
1111        Self { avx: reciprocal_sqrt_m256(self.avx) }
1112      } else {
1113        Self {
1114          a : self.a.recip_sqrt(),
1115          b : self.b.recip_sqrt(),
1116        }
1117      }
1118    }
1119  }
1120  #[inline]
1121  #[must_use]
1122  pub fn sqrt(self) -> Self {
1123    pick! {
1124      if #[cfg(target_feature="avx")] {
1125        Self { avx: sqrt_m256(self.avx) }
1126      } else {
1127        Self {
1128          a : self.a.sqrt(),
1129          b : self.b.sqrt(),
1130        }
1131      }
1132    }
1133  }
1134  #[inline]
1135  #[must_use]
1136  #[doc(alias("movemask", "move_mask"))]
1137  pub fn to_bitmask(self) -> u32 {
1138    pick! {
1139      if #[cfg(target_feature="avx")] {
1140        move_mask_m256(self.avx) as u32
1141      } else {
1142        (self.b.to_bitmask() << 4) | self.a.to_bitmask()
1143      }
1144    }
1145  }
1146  #[inline]
1147  #[must_use]
1148  pub fn any(self) -> bool {
1149    pick! {
1150      if #[cfg(target_feature="avx")] {
1151        move_mask_m256(self.avx) != 0
1152      } else {
1153        self.a.any() || self.b.any()
1154      }
1155    }
1156  }
1157  #[inline]
1158  #[must_use]
1159  pub fn all(self) -> bool {
1160    pick! {
1161      if #[cfg(target_feature="avx")] {
1162        move_mask_m256(self.avx) == 0b11111111
1163      } else {
1164        self.a.all() && self.b.all()
1165      }
1166    }
1167  }
1168  #[inline]
1169  #[must_use]
1170  pub fn none(self) -> bool {
1171    !self.any()
1172  }
1173
1174  #[inline]
1175  fn vm_pow2n(self) -> Self {
1176    const_f32_as_f32x8!(pow2_23, 8388608.0);
1177    const_f32_as_f32x8!(bias, 127.0);
1178    let a = self + (bias + pow2_23);
1179    let c = cast::<_, i32x8>(a) << 23;
1180    cast::<_, f32x8>(c)
1181  }
1182
1183  /// Calculate the exponent of a packed `f32x8`
1184  #[inline]
1185  #[must_use]
1186  pub fn exp(self) -> Self {
1187    const_f32_as_f32x8!(P0, 1.0 / 2.0);
1188    const_f32_as_f32x8!(P1, 1.0 / 6.0);
1189    const_f32_as_f32x8!(P2, 1. / 24.);
1190    const_f32_as_f32x8!(P3, 1. / 120.);
1191    const_f32_as_f32x8!(P4, 1. / 720.);
1192    const_f32_as_f32x8!(P5, 1. / 5040.);
1193    const_f32_as_f32x8!(LN2D_HI, 0.693359375);
1194    const_f32_as_f32x8!(LN2D_LO, -2.12194440e-4);
1195    let max_x = f32x8::from(87.3);
1196    let r = (self * Self::LOG2_E).round();
1197    let x = r.mul_neg_add(LN2D_HI, self);
1198    let x = r.mul_neg_add(LN2D_LO, x);
1199    let z = polynomial_5!(x, P0, P1, P2, P3, P4, P5);
1200    let x2 = x * x;
1201    let z = z.mul_add(x2, x);
1202    let n2 = Self::vm_pow2n(r);
1203    let z = (z + Self::ONE) * n2;
1204    // check for overflow
1205    let in_range = self.abs().simd_lt(max_x);
1206    let in_range = in_range & self.is_finite();
1207    in_range.blend(z, Self::ZERO)
1208  }
1209
1210  #[inline]
1211  fn exponent(self) -> f32x8 {
1212    const_f32_as_f32x8!(pow2_23, 8388608.0);
1213    const_f32_as_f32x8!(bias, 127.0);
1214    let a = cast::<_, u32x8>(self);
1215    let b = a >> 23;
1216    let c = b | cast::<_, u32x8>(pow2_23);
1217    let d = cast::<_, f32x8>(c);
1218    let e = d - (pow2_23 + bias);
1219    e
1220  }
1221
1222  #[inline]
1223  fn fraction_2(self) -> Self {
1224    let t1 = cast::<_, u32x8>(self);
1225    let t2 = cast::<_, u32x8>(
1226      (t1 & u32x8::from(0x007FFFFF)) | u32x8::from(0x3F000000),
1227    );
1228    cast::<_, f32x8>(t2)
1229  }
1230  #[inline]
1231  fn is_zero_or_subnormal(self) -> Self {
1232    let t = cast::<_, i32x8>(self);
1233    let t = t & i32x8::splat(0x7F800000);
1234    i32x8::round_float(t.simd_eq(i32x8::splat(0)))
1235  }
1236  #[inline]
1237  fn infinity() -> Self {
1238    cast::<_, f32x8>(i32x8::splat(0x7F800000))
1239  }
1240  #[inline]
1241  fn nan_log() -> Self {
1242    cast::<_, f32x8>(i32x8::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1243  }
1244  #[inline]
1245  fn nan_pow() -> Self {
1246    cast::<_, f32x8>(i32x8::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1247  }
1248  #[inline]
1249  pub fn sign_bit(self) -> Self {
1250    let t1 = cast::<_, i32x8>(self);
1251    let t2 = t1 >> 31;
1252    !cast::<_, f32x8>(t2).simd_eq(f32x8::ZERO)
1253  }
1254
1255  /// horizontal add of all the elements of the vector
1256  #[inline]
1257  #[must_use]
1258  pub fn reduce_add(self) -> f32 {
1259    pick! {
1260      // From https://stackoverflow.com/questions/13219146/how-to-sum-m256-horizontally
1261      if #[cfg(target_feature="avx")]{
1262        let hi_quad = extract_m128_from_m256::<1>(self.avx);
1263        let lo_quad = cast_to_m128_from_m256(self.avx);
1264        let sum_quad = add_m128(lo_quad,hi_quad);
1265        let lo_dual = sum_quad;
1266        let hi_dual = move_high_low_m128(sum_quad,sum_quad);
1267        let sum_dual = add_m128(lo_dual,hi_dual);
1268        let lo = sum_dual;
1269        let hi = shuffle_abi_f32_all_m128::<0b_01>(sum_dual, sum_dual);
1270        let sum = add_m128_s(lo, hi);
1271        get_f32_from_m128_s(sum)
1272      } else {
1273        self.a.reduce_add() + self.b.reduce_add()
1274      }
1275    }
1276  }
1277
1278  /// Natural log (ln(x))
1279  #[inline]
1280  #[must_use]
1281  pub fn ln(self) -> Self {
1282    const_f32_as_f32x8!(HALF, 0.5);
1283    const_f32_as_f32x8!(P0, 3.3333331174E-1);
1284    const_f32_as_f32x8!(P1, -2.4999993993E-1);
1285    const_f32_as_f32x8!(P2, 2.0000714765E-1);
1286    const_f32_as_f32x8!(P3, -1.6668057665E-1);
1287    const_f32_as_f32x8!(P4, 1.4249322787E-1);
1288    const_f32_as_f32x8!(P5, -1.2420140846E-1);
1289    const_f32_as_f32x8!(P6, 1.1676998740E-1);
1290    const_f32_as_f32x8!(P7, -1.1514610310E-1);
1291    const_f32_as_f32x8!(P8, 7.0376836292E-2);
1292    const_f32_as_f32x8!(LN2F_HI, 0.693359375);
1293    const_f32_as_f32x8!(LN2F_LO, -2.12194440e-4);
1294    const_f32_as_f32x8!(VM_SMALLEST_NORMAL, 1.17549435E-38);
1295
1296    let x1 = self;
1297    let x = Self::fraction_2(x1);
1298    let e = Self::exponent(x1);
1299    let mask = x.simd_gt(Self::SQRT_2 * HALF);
1300    let x = (!mask).blend(x + x, x);
1301    let fe = mask.blend(e + Self::ONE, e);
1302    let x = x - Self::ONE;
1303    let res = polynomial_8!(x, P0, P1, P2, P3, P4, P5, P6, P7, P8);
1304    let x2 = x * x;
1305    let res = x2 * x * res;
1306    let res = fe.mul_add(LN2F_LO, res);
1307    let res = res + x2.mul_neg_add(HALF, x);
1308    let res = fe.mul_add(LN2F_HI, res);
1309    let overflow = !self.is_finite();
1310    let underflow = x1.simd_lt(VM_SMALLEST_NORMAL);
1311    let mask = overflow | underflow;
1312    if !mask.any() {
1313      res
1314    } else {
1315      let is_zero = self.is_zero_or_subnormal();
1316      let res = underflow.blend(Self::nan_log(), res);
1317      let res = is_zero.blend(Self::infinity(), res);
1318      let res = overflow.blend(self, res);
1319      res
1320    }
1321  }
1322
1323  #[inline]
1324  #[must_use]
1325  pub fn log2(self) -> Self {
1326    Self::ln(self) * Self::LOG2_E
1327  }
1328  #[inline]
1329  #[must_use]
1330  pub fn log10(self) -> Self {
1331    Self::ln(self) * Self::LOG10_E
1332  }
1333
1334  #[inline]
1335  #[must_use]
1336  pub fn pow_f32x8(self, y: Self) -> Self {
1337    const_f32_as_f32x8!(ln2f_hi, 0.693359375);
1338    const_f32_as_f32x8!(ln2f_lo, -2.12194440e-4);
1339    const_f32_as_f32x8!(P0logf, 3.3333331174E-1);
1340    const_f32_as_f32x8!(P1logf, -2.4999993993E-1);
1341    const_f32_as_f32x8!(P2logf, 2.0000714765E-1);
1342    const_f32_as_f32x8!(P3logf, -1.6668057665E-1);
1343    const_f32_as_f32x8!(P4logf, 1.4249322787E-1);
1344    const_f32_as_f32x8!(P5logf, -1.2420140846E-1);
1345    const_f32_as_f32x8!(P6logf, 1.1676998740E-1);
1346    const_f32_as_f32x8!(P7logf, -1.1514610310E-1);
1347    const_f32_as_f32x8!(P8logf, 7.0376836292E-2);
1348
1349    const_f32_as_f32x8!(p2expf, 1.0 / 2.0); // coefficients for Taylor expansion of exp
1350    const_f32_as_f32x8!(p3expf, 1.0 / 6.0);
1351    const_f32_as_f32x8!(p4expf, 1.0 / 24.0);
1352    const_f32_as_f32x8!(p5expf, 1.0 / 120.0);
1353    const_f32_as_f32x8!(p6expf, 1.0 / 720.0);
1354    const_f32_as_f32x8!(p7expf, 1.0 / 5040.0);
1355
1356    let x1 = self.abs();
1357    let x = x1.fraction_2();
1358    let mask = x.simd_gt(f32x8::SQRT_2 * f32x8::HALF);
1359    let x = (!mask).blend(x + x, x);
1360
1361    let x = x - f32x8::ONE;
1362    let x2 = x * x;
1363    let lg1 = polynomial_8!(
1364      x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf
1365    );
1366    let lg1 = lg1 * x2 * x;
1367
1368    let ef = x1.exponent();
1369    let ef = mask.blend(ef + f32x8::ONE, ef);
1370    let e1 = (ef * y).round();
1371    let yr = ef.mul_sub(y, e1);
1372
1373    let lg = f32x8::HALF.mul_neg_add(x2, x) + lg1;
1374    let x2_err = (f32x8::HALF * x).mul_sub(x, f32x8::HALF * x2);
1375    let lg_err = f32x8::HALF.mul_add(x2, lg - x) - lg1;
1376
1377    let e2 = (lg * y * f32x8::LOG2_E).round();
1378    let v = lg.mul_sub(y, e2 * ln2f_hi);
1379    let v = e2.mul_neg_add(ln2f_lo, v);
1380    let v = v - (lg_err + x2_err).mul_sub(y, yr * f32x8::LN_2);
1381
1382    let x = v;
1383    let e3 = (x * f32x8::LOG2_E).round();
1384    let x = e3.mul_neg_add(f32x8::LN_2, x);
1385    let x2 = x * x;
1386    let z = x2.mul_add(
1387      polynomial_5!(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf),
1388      x + f32x8::ONE,
1389    );
1390
1391    let ee = e1 + e2 + e3;
1392    let ei = cast::<_, i32x8>(ee.round_int());
1393    let ej = cast::<_, i32x8>(ei + (cast::<_, i32x8>(z) >> 23));
1394
1395    let overflow = cast::<_, f32x8>(ej.simd_gt(i32x8::splat(0x0FF)))
1396      | (ee.simd_gt(f32x8::splat(300.0)));
1397    let underflow = cast::<_, f32x8>(ej.simd_lt(i32x8::splat(0x000)))
1398      | (ee.simd_lt(f32x8::splat(-300.0)));
1399
1400    // Add exponent by integer addition
1401    let z = cast::<_, f32x8>(cast::<_, i32x8>(z) + (ei << 23));
1402    // Check for overflow/underflow
1403    let z = underflow.blend(f32x8::ZERO, z);
1404    let z = overflow.blend(Self::infinity(), z);
1405
1406    // Check for self == 0
1407    let x_zero = self.is_zero_or_subnormal();
1408    let z = x_zero.blend(
1409      y.simd_lt(f32x8::ZERO).blend(
1410        Self::infinity(),
1411        y.simd_eq(f32x8::ZERO).blend(f32x8::ONE, f32x8::ZERO),
1412      ),
1413      z,
1414    );
1415
1416    let x_sign = self.sign_bit();
1417    let z = if x_sign.any() {
1418      // Y into an integer
1419      let yi = y.simd_eq(y.round());
1420
1421      // Is y odd?
1422      let y_odd = cast::<_, i32x8>(y.round_int() << 31).round_float();
1423
1424      let z1 =
1425        yi.blend(z | y_odd, self.simd_eq(Self::ZERO).blend(z, Self::nan_pow()));
1426
1427      x_sign.blend(z1, z)
1428    } else {
1429      z
1430    };
1431
1432    let x_finite = self.is_finite();
1433    let y_finite = y.is_finite();
1434    let e_finite = ee.is_finite();
1435    if (x_finite & y_finite & (e_finite | x_zero)).all() {
1436      return z;
1437    }
1438
1439    (self.is_nan() | y.is_nan()).blend(self + y, z)
1440  }
1441  #[inline]
1442  pub fn powf(self, y: f32) -> Self {
1443    Self::pow_f32x8(self, f32x8::splat(y))
1444  }
1445
1446  /// Transpose matrix of 8x8 `f32` matrix. Currently only accelerated on AVX.
1447  #[must_use]
1448  #[inline]
1449  pub fn transpose(data: [f32x8; 8]) -> [f32x8; 8] {
1450    pick! {
1451      if #[cfg(target_feature="avx")] {
1452        let a0 = unpack_lo_m256(data[0].avx, data[1].avx);
1453        let a1 = unpack_hi_m256(data[0].avx, data[1].avx);
1454        let a2 = unpack_lo_m256(data[2].avx, data[3].avx);
1455        let a3 = unpack_hi_m256(data[2].avx, data[3].avx);
1456        let a4 = unpack_lo_m256(data[4].avx, data[5].avx);
1457        let a5 = unpack_hi_m256(data[4].avx, data[5].avx);
1458        let a6 = unpack_lo_m256(data[6].avx, data[7].avx);
1459        let a7 = unpack_hi_m256(data[6].avx, data[7].avx);
1460
1461        pub const fn mm_shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 {
1462          (z << 6) | (y << 4) | (x << 2) | w
1463        }
1464
1465        const SHUFF_LO : i32 = mm_shuffle(1,0,1,0);
1466        const SHUFF_HI : i32 = mm_shuffle(3,2,3,2);
1467
1468        // possible todo: intel performance manual suggests alternative with blend to avoid port 5 pressure
1469        // (since blend runs on a different port than shuffle)
1470        let b0 = shuffle_m256::<SHUFF_LO>(a0,a2);
1471        let b1 = shuffle_m256::<SHUFF_HI>(a0,a2);
1472        let b2 = shuffle_m256::<SHUFF_LO>(a1,a3);
1473        let b3 = shuffle_m256::<SHUFF_HI>(a1,a3);
1474        let b4 = shuffle_m256::<SHUFF_LO>(a4,a6);
1475        let b5 = shuffle_m256::<SHUFF_HI>(a4,a6);
1476        let b6 = shuffle_m256::<SHUFF_LO>(a5,a7);
1477        let b7 = shuffle_m256::<SHUFF_HI>(a5,a7);
1478
1479        [
1480          f32x8 { avx: permute2z_m256::<0x20>(b0, b4) },
1481          f32x8 { avx: permute2z_m256::<0x20>(b1, b5) },
1482          f32x8 { avx: permute2z_m256::<0x20>(b2, b6) },
1483          f32x8 { avx: permute2z_m256::<0x20>(b3, b7) },
1484          f32x8 { avx: permute2z_m256::<0x31>(b0, b4) },
1485          f32x8 { avx: permute2z_m256::<0x31>(b1, b5) },
1486          f32x8 { avx: permute2z_m256::<0x31>(b2, b6) },
1487          f32x8 { avx: permute2z_m256::<0x31>(b3, b7) }
1488        ]
1489      } else {
1490        // possible todo: not sure that 128bit SIMD gives us a a lot of speedup here
1491
1492        #[inline(always)]
1493        fn transpose_column(data: &[f32x8; 8], index: usize) -> f32x8 {
1494          f32x8::new([
1495            data[0].as_array()[index],
1496            data[1].as_array()[index],
1497            data[2].as_array()[index],
1498            data[3].as_array()[index],
1499            data[4].as_array()[index],
1500            data[5].as_array()[index],
1501            data[6].as_array()[index],
1502            data[7].as_array()[index],
1503          ])
1504        }
1505
1506        [
1507          transpose_column(&data, 0),
1508          transpose_column(&data, 1),
1509          transpose_column(&data, 2),
1510          transpose_column(&data, 3),
1511          transpose_column(&data, 4),
1512          transpose_column(&data, 5),
1513          transpose_column(&data, 6),
1514          transpose_column(&data, 7),
1515        ]
1516      }
1517    }
1518  }
1519
1520  #[inline]
1521  pub fn to_array(self) -> [f32; 8] {
1522    cast(self)
1523  }
1524
1525  #[inline]
1526  pub fn as_array(&self) -> &[f32; 8] {
1527    cast_ref(self)
1528  }
1529
1530  #[inline]
1531  pub fn as_mut_array(&mut self) -> &mut [f32; 8] {
1532    cast_mut(self)
1533  }
1534
1535  #[inline]
1536  pub fn from_i32x8(v: i32x8) -> Self {
1537    pick! {
1538      if #[cfg(target_feature="avx2")] {
1539        Self { avx: convert_to_m256_from_i32_m256i(v.avx2) }
1540      } else {
1541        Self::new([
1542            v.as_array()[0] as f32,
1543            v.as_array()[1] as f32,
1544            v.as_array()[2] as f32,
1545            v.as_array()[3] as f32,
1546            v.as_array()[4] as f32,
1547            v.as_array()[5] as f32,
1548            v.as_array()[6] as f32,
1549            v.as_array()[7] as f32,
1550          ])
1551      }
1552    }
1553  }
1554}
1555
1556impl Not for f32x8 {
1557  type Output = Self;
1558  #[inline]
1559  fn not(self) -> Self {
1560    pick! {
1561      if #[cfg(target_feature="avx")] {
1562        Self { avx: self.avx.not()  }
1563      } else {
1564        Self {
1565          a : self.a.not(),
1566          b : self.b.not(),
1567        }
1568      }
1569    }
1570  }
1571}