pulp/
lib.rs

1//! `pulp` is a safe abstraction over SIMD instructions, that allows you to write a function once
2//! and dispatch to equivalent vectorized versions based on the features detected at runtime.
3//!
4//! # Autovectorization example
5//!
6//! ```
7//! use pulp::Arch;
8//!
9//! let mut v = (0..1000).map(|i| i as f64).collect::<Vec<_>>();
10//! let arch = Arch::new();
11//!
12//! arch.dispatch(|| {
13//! 	for x in &mut v {
14//! 		*x *= 2.0;
15//! 	}
16//! });
17//!
18//! for (i, x) in v.into_iter().enumerate() {
19//! 	assert_eq!(x, 2.0 * i as f64);
20//! }
21//! ```
22//!
23//! # Manual vectorization example
24//!
25//! ```
26//! use pulp::{Arch, Simd, WithSimd};
27//!
28//! struct TimesThree<'a>(&'a mut [f64]);
29//! impl<'a> WithSimd for TimesThree<'a> {
30//! 	type Output = ();
31//!
32//! 	#[inline(always)]
33//! 	fn with_simd<S: Simd>(self, simd: S) -> Self::Output {
34//! 		let v = self.0;
35//! 		let (head, tail) = S::as_mut_simd_f64s(v);
36//!
37//! 		let three = simd.splat_f64s(3.0);
38//! 		for x in head {
39//! 			*x = simd.mul_f64s(three, *x);
40//! 		}
41//!
42//! 		for x in tail {
43//! 			*x = *x * 3.0;
44//! 		}
45//! 	}
46//! }
47//!
48//! let mut v = (0..1000).map(|i| i as f64).collect::<Vec<_>>();
49//! let arch = Arch::new();
50//!
51//! arch.dispatch(TimesThree(&mut v));
52//!
53//! for (i, x) in v.into_iter().enumerate() {
54//! 	assert_eq!(x, 3.0 * i as f64);
55//! }
56//! ```
57
58// FIXME: replace x86 non-ieee min/max functions to propagate nans instead
59
60#![allow(
61	non_camel_case_types,
62	unknown_lints,
63	clippy::zero_prefixed_literal,
64	clippy::identity_op,
65	clippy::too_many_arguments,
66	clippy::type_complexity,
67	clippy::missing_transmute_annotations,
68	clippy::tabs_in_doc_comments,
69	clippy::modulo_one,
70	clippy::useless_transmute,
71	clippy::not_unsafe_ptr_arg_deref,
72	clippy::manual_is_multiple_of
73)]
74#![cfg_attr(
75	all(feature = "nightly", any(target_arch = "aarch64")),
76	feature(stdarch_neon_i8mm),
77	feature(stdarch_neon_sm4),
78	feature(stdarch_neon_ftts),
79	feature(stdarch_neon_fcma),
80	feature(stdarch_neon_dotprod)
81)]
82#![cfg_attr(not(feature = "std"), no_std)]
83#![cfg_attr(docsrs, feature(doc_cfg))]
84
85macro_rules! match_cfg {
86    (item, match cfg!() {
87        $(
88            const { $i_meta:meta } => { $( $i_tokens:tt )* },
89        )*
90        $(_ => { $( $e_tokens:tt )* },)?
91    }) => {
92        $crate::match_cfg! {
93            @__items () ;
94            $(
95                (( $i_meta ) ( $( $i_tokens )* )) ,
96            )*
97            $((() ( $( $e_tokens )* )),)?
98        }
99    };
100
101    (match cfg!() {
102        $(
103            const { $i_meta:meta } => $i_expr: expr,
104        )*
105        $(_ => $e_expr: expr,)?
106    }) => {
107        $crate::match_cfg! {
108            @ __result @ __exprs ();
109            $(
110                (( $i_meta ) ( $i_expr  )) ,
111            )*
112            $((() ( $e_expr  )),)?
113        }
114    };
115
116    // Internal and recursive macro to emit all the items
117    //
118    // Collects all the previous cfgs in a list at the beginning, so they can be
119    // negated. After the semicolon are all the remaining items.
120    (@__items ( $( $_:meta , )* ) ; ) => {};
121    (
122        @__items ( $( $no:meta , )* ) ;
123        (( $( $yes:meta )? ) ( $( $tokens:tt )* )) ,
124        $( $rest:tt , )*
125    ) => {
126        // Emit all items within one block, applying an appropriate [cfg]. The
127        // [cfg] will require all `$yes` matchers specified and must also negate
128        // all previous matchers.
129        #[cfg(all(
130            $( $yes , )?
131            not(any( $( $no ),* ))
132        ))]
133        $crate::match_cfg! { @__identity $( $tokens )* }
134
135        // Recurse to emit all other items in `$rest`, and when we do so add all
136        // our `$yes` matchers to the list of `$no` matchers as future emissions
137        // will have to negate everything we just matched as well.
138        $crate::match_cfg! {
139            @__items ( $( $no , )* $( $yes , )? ) ;
140            $( $rest , )*
141        }
142    };
143
144    // Internal and recursive macro to emit all the exprs
145    //
146    // Collects all the previous cfgs in a list at the beginning, so they can be
147    // negated. After the semicolon are all the remaining exprs.
148    (@ $ret: ident @ __exprs ( $( $_:meta , )* ) ; ) => {
149    	$ret
150    };
151
152    (
153        @ $ret: ident @__exprs ( $( $no:meta , )* ) ;
154        (( $( $yes:meta )? ) ( $( $tokens:tt )* )) ,
155        $( $rest:tt , )*
156    ) => {{
157        // Emit all exprs within one block, applying an appropriate [cfg]. The
158        // [cfg] will require all `$yes` matchers specified and must also negate
159        // all previous matchers.
160        #[cfg(all(
161            $( $yes , )?
162            not(any( $( $no ),* ))
163        ))]
164        let $ret = $crate::match_cfg! { @__identity $( $tokens )* };
165
166        // // Recurse to emit all other exprs in `$rest`, and when we do so add all
167        // // our `$yes` matchers to the list of `$no` matchers as future emissions
168        // // will have to negate everything we just matched as well.
169        $crate::match_cfg! {
170            @ $ret @ __exprs ( $( $no , )* $( $yes , )? ) ;
171            $( $rest , )*
172        }
173    }};
174
175    // Internal macro to make __apply work out right for different match types,
176    // because of how macros match/expand stuff.
177    (@__identity $( $tokens:tt )* ) => {
178        $( $tokens )*
179    };
180}
181
182const MAX_REGISTER_BYTES: usize = 256;
183
184use match_cfg;
185
186/// Safe transmute macro.
187///
188/// This function asserts at compile time that the two types have the same size.
189#[macro_export]
190macro_rules! cast {
191	($val: expr $(,)?) => {{
192		let __val = $val;
193		if const { false } {
194			// checks type constraints
195			$crate::cast(__val)
196		} else {
197			#[allow(
198				unused_unsafe,
199				unnecessary_transmutes,
200				clippy::missing_transmute_annotations
201			)]
202			unsafe {
203				::core::mem::transmute(__val)
204			}
205		}
206	}};
207}
208
209use bytemuck::{AnyBitPattern, CheckedBitPattern, NoUninit, Pod, Zeroable, checked};
210use core::fmt::Debug;
211use core::marker::PhantomData;
212use core::mem::MaybeUninit;
213use core::ops::*;
214use core::slice::{from_raw_parts, from_raw_parts_mut};
215use num_complex::Complex;
216use paste::paste;
217use seal::Seal;
218
219/// Requires the first non-lifetime generic parameter, as well as the function's
220/// first input parameter to be the SIMD type.
221/// Also currently requires that all the lifetimes be explicitly specified.
222#[cfg(feature = "macro")]
223#[cfg_attr(docsrs, doc(cfg(feature = "macro")))]
224pub use pulp_macro::with_simd;
225
226pub use {bytemuck, num_complex};
227
228pub type c32 = Complex<f32>;
229pub type c64 = Complex<f64>;
230
231#[derive(Copy, Clone)]
232#[repr(transparent)]
233struct DebugCplx<T>(T);
234
235unsafe impl<T: Zeroable> Zeroable for DebugCplx<T> {}
236unsafe impl<T: Pod> Pod for DebugCplx<T> {}
237
238impl Debug for DebugCplx<c32> {
239	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
240		let c32 { re, im } = self.0;
241		re.fmt(f)?;
242
243		let sign = if im.is_sign_positive() { " + " } else { " - " };
244		f.write_str(sign)?;
245
246		let im = f32::from_bits(im.to_bits() & (u32::MAX >> 1));
247		im.abs().fmt(f)?;
248
249		f.write_str("i")?;
250
251		Ok(())
252	}
253}
254
255impl Debug for DebugCplx<c64> {
256	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
257		let c64 { re, im } = self.0;
258		re.fmt(f)?;
259
260		let sign = if im.is_sign_positive() { " + " } else { " - " };
261		f.write_str(sign)?;
262
263		let im = f64::from_bits(im.to_bits() & (u64::MAX >> 1));
264		im.abs().fmt(f)?;
265
266		f.write_str("i")?;
267
268		Ok(())
269	}
270}
271
272match_cfg!(
273	item,
274	match cfg!() {
275		const { any(target_arch = "x86_64") } => {
276			#[derive(Debug, Copy, Clone)]
277			pub struct MemMask<T> {
278				mask: T,
279				load: Option<unsafe extern "C" fn()>,
280				store: Option<unsafe extern "C" fn()>,
281			}
282
283			impl<T> MemMask<T> {
284				#[inline]
285				pub fn new(mask: T) -> Self {
286					Self {
287						mask,
288						load: None,
289						store: None,
290					}
291				}
292			}
293
294			impl<T> From<T> for MemMask<T> {
295				#[inline]
296				fn from(value: T) -> Self {
297					Self {
298						mask: value,
299						load: None,
300						store: None,
301					}
302				}
303			}
304		},
305
306		_ => {
307			#[derive(Debug, Copy, Clone)]
308			pub struct MemMask<T> {
309				mask: T,
310			}
311
312			impl<T> MemMask<T> {
313				#[inline]
314				pub fn new(mask: T) -> Self {
315					Self { mask }
316				}
317			}
318
319			impl<T> From<T> for MemMask<T> {
320				#[inline]
321				fn from(value: T) -> Self {
322					Self { mask: value }
323				}
324			}
325		},
326	}
327);
328
329impl<T: Copy> MemMask<T> {
330	#[inline]
331	pub fn mask(self) -> T {
332		self.mask
333	}
334}
335
336mod seal {
337	pub trait Seal {}
338}
339
340pub trait NullaryFnOnce {
341	type Output;
342
343	fn call(self) -> Self::Output;
344}
345
346impl<R, F: FnOnce() -> R> NullaryFnOnce for F {
347	type Output = R;
348
349	#[inline(always)]
350	fn call(self) -> Self::Output {
351		self()
352	}
353}
354
355pub trait WithSimd {
356	type Output;
357
358	fn with_simd<S: Simd>(self, simd: S) -> Self::Output;
359}
360
361impl<F: NullaryFnOnce> WithSimd for F {
362	type Output = F::Output;
363
364	#[inline(always)]
365	fn with_simd<S: Simd>(self, simd: S) -> Self::Output {
366		let _simd = &simd;
367		self.call()
368	}
369}
370
371#[inline(always)]
372fn fma_f32(a: f32, b: f32, c: f32) -> f32 {
373	match_cfg!(match cfg!() {
374		const { feature = "std" } => f32::mul_add(a, b, c),
375		_ => libm::fmaf(a, b, c),
376	})
377}
378
379#[inline(always)]
380fn fma_f64(a: f64, b: f64, c: f64) -> f64 {
381	match_cfg!(match cfg!() {
382		const { feature = "std" } => f64::mul_add(a, b, c),
383		_ => libm::fma(a, b, c),
384	})
385}
386
387#[inline(always)]
388fn sqrt_f32(a: f32) -> f32 {
389	match_cfg!(match cfg!() {
390		const { feature = "std" } => f32::sqrt(a),
391		_ => libm::sqrtf(a),
392	})
393}
394
395#[inline(always)]
396fn sqrt_f64(a: f64) -> f64 {
397	match_cfg!(match cfg!() {
398		const { feature = "std" } => f64::sqrt(a, ),
399		_ => libm::sqrt(a),
400	})
401}
402
403// a0,0 ... a0,m-1
404// ...
405// an-1,0 ... an-1,m-1
406#[inline(always)]
407unsafe fn interleave_fallback<Unit: Pod, Reg: Pod, AosReg>(x: AosReg) -> AosReg {
408	assert!(core::mem::size_of::<AosReg>() % core::mem::size_of::<Reg>() == 0);
409	assert!(core::mem::size_of::<Reg>() % core::mem::size_of::<Unit>() == 0);
410	assert!(!core::mem::needs_drop::<AosReg>());
411
412	if const { core::mem::size_of::<AosReg>() == core::mem::size_of::<Reg>() } {
413		x
414	} else {
415		let mut y = core::ptr::read(&x);
416
417		let n = const { core::mem::size_of::<AosReg>() / core::mem::size_of::<Reg>() };
418		let m = const { core::mem::size_of::<Reg>() / core::mem::size_of::<Unit>() };
419
420		unsafe {
421			let y = (&mut y) as *mut _ as *mut Unit;
422			let x = (&x) as *const _ as *const Unit;
423			for j in 0..m {
424				for i in 0..n {
425					*y.add(i + n * j) = *x.add(j + i * m);
426				}
427			}
428		}
429
430		y
431	}
432}
433
434#[inline(always)]
435unsafe fn deinterleave_fallback<Unit: Pod, Reg: Pod, SoaReg>(y: SoaReg) -> SoaReg {
436	assert!(core::mem::size_of::<SoaReg>() % core::mem::size_of::<Reg>() == 0);
437	assert!(core::mem::size_of::<Reg>() % core::mem::size_of::<Unit>() == 0);
438	assert!(!core::mem::needs_drop::<SoaReg>());
439
440	if const { core::mem::size_of::<SoaReg>() == core::mem::size_of::<Reg>() } {
441		y
442	} else {
443		let mut x = core::ptr::read(&y);
444
445		let n = const { core::mem::size_of::<SoaReg>() / core::mem::size_of::<Reg>() };
446		let m = const { core::mem::size_of::<Reg>() / core::mem::size_of::<Unit>() };
447
448		unsafe {
449			let y = (&y) as *const _ as *const Unit;
450			let x = (&mut x) as *mut _ as *mut Unit;
451			for j in 0..m {
452				for i in 0..n {
453					*x.add(j + i * m) = *y.add(i + n * j);
454				}
455			}
456		}
457
458		x
459	}
460}
461
462macro_rules! define_binop {
463	($func: ident, $ty: ident, $out: ident) => {
464		paste! {
465			fn [<$func _ $ty s>](self, a: Self::[<$ty s>], b: Self::[<$ty s>]) -> Self::[<$out s>];
466		}
467	};
468}
469
470macro_rules! define_binop_all {
471	($func: ident, $($ty: ident),*) => {
472		$(define_binop!($func, $ty, $ty);)*
473	};
474	($func: ident, $($ty: ident => $out: ident),*) => {
475		$(define_binop!($func, $ty, $out);)*
476	};
477}
478
479macro_rules! transmute_binop {
480	($func: ident, $ty: ident, $to: ident) => {
481		paste! {
482			fn [<$func _ $ty s>](self, a: Self::[<$ty s>], b: Self::[<$ty s>]) -> Self::[<$ty s>] {
483				self.[<transmute_ $ty s_ $to s>](
484					self.[<$func _ $to s>](self.[<transmute_ $to s_ $ty s>](a), self.[<transmute_ $to s_ $ty s>](b)),
485				)
486			}
487		}
488	};
489	($func: ident, $($ty: ident => $to: ident),*) => {
490		$(transmute_binop!($func, $ty, $to);)*
491	};
492}
493
494macro_rules! define_unop {
495	($func: ident, $ty: ident, $out: ident) => {
496		paste! {
497			fn [<$func _ $ty s>](self, a: Self::[<$ty s>]) -> Self::[<$out s>];
498		}
499	};
500}
501
502macro_rules! define_unop_all {
503	($func: ident, $($ty: ident),*) => {
504		$(define_unop!($func, $ty, $ty);)*
505	};
506	($func: ident, $($ty: ident => $out: ident),*) => {
507		$(define_unop!($func, $ty, $out);)*
508	};
509}
510
511macro_rules! transmute_unop {
512	($func: ident, $ty: ident, $to: ident) => {
513		paste! {
514			fn [<$func _ $ty s>](self, a: Self::[<$ty s>]) -> Self::[<$ty s>] {
515				self.[<transmute_ $ty s_ $to s>](
516					self.[<$func _ $to s>](self.[<transmute_ $to s_ $ty s>](a)),
517				)
518			}
519		}
520	};
521	($func: ident, $($ty: ident => $to: ident),*) => {
522		$(transmute_unop!($func, $ty, $to);)*
523	};
524}
525
526macro_rules! transmute_cmp {
527	($func: ident, $ty: ident, $to: ident, $out: ident) => {
528		paste! {
529			fn [<$func _ $ty s>](self, a: Self::[<$ty s>], b: Self::[<$ty s>]) -> Self::[<$out s>] {
530				self.[<$func _ $to s>](self.[<transmute_ $to s_ $ty s>](a), self.[<transmute_ $to s_ $ty s>](b))
531			}
532		}
533	};
534	($func: ident, $($ty: ident => $to: ident => $out: ident),*) => {
535		$(transmute_cmp!($func, $ty, $to, $out);)*
536	};
537}
538
539macro_rules! define_splat {
540	($ty: ty) => {
541		paste! {
542			fn [<splat_ $ty s>](self, value: $ty) -> Self::[<$ty s>];
543		}
544	};
545	($($ty: ident),*) => {
546		$(define_splat!($ty);)*
547	};
548}
549
550macro_rules! split_slice {
551	($ty: ident) => {
552		paste! {
553			#[inline(always)]
554			fn [<as_mut_rsimd_ $ty s>](slice: &mut [$ty]) -> (&mut [$ty], &mut [Self::[<$ty s>]]) {
555				unsafe { rsplit_mut_slice(slice) }
556			}
557			#[inline(always)]
558			fn [<as_rsimd_ $ty s>](slice: &[$ty]) -> (&[$ty], &[Self::[<$ty s>]]) {
559				unsafe { rsplit_slice(slice) }
560			}
561			#[inline(always)]
562			fn [<as_mut_simd_ $ty s>](slice: &mut [$ty]) -> (&mut [Self::[<$ty s>]], &mut [$ty]) {
563				unsafe { split_mut_slice(slice) }
564			}
565			#[inline(always)]
566			fn [<as_simd_ $ty s>](slice: &[$ty]) -> (&[Self::[<$ty s>]], &[$ty]) {
567				unsafe { split_slice(slice) }
568			}
569			#[inline(always)]
570			fn [<as_uninit_mut_rsimd_ $ty s>](
571				slice: &mut [MaybeUninit<$ty>],
572			) -> (&mut [MaybeUninit<$ty>], &mut [MaybeUninit<Self::[<$ty s>]>]) {
573				unsafe { rsplit_mut_slice(slice) }
574			}
575			#[inline(always)]
576			fn [<as_uninit_mut_simd_ $ty s>](
577				slice: &mut [MaybeUninit<$ty>],
578			) -> (&mut [MaybeUninit<Self::[<$ty s>]>], &mut [MaybeUninit<$ty>]) {
579				unsafe { split_mut_slice(slice) }
580			}
581		}
582	};
583	($($ty: ident),*) => {
584		$(split_slice!($ty);)*
585	};
586}
587
588/// Types that allow \[de\]interleaving.
589///
590/// # Safety
591/// Instances of this type passed to simd \[de\]interleave functions must be `Pod`.
592pub unsafe trait Interleave {}
593unsafe impl<T: Pod> Interleave for T {}
594
595pub trait Simd: Seal + Debug + Copy + Send + Sync + 'static {
596	const IS_SCALAR: bool = false;
597
598	const M64_LANES: usize = core::mem::size_of::<Self::m64s>() / core::mem::size_of::<m64>();
599	const U64_LANES: usize = core::mem::size_of::<Self::u64s>() / core::mem::size_of::<u64>();
600	const I64_LANES: usize = core::mem::size_of::<Self::i64s>() / core::mem::size_of::<i64>();
601	const F64_LANES: usize = core::mem::size_of::<Self::f64s>() / core::mem::size_of::<f64>();
602	const C64_LANES: usize = core::mem::size_of::<Self::c64s>() / core::mem::size_of::<c64>();
603
604	const M32_LANES: usize = core::mem::size_of::<Self::m32s>() / core::mem::size_of::<m32>();
605	const U32_LANES: usize = core::mem::size_of::<Self::u32s>() / core::mem::size_of::<u32>();
606	const I32_LANES: usize = core::mem::size_of::<Self::i32s>() / core::mem::size_of::<i32>();
607	const F32_LANES: usize = core::mem::size_of::<Self::f32s>() / core::mem::size_of::<f32>();
608	const C32_LANES: usize = core::mem::size_of::<Self::c32s>() / core::mem::size_of::<c32>();
609
610	const M16_LANES: usize = core::mem::size_of::<Self::m16s>() / core::mem::size_of::<m16>();
611	const U16_LANES: usize = core::mem::size_of::<Self::u16s>() / core::mem::size_of::<u16>();
612	const I16_LANES: usize = core::mem::size_of::<Self::i16s>() / core::mem::size_of::<i16>();
613
614	const M8_LANES: usize = core::mem::size_of::<Self::m8s>() / core::mem::size_of::<m8>();
615	const U8_LANES: usize = core::mem::size_of::<Self::u8s>() / core::mem::size_of::<u8>();
616	const I8_LANES: usize = core::mem::size_of::<Self::i8s>() / core::mem::size_of::<i8>();
617
618	const REGISTER_COUNT: usize;
619
620	type m8s: Debug + Copy + Send + Sync + Zeroable + NoUninit + CheckedBitPattern + 'static;
621	type i8s: Debug + Copy + Send + Sync + Pod + 'static;
622	type u8s: Debug + Copy + Send + Sync + Pod + 'static;
623
624	type m16s: Debug + Copy + Send + Sync + Zeroable + NoUninit + CheckedBitPattern + 'static;
625	type i16s: Debug + Copy + Send + Sync + Pod + 'static;
626	type u16s: Debug + Copy + Send + Sync + Pod + 'static;
627
628	type m32s: Debug + Copy + Send + Sync + Zeroable + NoUninit + CheckedBitPattern + 'static;
629	type f32s: Debug + Copy + Send + Sync + Pod + 'static;
630	type c32s: Debug + Copy + Send + Sync + Pod + 'static;
631	type i32s: Debug + Copy + Send + Sync + Pod + 'static;
632	type u32s: Debug + Copy + Send + Sync + Pod + 'static;
633
634	type m64s: Debug + Copy + Send + Sync + Zeroable + NoUninit + CheckedBitPattern + 'static;
635	type f64s: Debug + Copy + Send + Sync + Pod + 'static;
636	type c64s: Debug + Copy + Send + Sync + Pod + 'static;
637	type i64s: Debug + Copy + Send + Sync + Pod + 'static;
638	type u64s: Debug + Copy + Send + Sync + Pod + 'static;
639
640	/// Contains the square of the norm in both the real and imaginary components.
641	fn abs2_c32s(self, a: Self::c32s) -> Self::c32s;
642
643	/// Contains the square of the norm in both the real and imaginary components.
644	fn abs2_c64s(self, a: Self::c64s) -> Self::c64s;
645	#[inline]
646	fn abs_f32s(self, a: Self::f32s) -> Self::f32s {
647		self.and_f32s(self.not_f32s(self.splat_f32s(-0.0)), a)
648	}
649	#[inline]
650	fn abs_f64s(self, a: Self::f64s) -> Self::f64s {
651		self.and_f64s(self.not_f64s(self.splat_f64s(-0.0)), a)
652	}
653	/// Contains the max norm in both the real and imaginary components.
654	fn abs_max_c32s(self, a: Self::c32s) -> Self::c32s;
655	/// Contains the max norm in both the real and imaginary components.
656	fn abs_max_c64s(self, a: Self::c64s) -> Self::c64s;
657
658	define_binop_all!(add, c32, c64, f32, f64, u8, u16, u32, u64);
659	define_binop_all!(
660		sub, c32, c64, f32, f64, u8, i8, u16, i16, u32, i32, u64, i64
661	);
662	define_binop_all!(mul, c32, c64, f32, f64, u16, i16, u32, i32, u64, i64);
663	define_binop_all!(div, f32, f64);
664	define_binop_all!(equal, u8 => m8, u16 => m16, u32 => m32, u64 => m64, c32 => m32, f32 => m32, c64 => m64, f64 => m64);
665	define_binop_all!(greater_than, u8 => m8, i8 => m8, u16 => m16, i16 => m16, u32 => m32, i32 => m32, u64 => m64, i64 => m64, f32 => m32, f64 => m64);
666	define_binop_all!(greater_than_or_equal, u8 => m8, i8 => m8, u16 => m16, i16 => m16, u32 => m32, i32 => m32, u64 => m64, i64 => m64, f32 => m32, f64 => m64);
667	define_binop_all!(less_than_or_equal, u8 => m8, i8 => m8, u16 => m16, i16 => m16, u32 => m32, i32 => m32, u64 => m64, i64 => m64, f32 => m32, f64 => m64);
668	define_binop_all!(less_than, u8 => m8, i8 => m8, u16 => m16, i16 => m16, u32 => m32, i32 => m32, u64 => m64, i64 => m64, f32 => m32, f64 => m64);
669
670	define_binop_all!(and, u8, u16, u32, u64);
671	define_binop_all!(or, u8, u16, u32, u64);
672	define_binop_all!(xor, u8, u16, u32, u64);
673
674	transmute_binop!(and, m8 => u8, i8 => u8, m16 => u16, i16 => u16, m32 => u32, i32 => u32, m64 => u64, i64 => u64, f32 => u32, f64 => u64);
675	transmute_binop!(or, m8 => u8, i8 => u8, m16 => u16, i16 => u16, m32 => u32, i32 => u32, m64 => u64, i64 => u64, f32 => u32, f64 => u64);
676	transmute_binop!(xor, m8 => u8, i8 => u8, m16 => u16, i16 => u16, m32 => u32, i32 => u32, m64 => u64, i64 => u64, f32 => u32, f64 => u64);
677
678	transmute_binop!(add, i8 => u8, i16 => u16, i32 => u32, i64 => u64);
679	transmute_cmp!(equal, m8 => u8 => m8, i8 => u8 => m8, m16 => u16 => m16, i16 => u16 => m16, m32 => u32 => m32, i32 => u32 => m32, m64 => u64 => m64, i64 => u64 => m64);
680
681	define_binop_all!(min, f32, f64, u8, i8, u16, i16, u32, i32, u64, i64);
682	define_binop_all!(max, f32, f64, u8, i8, u16, i16, u32, i32, u64, i64);
683
684	define_unop_all!(neg, c32, c64);
685	define_unop_all!(not, m8, u8, m16, u16, m32, u32, m64, u64);
686
687	transmute_unop!(not, i8 => u8, i16 => u16, i32 => u32, i64 => u64, f32 => u32, f64 => u64);
688
689	split_slice!(u8, i8, u16, i16, u32, i32, u64, i64, c32, f32, c64, f64);
690	define_splat!(u8, i8, u16, i16, u32, i32, u64, i64, c32, f32, c64, f64);
691
692	fn sqrt_f32s(self, a: Self::f32s) -> Self::f32s;
693	fn sqrt_f64s(self, a: Self::f64s) -> Self::f64s;
694
695	fn conj_c32s(self, a: Self::c32s) -> Self::c32s;
696	fn conj_c64s(self, a: Self::c64s) -> Self::c64s;
697	fn conj_mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s;
698	fn conj_mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s;
699
700	/// Computes `conj(a) * b + c`
701	#[inline]
702	fn conj_mul_add_e_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s {
703		self.conj_mul_add_c32s(a, b, c)
704	}
705	/// Computes `conj(a) * b + c`
706	#[inline]
707	fn conj_mul_add_e_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s {
708		self.conj_mul_add_c64s(a, b, c)
709	}
710	fn conj_mul_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s;
711
712	fn conj_mul_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s;
713	/// Computes `conj(a) * b`
714	#[inline]
715	fn conj_mul_e_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
716		self.conj_mul_c32s(a, b)
717	}
718	/// Computes `conj(a) * b`
719	#[inline]
720	fn conj_mul_e_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
721		self.conj_mul_c64s(a, b)
722	}
723	#[inline(always)]
724	fn deinterleave_shfl_f32s<T: Interleave>(self, values: T) -> T {
725		unsafe { deinterleave_fallback::<f32, Self::f32s, T>(values) }
726	}
727
728	#[inline(always)]
729	fn deinterleave_shfl_f64s<T: Interleave>(self, values: T) -> T {
730		unsafe { deinterleave_fallback::<f64, Self::f64s, T>(values) }
731	}
732
733	#[inline(always)]
734	fn first_true_m8s(self, mask: Self::m8s) -> usize {
735		if const { core::mem::size_of::<Self::m8s>() == core::mem::size_of::<Self::u8s>() } {
736			let mask: Self::u8s = bytemuck::cast(mask);
737			let slice = bytemuck::cast_slice::<Self::u8s, u8>(core::slice::from_ref(&mask));
738			let mut i = 0;
739			for &x in slice.iter() {
740				if x != 0 {
741					break;
742				}
743				i += 1;
744			}
745			i
746		} else if const { core::mem::size_of::<Self::m8s>() == core::mem::size_of::<u8>() } {
747			let mask: u8 = bytemuck::cast(mask);
748			mask.leading_zeros() as usize
749		} else if const { core::mem::size_of::<Self::m8s>() == core::mem::size_of::<u16>() } {
750			let mask: u16 = bytemuck::cast(mask);
751			mask.leading_zeros() as usize
752		} else {
753			panic!()
754		}
755	}
756
757	#[inline(always)]
758	fn first_true_m16s(self, mask: Self::m16s) -> usize {
759		if const { core::mem::size_of::<Self::m16s>() == core::mem::size_of::<Self::u16s>() } {
760			let mask: Self::u16s = bytemuck::cast(mask);
761			let slice = bytemuck::cast_slice::<Self::u16s, u16>(core::slice::from_ref(&mask));
762			let mut i = 0;
763			for &x in slice.iter() {
764				if x != 0 {
765					break;
766				}
767				i += 1;
768			}
769			i
770		} else if const { core::mem::size_of::<Self::m16s>() == core::mem::size_of::<u8>() } {
771			let mask: u8 = bytemuck::cast(mask);
772			mask.leading_zeros() as usize
773		} else if const { core::mem::size_of::<Self::m16s>() == core::mem::size_of::<u16>() } {
774			let mask: u16 = bytemuck::cast(mask);
775			mask.leading_zeros() as usize
776		} else {
777			panic!()
778		}
779	}
780
781	#[inline(always)]
782	fn first_true_m32s(self, mask: Self::m32s) -> usize {
783		if const { core::mem::size_of::<Self::m32s>() == core::mem::size_of::<Self::u32s>() } {
784			let mask: Self::u32s = bytemuck::cast(mask);
785			let slice = bytemuck::cast_slice::<Self::u32s, u32>(core::slice::from_ref(&mask));
786			let mut i = 0;
787			for &x in slice.iter() {
788				if x != 0 {
789					break;
790				}
791				i += 1;
792			}
793			i
794		} else if const { core::mem::size_of::<Self::m32s>() == core::mem::size_of::<u8>() } {
795			let mask: u8 = bytemuck::cast(mask);
796			mask.leading_zeros() as usize
797		} else if const { core::mem::size_of::<Self::m32s>() == core::mem::size_of::<u16>() } {
798			let mask: u16 = bytemuck::cast(mask);
799			mask.leading_zeros() as usize
800		} else {
801			panic!()
802		}
803	}
804
805	#[inline(always)]
806	fn first_true_m64s(self, mask: Self::m64s) -> usize {
807		if const { core::mem::size_of::<Self::m64s>() == core::mem::size_of::<Self::u64s>() } {
808			let mask: Self::u64s = bytemuck::cast(mask);
809			let slice = bytemuck::cast_slice::<Self::u64s, u64>(core::slice::from_ref(&mask));
810			let mut i = 0;
811			for &x in slice.iter() {
812				if x != 0 {
813					break;
814				}
815				i += 1;
816			}
817			i
818		} else if const { core::mem::size_of::<Self::m64s>() == core::mem::size_of::<u8>() } {
819			let mask: u8 = bytemuck::cast(mask);
820			mask.leading_zeros() as usize
821		} else if const { core::mem::size_of::<Self::m64s>() == core::mem::size_of::<u16>() } {
822			let mask: u16 = bytemuck::cast(mask);
823			mask.leading_zeros() as usize
824		} else {
825			panic!()
826		}
827	}
828
829	#[inline(always)]
830	fn interleave_shfl_f32s<T: Interleave>(self, values: T) -> T {
831		unsafe { interleave_fallback::<f32, Self::f32s, T>(values) }
832	}
833
834	#[inline(always)]
835	fn interleave_shfl_f64s<T: Interleave>(self, values: T) -> T {
836		unsafe { interleave_fallback::<f64, Self::f64s, T>(values) }
837	}
838
839	#[inline(always)]
840	fn mask_between_m8s(self, start: u8, end: u8) -> MemMask<Self::m8s> {
841		let iota: Self::u8s = const {
842			unsafe { core::mem::transmute_copy(&iota_8::<u8, { MAX_REGISTER_BYTES / 1 }>()) }
843		};
844		self.and_m8s(
845			self.greater_than_or_equal_u8s(iota, self.splat_u8s(start)),
846			self.less_than_u8s(iota, self.splat_u8s(end)),
847		)
848		.into()
849	}
850
851	#[inline(always)]
852	fn mask_between_m16s(self, start: u16, end: u16) -> MemMask<Self::m16s> {
853		let iota: Self::u16s = const {
854			unsafe { core::mem::transmute_copy(&iota_16::<u16, { MAX_REGISTER_BYTES / 2 }>()) }
855		};
856		self.and_m16s(
857			self.greater_than_or_equal_u16s(iota, self.splat_u16s(start)),
858			self.less_than_u16s(iota, self.splat_u16s(end)),
859		)
860		.into()
861	}
862
863	#[inline(always)]
864	fn mask_between_m32s(self, start: u32, end: u32) -> MemMask<Self::m32s> {
865		let iota: Self::u32s = const {
866			unsafe { core::mem::transmute_copy(&iota_32::<u32, { MAX_REGISTER_BYTES / 4 }>()) }
867		};
868		self.and_m32s(
869			self.greater_than_or_equal_u32s(iota, self.splat_u32s(start)),
870			self.less_than_u32s(iota, self.splat_u32s(end)),
871		)
872		.into()
873	}
874
875	#[inline(always)]
876	fn mask_between_m64s(self, start: u64, end: u64) -> MemMask<Self::m64s> {
877		let iota: Self::u64s = const {
878			unsafe { core::mem::transmute_copy(&iota_64::<u64, { MAX_REGISTER_BYTES / 8 }>()) }
879		};
880		self.and_m64s(
881			self.greater_than_or_equal_u64s(iota, self.splat_u64s(start)),
882			self.less_than_u64s(iota, self.splat_u64s(end)),
883		)
884		.into()
885	}
886	/// # Safety
887	///
888	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
889	/// [`core::ptr::read`].
890	unsafe fn mask_load_ptr_c32s(self, mask: MemMask<Self::m32s>, ptr: *const c32) -> Self::c32s;
891	/// # Safety
892	///
893	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
894	/// [`core::ptr::read`].
895	unsafe fn mask_load_ptr_c64s(self, mask: MemMask<Self::m64s>, ptr: *const c64) -> Self::c64s;
896	/// # Safety
897	///
898	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
899	/// [`core::ptr::read`].
900	#[inline(always)]
901	unsafe fn mask_load_ptr_f32s(self, mask: MemMask<Self::m32s>, ptr: *const f32) -> Self::f32s {
902		self.transmute_f32s_u32s(self.mask_load_ptr_u32s(mask, ptr as *const u32))
903	}
904
905	/// # Safety
906	///
907	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
908	/// [`core::ptr::read`].
909	#[inline(always)]
910	unsafe fn mask_load_ptr_f64s(self, mask: MemMask<Self::m64s>, ptr: *const f64) -> Self::f64s {
911		self.transmute_f64s_u64s(self.mask_load_ptr_u64s(mask, ptr as *const u64))
912	}
913	/// # Safety
914	///
915	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
916	/// [`core::ptr::read`].
917	#[inline(always)]
918	unsafe fn mask_load_ptr_i8s(self, mask: MemMask<Self::m8s>, ptr: *const i8) -> Self::i8s {
919		self.transmute_i8s_u8s(self.mask_load_ptr_u8s(mask, ptr as *const u8))
920	}
921	/// # Safety
922	///
923	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
924	/// [`core::ptr::read`].
925	#[inline(always)]
926	unsafe fn mask_load_ptr_i16s(self, mask: MemMask<Self::m16s>, ptr: *const i16) -> Self::i16s {
927		self.transmute_i16s_u16s(self.mask_load_ptr_u16s(mask, ptr as *const u16))
928	}
929	/// # Safety
930	///
931	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
932	/// [`core::ptr::read`].
933	#[inline(always)]
934	unsafe fn mask_load_ptr_i32s(self, mask: MemMask<Self::m32s>, ptr: *const i32) -> Self::i32s {
935		self.transmute_i32s_u32s(self.mask_load_ptr_u32s(mask, ptr as *const u32))
936	}
937	/// # Safety
938	///
939	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
940	/// [`core::ptr::read`].
941	#[inline(always)]
942	unsafe fn mask_load_ptr_i64s(self, mask: MemMask<Self::m64s>, ptr: *const i64) -> Self::i64s {
943		self.transmute_i64s_u64s(self.mask_load_ptr_u64s(mask, ptr as *const u64))
944	}
945
946	/// # Safety
947	///
948	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
949	/// [`core::ptr::read`].
950	unsafe fn mask_load_ptr_u8s(self, mask: MemMask<Self::m8s>, ptr: *const u8) -> Self::u8s;
951
952	/// # Safety
953	///
954	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
955	/// [`core::ptr::read`].
956	unsafe fn mask_load_ptr_u16s(self, mask: MemMask<Self::m16s>, ptr: *const u16) -> Self::u16s;
957
958	/// # Safety
959	///
960	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
961	/// [`core::ptr::read`].
962	unsafe fn mask_load_ptr_u32s(self, mask: MemMask<Self::m32s>, ptr: *const u32) -> Self::u32s;
963
964	/// # Safety
965	///
966	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
967	/// [`core::ptr::read`].
968	unsafe fn mask_load_ptr_u64s(self, mask: MemMask<Self::m64s>, ptr: *const u64) -> Self::u64s;
969	/// # Safety
970	///
971	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
972	/// [`core::ptr::write`].
973	unsafe fn mask_store_ptr_c32s(
974		self,
975		mask: MemMask<Self::m32s>,
976		ptr: *mut c32,
977		values: Self::c32s,
978	);
979	/// # Safety
980	///
981	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
982	/// [`core::ptr::write`].
983	unsafe fn mask_store_ptr_c64s(
984		self,
985		mask: MemMask<Self::m64s>,
986		ptr: *mut c64,
987		values: Self::c64s,
988	);
989	/// # Safety
990	///
991	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
992	/// [`core::ptr::write`].
993	#[inline(always)]
994	unsafe fn mask_store_ptr_f32s(
995		self,
996		mask: MemMask<Self::m32s>,
997		ptr: *mut f32,
998		values: Self::f32s,
999	) {
1000		self.mask_store_ptr_u32s(mask, ptr as *mut u32, self.transmute_u32s_f32s(values));
1001	}
1002
1003	/// # Safety
1004	///
1005	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
1006	/// [`core::ptr::write`].
1007	#[inline(always)]
1008	unsafe fn mask_store_ptr_f64s(
1009		self,
1010		mask: MemMask<Self::m64s>,
1011		ptr: *mut f64,
1012		values: Self::f64s,
1013	) {
1014		self.mask_store_ptr_u64s(mask, ptr as *mut u64, self.transmute_u64s_f64s(values));
1015	}
1016	/// # Safety
1017	///
1018	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
1019	/// [`core::ptr::write`].
1020	#[inline(always)]
1021	unsafe fn mask_store_ptr_i8s(self, mask: MemMask<Self::m8s>, ptr: *mut i8, values: Self::i8s) {
1022		self.mask_store_ptr_u8s(mask, ptr as *mut u8, self.transmute_u8s_i8s(values));
1023	}
1024	/// # Safety
1025	///
1026	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
1027	/// [`core::ptr::write`].
1028	#[inline(always)]
1029	unsafe fn mask_store_ptr_i16s(
1030		self,
1031		mask: MemMask<Self::m16s>,
1032		ptr: *mut i16,
1033		values: Self::i16s,
1034	) {
1035		self.mask_store_ptr_u16s(mask, ptr as *mut u16, self.transmute_u16s_i16s(values));
1036	}
1037	/// # Safety
1038	///
1039	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
1040	/// [`core::ptr::write`].
1041	#[inline(always)]
1042	unsafe fn mask_store_ptr_i32s(
1043		self,
1044		mask: MemMask<Self::m32s>,
1045		ptr: *mut i32,
1046		values: Self::i32s,
1047	) {
1048		self.mask_store_ptr_u32s(mask, ptr as *mut u32, self.transmute_u32s_i32s(values));
1049	}
1050	/// # Safety
1051	///
1052	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
1053	/// [`core::ptr::write`].
1054	#[inline(always)]
1055	unsafe fn mask_store_ptr_i64s(
1056		self,
1057		mask: MemMask<Self::m64s>,
1058		ptr: *mut i64,
1059		values: Self::i64s,
1060	) {
1061		self.mask_store_ptr_u64s(mask, ptr as *mut u64, self.transmute_u64s_i64s(values));
1062	}
1063
1064	/// # Safety
1065	///
1066	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
1067	/// [`core::ptr::write`].
1068	unsafe fn mask_store_ptr_u8s(self, mask: MemMask<Self::m8s>, ptr: *mut u8, values: Self::u8s);
1069
1070	/// # Safety
1071	///
1072	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
1073	/// [`core::ptr::write`].
1074	unsafe fn mask_store_ptr_u16s(
1075		self,
1076		mask: MemMask<Self::m16s>,
1077		ptr: *mut u16,
1078		values: Self::u16s,
1079	);
1080
1081	/// # Safety
1082	///
1083	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
1084	/// [`core::ptr::write`].
1085	unsafe fn mask_store_ptr_u32s(
1086		self,
1087		mask: MemMask<Self::m32s>,
1088		ptr: *mut u32,
1089		values: Self::u32s,
1090	);
1091
1092	/// # Safety
1093	///
1094	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
1095	/// [`core::ptr::write`].
1096	unsafe fn mask_store_ptr_u64s(
1097		self,
1098		mask: MemMask<Self::m64s>,
1099		ptr: *mut u64,
1100		values: Self::u64s,
1101	);
1102
1103	fn mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s;
1104	fn mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s;
1105	/// Computes `a * b + c`
1106	#[inline]
1107	fn mul_add_e_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s {
1108		self.mul_add_c32s(a, b, c)
1109	}
1110	/// Computes `a * b + c`
1111	#[inline]
1112	fn mul_add_e_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s {
1113		self.mul_add_c64s(a, b, c)
1114	}
1115	fn mul_add_e_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s;
1116	fn mul_add_e_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s;
1117	fn mul_add_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s;
1118	fn mul_add_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s;
1119	/// Computes `a * b`
1120	#[inline]
1121	fn mul_e_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
1122		self.mul_c32s(a, b)
1123	}
1124	/// Computes `a * b`
1125	fn mul_e_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
1126		self.mul_c64s(a, b)
1127	}
1128
1129	#[inline]
1130	fn neg_f32s(self, a: Self::f32s) -> Self::f32s {
1131		self.xor_f32s(self.splat_f32s(-0.0), a)
1132	}
1133	#[inline]
1134	fn neg_f64s(self, a: Self::f64s) -> Self::f64s {
1135		self.xor_f64s(a, self.splat_f64s(-0.0))
1136	}
1137
1138	#[inline(always)]
1139	fn partial_load_c32s(self, slice: &[c32]) -> Self::c32s {
1140		cast(self.partial_load_f64s(bytemuck::cast_slice(slice)))
1141	}
1142	#[inline(always)]
1143	fn partial_load_c64s(self, slice: &[c64]) -> Self::c64s {
1144		cast(self.partial_load_f64s(bytemuck::cast_slice(slice)))
1145	}
1146	#[inline(always)]
1147	fn partial_load_f32s(self, slice: &[f32]) -> Self::f32s {
1148		cast(self.partial_load_u32s(bytemuck::cast_slice(slice)))
1149	}
1150	#[inline(always)]
1151	fn partial_load_f64s(self, slice: &[f64]) -> Self::f64s {
1152		cast(self.partial_load_u64s(bytemuck::cast_slice(slice)))
1153	}
1154	#[inline(always)]
1155	fn partial_load_i8s(self, slice: &[i8]) -> Self::i8s {
1156		cast(self.partial_load_u8s(bytemuck::cast_slice(slice)))
1157	}
1158	#[inline(always)]
1159	fn partial_load_i16s(self, slice: &[i16]) -> Self::i16s {
1160		cast(self.partial_load_u16s(bytemuck::cast_slice(slice)))
1161	}
1162	#[inline(always)]
1163	fn partial_load_i32s(self, slice: &[i32]) -> Self::i32s {
1164		cast(self.partial_load_u32s(bytemuck::cast_slice(slice)))
1165	}
1166	#[inline(always)]
1167	fn partial_load_i64s(self, slice: &[i64]) -> Self::i64s {
1168		cast(self.partial_load_u64s(bytemuck::cast_slice(slice)))
1169	}
1170	#[inline(always)]
1171	fn partial_load_u8s(self, slice: &[u8]) -> Self::u8s {
1172		unsafe {
1173			self.mask_load_ptr_u8s(self.mask_between_m8s(0, slice.len() as u8), slice.as_ptr())
1174		}
1175	}
1176	#[inline(always)]
1177	fn partial_load_u16s(self, slice: &[u16]) -> Self::u16s {
1178		unsafe {
1179			self.mask_load_ptr_u16s(
1180				self.mask_between_m16s(0, slice.len() as u16),
1181				slice.as_ptr(),
1182			)
1183		}
1184	}
1185	#[inline(always)]
1186	fn partial_load_u32s(self, slice: &[u32]) -> Self::u32s {
1187		unsafe {
1188			self.mask_load_ptr_u32s(
1189				self.mask_between_m32s(0, slice.len() as u32),
1190				slice.as_ptr(),
1191			)
1192		}
1193	}
1194	#[inline(always)]
1195	fn partial_load_u64s(self, slice: &[u64]) -> Self::u64s {
1196		unsafe {
1197			self.mask_load_ptr_u64s(
1198				self.mask_between_m64s(0, slice.len() as u64),
1199				slice.as_ptr(),
1200			)
1201		}
1202	}
1203
1204	#[inline(always)]
1205	fn partial_store_c32s(self, slice: &mut [c32], values: Self::c32s) {
1206		self.partial_store_f64s(bytemuck::cast_slice_mut(slice), cast(values))
1207	}
1208	#[inline(always)]
1209	fn partial_store_c64s(self, slice: &mut [c64], values: Self::c64s) {
1210		self.partial_store_f64s(bytemuck::cast_slice_mut(slice), cast(values))
1211	}
1212
1213	#[inline(always)]
1214	fn partial_store_f32s(self, slice: &mut [f32], values: Self::f32s) {
1215		self.partial_store_u32s(bytemuck::cast_slice_mut(slice), cast(values))
1216	}
1217	#[inline(always)]
1218	fn partial_store_f64s(self, slice: &mut [f64], values: Self::f64s) {
1219		self.partial_store_u64s(bytemuck::cast_slice_mut(slice), cast(values))
1220	}
1221	#[inline(always)]
1222	fn partial_store_i8s(self, slice: &mut [i8], values: Self::i8s) {
1223		self.partial_store_u16s(bytemuck::cast_slice_mut(slice), cast(values))
1224	}
1225	#[inline(always)]
1226	fn partial_store_i16s(self, slice: &mut [i16], values: Self::i16s) {
1227		self.partial_store_u16s(bytemuck::cast_slice_mut(slice), cast(values))
1228	}
1229	#[inline(always)]
1230	fn partial_store_i32s(self, slice: &mut [i32], values: Self::i32s) {
1231		self.partial_store_u32s(bytemuck::cast_slice_mut(slice), cast(values))
1232	}
1233	#[inline(always)]
1234	fn partial_store_i64s(self, slice: &mut [i64], values: Self::i64s) {
1235		self.partial_store_u64s(bytemuck::cast_slice_mut(slice), cast(values))
1236	}
1237	#[inline(always)]
1238	fn partial_store_u8s(self, slice: &mut [u8], values: Self::u8s) {
1239		unsafe {
1240			self.mask_store_ptr_u8s(
1241				self.mask_between_m8s(0, slice.len() as u8),
1242				slice.as_mut_ptr(),
1243				values,
1244			)
1245		}
1246	}
1247	#[inline(always)]
1248	fn partial_store_u16s(self, slice: &mut [u16], values: Self::u16s) {
1249		unsafe {
1250			self.mask_store_ptr_u16s(
1251				self.mask_between_m16s(0, slice.len() as u16),
1252				slice.as_mut_ptr(),
1253				values,
1254			)
1255		}
1256	}
1257	#[inline(always)]
1258	fn partial_store_u32s(self, slice: &mut [u32], values: Self::u32s) {
1259		unsafe {
1260			self.mask_store_ptr_u32s(
1261				self.mask_between_m32s(0, slice.len() as u32),
1262				slice.as_mut_ptr(),
1263				values,
1264			)
1265		}
1266	}
1267	#[inline(always)]
1268	fn partial_store_u64s(self, slice: &mut [u64], values: Self::u64s) {
1269		unsafe {
1270			self.mask_store_ptr_u64s(
1271				self.mask_between_m64s(0, slice.len() as u64),
1272				slice.as_mut_ptr(),
1273				values,
1274			)
1275		}
1276	}
1277	fn reduce_max_c32s(self, a: Self::c32s) -> c32;
1278	fn reduce_max_c64s(self, a: Self::c64s) -> c64;
1279	fn reduce_max_f32s(self, a: Self::f32s) -> f32;
1280	fn reduce_max_f64s(self, a: Self::f64s) -> f64;
1281	fn reduce_min_c32s(self, a: Self::c32s) -> c32;
1282	fn reduce_min_c64s(self, a: Self::c64s) -> c64;
1283	fn reduce_min_f32s(self, a: Self::f32s) -> f32;
1284	fn reduce_min_f64s(self, a: Self::f64s) -> f64;
1285
1286	fn reduce_product_f32s(self, a: Self::f32s) -> f32;
1287	fn reduce_product_f64s(self, a: Self::f64s) -> f64;
1288	fn reduce_sum_c32s(self, a: Self::c32s) -> c32;
1289	fn reduce_sum_c64s(self, a: Self::c64s) -> c64;
1290
1291	fn reduce_sum_f32s(self, a: Self::f32s) -> f32;
1292	fn reduce_sum_f64s(self, a: Self::f64s) -> f64;
1293	#[inline(always)]
1294	fn rotate_left_c32s(self, a: Self::c32s, amount: usize) -> Self::c32s {
1295		self.rotate_right_c32s(a, amount.wrapping_neg())
1296	}
1297	#[inline(always)]
1298	fn rotate_left_c64s(self, a: Self::c64s, amount: usize) -> Self::c64s {
1299		self.rotate_right_c64s(a, amount.wrapping_neg())
1300	}
1301
1302	#[inline(always)]
1303	fn rotate_left_f32s(self, a: Self::f32s, amount: usize) -> Self::f32s {
1304		cast(self.rotate_left_u32s(cast(a), amount))
1305	}
1306	#[inline(always)]
1307	fn rotate_left_f64s(self, a: Self::f64s, amount: usize) -> Self::f64s {
1308		cast(self.rotate_left_u64s(cast(a), amount))
1309	}
1310	#[inline(always)]
1311	fn rotate_left_i32s(self, a: Self::i32s, amount: usize) -> Self::i32s {
1312		cast(self.rotate_left_u32s(cast(a), amount))
1313	}
1314
1315	#[inline(always)]
1316	fn rotate_left_i64s(self, a: Self::i64s, amount: usize) -> Self::i64s {
1317		cast(self.rotate_left_u64s(cast(a), amount))
1318	}
1319
1320	#[inline(always)]
1321	fn rotate_left_u32s(self, a: Self::u32s, amount: usize) -> Self::u32s {
1322		self.rotate_right_u32s(a, amount.wrapping_neg())
1323	}
1324	#[inline(always)]
1325	fn rotate_left_u64s(self, a: Self::u64s, amount: usize) -> Self::u64s {
1326		self.rotate_right_u64s(a, amount.wrapping_neg())
1327	}
1328	fn rotate_right_c32s(self, a: Self::c32s, amount: usize) -> Self::c32s;
1329	fn rotate_right_c64s(self, a: Self::c64s, amount: usize) -> Self::c64s;
1330	#[inline(always)]
1331	fn rotate_right_f32s(self, a: Self::f32s, amount: usize) -> Self::f32s {
1332		cast(self.rotate_right_u32s(cast(a), amount))
1333	}
1334	#[inline(always)]
1335	fn rotate_right_f64s(self, a: Self::f64s, amount: usize) -> Self::f64s {
1336		cast(self.rotate_right_u64s(cast(a), amount))
1337	}
1338	#[inline(always)]
1339	fn rotate_right_i32s(self, a: Self::i32s, amount: usize) -> Self::i32s {
1340		cast(self.rotate_right_u32s(cast(a), amount))
1341	}
1342	#[inline(always)]
1343	fn rotate_right_i64s(self, a: Self::i64s, amount: usize) -> Self::i64s {
1344		cast(self.rotate_right_u64s(cast(a), amount))
1345	}
1346	fn rotate_right_u32s(self, a: Self::u32s, amount: usize) -> Self::u32s;
1347	fn rotate_right_u64s(self, a: Self::u64s, amount: usize) -> Self::u64s;
1348
1349	#[inline]
1350	fn select_f32s(
1351		self,
1352		mask: Self::m32s,
1353		if_true: Self::f32s,
1354		if_false: Self::f32s,
1355	) -> Self::f32s {
1356		self.transmute_f32s_u32s(self.select_u32s(
1357			mask,
1358			self.transmute_u32s_f32s(if_true),
1359			self.transmute_u32s_f32s(if_false),
1360		))
1361	}
1362	#[inline]
1363	fn select_f64s(
1364		self,
1365		mask: Self::m64s,
1366		if_true: Self::f64s,
1367		if_false: Self::f64s,
1368	) -> Self::f64s {
1369		self.transmute_f64s_u64s(self.select_u64s(
1370			mask,
1371			self.transmute_u64s_f64s(if_true),
1372			self.transmute_u64s_f64s(if_false),
1373		))
1374	}
1375	#[inline]
1376	fn select_i32s(
1377		self,
1378		mask: Self::m32s,
1379		if_true: Self::i32s,
1380		if_false: Self::i32s,
1381	) -> Self::i32s {
1382		self.transmute_i32s_u32s(self.select_u32s(
1383			mask,
1384			self.transmute_u32s_i32s(if_true),
1385			self.transmute_u32s_i32s(if_false),
1386		))
1387	}
1388	#[inline]
1389	fn select_i64s(
1390		self,
1391		mask: Self::m64s,
1392		if_true: Self::i64s,
1393		if_false: Self::i64s,
1394	) -> Self::i64s {
1395		self.transmute_i64s_u64s(self.select_u64s(
1396			mask,
1397			self.transmute_u64s_i64s(if_true),
1398			self.transmute_u64s_i64s(if_false),
1399		))
1400	}
1401	fn select_u32s(self, mask: Self::m32s, if_true: Self::u32s, if_false: Self::u32s)
1402	-> Self::u32s;
1403	fn select_u64s(self, mask: Self::m64s, if_true: Self::u64s, if_false: Self::u64s)
1404	-> Self::u64s;
1405
1406	fn swap_re_im_c32s(self, a: Self::c32s) -> Self::c32s;
1407	fn swap_re_im_c64s(self, a: Self::c64s) -> Self::c64s;
1408
1409	#[inline]
1410	fn transmute_f32s_i32s(self, a: Self::i32s) -> Self::f32s {
1411		cast(a)
1412	}
1413	#[inline]
1414	fn transmute_f32s_u32s(self, a: Self::u32s) -> Self::f32s {
1415		cast(a)
1416	}
1417
1418	#[inline]
1419	fn transmute_f64s_i64s(self, a: Self::i64s) -> Self::f64s {
1420		cast(a)
1421	}
1422	#[inline]
1423	fn transmute_f64s_u64s(self, a: Self::u64s) -> Self::f64s {
1424		cast(a)
1425	}
1426	#[inline]
1427	fn transmute_i32s_f32s(self, a: Self::f32s) -> Self::i32s {
1428		cast(a)
1429	}
1430	#[inline]
1431	fn transmute_m8s_u8s(self, a: Self::u8s) -> Self::m8s {
1432		checked::cast(a)
1433	}
1434	#[inline]
1435	fn transmute_u8s_m8s(self, a: Self::m8s) -> Self::u8s {
1436		cast(a)
1437	}
1438	#[inline]
1439	fn transmute_m16s_u16s(self, a: Self::u16s) -> Self::m16s {
1440		checked::cast(a)
1441	}
1442	#[inline]
1443	fn transmute_u16s_m16s(self, a: Self::m16s) -> Self::u16s {
1444		cast(a)
1445	}
1446	#[inline]
1447	fn transmute_m32s_u32s(self, a: Self::u32s) -> Self::m32s {
1448		checked::cast(a)
1449	}
1450	#[inline]
1451	fn transmute_u32s_m32s(self, a: Self::m32s) -> Self::u32s {
1452		cast(a)
1453	}
1454	#[inline]
1455	fn transmute_m64s_u64s(self, a: Self::u64s) -> Self::m64s {
1456		checked::cast(a)
1457	}
1458	#[inline]
1459	fn transmute_u64s_m64s(self, a: Self::m64s) -> Self::u64s {
1460		cast(a)
1461	}
1462	#[inline]
1463	fn transmute_i8s_u8s(self, a: Self::u8s) -> Self::i8s {
1464		cast(a)
1465	}
1466	#[inline]
1467	fn transmute_u8s_i8s(self, a: Self::i8s) -> Self::u8s {
1468		cast(a)
1469	}
1470	#[inline]
1471	fn transmute_u16s_i16s(self, a: Self::i16s) -> Self::u16s {
1472		cast(a)
1473	}
1474	#[inline]
1475	fn transmute_i16s_u16s(self, a: Self::u16s) -> Self::i16s {
1476		cast(a)
1477	}
1478	#[inline]
1479	fn transmute_i32s_u32s(self, a: Self::u32s) -> Self::i32s {
1480		cast(a)
1481	}
1482	#[inline]
1483	fn transmute_i64s_f64s(self, a: Self::f64s) -> Self::i64s {
1484		cast(a)
1485	}
1486	#[inline]
1487	fn transmute_i64s_u64s(self, a: Self::u64s) -> Self::i64s {
1488		cast(a)
1489	}
1490
1491	#[inline]
1492	fn transmute_u32s_f32s(self, a: Self::f32s) -> Self::u32s {
1493		cast(a)
1494	}
1495	#[inline]
1496	fn transmute_u32s_i32s(self, a: Self::i32s) -> Self::u32s {
1497		cast(a)
1498	}
1499	#[inline]
1500	fn transmute_u64s_f64s(self, a: Self::f64s) -> Self::u64s {
1501		cast(a)
1502	}
1503	#[inline]
1504	fn transmute_u64s_i64s(self, a: Self::i64s) -> Self::u64s {
1505		cast(a)
1506	}
1507
1508	fn vectorize<Op: WithSimd>(self, op: Op) -> Op::Output;
1509	fn widening_mul_u32s(self, a: Self::u32s, b: Self::u32s) -> (Self::u32s, Self::u32s);
1510	fn wrapping_dyn_shl_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s;
1511	fn wrapping_dyn_shr_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s;
1512}
1513
1514pub trait PortableSimd: Simd {}
1515
1516impl PortableSimd for Scalar {}
1517impl PortableSimd for Scalar128b {}
1518impl PortableSimd for Scalar256b {}
1519impl PortableSimd for Scalar512b {}
1520
1521#[derive(Debug, Copy, Clone)]
1522pub struct Scalar;
1523
1524#[derive(Debug, Copy, Clone)]
1525pub struct Scalar128b;
1526#[derive(Debug, Copy, Clone)]
1527pub struct Scalar256b;
1528#[derive(Debug, Copy, Clone)]
1529pub struct Scalar512b;
1530
1531macro_rules! scalar_simd_binop_impl {
1532	($func: ident, $op: ident, $ty: ty) => {
1533		paste! {
1534			#[inline]
1535			fn [<$func _ $ty s>](self, a: Self::[<$ty s>], b: Self::[<$ty s>],) -> Self::[<$ty s>] {
1536				let mut out = [<$ty as Default>::default(); Self::[<$ty:upper _LANES>]];
1537				let a: [$ty; Self::[<$ty:upper _LANES>]] = cast(a);
1538				let b: [$ty; Self::[<$ty:upper _LANES>]] = cast(b);
1539
1540				for i in 0..Self::[<$ty:upper _LANES>] {
1541					out[i] = a[i].$op(b[i]);
1542				}
1543
1544				cast(out)
1545			}
1546		}
1547	};
1548}
1549
1550macro_rules! scalar_simd_binop {
1551	($func: ident, op $op: ident, $($ty: ty),*) => {
1552		$(scalar_simd_binop_impl!($func, $op, $ty);)*
1553	};
1554	($func: ident, $($ty: ty),*) => {
1555		$(scalar_simd_binop_impl!($func, $func, $ty);)*
1556	};
1557}
1558
1559macro_rules! scalar_simd_unop_impl {
1560	($func: ident, $op: ident, $ty: ty) => {
1561		paste! {
1562			#[inline]
1563			fn [<$func _ $ty s>](self, a: Self::[<$ty s>]) -> Self::[<$ty s>] {
1564				let mut out = [<$ty as Default>::default(); Self::[<$ty:upper _LANES>]];
1565				let a: [$ty; Self::[<$ty:upper _LANES>]] = cast(a);
1566
1567				for i in 0..Self::[<$ty:upper _LANES>] {
1568					out[i] = a[i].$op();
1569				}
1570
1571				cast(out)
1572			}
1573		}
1574	};
1575}
1576
1577macro_rules! scalar_simd_unop {
1578	($func: ident, $($ty: ty),*) => {
1579		$(scalar_simd_unop_impl!($func, $func, $ty);)*
1580	};
1581}
1582
1583macro_rules! scalar_simd_cmp {
1584	($func: ident, $op: ident, $ty: ty, $mask: ty) => {
1585		paste! {
1586			#[inline]
1587			fn [<$func _ $ty s>](self, a: Self::[<$ty s>], b: Self::[<$ty s>]) -> Self::[<$mask s>] {
1588				let mut out = [$mask::new(false); Self::[<$ty:upper _LANES>]];
1589				let a: [$ty; Self::[<$ty:upper _LANES>]] = cast(a);
1590				let b: [$ty; Self::[<$ty:upper _LANES>]] = cast(b);
1591				for i in 0..Self::[<$ty:upper _LANES>] {
1592					out[i] = $mask::new(a[i].$op(&b[i]));
1593				}
1594				cast(out)
1595			}
1596		}
1597	};
1598	($func: ident, op $op: ident, $($ty: ty => $mask: ty),*) => {
1599		$(scalar_simd_cmp!($func, $op, $ty, $mask);)*
1600	};
1601	($func: ident, $($ty: ty => $mask: ty),*) => {
1602		$(scalar_simd_cmp!($func, $func, $ty, $mask);)*
1603	};
1604}
1605
1606macro_rules! scalar_splat {
1607	($ty: ident) => {
1608		paste! {
1609			#[inline]
1610			fn [<splat_ $ty s>](self, value: $ty) -> Self::[<$ty s>] {
1611				cast([value; Self::[<$ty:upper _LANES>]])
1612			}
1613		}
1614	};
1615	($($ty: ident),*) => {
1616		$(scalar_splat!($ty);)*
1617	};
1618}
1619
1620macro_rules! scalar_partial_load {
1621	($ty: ident) => {
1622		paste! {
1623			#[inline]
1624			fn [<partial_load_ $ty s>](self, slice: &[$ty]) -> Self::[<$ty s>] {
1625				let mut values = [<$ty as Default>::default(); Self::[<$ty:upper _LANES>]];
1626				for i in 0..Ord::min(values.len(), slice.len()) {
1627					values[i] = slice[i];
1628				}
1629				cast(values)
1630			}
1631		}
1632	};
1633	($($ty: ident),*) => {
1634		$(scalar_partial_load!($ty);)*
1635	};
1636}
1637
1638macro_rules! scalar_partial_store {
1639	($ty: ident) => {
1640		paste! {
1641			#[inline]
1642			fn [<partial_store_ $ty s>](self, slice: &mut [$ty], values: Self::[<$ty s>]) {
1643				let values: [$ty; Self::[<$ty:upper _LANES>]] = cast(values);
1644				for i in 0..Ord::min(values.len(), slice.len()) {
1645					slice[i] = values[i];
1646				}
1647			}
1648		}
1649	};
1650	($($ty: ident),*) => {
1651		$(scalar_partial_store!($ty);)*
1652	};
1653}
1654
1655macro_rules! mask_load_ptr {
1656	($ty: ident, $mask: ident) => {
1657		paste! {
1658			#[inline]
1659			unsafe fn [<mask_load_ptr_ $ty s>](
1660				self,
1661				mask: MemMask<Self::[<$mask s>]>,
1662				ptr: *const $ty,
1663			) -> Self::[<$ty s>] {
1664				let mut values = [<$ty as Default>::default(); Self::[<$ty:upper _LANES>]];
1665				let mask: [$mask; Self::[<$ty:upper _LANES>]] = cast(mask.mask());
1666				for i in 0..Self::[<$ty:upper _LANES>] {
1667					if mask[i].is_set() {
1668						values[i] = *ptr.add(i);
1669					}
1670				}
1671				cast(values)
1672			}
1673		}
1674	};
1675	(cast $ty: ident, $to: ident, $mask: ident) => {
1676		paste! {
1677			#[inline]
1678			unsafe fn [<mask_load_ptr_ $ty s>](
1679				self,
1680				mask: MemMask<Self::[<$mask s>]>,
1681				ptr: *const $ty,
1682			) -> Self::[<$ty s>] {
1683				cast(self.[<mask_load_ptr_ $to s>](mask, ptr as *const $to))
1684			}
1685		}
1686	};
1687	($($ty: ident: $mask: ident),*) => {
1688		$(mask_load_ptr!($ty, $mask);)*
1689	};
1690	(cast $($ty: ident: $mask: ident => $to: ident),*) => {
1691		$(mask_load_ptr!(cast $ty, $to, $mask);)*
1692	};
1693}
1694
1695macro_rules! mask_store_ptr {
1696	($ty: ident, $mask: ident) => {
1697		paste! {
1698			#[inline]
1699			unsafe fn [<mask_store_ptr_ $ty s>](
1700				self,
1701				mask: MemMask<Self::[<$mask s>]>,
1702				ptr: *mut $ty,
1703				values: Self::[<$ty s>],
1704			) {
1705				let mask: [$mask; Self::[<$ty:upper _LANES>]] = cast(mask.mask());
1706				let values: [$ty; Self::[<$ty:upper _LANES>]] = cast(values);
1707				for i in 0..Self::[<$ty:upper _LANES>] {
1708					if mask[i].is_set() {
1709						*ptr.add(i) = values[i];
1710					}
1711				}
1712			}
1713		}
1714	};
1715	(cast $ty: ident, $to: ident, $mask: ident) => {
1716		paste! {
1717			#[inline]
1718			unsafe fn [<mask_store_ptr_ $ty s>](
1719				self,
1720				mask: MemMask<Self::[<$mask s>]>,
1721				ptr: *mut $ty,
1722				values: Self::[<$ty s>],
1723			) {
1724				self.[<mask_store_ptr_ $to s>](mask, ptr as *mut $to, cast(values));
1725			}
1726		}
1727	};
1728	($($ty: ident: $mask: ident),*) => {
1729		$(mask_store_ptr!($ty, $mask);)*
1730	};
1731	(cast $($ty: ident: $mask: ident => $to: ident),*) => {
1732		$(mask_store_ptr!(cast $ty, $to, $mask);)*
1733	};
1734}
1735
1736macro_rules! scalar_simd {
1737	($ty: ty, $register_count: expr, $m8s: ty, $i8s: ty, $u8s: ty, $m16s: ty, $i16s: ty, $u16s: ty, $m32s: ty, $f32s: ty, $i32s: ty, $u32s: ty, $m64s: ty, $f64s: ty, $i64s: ty, $u64s: ty $(,)?) => {
1738		impl Seal for $ty {}
1739		impl Simd for $ty {
1740			type m8s = $m8s;
1741			type m16s = $m16s;
1742			type c32s = $f32s;
1743			type c64s = $f64s;
1744			type f32s = $f32s;
1745			type f64s = $f64s;
1746			type i16s = $i16s;
1747			type i32s = $i32s;
1748			type i64s = $i64s;
1749			type i8s = $i8s;
1750			type m32s = $m32s;
1751			type m64s = $m64s;
1752			type u16s = $u16s;
1753			type u32s = $u32s;
1754			type u64s = $u64s;
1755			type u8s = $u8s;
1756
1757			const REGISTER_COUNT: usize = $register_count;
1758
1759			scalar_simd_binop!(min, u8, i8, u16, i16, u32, i32, u64, i64, f32, f64);
1760
1761			scalar_simd_binop!(max, u8, i8, u16, i16, u32, i32, u64, i64, f32, f64);
1762
1763			scalar_simd_binop!(add, c32, f32, c64, f64);
1764			scalar_simd_binop!(add, op wrapping_add, u8, i8, u16, i16, u32, i32, u64, i64);
1765			scalar_simd_binop!(sub, c32, f32, c64, f64);
1766			scalar_simd_binop!(sub, op wrapping_sub, u8, i8, u16, i16, u32, i32, u64, i64);
1767			scalar_simd_binop!(mul, c32, f32, c64, f64);
1768			scalar_simd_binop!(mul, op wrapping_mul, u16, i16, u32, i32, u64, i64);
1769			scalar_simd_binop!(div, f32, f64);
1770
1771			scalar_simd_binop!(and, op bitand, u8, u16, u32, u64);
1772			scalar_simd_binop!(or,  op bitor, u8, u16, u32, u64);
1773			scalar_simd_binop!(xor, op bitxor, u8, u16, u32, u64);
1774
1775			scalar_simd_cmp!(equal, op eq, u8 => m8, u16 => m16, u32 => m32, u64 => m64, c32 => m32, f32 => m32, c64 => m64, f64 => m64);
1776			scalar_simd_cmp!(greater_than, op gt, u8 => m8, i8 => m8, u16 => m16, i16 => m16, u32 => m32, i32 => m32, u64 => m64, i64 => m64, f32 => m32, f64 => m64);
1777			scalar_simd_cmp!(greater_than_or_equal, op ge, u8 => m8, i8 => m8, u16 => m16, i16 => m16, u32 => m32, i32 => m32, u64 => m64, i64 => m64, f32 => m32, f64 => m64);
1778			scalar_simd_cmp!(less_than_or_equal, op le, u8 => m8, i8 => m8, u16 => m16, i16 => m16, u32 => m32, i32 => m32, u64 => m64, i64 => m64, f32 => m32, f64 => m64);
1779			scalar_simd_cmp!(less_than, op lt, u8 => m8, i8 => m8, u16 => m16, i16 => m16, u32 => m32, i32 => m32, u64 => m64, i64 => m64, f32 => m32, f64 => m64);
1780
1781			scalar_simd_unop!(not, m8, u8, m16, u16, m32, u32, m64, u64);
1782
1783			scalar_splat!(u8, i8, u16, i16, u32, i32, u64, i64, f32, f64);
1784
1785			scalar_partial_load!(u8, i8, u16, i16, u32, i32, u64, i64, f32, f64);
1786			scalar_partial_store!(u8, i8, u16, i16, u32, i32, u64, i64, f32, f64);
1787
1788			mask_load_ptr!(u8: m8, u16: m16, u32: m32, u64: m64);
1789			mask_load_ptr!(cast i8: m8 => u8, i16: m16 => u16, i32: m32 => u32, i64: m64 => u64, c32: m32 => u32, f32: m32 => u32, c64: m64 => u64, f64: m64 => u64);
1790			mask_store_ptr!(u8: m8, u16: m16, u32: m32, u64: m64);
1791			mask_store_ptr!(cast i8: m8 => u8, i16: m16 => u16, i32: m32 => u32, i64: m64 => u64, c32: m32 => u32, f32: m32 => u32, c64: m64 => u64, f64: m64 => u64);
1792
1793			#[inline]
1794			fn vectorize<Op: WithSimd>(self, op: Op) -> Op::Output {
1795				op.with_simd(self)
1796			}
1797
1798			#[inline]
1799			fn and_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s {
1800				let mut out = [m32::new(false); Self::F32_LANES];
1801				let a: [m32; Self::F32_LANES] = cast(a);
1802				let b: [m32; Self::F32_LANES] = cast(b);
1803				for i in 0..Self::F32_LANES {
1804					out[i] = a[i] & b[i];
1805				}
1806				cast(out)
1807			}
1808
1809			#[inline]
1810			fn or_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s {
1811				let mut out = [m32::new(false); Self::F32_LANES];
1812				let a: [m32; Self::F32_LANES] = cast(a);
1813				let b: [m32; Self::F32_LANES] = cast(b);
1814				for i in 0..Self::F32_LANES {
1815					out[i] = a[i] | b[i];
1816				}
1817				cast(out)
1818			}
1819
1820			#[inline]
1821			fn xor_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s {
1822				let mut out = [m32::new(false); Self::F32_LANES];
1823				let a: [m32; Self::F32_LANES] = cast(a);
1824				let b: [m32; Self::F32_LANES] = cast(b);
1825				for i in 0..Self::F32_LANES {
1826					out[i] = a[i] ^ b[i];
1827				}
1828				cast(out)
1829			}
1830
1831			#[inline]
1832			fn and_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s {
1833				let mut out = [m64::new(false); Self::F64_LANES];
1834				let a: [m64; Self::F64_LANES] = cast(a);
1835				let b: [m64; Self::F64_LANES] = cast(b);
1836				for i in 0..Self::F64_LANES {
1837					out[i] = a[i] & b[i];
1838				}
1839				cast(out)
1840			}
1841
1842			#[inline]
1843			fn or_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s {
1844				let mut out = [m64::new(false); Self::F64_LANES];
1845				let a: [m64; Self::F64_LANES] = cast(a);
1846				let b: [m64; Self::F64_LANES] = cast(b);
1847				for i in 0..Self::F64_LANES {
1848					out[i] = a[i] | b[i];
1849				}
1850				cast(out)
1851			}
1852
1853			#[inline]
1854			fn xor_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s {
1855				let mut out = [m64::new(false); Self::F64_LANES];
1856				let a: [m64; Self::F64_LANES] = cast(a);
1857				let b: [m64; Self::F64_LANES] = cast(b);
1858				for i in 0..Self::F64_LANES {
1859					out[i] = a[i] ^ b[i];
1860				}
1861				cast(out)
1862			}
1863
1864			#[inline]
1865			fn select_u32s(
1866				self,
1867				mask: Self::m32s,
1868				if_true: Self::u32s,
1869				if_false: Self::u32s,
1870			) -> Self::u32s {
1871				let mut out = [0u32; Self::F32_LANES];
1872				let mask: [m32; Self::F32_LANES] = cast(mask);
1873				let if_true: [u32; Self::F32_LANES] = cast(if_true);
1874				let if_false: [u32; Self::F32_LANES] = cast(if_false);
1875
1876				for i in 0..Self::F32_LANES {
1877					out[i] = if mask[i].is_set() {
1878						if_true[i]
1879					} else {
1880						if_false[i]
1881					};
1882				}
1883
1884				cast(out)
1885			}
1886
1887			#[inline]
1888			fn select_u64s(
1889				self,
1890				mask: Self::m64s,
1891				if_true: Self::u64s,
1892				if_false: Self::u64s,
1893			) -> Self::u64s {
1894				let mut out = [0u64; Self::F64_LANES];
1895				let mask: [m64; Self::F64_LANES] = cast(mask);
1896				let if_true: [u64; Self::F64_LANES] = cast(if_true);
1897				let if_false: [u64; Self::F64_LANES] = cast(if_false);
1898
1899				for i in 0..Self::F64_LANES {
1900					out[i] = if mask[i].is_set() {
1901						if_true[i]
1902					} else {
1903						if_false[i]
1904					};
1905				}
1906
1907				cast(out)
1908			}
1909
1910			#[inline]
1911			fn wrapping_dyn_shl_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s {
1912				let mut out = [0u32; Self::F32_LANES];
1913				let a: [u32; Self::F32_LANES] = cast(a);
1914				let b: [u32; Self::F32_LANES] = cast(amount);
1915				for i in 0..Self::F32_LANES {
1916					out[i] = a[i].wrapping_shl(b[i]);
1917				}
1918				cast(out)
1919			}
1920
1921			#[inline]
1922			fn wrapping_dyn_shr_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s {
1923				let mut out = [0u32; Self::F32_LANES];
1924				let a: [u32; Self::F32_LANES] = cast(a);
1925				let b: [u32; Self::F32_LANES] = cast(amount);
1926				for i in 0..Self::F32_LANES {
1927					out[i] = a[i].wrapping_shr(b[i]);
1928				}
1929				cast(out)
1930			}
1931
1932			#[inline]
1933			fn widening_mul_u32s(self, a: Self::u32s, b: Self::u32s) -> (Self::u32s, Self::u32s) {
1934				let mut lo = [0u32; Self::F32_LANES];
1935				let mut hi = [0u32; Self::F32_LANES];
1936				let a: [u32; Self::F32_LANES] = cast(a);
1937				let b: [u32; Self::F32_LANES] = cast(b);
1938				for i in 0..Self::F32_LANES {
1939					let m = a[i] as u64 * b[i] as u64;
1940
1941					(lo[i], hi[i]) = (m as u32, (m >> 32) as u32);
1942				}
1943				(cast(lo), cast(hi))
1944			}
1945
1946			#[inline]
1947			fn mul_add_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s {
1948				let mut out = [0.0f32; Self::F32_LANES];
1949				let a: [f32; Self::F32_LANES] = cast(a);
1950				let b: [f32; Self::F32_LANES] = cast(b);
1951				let c: [f32; Self::F32_LANES] = cast(c);
1952
1953				for i in 0..Self::F32_LANES {
1954					out[i] = fma_f32(a[i], b[i], c[i]);
1955				}
1956
1957				cast(out)
1958			}
1959
1960			#[inline]
1961			fn reduce_sum_f32s(self, a: Self::f32s) -> f32 {
1962				let mut a: [f32; Self::F32_LANES] = cast(a);
1963
1964				let mut n = Self::F32_LANES;
1965				while n > 1 {
1966					n /= 2;
1967					for i in 0..n {
1968						a[i] += a[i + n];
1969					}
1970				}
1971
1972				a[0]
1973			}
1974
1975			#[inline]
1976			fn reduce_product_f32s(self, a: Self::f32s) -> f32 {
1977				let mut a: [f32; Self::F32_LANES] = cast(a);
1978
1979				let mut n = Self::F32_LANES;
1980				while n > 1 {
1981					n /= 2;
1982					for i in 0..n {
1983						a[i] *= a[i + n];
1984					}
1985				}
1986
1987				a[0]
1988			}
1989
1990			#[inline]
1991			fn reduce_min_f32s(self, a: Self::f32s) -> f32 {
1992				let mut a: [f32; Self::F32_LANES] = cast(a);
1993
1994				let mut n = Self::F32_LANES;
1995				while n > 1 {
1996					n /= 2;
1997					for i in 0..n {
1998						a[i] = f32::min(a[i], a[i + n]);
1999					}
2000				}
2001
2002				a[0]
2003			}
2004
2005			#[inline]
2006			fn reduce_max_f32s(self, a: Self::f32s) -> f32 {
2007				let mut a: [f32; Self::F32_LANES] = cast(a);
2008
2009				let mut n = Self::F32_LANES;
2010				while n > 1 {
2011					n /= 2;
2012					for i in 0..n {
2013						a[i] = f32::max(a[i], a[i + n]);
2014					}
2015				}
2016
2017				a[0]
2018			}
2019
2020			#[inline]
2021			fn splat_c32s(self, value: c32) -> Self::c32s {
2022				cast([value; Self::C32_LANES])
2023			}
2024
2025			#[inline]
2026			fn conj_c32s(self, a: Self::c32s) -> Self::c32s {
2027				let mut out = [c32::ZERO; Self::C32_LANES];
2028				let a: [c32; Self::C32_LANES] = cast(a);
2029
2030				for i in 0..Self::C32_LANES {
2031					out[i] = c32::new(a[i].re, -a[i].im);
2032				}
2033
2034				cast(out)
2035			}
2036
2037			#[inline]
2038			fn neg_c32s(self, a: Self::c32s) -> Self::c32s {
2039				let mut out = [c32::ZERO; Self::C32_LANES];
2040				let a: [c32; Self::C32_LANES] = cast(a);
2041
2042				for i in 0..Self::C32_LANES {
2043					out[i] = c32::new(-a[i].re, -a[i].im);
2044				}
2045
2046				cast(out)
2047			}
2048
2049			#[inline]
2050			fn swap_re_im_c32s(self, a: Self::c32s) -> Self::c32s {
2051				let mut out = [c32::ZERO; Self::C32_LANES];
2052				let a: [c32; Self::C32_LANES] = cast(a);
2053
2054				for i in 0..Self::C32_LANES {
2055					out[i] = c32::new(a[i].im, a[i].re);
2056				}
2057
2058				cast(out)
2059			}
2060
2061			#[inline]
2062			fn conj_mul_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
2063				let mut out = [c32::ZERO; Self::C32_LANES];
2064				let a: [c32; Self::C32_LANES] = cast(a);
2065				let b: [c32; Self::C32_LANES] = cast(b);
2066
2067				for i in 0..Self::C32_LANES {
2068					out[i].re = fma_f32(a[i].re, b[i].re, a[i].im * b[i].im);
2069					out[i].im = fma_f32(a[i].re, b[i].im, -(a[i].im * b[i].re));
2070				}
2071
2072				cast(out)
2073			}
2074
2075			#[inline]
2076			fn mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s {
2077				let mut out = [c32::ZERO; Self::C32_LANES];
2078				let a: [c32; Self::C32_LANES] = cast(a);
2079				let b: [c32; Self::C32_LANES] = cast(b);
2080				let c: [c32; Self::C32_LANES] = cast(c);
2081
2082				for i in 0..Self::C32_LANES {
2083					out[i].re = fma_f32(a[i].re, b[i].re, -fma_f32(a[i].im, b[i].im, -c[i].re));
2084					out[i].im = fma_f32(a[i].re, b[i].im, fma_f32(a[i].im, b[i].re, c[i].im));
2085				}
2086
2087				cast(out)
2088			}
2089
2090			#[inline]
2091			fn conj_mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s {
2092				let mut out = [c32::ZERO; Self::C32_LANES];
2093				let a: [c32; Self::C32_LANES] = cast(a);
2094				let b: [c32; Self::C32_LANES] = cast(b);
2095				let c: [c32; Self::C32_LANES] = cast(c);
2096
2097				for i in 0..Self::C32_LANES {
2098					out[i].re = fma_f32(a[i].re, b[i].re, fma_f32(a[i].im, b[i].im, c[i].re));
2099					out[i].im = fma_f32(a[i].re, b[i].im, -fma_f32(a[i].im, b[i].re, -c[i].im));
2100				}
2101
2102				cast(out)
2103			}
2104
2105			#[inline]
2106			fn abs2_c32s(self, a: Self::c32s) -> Self::c32s {
2107				let mut out = [c32::ZERO; Self::C32_LANES];
2108				let a: [c32; Self::C32_LANES] = cast(a);
2109
2110				for i in 0..Self::C32_LANES {
2111					let x = a[i].re * a[i].re + a[i].im * a[i].im;
2112					out[i].re = x;
2113					out[i].im = x;
2114				}
2115
2116				cast(out)
2117			}
2118
2119			#[inline]
2120			fn abs_max_c32s(self, a: Self::c32s) -> Self::c32s {
2121				let mut out = [c32::ZERO; Self::C32_LANES];
2122				let a: [c32; Self::C32_LANES] = cast(self.abs_f32s(a));
2123
2124				for i in 0..Self::C32_LANES {
2125					let x = f32::max(a[i].re, a[i].im);
2126					out[i].re = x;
2127					out[i].im = x;
2128				}
2129
2130				cast(out)
2131			}
2132
2133			#[inline]
2134			fn reduce_sum_c32s(self, a: Self::c32s) -> c32 {
2135				let mut a: [c32; Self::C32_LANES] = cast(a);
2136
2137				let mut n = Self::C32_LANES;
2138				while n > 1 {
2139					n /= 2;
2140					for i in 0..n {
2141						a[i].re += a[i + n].re;
2142						a[i].im += a[i + n].im;
2143					}
2144				}
2145
2146				a[0]
2147			}
2148
2149			#[inline]
2150			fn reduce_min_c32s(self, a: Self::c32s) -> c32 {
2151				let mut a: [c32; Self::C32_LANES] = cast(a);
2152
2153				let mut n = Self::C32_LANES;
2154				while n > 1 {
2155					n /= 2;
2156					for i in 0..n {
2157						a[i].re = f32::min(a[i].re, a[i + n].re);
2158						a[i].im = f32::min(a[i].im, a[i + n].im);
2159					}
2160				}
2161
2162				a[0]
2163			}
2164
2165			#[inline]
2166			fn reduce_max_c32s(self, a: Self::c32s) -> c32 {
2167				let mut a: [c32; Self::C32_LANES] = cast(a);
2168
2169				let mut n = Self::C32_LANES;
2170				while n > 1 {
2171					n /= 2;
2172					for i in 0..n {
2173						a[i].re = f32::max(a[i].re, a[i + n].re);
2174						a[i].im = f32::max(a[i].im, a[i + n].im);
2175					}
2176				}
2177
2178				a[0]
2179			}
2180
2181			#[inline]
2182			fn rotate_right_u32s(self, a: Self::u32s, amount: usize) -> Self::u32s {
2183				let mut a: [u32; Self::F32_LANES] = cast(a);
2184				let amount = amount % Self::F32_LANES;
2185				a.rotate_right(amount);
2186				cast(a)
2187			}
2188
2189			#[inline]
2190			fn rotate_right_c32s(self, a: Self::c32s, amount: usize) -> Self::c32s {
2191				let mut a: [c32; Self::C32_LANES] = cast(a);
2192				let amount = amount % Self::C32_LANES;
2193				a.rotate_right(amount);
2194				cast(a)
2195			}
2196
2197			#[inline]
2198			fn mul_add_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s {
2199				let mut out = [0.0f64; Self::F64_LANES];
2200				let a: [f64; Self::F64_LANES] = cast(a);
2201				let b: [f64; Self::F64_LANES] = cast(b);
2202				let c: [f64; Self::F64_LANES] = cast(c);
2203
2204				for i in 0..Self::F64_LANES {
2205					out[i] = fma_f64(a[i], b[i], c[i]);
2206				}
2207
2208				cast(out)
2209			}
2210
2211			#[inline]
2212			fn reduce_sum_f64s(self, a: Self::f64s) -> f64 {
2213				let mut a: [f64; Self::F64_LANES] = cast(a);
2214
2215				let mut n = Self::F64_LANES;
2216				while n > 1 {
2217					n /= 2;
2218					for i in 0..n {
2219						a[i] += a[i + n];
2220					}
2221				}
2222
2223				a[0]
2224			}
2225
2226			#[inline]
2227			fn reduce_product_f64s(self, a: Self::f64s) -> f64 {
2228				let mut a: [f64; Self::F64_LANES] = cast(a);
2229
2230				let mut n = Self::F64_LANES;
2231				while n > 1 {
2232					n /= 2;
2233					for i in 0..n {
2234						a[i] *= a[i + n];
2235					}
2236				}
2237
2238				a[0]
2239			}
2240
2241			#[inline]
2242			fn reduce_min_f64s(self, a: Self::f64s) -> f64 {
2243				let mut a: [f64; Self::F64_LANES] = cast(a);
2244
2245				let mut n = Self::F64_LANES;
2246				while n > 1 {
2247					n /= 2;
2248					for i in 0..n {
2249						a[i] = f64::min(a[i], a[i + n]);
2250					}
2251				}
2252
2253				a[0]
2254			}
2255
2256			#[inline]
2257			fn reduce_max_f64s(self, a: Self::f64s) -> f64 {
2258				let mut a: [f64; Self::F64_LANES] = cast(a);
2259
2260				let mut n = Self::F64_LANES;
2261				while n > 1 {
2262					n /= 2;
2263					for i in 0..n {
2264						a[i] = f64::max(a[i], a[i + n]);
2265					}
2266				}
2267
2268				a[0]
2269			}
2270
2271			#[inline]
2272			fn splat_c64s(self, value: c64) -> Self::c64s {
2273				cast([value; Self::C64_LANES])
2274			}
2275
2276			#[inline]
2277			fn conj_c64s(self, a: Self::c64s) -> Self::c64s {
2278				let mut out = [c64::ZERO; Self::C64_LANES];
2279				let a: [c64; Self::C64_LANES] = cast(a);
2280
2281				for i in 0..Self::C64_LANES {
2282					out[i] = c64::new(a[i].re, -a[i].im);
2283				}
2284
2285				cast(out)
2286			}
2287
2288			#[inline]
2289			fn neg_c64s(self, a: Self::c64s) -> Self::c64s {
2290				let mut out = [c64::ZERO; Self::C64_LANES];
2291				let a: [c64; Self::C64_LANES] = cast(a);
2292
2293				for i in 0..Self::C64_LANES {
2294					out[i] = c64::new(-a[i].re, -a[i].im);
2295				}
2296
2297				cast(out)
2298			}
2299
2300			#[inline]
2301			fn swap_re_im_c64s(self, a: Self::c64s) -> Self::c64s {
2302				let mut out = [c64::ZERO; Self::C64_LANES];
2303				let a: [c64; Self::C64_LANES] = cast(a);
2304
2305				for i in 0..Self::C64_LANES {
2306					out[i] = c64::new(a[i].im, a[i].re);
2307				}
2308
2309				cast(out)
2310			}
2311
2312			#[inline]
2313			fn conj_mul_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
2314				let mut out = [c64::ZERO; Self::C64_LANES];
2315				let a: [c64; Self::C64_LANES] = cast(a);
2316				let b: [c64; Self::C64_LANES] = cast(b);
2317
2318				for i in 0..Self::C64_LANES {
2319					out[i].re = fma_f64(a[i].re, b[i].re, a[i].im * b[i].im);
2320					out[i].im = fma_f64(a[i].re, b[i].im, -(a[i].im * b[i].re));
2321				}
2322
2323				cast(out)
2324			}
2325
2326			#[inline]
2327			fn mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s {
2328				let mut out = [c64::ZERO; Self::C64_LANES];
2329				let a: [c64; Self::C64_LANES] = cast(a);
2330				let b: [c64; Self::C64_LANES] = cast(b);
2331				let c: [c64; Self::C64_LANES] = cast(c);
2332
2333				for i in 0..Self::C64_LANES {
2334					out[i].re = fma_f64(a[i].re, b[i].re, -fma_f64(a[i].im, b[i].im, -c[i].re));
2335					out[i].im = fma_f64(a[i].re, b[i].im, fma_f64(a[i].im, b[i].re, c[i].im));
2336				}
2337
2338				cast(out)
2339			}
2340
2341			#[inline]
2342			fn conj_mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s {
2343				let mut out = [c64::ZERO; Self::C64_LANES];
2344				let a: [c64; Self::C64_LANES] = cast(a);
2345				let b: [c64; Self::C64_LANES] = cast(b);
2346				let c: [c64; Self::C64_LANES] = cast(c);
2347
2348				for i in 0..Self::C64_LANES {
2349					out[i].re = fma_f64(a[i].re, b[i].re, fma_f64(a[i].im, b[i].im, c[i].re));
2350					out[i].im = fma_f64(a[i].re, b[i].im, -fma_f64(a[i].im, b[i].re, -c[i].im));
2351				}
2352
2353				cast(out)
2354			}
2355
2356			#[inline]
2357			fn abs2_c64s(self, a: Self::c64s) -> Self::c64s {
2358				let mut out = [c64::ZERO; Self::C64_LANES];
2359				let a: [c64; Self::C64_LANES] = cast(a);
2360
2361				for i in 0..Self::C64_LANES {
2362					let x = a[i].re * a[i].re + a[i].im * a[i].im;
2363					out[i].re = x;
2364					out[i].im = x;
2365				}
2366
2367				cast(out)
2368			}
2369
2370			#[inline]
2371			fn abs_max_c64s(self, a: Self::c64s) -> Self::c64s {
2372				let mut out = [c64::ZERO; Self::C64_LANES];
2373				let a: [c64; Self::C64_LANES] = cast(self.abs_f64s(a));
2374
2375				for i in 0..Self::C64_LANES {
2376					let x = f64::max(a[i].re, a[i].im);
2377					out[i].re = x;
2378					out[i].im = x;
2379				}
2380
2381				cast(out)
2382			}
2383
2384			#[inline]
2385			fn reduce_sum_c64s(self, a: Self::c64s) -> c64 {
2386				let mut a: [c64; Self::C64_LANES] = cast(a);
2387
2388				let mut n = Self::C64_LANES;
2389				while n > 1 {
2390					n /= 2;
2391					for i in 0..n {
2392						a[i].re += a[i + n].re;
2393						a[i].im += a[i + n].im;
2394					}
2395				}
2396
2397				a[0]
2398			}
2399
2400			#[inline]
2401			fn reduce_min_c64s(self, a: Self::c64s) -> c64 {
2402				let mut a: [c64; Self::C64_LANES] = cast(a);
2403
2404				let mut n = Self::C64_LANES;
2405				while n > 1 {
2406					n /= 2;
2407					for i in 0..n {
2408						a[i].re = f64::min(a[i].re, a[i + n].re);
2409						a[i].im = f64::min(a[i].im, a[i + n].im);
2410					}
2411				}
2412
2413				a[0]
2414			}
2415
2416			#[inline]
2417			fn reduce_max_c64s(self, a: Self::c64s) -> c64 {
2418				let mut a: [c64; Self::C64_LANES] = cast(a);
2419
2420				let mut n = Self::C64_LANES;
2421				while n > 1 {
2422					n /= 2;
2423					for i in 0..n {
2424						a[i].re = f64::max(a[i].re, a[i + n].re);
2425						a[i].im = f64::max(a[i].im, a[i + n].im);
2426					}
2427				}
2428
2429				a[0]
2430			}
2431
2432			#[inline]
2433			fn rotate_right_u64s(self, a: Self::u64s, amount: usize) -> Self::u64s {
2434				let mut a: [u64; Self::F64_LANES] = cast(a);
2435				let amount = amount % Self::F64_LANES;
2436				a.rotate_right(amount);
2437				cast(a)
2438			}
2439
2440			#[inline]
2441			fn rotate_right_c64s(self, a: Self::c64s, amount: usize) -> Self::c64s {
2442				let mut a: [c64; Self::C64_LANES] = cast(a);
2443				let amount = amount % Self::C64_LANES;
2444				a.rotate_right(amount);
2445				cast(a)
2446			}
2447
2448			#[inline]
2449			fn mul_add_e_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s {
2450				self.mul_add_f32s(a, b, c)
2451			}
2452
2453			#[inline]
2454			fn mul_add_e_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s {
2455				self.mul_add_f64s(a, b, c)
2456			}
2457
2458			#[inline(always)]
2459			fn sqrt_f32s(self, a: Self::f32s) -> Self::f32s {
2460				let mut out = [0.0_f32; Self::F32_LANES];
2461				let a: [f32; Self::F32_LANES] = cast(a);
2462
2463				for i in 0..Self::F32_LANES {
2464					out[i] = sqrt_f32(a[i]);
2465				}
2466
2467				cast(out)
2468			}
2469			#[inline(always)]
2470			fn sqrt_f64s(self, a: Self::f64s) -> Self::f64s {
2471				let mut out = [0.0_f64; Self::F64_LANES];
2472				let a: [f64; Self::F64_LANES] = cast(a);
2473
2474				for i in 0..Self::F64_LANES {
2475					out[i] = sqrt_f64(a[i]);
2476				}
2477
2478				cast(out)
2479			}
2480		}
2481	};
2482}
2483
2484scalar_simd!(
2485	Scalar128b, 16, m8x16, i8x16, u8x16, m16x8, i16x8, u16x8, m32x4, f32x4, i32x4, u32x4, m64x2,
2486	f64x2, i64x2, u64x2
2487);
2488scalar_simd!(
2489	Scalar256b, 16, m8x32, i8x32, u8x32, m16x16, i16x16, u16x16, m32x8, f32x8, i32x8, u32x8, m64x4,
2490	f64x4, i64x4, u64x4
2491);
2492scalar_simd!(
2493	Scalar512b, 8, m8x64, i8x64, u8x64, m16x32, i16x32, u16x32, m32x16, f32x16, i32x16, u32x16,
2494	m64x8, f64x8, i64x8, u64x8
2495);
2496
2497impl Default for Scalar {
2498	#[inline]
2499	fn default() -> Self {
2500		Self::new()
2501	}
2502}
2503
2504impl Scalar {
2505	#[inline]
2506	pub fn new() -> Self {
2507		Self
2508	}
2509}
2510
2511macro_rules! impl_primitive_binop {
2512	($func: ident, $op: ident, $ty: ident, $out: ty) => {
2513		paste! {
2514			#[inline(always)]
2515			fn [<$func _ $ty s>](self, a: Self::[<$ty s>], b: Self::[<$ty s>]) -> Self::[<$out s>] {
2516				a.$op(b)
2517			}
2518		}
2519	};
2520	(ref $func: ident, $op: ident, $ty: ident, $out: ty) => {
2521		paste! {
2522			#[inline(always)]
2523			fn [<$func _ $ty s>](self, a: Self::[<$ty s>], b: Self::[<$ty s>]) -> Self::[<$out s>] {
2524				a.$op(&b)
2525			}
2526		}
2527	};
2528}
2529
2530macro_rules! primitive_binop {
2531	(ref $func: ident, op $op: ident, $($ty: ident => $out: ty),*) => {
2532		$(impl_primitive_binop!(ref $func, $op, $ty, $out);)*
2533	};
2534	($func: ident, $($ty: ident => $out: ty),*) => {
2535		$(impl_primitive_binop!($func, $func, $ty, $out);)*
2536	};
2537	($func: ident, op $op: ident, $($ty: ident),*) => {
2538		$(impl_primitive_binop!($func, $op, $ty, $ty);)*
2539	};
2540	($func: ident, $($ty: ident),*) => {
2541		$(impl_primitive_binop!($func, $func, $ty, $ty);)*
2542	};
2543}
2544
2545macro_rules! impl_primitive_unop {
2546	($func: ident, $op: ident, $ty: ident, $out: ty) => {
2547		paste! {
2548			#[inline(always)]
2549			fn [<$func _ $ty s>](self, a: Self::[<$ty s>]) -> Self::[<$out s>] {
2550				a.$op()
2551			}
2552		}
2553	};
2554}
2555
2556macro_rules! primitive_unop {
2557	($func: ident, $($ty: ident),*) => {
2558		$(impl_primitive_unop!($func, $func, $ty, $ty);)*
2559	};
2560}
2561
2562macro_rules! splat_primitive {
2563	($ty: ty) => {
2564		paste! {
2565			#[inline]
2566			fn [<splat_ $ty s>](self, value: $ty) -> Self::[<$ty s>] {
2567				value
2568			}
2569		}
2570	};
2571	($($ty: ty),*) => {
2572		$(splat_primitive!($ty);)*
2573	}
2574}
2575
2576impl Seal for Scalar {}
2577impl Simd for Scalar {
2578	type c32s = c32;
2579	type c64s = c64;
2580	type f32s = f32;
2581	type f64s = f64;
2582	type i16s = i16;
2583	type i32s = i32;
2584	type i64s = i64;
2585	type i8s = i8;
2586	type m16s = bool;
2587	type m32s = bool;
2588	type m64s = bool;
2589	type m8s = bool;
2590	type u16s = u16;
2591	type u32s = u32;
2592	type u64s = u64;
2593	type u8s = u8;
2594
2595	const IS_SCALAR: bool = true;
2596	const REGISTER_COUNT: usize = 16;
2597
2598	primitive_binop!(add, c32, f32, c64, f64);
2599
2600	primitive_binop!(add, op wrapping_add, u8, i8, u16, i16, u32, i32, u64, i64);
2601
2602	primitive_binop!(sub, c32, f32, c64, f64);
2603
2604	primitive_binop!(sub, op wrapping_sub, u8, i8, u16, i16, u32, i32, u64, i64);
2605
2606	primitive_binop!(mul, f32, f64);
2607
2608	primitive_binop!(mul, op wrapping_mul, u16, i16, u32, i32, u64, i64);
2609
2610	primitive_binop!(div, f32, f64);
2611
2612	primitive_binop!(and, op bitand, m8, u8, m16, u16, m32, u32, m64, u64);
2613
2614	primitive_binop!(or, op bitor, m8, u8, m16, u16, m32, u32, m64, u64);
2615
2616	primitive_binop!(xor, op bitxor, m8, u8, m16, u16, m32, u32, m64, u64);
2617
2618	primitive_binop!(ref equal, op eq, m8 => m8, u8 => m8, m16 => m16, u16 => m16, m32 => m32, u32 => m32, m64 => m64, u64 => m64, c32 => m32, f32 => m32, c64 => m64, f64 => m64);
2619
2620	primitive_binop!(ref greater_than, op gt, u8 => m8, i8 => m8, u16 => m16, i16 => m16, u32 => m32, i32 => m32, u64 => m64, i64 => m64, f32 => m32, f64 => m64);
2621
2622	primitive_binop!(ref greater_than_or_equal, op ge, u8 => m8, i8 => m8, u16 => m16, i16 => m16, u32 => m32, i32 => m32, u64 => m64, i64 => m64, f32 => m32, f64 => m64);
2623
2624	primitive_binop!(ref less_than, op lt, u8 => m8, i8 => m8, u16 => m16, i16 => m16, u32 => m32, i32 => m32, u64 => m64, i64 => m64, f32 => m32, f64 => m64);
2625
2626	primitive_binop!(ref less_than_or_equal, op le, u8 => m8, i8 => m8, u16 => m16, i16 => m16, u32 => m32, i32 => m32, u64 => m64, i64 => m64, f32 => m32, f64 => m64);
2627
2628	primitive_binop!(min, u8, i8, u16, i16, u32, i32, u64, i64, f32, f64);
2629
2630	primitive_binop!(max, u8, i8, u16, i16, u32, i32, u64, i64, f32, f64);
2631
2632	primitive_unop!(neg, c32, c64, f32, f64);
2633
2634	primitive_unop!(not, m8, u8, m16, u16, m32, u32, m64, u64);
2635
2636	splat_primitive!(u8, i8, u16, i16, u32, i32, u64, i64, c32, f32, c64, f64);
2637
2638	#[inline]
2639	fn abs2_c32s(self, a: Self::c32s) -> Self::c32s {
2640		let norm2 = a.re * a.re + a.im * a.im;
2641		c32::new(norm2, norm2)
2642	}
2643
2644	#[inline]
2645	fn abs2_c64s(self, a: Self::c64s) -> Self::c64s {
2646		let norm2 = a.re * a.re + a.im * a.im;
2647		c64::new(norm2, norm2)
2648	}
2649
2650	#[inline(always)]
2651	fn abs_max_c32s(self, a: Self::c32s) -> Self::c32s {
2652		let re = if a.re > a.im { a.re } else { a.im };
2653		let im = re;
2654		Complex { re, im }
2655	}
2656
2657	#[inline(always)]
2658	fn abs_max_c64s(self, a: Self::c64s) -> Self::c64s {
2659		let re = if a.re > a.im { a.re } else { a.im };
2660		let im = re;
2661		Complex { re, im }
2662	}
2663
2664	#[inline]
2665	fn conj_c32s(self, a: Self::c32s) -> Self::c32s {
2666		a.conj()
2667	}
2668
2669	#[inline]
2670	fn conj_c64s(self, a: Self::c64s) -> Self::c64s {
2671		a.conj()
2672	}
2673
2674	#[inline]
2675	fn conj_mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s {
2676		let re = fma_f32(a.re, b.re, fma_f32(a.im, b.im, c.re));
2677		let im = fma_f32(a.re, b.im, -fma_f32(a.im, b.re, -c.im));
2678		Complex { re, im }
2679	}
2680
2681	#[inline]
2682	fn conj_mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s {
2683		let re = fma_f64(a.re, b.re, fma_f64(a.im, b.im, c.re));
2684		let im = fma_f64(a.re, b.im, -fma_f64(a.im, b.re, -c.im));
2685		Complex { re, im }
2686	}
2687
2688	#[inline]
2689	fn conj_mul_add_e_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s {
2690		a.conj() * b + c
2691	}
2692
2693	#[inline]
2694	fn conj_mul_add_e_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s {
2695		a.conj() * b + c
2696	}
2697
2698	#[inline]
2699	fn conj_mul_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
2700		let re = fma_f32(a.re, b.re, a.im * b.im);
2701		let im = fma_f32(a.re, b.im, -(a.im * b.re));
2702		Complex { re, im }
2703	}
2704
2705	#[inline]
2706	fn conj_mul_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
2707		let re = fma_f64(a.re, b.re, a.im * b.im);
2708		let im = fma_f64(a.re, b.im, -(a.im * b.re));
2709		Complex { re, im }
2710	}
2711
2712	#[inline]
2713	fn conj_mul_e_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
2714		a.conj() * b
2715	}
2716
2717	#[inline]
2718	fn conj_mul_e_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
2719		a.conj() * b
2720	}
2721
2722	#[inline(always)]
2723	fn first_true_m32s(self, mask: Self::m32s) -> usize {
2724		if mask { 0 } else { 1 }
2725	}
2726
2727	#[inline(always)]
2728	fn first_true_m64s(self, mask: Self::m64s) -> usize {
2729		if mask { 0 } else { 1 }
2730	}
2731
2732	#[inline(always)]
2733	unsafe fn mask_load_ptr_c32s(self, mask: MemMask<Self::m32s>, ptr: *const c32) -> Self::c32s {
2734		if mask.mask { *ptr } else { core::mem::zeroed() }
2735	}
2736
2737	#[inline(always)]
2738	unsafe fn mask_load_ptr_c64s(self, mask: MemMask<Self::m64s>, ptr: *const c64) -> Self::c64s {
2739		if mask.mask { *ptr } else { core::mem::zeroed() }
2740	}
2741
2742	#[inline(always)]
2743	unsafe fn mask_load_ptr_u32s(self, mask: MemMask<Self::m32s>, ptr: *const u32) -> Self::u32s {
2744		if mask.mask { *ptr } else { 0 }
2745	}
2746
2747	#[inline(always)]
2748	unsafe fn mask_load_ptr_u64s(self, mask: MemMask<Self::m64s>, ptr: *const u64) -> Self::u64s {
2749		if mask.mask { *ptr } else { 0 }
2750	}
2751
2752	#[inline(always)]
2753	unsafe fn mask_store_ptr_c32s(
2754		self,
2755		mask: MemMask<Self::m32s>,
2756		ptr: *mut c32,
2757		values: Self::c32s,
2758	) {
2759		if mask.mask {
2760			*ptr = values
2761		}
2762	}
2763
2764	#[inline(always)]
2765	unsafe fn mask_store_ptr_c64s(
2766		self,
2767		mask: MemMask<Self::m64s>,
2768		ptr: *mut c64,
2769		values: Self::c64s,
2770	) {
2771		if mask.mask {
2772			*ptr = values
2773		}
2774	}
2775
2776	#[inline(always)]
2777	unsafe fn mask_store_ptr_u8s(self, mask: MemMask<Self::m8s>, ptr: *mut u8, values: Self::u8s) {
2778		if mask.mask {
2779			*ptr = values
2780		}
2781	}
2782
2783	#[inline(always)]
2784	unsafe fn mask_store_ptr_u16s(
2785		self,
2786		mask: MemMask<Self::m16s>,
2787		ptr: *mut u16,
2788		values: Self::u16s,
2789	) {
2790		if mask.mask {
2791			*ptr = values
2792		}
2793	}
2794
2795	#[inline(always)]
2796	unsafe fn mask_store_ptr_u32s(
2797		self,
2798		mask: MemMask<Self::m32s>,
2799		ptr: *mut u32,
2800		values: Self::u32s,
2801	) {
2802		if mask.mask {
2803			*ptr = values
2804		}
2805	}
2806
2807	#[inline(always)]
2808	unsafe fn mask_store_ptr_u64s(
2809		self,
2810		mask: MemMask<Self::m64s>,
2811		ptr: *mut u64,
2812		values: Self::u64s,
2813	) {
2814		if mask.mask {
2815			*ptr = values
2816		}
2817	}
2818
2819	#[inline]
2820	fn mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s {
2821		let re = fma_f32(a.re, b.re, -fma_f32(a.im, b.im, -c.re));
2822		let im = fma_f32(a.re, b.im, fma_f32(a.im, b.re, c.im));
2823		Complex { re, im }
2824	}
2825
2826	#[inline]
2827	fn mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s {
2828		let re = fma_f64(a.re, b.re, -fma_f64(a.im, b.im, -c.re));
2829		let im = fma_f64(a.re, b.im, fma_f64(a.im, b.re, c.im));
2830		Complex { re, im }
2831	}
2832
2833	#[inline]
2834	fn mul_add_e_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s {
2835		a * b + c
2836	}
2837
2838	#[inline]
2839	fn mul_add_e_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s {
2840		a * b + c
2841	}
2842
2843	#[inline(always)]
2844	fn mul_add_e_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s {
2845		a * b + c
2846	}
2847
2848	#[inline(always)]
2849	fn mul_add_e_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s {
2850		a * b + c
2851	}
2852
2853	#[inline]
2854	fn mul_add_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s {
2855		fma_f32(a, b, c)
2856	}
2857
2858	#[inline]
2859	fn mul_add_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s {
2860		fma_f64(a, b, c)
2861	}
2862
2863	#[inline]
2864	fn mul_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
2865		let re = fma_f32(a.re, b.re, -(a.im * b.im));
2866		let im = fma_f32(a.re, b.im, a.im * b.re);
2867		Complex { re, im }
2868	}
2869
2870	#[inline]
2871	fn mul_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
2872		let re = fma_f64(a.re, b.re, -(a.im * b.im));
2873		let im = fma_f64(a.re, b.im, a.im * b.re);
2874		Complex { re, im }
2875	}
2876
2877	#[inline]
2878	fn mul_e_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
2879		a * b
2880	}
2881
2882	#[inline]
2883	fn mul_e_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
2884		a * b
2885	}
2886
2887	#[inline]
2888	fn partial_load_c64s(self, slice: &[c64]) -> Self::c64s {
2889		if let Some((head, _)) = slice.split_first() {
2890			*head
2891		} else {
2892			c64 { re: 0.0, im: 0.0 }
2893		}
2894	}
2895
2896	#[inline]
2897	fn partial_load_u32s(self, slice: &[u32]) -> Self::u32s {
2898		if let Some((head, _)) = slice.split_first() {
2899			*head
2900		} else {
2901			0
2902		}
2903	}
2904
2905	#[inline]
2906	fn partial_load_u64s(self, slice: &[u64]) -> Self::u64s {
2907		if let Some((head, _)) = slice.split_first() {
2908			*head
2909		} else {
2910			0
2911		}
2912	}
2913
2914	#[inline]
2915	fn partial_store_c64s(self, slice: &mut [c64], values: Self::c64s) {
2916		if let Some((head, _)) = slice.split_first_mut() {
2917			*head = values;
2918		}
2919	}
2920
2921	#[inline]
2922	fn partial_store_u32s(self, slice: &mut [u32], values: Self::u32s) {
2923		if let Some((head, _)) = slice.split_first_mut() {
2924			*head = values;
2925		}
2926	}
2927
2928	#[inline]
2929	fn partial_store_u64s(self, slice: &mut [u64], values: Self::u64s) {
2930		if let Some((head, _)) = slice.split_first_mut() {
2931			*head = values;
2932		}
2933	}
2934
2935	#[inline(always)]
2936	fn reduce_max_c32s(self, a: Self::c32s) -> c32 {
2937		a
2938	}
2939
2940	#[inline(always)]
2941	fn reduce_max_c64s(self, a: Self::c64s) -> c64 {
2942		a
2943	}
2944
2945	#[inline]
2946	fn reduce_max_f32s(self, a: Self::f32s) -> f32 {
2947		a
2948	}
2949
2950	#[inline]
2951	fn reduce_max_f64s(self, a: Self::f64s) -> f64 {
2952		a
2953	}
2954
2955	#[inline(always)]
2956	fn reduce_min_c32s(self, a: Self::c32s) -> c32 {
2957		a
2958	}
2959
2960	#[inline(always)]
2961	fn reduce_min_c64s(self, a: Self::c64s) -> c64 {
2962		a
2963	}
2964
2965	#[inline]
2966	fn reduce_min_f32s(self, a: Self::f32s) -> f32 {
2967		a
2968	}
2969
2970	#[inline]
2971	fn reduce_min_f64s(self, a: Self::f64s) -> f64 {
2972		a
2973	}
2974
2975	#[inline]
2976	fn reduce_product_f32s(self, a: Self::f32s) -> f32 {
2977		a
2978	}
2979
2980	#[inline]
2981	fn reduce_product_f64s(self, a: Self::f64s) -> f64 {
2982		a
2983	}
2984
2985	#[inline]
2986	fn reduce_sum_c32s(self, a: Self::c32s) -> c32 {
2987		a
2988	}
2989
2990	#[inline]
2991	fn reduce_sum_c64s(self, a: Self::c64s) -> c64 {
2992		a
2993	}
2994
2995	#[inline]
2996	fn reduce_sum_f32s(self, a: Self::f32s) -> f32 {
2997		a
2998	}
2999
3000	#[inline]
3001	fn reduce_sum_f64s(self, a: Self::f64s) -> f64 {
3002		a
3003	}
3004
3005	#[inline(always)]
3006	fn rotate_right_c32s(self, a: Self::c32s, _amount: usize) -> Self::c32s {
3007		a
3008	}
3009
3010	#[inline(always)]
3011	fn rotate_right_c64s(self, a: Self::c64s, _amount: usize) -> Self::c64s {
3012		a
3013	}
3014
3015	#[inline(always)]
3016	fn rotate_right_u32s(self, a: Self::u32s, _amount: usize) -> Self::u32s {
3017		a
3018	}
3019
3020	#[inline(always)]
3021	fn rotate_right_u64s(self, a: Self::u64s, _amount: usize) -> Self::u64s {
3022		a
3023	}
3024
3025	#[inline]
3026	fn select_u32s(
3027		self,
3028		mask: Self::m32s,
3029		if_true: Self::u32s,
3030		if_false: Self::u32s,
3031	) -> Self::u32s {
3032		if mask { if_true } else { if_false }
3033	}
3034
3035	#[inline]
3036	fn select_u64s(
3037		self,
3038		mask: Self::m64s,
3039		if_true: Self::u64s,
3040		if_false: Self::u64s,
3041	) -> Self::u64s {
3042		if mask { if_true } else { if_false }
3043	}
3044
3045	#[inline]
3046	fn swap_re_im_c32s(self, a: Self::c32s) -> Self::c32s {
3047		c32 { re: a.im, im: a.re }
3048	}
3049
3050	fn swap_re_im_c64s(self, a: Self::c64s) -> Self::c64s {
3051		c64 { re: a.im, im: a.re }
3052	}
3053
3054	#[inline]
3055	fn vectorize<Op: WithSimd>(self, op: Op) -> Op::Output {
3056		op.with_simd(self)
3057	}
3058
3059	#[inline]
3060	fn widening_mul_u32s(self, a: Self::u32s, b: Self::u32s) -> (Self::u32s, Self::u32s) {
3061		let c = a as u64 * b as u64;
3062		let lo = c as u32;
3063		let hi = (c >> 32) as u32;
3064		(lo, hi)
3065	}
3066
3067	#[inline]
3068	fn wrapping_dyn_shl_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s {
3069		a.wrapping_shl(amount)
3070	}
3071
3072	#[inline]
3073	fn wrapping_dyn_shr_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s {
3074		a.wrapping_shr(amount)
3075	}
3076
3077	unsafe fn mask_load_ptr_u8s(self, mask: MemMask<Self::m8s>, ptr: *const u8) -> Self::u8s {
3078		if mask.mask { *ptr } else { 0 }
3079	}
3080
3081	unsafe fn mask_load_ptr_u16s(self, mask: MemMask<Self::m16s>, ptr: *const u16) -> Self::u16s {
3082		if mask.mask { *ptr } else { 0 }
3083	}
3084
3085	#[inline(always)]
3086	fn sqrt_f32s(self, a: Self::f32s) -> Self::f32s {
3087		sqrt_f32(a)
3088	}
3089
3090	#[inline(always)]
3091	fn sqrt_f64s(self, a: Self::f64s) -> Self::f64s {
3092		sqrt_f64(a)
3093	}
3094}
3095
3096#[inline(always)]
3097unsafe fn split_slice<T, U>(slice: &[T]) -> (&[U], &[T]) {
3098	assert_eq!(core::mem::size_of::<U>() % core::mem::size_of::<T>(), 0);
3099	assert_eq!(core::mem::align_of::<U>(), core::mem::align_of::<T>());
3100
3101	let chunk_size = core::mem::size_of::<U>() / core::mem::size_of::<T>();
3102
3103	let len = slice.len();
3104	let data = slice.as_ptr();
3105
3106	let div = len / chunk_size;
3107	let rem = len % chunk_size;
3108	(
3109		from_raw_parts(data as *const U, div),
3110		from_raw_parts(data.add(len - rem), rem),
3111	)
3112}
3113
3114#[inline(always)]
3115unsafe fn split_mut_slice<T, U>(slice: &mut [T]) -> (&mut [U], &mut [T]) {
3116	assert_eq!(core::mem::size_of::<U>() % core::mem::size_of::<T>(), 0);
3117	assert_eq!(core::mem::align_of::<U>(), core::mem::align_of::<T>());
3118
3119	let chunk_size = core::mem::size_of::<U>() / core::mem::size_of::<T>();
3120
3121	let len = slice.len();
3122	let data = slice.as_mut_ptr();
3123
3124	let div = len / chunk_size;
3125	let rem = len % chunk_size;
3126	(
3127		from_raw_parts_mut(data as *mut U, div),
3128		from_raw_parts_mut(data.add(len - rem), rem),
3129	)
3130}
3131
3132#[inline(always)]
3133unsafe fn rsplit_slice<T, U>(slice: &[T]) -> (&[T], &[U]) {
3134	assert_eq!(core::mem::size_of::<U>() % core::mem::size_of::<T>(), 0);
3135	assert_eq!(core::mem::align_of::<U>(), core::mem::align_of::<T>());
3136
3137	let chunk_size = core::mem::size_of::<U>() / core::mem::size_of::<T>();
3138
3139	let len = slice.len();
3140	let data = slice.as_ptr();
3141
3142	let div = len / chunk_size;
3143	let rem = len % chunk_size;
3144	(
3145		from_raw_parts(data, rem),
3146		from_raw_parts(data.add(rem) as *const U, div),
3147	)
3148}
3149
3150#[inline(always)]
3151unsafe fn rsplit_mut_slice<T, U>(slice: &mut [T]) -> (&mut [T], &mut [U]) {
3152	assert_eq!(core::mem::size_of::<U>() % core::mem::size_of::<T>(), 0);
3153	assert_eq!(core::mem::align_of::<U>(), core::mem::align_of::<T>());
3154
3155	let chunk_size = core::mem::size_of::<U>() / core::mem::size_of::<T>();
3156
3157	let len = slice.len();
3158	let data = slice.as_mut_ptr();
3159
3160	let div = len / chunk_size;
3161	let rem = len % chunk_size;
3162	(
3163		from_raw_parts_mut(data, rem),
3164		from_raw_parts_mut(data.add(rem) as *mut U, div),
3165	)
3166}
3167
3168match_cfg!(
3169	item,
3170	match cfg!() {
3171		const { any(target_arch = "x86", target_arch = "x86_64") } => {
3172			pub use x86::Arch;
3173		},
3174		const { target_arch = "aarch64" } => {
3175			pub use aarch64::Arch;
3176		},
3177		const { target_arch = "wasm32" } => {
3178			pub use wasm::Arch;
3179		},
3180		_ => {
3181			#[derive(Debug, Clone, Copy)]
3182			#[non_exhaustive]
3183			pub enum Arch {
3184				Scalar,
3185			}
3186
3187			impl Arch {
3188				#[inline(always)]
3189				pub fn new() -> Self {
3190					Self::Scalar
3191				}
3192
3193				#[inline(always)]
3194				pub fn dispatch<Op: WithSimd>(self, op: Op) -> Op::Output {
3195					op.with_simd(Scalar)
3196				}
3197			}
3198			impl Default for Arch {
3199				#[inline]
3200				fn default() -> Self {
3201					Self::new()
3202				}
3203			}
3204		},
3205	}
3206);
3207
3208#[doc(hidden)]
3209pub struct CheckSameSize<T, U>(PhantomData<(T, U)>);
3210impl<T, U> CheckSameSize<T, U> {
3211	pub const VALID: () = {
3212		assert!(core::mem::size_of::<T>() == core::mem::size_of::<U>());
3213	};
3214}
3215
3216#[doc(hidden)]
3217pub struct CheckSizeLessThanOrEqual<T, U>(PhantomData<(T, U)>);
3218impl<T, U> CheckSizeLessThanOrEqual<T, U> {
3219	pub const VALID: () = {
3220		assert!(core::mem::size_of::<T>() <= core::mem::size_of::<U>());
3221	};
3222}
3223
3224#[macro_export]
3225macro_rules! static_assert_same_size {
3226	($t: ty, $u: ty) => {
3227		let _ = $crate::CheckSameSize::<$t, $u>::VALID;
3228	};
3229}
3230#[macro_export]
3231macro_rules! static_assert_size_less_than_or_equal {
3232	($t: ty, $u: ty) => {
3233		let _ = $crate::CheckSizeLessThanOrEqual::<$t, $u>::VALID;
3234	};
3235}
3236
3237/// Safe transmute function.
3238///
3239/// This function asserts at compile time that the two types have the same size.
3240#[inline(always)]
3241pub const fn cast<T: NoUninit, U: AnyBitPattern>(value: T) -> U {
3242	static_assert_same_size!(T, U);
3243	let ptr = &raw const value as *const U;
3244	unsafe { ptr.read_unaligned() }
3245}
3246
3247/// Safe lossy transmute function, where the destination type may be smaller than the source type.
3248///
3249/// This property is checked at compile time.
3250#[inline(always)]
3251pub const fn cast_lossy<T: NoUninit, U: AnyBitPattern>(value: T) -> U {
3252	static_assert_size_less_than_or_equal!(U, T);
3253	let value = core::mem::ManuallyDrop::new(value);
3254	let ptr = &raw const value as *const U;
3255	unsafe { ptr.read_unaligned() }
3256}
3257
3258/// Splits a slice into chunks of equal size (known at compile time).
3259///
3260/// Returns the chunks and the remaining section of the input slice.
3261#[inline(always)]
3262pub fn as_arrays<const N: usize, T>(slice: &[T]) -> (&[[T; N]], &[T]) {
3263	let n = slice.len();
3264	let mid_div_n = n / N;
3265	let mid = mid_div_n * N;
3266	let ptr = slice.as_ptr();
3267	unsafe {
3268		(
3269			from_raw_parts(ptr as *const [T; N], mid_div_n),
3270			from_raw_parts(ptr.add(mid), n - mid),
3271		)
3272	}
3273}
3274
3275/// Splits a slice into chunks of equal size (known at compile time).
3276///
3277/// Returns the chunks and the remaining section of the input slice.
3278#[inline(always)]
3279pub fn as_arrays_mut<const N: usize, T>(slice: &mut [T]) -> (&mut [[T; N]], &mut [T]) {
3280	let n = slice.len();
3281	let mid_div_n = n / N;
3282	let mid = mid_div_n * N;
3283	let ptr = slice.as_mut_ptr();
3284	unsafe {
3285		(
3286			from_raw_parts_mut(ptr as *mut [T; N], mid_div_n),
3287			from_raw_parts_mut(ptr.add(mid), n - mid),
3288		)
3289	}
3290}
3291
3292/// Platform dependent intrinsics.
3293pub mod core_arch;
3294
3295#[allow(unused_macros)]
3296macro_rules! inherit {
3297    ({$(
3298        $(#[$attr: meta])*
3299        $(unsafe $($placeholder: lifetime)?)?
3300        fn $func: ident(self
3301            $(,$arg: ident: $ty: ty)* $(,)?
3302        ) $(-> $ret: ty)?;
3303    )*}) => {
3304        $(
3305            $(#[$attr])*
3306            #[inline(always)]
3307            $(unsafe $($placeholder)?)? fn $func (self, $($arg: $ty,)*) $(-> $ret)? {
3308                (*self).$func ($($arg,)*)
3309            }
3310        )*
3311    };
3312}
3313
3314#[allow(unused_macros)]
3315macro_rules! inherit_x2 {
3316    ($base: expr, {$(
3317        $(#[$attr: meta])*
3318        $(unsafe $($placeholder: lifetime)?)?
3319        fn $func: ident ($self: ident
3320            $(,$arg: ident: $ty: ty)* $(,)?
3321        ) $(-> $ret: ty)?;
3322    )*}) => {
3323        $(
3324            $(#[$attr])*
3325            #[inline(always)]
3326            $(unsafe $($placeholder)?)? fn $func ($self, $($arg: $ty,)*) $(-> $ret)? {
3327            	$(let $arg: [_; 2] = cast!($arg);)*
3328                cast!([($base).$func ($($arg[0],)*), ($base).$func ($($arg[1],)*)])
3329            }
3330        )*
3331    };
3332
3333    ($base: expr, splat, {$(
3334        $(#[$attr: meta])*
3335        $(unsafe $($placeholder: lifetime)?)?
3336        fn $func: ident ($self: ident
3337            $(,$arg: ident: $ty: ty)* $(,)?
3338        ) $(-> $ret: ty)?;
3339    )*}) => {
3340        $(
3341            $(#[$attr])*
3342            #[inline(always)]
3343            $(unsafe $($placeholder)?)? fn $func ($self, $($arg: $ty,)*) $(-> $ret)? {
3344                cast!([($base).$func ($($arg,)*), ($base).$func ($($arg,)*)])
3345            }
3346        )*
3347    };
3348
3349    ($base: expr, wide, {$(
3350        $(#[$attr: meta])*
3351        $(unsafe $($placeholder: lifetime)?)?
3352        fn $func: ident ($self: ident
3353            $(,$arg: ident: $ty: ty)* $(,)?
3354        ) $(-> $ret: ty)?;
3355    )*}) => {
3356        $(
3357            $(#[$attr])*
3358            #[inline(always)]
3359            $(unsafe $($placeholder)?)? fn $func ($self, $($arg: $ty,)*) $(-> $ret)? {
3360            	$(let $arg: [_; 2] = cast!($arg);)*
3361                let (r0, r1) = ($base).$func ($($arg[0],)*); let (s0, s1) = ($base).$func ($($arg[1],)*);
3362                (cast!([r0, s0]), cast!([r1, s1]))
3363            }
3364        )*
3365    };
3366}
3367
3368#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
3369#[cfg_attr(docsrs, doc(cfg(any(target_arch = "x86", target_arch = "x86_64"))))]
3370/// Low level x86 API.
3371pub mod x86;
3372
3373#[cfg(target_arch = "wasm32")]
3374#[cfg_attr(docsrs, doc(cfg(target_arch = "wasm32")))]
3375/// Low level wasm API.
3376pub mod wasm;
3377
3378#[cfg(target_arch = "aarch64")]
3379#[cfg_attr(docsrs, doc(cfg(target_arch = "aarch64")))]
3380/// Low level aarch64 API.
3381pub mod aarch64;
3382
3383/// Mask type with 8 bits. Its bit pattern is either all ones or all zeros. Unsafe code must not
3384/// depend on this, however.
3385#[derive(Copy, Clone, PartialEq, Eq, Default)]
3386#[repr(transparent)]
3387pub struct m8(u8);
3388/// Mask type with 16 bits. Its bit pattern is either all ones or all zeros. Unsafe code must not
3389/// depend on this, however.
3390#[derive(Copy, Clone, PartialEq, Eq, Default)]
3391#[repr(transparent)]
3392pub struct m16(u16);
3393/// Mask type with 32 bits. Its bit pattern is either all ones or all zeros. Unsafe code must not
3394/// depend on this, however.
3395#[derive(Copy, Clone, PartialEq, Eq, Default)]
3396#[repr(transparent)]
3397pub struct m32(u32);
3398/// Mask type with 64 bits. Its bit pattern is either all ones or all zeros. Unsafe code must not
3399/// depend on this, however.
3400#[derive(Copy, Clone, PartialEq, Eq, Default)]
3401#[repr(transparent)]
3402pub struct m64(u64);
3403
3404/// Bitmask type for 8 elements, used for mask operations on AVX512.
3405#[derive(Copy, Clone, PartialEq, Eq)]
3406#[repr(transparent)]
3407pub struct b8(pub u8);
3408/// Bitmask type for 16 elements, used for mask operations on AVX512.
3409#[derive(Copy, Clone, PartialEq, Eq)]
3410#[repr(transparent)]
3411pub struct b16(pub u16);
3412/// Bitmask type for 32 elements, used for mask operations on AVX512.
3413#[derive(Copy, Clone, PartialEq, Eq)]
3414#[repr(transparent)]
3415pub struct b32(pub u32);
3416/// Bitmask type for 64 elements, used for mask operations on AVX512.
3417#[derive(Copy, Clone, PartialEq, Eq)]
3418#[repr(transparent)]
3419pub struct b64(pub u64);
3420
3421impl core::ops::Not for b8 {
3422	type Output = b8;
3423
3424	#[inline(always)]
3425	fn not(self) -> Self::Output {
3426		b8(!self.0)
3427	}
3428}
3429impl core::ops::BitAnd for b8 {
3430	type Output = b8;
3431
3432	#[inline(always)]
3433	fn bitand(self, rhs: Self) -> Self::Output {
3434		b8(self.0 & rhs.0)
3435	}
3436}
3437impl core::ops::BitOr for b8 {
3438	type Output = b8;
3439
3440	#[inline(always)]
3441	fn bitor(self, rhs: Self) -> Self::Output {
3442		b8(self.0 | rhs.0)
3443	}
3444}
3445impl core::ops::BitXor for b8 {
3446	type Output = b8;
3447
3448	#[inline(always)]
3449	fn bitxor(self, rhs: Self) -> Self::Output {
3450		b8(self.0 ^ rhs.0)
3451	}
3452}
3453
3454impl core::ops::Not for m8 {
3455	type Output = m8;
3456
3457	#[inline(always)]
3458	fn not(self) -> Self::Output {
3459		m8(!self.0)
3460	}
3461}
3462impl core::ops::BitAnd for m8 {
3463	type Output = m8;
3464
3465	#[inline(always)]
3466	fn bitand(self, rhs: Self) -> Self::Output {
3467		m8(self.0 & rhs.0)
3468	}
3469}
3470impl core::ops::BitOr for m8 {
3471	type Output = m8;
3472
3473	#[inline(always)]
3474	fn bitor(self, rhs: Self) -> Self::Output {
3475		m8(self.0 | rhs.0)
3476	}
3477}
3478impl core::ops::BitXor for m8 {
3479	type Output = m8;
3480
3481	#[inline(always)]
3482	fn bitxor(self, rhs: Self) -> Self::Output {
3483		m8(self.0 ^ rhs.0)
3484	}
3485}
3486
3487impl core::ops::Not for m16 {
3488	type Output = m16;
3489
3490	#[inline(always)]
3491	fn not(self) -> Self::Output {
3492		m16(!self.0)
3493	}
3494}
3495impl core::ops::BitAnd for m16 {
3496	type Output = m16;
3497
3498	#[inline(always)]
3499	fn bitand(self, rhs: Self) -> Self::Output {
3500		m16(self.0 & rhs.0)
3501	}
3502}
3503impl core::ops::BitOr for m16 {
3504	type Output = m16;
3505
3506	#[inline(always)]
3507	fn bitor(self, rhs: Self) -> Self::Output {
3508		m16(self.0 | rhs.0)
3509	}
3510}
3511impl core::ops::BitXor for m16 {
3512	type Output = m16;
3513
3514	#[inline(always)]
3515	fn bitxor(self, rhs: Self) -> Self::Output {
3516		m16(self.0 ^ rhs.0)
3517	}
3518}
3519
3520impl core::ops::Not for m32 {
3521	type Output = m32;
3522
3523	#[inline(always)]
3524	fn not(self) -> Self::Output {
3525		m32(!self.0)
3526	}
3527}
3528impl core::ops::BitAnd for m32 {
3529	type Output = m32;
3530
3531	#[inline(always)]
3532	fn bitand(self, rhs: Self) -> Self::Output {
3533		m32(self.0 & rhs.0)
3534	}
3535}
3536impl core::ops::BitOr for m32 {
3537	type Output = m32;
3538
3539	#[inline(always)]
3540	fn bitor(self, rhs: Self) -> Self::Output {
3541		m32(self.0 | rhs.0)
3542	}
3543}
3544impl core::ops::BitXor for m32 {
3545	type Output = m32;
3546
3547	#[inline(always)]
3548	fn bitxor(self, rhs: Self) -> Self::Output {
3549		m32(self.0 ^ rhs.0)
3550	}
3551}
3552
3553impl core::ops::Not for m64 {
3554	type Output = m64;
3555
3556	#[inline(always)]
3557	fn not(self) -> Self::Output {
3558		m64(!self.0)
3559	}
3560}
3561impl core::ops::BitAnd for m64 {
3562	type Output = m64;
3563
3564	#[inline(always)]
3565	fn bitand(self, rhs: Self) -> Self::Output {
3566		m64(self.0 & rhs.0)
3567	}
3568}
3569impl core::ops::BitOr for m64 {
3570	type Output = m64;
3571
3572	#[inline(always)]
3573	fn bitor(self, rhs: Self) -> Self::Output {
3574		m64(self.0 | rhs.0)
3575	}
3576}
3577impl core::ops::BitXor for m64 {
3578	type Output = m64;
3579
3580	#[inline(always)]
3581	fn bitxor(self, rhs: Self) -> Self::Output {
3582		m64(self.0 ^ rhs.0)
3583	}
3584}
3585
3586impl core::ops::Not for b16 {
3587	type Output = b16;
3588
3589	#[inline(always)]
3590	fn not(self) -> Self::Output {
3591		b16(!self.0)
3592	}
3593}
3594impl core::ops::BitAnd for b16 {
3595	type Output = b16;
3596
3597	#[inline(always)]
3598	fn bitand(self, rhs: Self) -> Self::Output {
3599		b16(self.0 & rhs.0)
3600	}
3601}
3602impl core::ops::BitOr for b16 {
3603	type Output = b16;
3604
3605	#[inline(always)]
3606	fn bitor(self, rhs: Self) -> Self::Output {
3607		b16(self.0 | rhs.0)
3608	}
3609}
3610impl core::ops::BitXor for b16 {
3611	type Output = b16;
3612
3613	#[inline(always)]
3614	fn bitxor(self, rhs: Self) -> Self::Output {
3615		b16(self.0 ^ rhs.0)
3616	}
3617}
3618
3619impl core::ops::Not for b32 {
3620	type Output = b32;
3621
3622	#[inline(always)]
3623	fn not(self) -> Self::Output {
3624		b32(!self.0)
3625	}
3626}
3627impl core::ops::BitAnd for b32 {
3628	type Output = b32;
3629
3630	#[inline(always)]
3631	fn bitand(self, rhs: Self) -> Self::Output {
3632		b32(self.0 & rhs.0)
3633	}
3634}
3635impl core::ops::BitOr for b32 {
3636	type Output = b32;
3637
3638	#[inline(always)]
3639	fn bitor(self, rhs: Self) -> Self::Output {
3640		b32(self.0 | rhs.0)
3641	}
3642}
3643impl core::ops::BitXor for b32 {
3644	type Output = b32;
3645
3646	#[inline(always)]
3647	fn bitxor(self, rhs: Self) -> Self::Output {
3648		b32(self.0 ^ rhs.0)
3649	}
3650}
3651
3652impl core::ops::Not for b64 {
3653	type Output = b64;
3654
3655	#[inline(always)]
3656	fn not(self) -> Self::Output {
3657		b64(!self.0)
3658	}
3659}
3660impl core::ops::BitAnd for b64 {
3661	type Output = b64;
3662
3663	#[inline(always)]
3664	fn bitand(self, rhs: Self) -> Self::Output {
3665		b64(self.0 & rhs.0)
3666	}
3667}
3668impl core::ops::BitOr for b64 {
3669	type Output = b64;
3670
3671	#[inline(always)]
3672	fn bitor(self, rhs: Self) -> Self::Output {
3673		b64(self.0 | rhs.0)
3674	}
3675}
3676impl core::ops::BitXor for b64 {
3677	type Output = b64;
3678
3679	#[inline(always)]
3680	fn bitxor(self, rhs: Self) -> Self::Output {
3681		b64(self.0 ^ rhs.0)
3682	}
3683}
3684
3685impl Debug for b8 {
3686	fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result {
3687		#[allow(dead_code)]
3688		#[derive(Copy, Clone, Debug)]
3689		struct b8(bool, bool, bool, bool, bool, bool, bool, bool);
3690		b8(
3691			((self.0 >> 0) & 1) == 1,
3692			((self.0 >> 1) & 1) == 1,
3693			((self.0 >> 2) & 1) == 1,
3694			((self.0 >> 3) & 1) == 1,
3695			((self.0 >> 4) & 1) == 1,
3696			((self.0 >> 5) & 1) == 1,
3697			((self.0 >> 6) & 1) == 1,
3698			((self.0 >> 7) & 1) == 1,
3699		)
3700		.fmt(f)
3701	}
3702}
3703impl Debug for b16 {
3704	fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result {
3705		#[allow(dead_code)]
3706		#[derive(Copy, Clone, Debug)]
3707		struct b16(
3708			bool,
3709			bool,
3710			bool,
3711			bool,
3712			bool,
3713			bool,
3714			bool,
3715			bool,
3716			bool,
3717			bool,
3718			bool,
3719			bool,
3720			bool,
3721			bool,
3722			bool,
3723			bool,
3724		);
3725		b16(
3726			((self.0 >> 00) & 1) == 1,
3727			((self.0 >> 01) & 1) == 1,
3728			((self.0 >> 02) & 1) == 1,
3729			((self.0 >> 03) & 1) == 1,
3730			((self.0 >> 04) & 1) == 1,
3731			((self.0 >> 05) & 1) == 1,
3732			((self.0 >> 06) & 1) == 1,
3733			((self.0 >> 07) & 1) == 1,
3734			((self.0 >> 08) & 1) == 1,
3735			((self.0 >> 09) & 1) == 1,
3736			((self.0 >> 10) & 1) == 1,
3737			((self.0 >> 11) & 1) == 1,
3738			((self.0 >> 12) & 1) == 1,
3739			((self.0 >> 13) & 1) == 1,
3740			((self.0 >> 14) & 1) == 1,
3741			((self.0 >> 15) & 1) == 1,
3742		)
3743		.fmt(f)
3744	}
3745}
3746impl Debug for b32 {
3747	fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result {
3748		#[allow(dead_code)]
3749		#[derive(Copy, Clone, Debug)]
3750		struct b32(
3751			bool,
3752			bool,
3753			bool,
3754			bool,
3755			bool,
3756			bool,
3757			bool,
3758			bool,
3759			bool,
3760			bool,
3761			bool,
3762			bool,
3763			bool,
3764			bool,
3765			bool,
3766			bool,
3767			bool,
3768			bool,
3769			bool,
3770			bool,
3771			bool,
3772			bool,
3773			bool,
3774			bool,
3775			bool,
3776			bool,
3777			bool,
3778			bool,
3779			bool,
3780			bool,
3781			bool,
3782			bool,
3783		);
3784		b32(
3785			((self.0 >> 00) & 1) == 1,
3786			((self.0 >> 01) & 1) == 1,
3787			((self.0 >> 02) & 1) == 1,
3788			((self.0 >> 03) & 1) == 1,
3789			((self.0 >> 04) & 1) == 1,
3790			((self.0 >> 05) & 1) == 1,
3791			((self.0 >> 06) & 1) == 1,
3792			((self.0 >> 07) & 1) == 1,
3793			((self.0 >> 08) & 1) == 1,
3794			((self.0 >> 09) & 1) == 1,
3795			((self.0 >> 10) & 1) == 1,
3796			((self.0 >> 11) & 1) == 1,
3797			((self.0 >> 12) & 1) == 1,
3798			((self.0 >> 13) & 1) == 1,
3799			((self.0 >> 14) & 1) == 1,
3800			((self.0 >> 15) & 1) == 1,
3801			((self.0 >> 16) & 1) == 1,
3802			((self.0 >> 17) & 1) == 1,
3803			((self.0 >> 18) & 1) == 1,
3804			((self.0 >> 19) & 1) == 1,
3805			((self.0 >> 20) & 1) == 1,
3806			((self.0 >> 21) & 1) == 1,
3807			((self.0 >> 22) & 1) == 1,
3808			((self.0 >> 23) & 1) == 1,
3809			((self.0 >> 24) & 1) == 1,
3810			((self.0 >> 25) & 1) == 1,
3811			((self.0 >> 26) & 1) == 1,
3812			((self.0 >> 27) & 1) == 1,
3813			((self.0 >> 28) & 1) == 1,
3814			((self.0 >> 29) & 1) == 1,
3815			((self.0 >> 30) & 1) == 1,
3816			((self.0 >> 31) & 1) == 1,
3817		)
3818		.fmt(f)
3819	}
3820}
3821impl Debug for b64 {
3822	fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result {
3823		#[allow(dead_code)]
3824		#[derive(Copy, Clone, Debug)]
3825		struct b64(
3826			bool,
3827			bool,
3828			bool,
3829			bool,
3830			bool,
3831			bool,
3832			bool,
3833			bool,
3834			bool,
3835			bool,
3836			bool,
3837			bool,
3838			bool,
3839			bool,
3840			bool,
3841			bool,
3842			bool,
3843			bool,
3844			bool,
3845			bool,
3846			bool,
3847			bool,
3848			bool,
3849			bool,
3850			bool,
3851			bool,
3852			bool,
3853			bool,
3854			bool,
3855			bool,
3856			bool,
3857			bool,
3858			bool,
3859			bool,
3860			bool,
3861			bool,
3862			bool,
3863			bool,
3864			bool,
3865			bool,
3866			bool,
3867			bool,
3868			bool,
3869			bool,
3870			bool,
3871			bool,
3872			bool,
3873			bool,
3874			bool,
3875			bool,
3876			bool,
3877			bool,
3878			bool,
3879			bool,
3880			bool,
3881			bool,
3882			bool,
3883			bool,
3884			bool,
3885			bool,
3886			bool,
3887			bool,
3888			bool,
3889			bool,
3890		);
3891		b64(
3892			((self.0 >> 00) & 1) == 1,
3893			((self.0 >> 01) & 1) == 1,
3894			((self.0 >> 02) & 1) == 1,
3895			((self.0 >> 03) & 1) == 1,
3896			((self.0 >> 04) & 1) == 1,
3897			((self.0 >> 05) & 1) == 1,
3898			((self.0 >> 06) & 1) == 1,
3899			((self.0 >> 07) & 1) == 1,
3900			((self.0 >> 08) & 1) == 1,
3901			((self.0 >> 09) & 1) == 1,
3902			((self.0 >> 10) & 1) == 1,
3903			((self.0 >> 11) & 1) == 1,
3904			((self.0 >> 12) & 1) == 1,
3905			((self.0 >> 13) & 1) == 1,
3906			((self.0 >> 14) & 1) == 1,
3907			((self.0 >> 15) & 1) == 1,
3908			((self.0 >> 16) & 1) == 1,
3909			((self.0 >> 17) & 1) == 1,
3910			((self.0 >> 18) & 1) == 1,
3911			((self.0 >> 19) & 1) == 1,
3912			((self.0 >> 20) & 1) == 1,
3913			((self.0 >> 21) & 1) == 1,
3914			((self.0 >> 22) & 1) == 1,
3915			((self.0 >> 23) & 1) == 1,
3916			((self.0 >> 24) & 1) == 1,
3917			((self.0 >> 25) & 1) == 1,
3918			((self.0 >> 26) & 1) == 1,
3919			((self.0 >> 27) & 1) == 1,
3920			((self.0 >> 28) & 1) == 1,
3921			((self.0 >> 29) & 1) == 1,
3922			((self.0 >> 30) & 1) == 1,
3923			((self.0 >> 31) & 1) == 1,
3924			((self.0 >> 32) & 1) == 1,
3925			((self.0 >> 33) & 1) == 1,
3926			((self.0 >> 34) & 1) == 1,
3927			((self.0 >> 35) & 1) == 1,
3928			((self.0 >> 36) & 1) == 1,
3929			((self.0 >> 37) & 1) == 1,
3930			((self.0 >> 38) & 1) == 1,
3931			((self.0 >> 39) & 1) == 1,
3932			((self.0 >> 40) & 1) == 1,
3933			((self.0 >> 41) & 1) == 1,
3934			((self.0 >> 42) & 1) == 1,
3935			((self.0 >> 43) & 1) == 1,
3936			((self.0 >> 44) & 1) == 1,
3937			((self.0 >> 45) & 1) == 1,
3938			((self.0 >> 46) & 1) == 1,
3939			((self.0 >> 47) & 1) == 1,
3940			((self.0 >> 48) & 1) == 1,
3941			((self.0 >> 49) & 1) == 1,
3942			((self.0 >> 50) & 1) == 1,
3943			((self.0 >> 51) & 1) == 1,
3944			((self.0 >> 52) & 1) == 1,
3945			((self.0 >> 53) & 1) == 1,
3946			((self.0 >> 54) & 1) == 1,
3947			((self.0 >> 55) & 1) == 1,
3948			((self.0 >> 56) & 1) == 1,
3949			((self.0 >> 57) & 1) == 1,
3950			((self.0 >> 58) & 1) == 1,
3951			((self.0 >> 59) & 1) == 1,
3952			((self.0 >> 60) & 1) == 1,
3953			((self.0 >> 61) & 1) == 1,
3954			((self.0 >> 62) & 1) == 1,
3955			((self.0 >> 63) & 1) == 1,
3956		)
3957		.fmt(f)
3958	}
3959}
3960
3961impl Debug for m8 {
3962	#[inline]
3963	fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result {
3964		self.is_set().fmt(f)
3965	}
3966}
3967impl Debug for m16 {
3968	#[inline]
3969	fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result {
3970		self.is_set().fmt(f)
3971	}
3972}
3973impl Debug for m32 {
3974	#[inline]
3975	fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result {
3976		self.is_set().fmt(f)
3977	}
3978}
3979impl Debug for m64 {
3980	#[inline]
3981	fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result {
3982		self.is_set().fmt(f)
3983	}
3984}
3985
3986impl m8 {
3987	/// Returns a mask with all bits set one, if `flag` is true, otherwise returns a mask with all
3988	/// bits set to zero.
3989	#[inline(always)]
3990	pub const fn new(flag: bool) -> Self {
3991		Self(if flag { u8::MAX } else { 0 })
3992	}
3993
3994	/// Returns `false` if the mask bits are all zero, otherwise returns `true`.
3995	#[inline(always)]
3996	pub const fn is_set(self) -> bool {
3997		self.0 != 0
3998	}
3999}
4000impl m16 {
4001	/// Returns a mask with all bits set one, if `flag` is true, otherwise returns a mask with all
4002	/// bits set to zero.
4003	#[inline(always)]
4004	pub const fn new(flag: bool) -> Self {
4005		Self(if flag { u16::MAX } else { 0 })
4006	}
4007
4008	/// Returns `false` if the mask bits are all zero, otherwise returns `true`.
4009	#[inline(always)]
4010	pub const fn is_set(self) -> bool {
4011		self.0 != 0
4012	}
4013}
4014impl m32 {
4015	/// Returns a mask with all bits set one, if `flag` is true, otherwise returns a mask with all
4016	/// bits set to zero.
4017	#[inline(always)]
4018	pub const fn new(flag: bool) -> Self {
4019		Self(if flag { u32::MAX } else { 0 })
4020	}
4021
4022	/// Returns `false` if the mask bits are all zero, otherwise returns `true`.
4023	#[inline(always)]
4024	pub const fn is_set(self) -> bool {
4025		self.0 != 0
4026	}
4027}
4028impl m64 {
4029	/// Returns a mask with all bits set one, if `flag` is true, otherwise returns a mask with all
4030	/// bits set to zero.
4031	#[inline(always)]
4032	pub const fn new(flag: bool) -> Self {
4033		Self(if flag { u64::MAX } else { 0 })
4034	}
4035
4036	/// Returns `false` if the mask bits are all zero, otherwise returns `true`.
4037	#[inline(always)]
4038	pub const fn is_set(self) -> bool {
4039		self.0 != 0
4040	}
4041}
4042
4043/// A 128-bit SIMD vector with 16 elements of type [`i8`].
4044#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4045#[repr(C)]
4046pub struct i8x16(
4047	pub i8,
4048	pub i8,
4049	pub i8,
4050	pub i8,
4051	pub i8,
4052	pub i8,
4053	pub i8,
4054	pub i8,
4055	pub i8,
4056	pub i8,
4057	pub i8,
4058	pub i8,
4059	pub i8,
4060	pub i8,
4061	pub i8,
4062	pub i8,
4063);
4064/// A 256-bit SIMD vector with 32 elements of type [`i8`].
4065#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4066#[repr(C)]
4067pub struct i8x32(
4068	pub i8,
4069	pub i8,
4070	pub i8,
4071	pub i8,
4072	pub i8,
4073	pub i8,
4074	pub i8,
4075	pub i8,
4076	pub i8,
4077	pub i8,
4078	pub i8,
4079	pub i8,
4080	pub i8,
4081	pub i8,
4082	pub i8,
4083	pub i8,
4084	pub i8,
4085	pub i8,
4086	pub i8,
4087	pub i8,
4088	pub i8,
4089	pub i8,
4090	pub i8,
4091	pub i8,
4092	pub i8,
4093	pub i8,
4094	pub i8,
4095	pub i8,
4096	pub i8,
4097	pub i8,
4098	pub i8,
4099	pub i8,
4100);
4101/// A 512-bit SIMD vector with 64 elements of type [`i8`].
4102#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4103#[repr(C)]
4104pub struct i8x64(
4105	pub i8,
4106	pub i8,
4107	pub i8,
4108	pub i8,
4109	pub i8,
4110	pub i8,
4111	pub i8,
4112	pub i8,
4113	pub i8,
4114	pub i8,
4115	pub i8,
4116	pub i8,
4117	pub i8,
4118	pub i8,
4119	pub i8,
4120	pub i8,
4121	pub i8,
4122	pub i8,
4123	pub i8,
4124	pub i8,
4125	pub i8,
4126	pub i8,
4127	pub i8,
4128	pub i8,
4129	pub i8,
4130	pub i8,
4131	pub i8,
4132	pub i8,
4133	pub i8,
4134	pub i8,
4135	pub i8,
4136	pub i8,
4137	pub i8,
4138	pub i8,
4139	pub i8,
4140	pub i8,
4141	pub i8,
4142	pub i8,
4143	pub i8,
4144	pub i8,
4145	pub i8,
4146	pub i8,
4147	pub i8,
4148	pub i8,
4149	pub i8,
4150	pub i8,
4151	pub i8,
4152	pub i8,
4153	pub i8,
4154	pub i8,
4155	pub i8,
4156	pub i8,
4157	pub i8,
4158	pub i8,
4159	pub i8,
4160	pub i8,
4161	pub i8,
4162	pub i8,
4163	pub i8,
4164	pub i8,
4165	pub i8,
4166	pub i8,
4167	pub i8,
4168	pub i8,
4169);
4170
4171/// A 128-bit SIMD vector with 16 elements of type [`u8`].
4172#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4173#[repr(C)]
4174pub struct u8x16(
4175	pub u8,
4176	pub u8,
4177	pub u8,
4178	pub u8,
4179	pub u8,
4180	pub u8,
4181	pub u8,
4182	pub u8,
4183	pub u8,
4184	pub u8,
4185	pub u8,
4186	pub u8,
4187	pub u8,
4188	pub u8,
4189	pub u8,
4190	pub u8,
4191);
4192/// A 256-bit SIMD vector with 32 elements of type [`u8`].
4193#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4194#[repr(C)]
4195pub struct u8x32(
4196	pub u8,
4197	pub u8,
4198	pub u8,
4199	pub u8,
4200	pub u8,
4201	pub u8,
4202	pub u8,
4203	pub u8,
4204	pub u8,
4205	pub u8,
4206	pub u8,
4207	pub u8,
4208	pub u8,
4209	pub u8,
4210	pub u8,
4211	pub u8,
4212	pub u8,
4213	pub u8,
4214	pub u8,
4215	pub u8,
4216	pub u8,
4217	pub u8,
4218	pub u8,
4219	pub u8,
4220	pub u8,
4221	pub u8,
4222	pub u8,
4223	pub u8,
4224	pub u8,
4225	pub u8,
4226	pub u8,
4227	pub u8,
4228);
4229/// A 512-bit SIMD vector with 64 elements of type [`u8`].
4230#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4231#[repr(C)]
4232pub struct u8x64(
4233	pub u8,
4234	pub u8,
4235	pub u8,
4236	pub u8,
4237	pub u8,
4238	pub u8,
4239	pub u8,
4240	pub u8,
4241	pub u8,
4242	pub u8,
4243	pub u8,
4244	pub u8,
4245	pub u8,
4246	pub u8,
4247	pub u8,
4248	pub u8,
4249	pub u8,
4250	pub u8,
4251	pub u8,
4252	pub u8,
4253	pub u8,
4254	pub u8,
4255	pub u8,
4256	pub u8,
4257	pub u8,
4258	pub u8,
4259	pub u8,
4260	pub u8,
4261	pub u8,
4262	pub u8,
4263	pub u8,
4264	pub u8,
4265	pub u8,
4266	pub u8,
4267	pub u8,
4268	pub u8,
4269	pub u8,
4270	pub u8,
4271	pub u8,
4272	pub u8,
4273	pub u8,
4274	pub u8,
4275	pub u8,
4276	pub u8,
4277	pub u8,
4278	pub u8,
4279	pub u8,
4280	pub u8,
4281	pub u8,
4282	pub u8,
4283	pub u8,
4284	pub u8,
4285	pub u8,
4286	pub u8,
4287	pub u8,
4288	pub u8,
4289	pub u8,
4290	pub u8,
4291	pub u8,
4292	pub u8,
4293	pub u8,
4294	pub u8,
4295	pub u8,
4296	pub u8,
4297);
4298
4299/// A 128-bit SIMD vector with 16 elements of type [`m8`].
4300#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4301#[repr(C)]
4302pub struct m8x16(
4303	pub m8,
4304	pub m8,
4305	pub m8,
4306	pub m8,
4307	pub m8,
4308	pub m8,
4309	pub m8,
4310	pub m8,
4311	pub m8,
4312	pub m8,
4313	pub m8,
4314	pub m8,
4315	pub m8,
4316	pub m8,
4317	pub m8,
4318	pub m8,
4319);
4320/// A 256-bit SIMD vector with 32 elements of type [`m8`].
4321#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4322#[repr(C)]
4323pub struct m8x32(
4324	pub m8,
4325	pub m8,
4326	pub m8,
4327	pub m8,
4328	pub m8,
4329	pub m8,
4330	pub m8,
4331	pub m8,
4332	pub m8,
4333	pub m8,
4334	pub m8,
4335	pub m8,
4336	pub m8,
4337	pub m8,
4338	pub m8,
4339	pub m8,
4340	pub m8,
4341	pub m8,
4342	pub m8,
4343	pub m8,
4344	pub m8,
4345	pub m8,
4346	pub m8,
4347	pub m8,
4348	pub m8,
4349	pub m8,
4350	pub m8,
4351	pub m8,
4352	pub m8,
4353	pub m8,
4354	pub m8,
4355	pub m8,
4356);
4357
4358/// A 512-bit SIMD vector with 64 elements of type [`m8`].
4359#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4360#[repr(C)]
4361pub struct m8x64(
4362	pub m8,
4363	pub m8,
4364	pub m8,
4365	pub m8,
4366	pub m8,
4367	pub m8,
4368	pub m8,
4369	pub m8,
4370	pub m8,
4371	pub m8,
4372	pub m8,
4373	pub m8,
4374	pub m8,
4375	pub m8,
4376	pub m8,
4377	pub m8,
4378	pub m8,
4379	pub m8,
4380	pub m8,
4381	pub m8,
4382	pub m8,
4383	pub m8,
4384	pub m8,
4385	pub m8,
4386	pub m8,
4387	pub m8,
4388	pub m8,
4389	pub m8,
4390	pub m8,
4391	pub m8,
4392	pub m8,
4393	pub m8,
4394	pub m8,
4395	pub m8,
4396	pub m8,
4397	pub m8,
4398	pub m8,
4399	pub m8,
4400	pub m8,
4401	pub m8,
4402	pub m8,
4403	pub m8,
4404	pub m8,
4405	pub m8,
4406	pub m8,
4407	pub m8,
4408	pub m8,
4409	pub m8,
4410	pub m8,
4411	pub m8,
4412	pub m8,
4413	pub m8,
4414	pub m8,
4415	pub m8,
4416	pub m8,
4417	pub m8,
4418	pub m8,
4419	pub m8,
4420	pub m8,
4421	pub m8,
4422	pub m8,
4423	pub m8,
4424	pub m8,
4425	pub m8,
4426);
4427
4428/// A 128-bit SIMD vector with 8 elements of type [`i16`].
4429#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4430#[repr(C)]
4431pub struct i16x8(
4432	pub i16,
4433	pub i16,
4434	pub i16,
4435	pub i16,
4436	pub i16,
4437	pub i16,
4438	pub i16,
4439	pub i16,
4440);
4441/// A 256-bit SIMD vector with 16 elements of type [`i16`].
4442#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4443#[repr(C)]
4444pub struct i16x16(
4445	pub i16,
4446	pub i16,
4447	pub i16,
4448	pub i16,
4449	pub i16,
4450	pub i16,
4451	pub i16,
4452	pub i16,
4453	pub i16,
4454	pub i16,
4455	pub i16,
4456	pub i16,
4457	pub i16,
4458	pub i16,
4459	pub i16,
4460	pub i16,
4461);
4462/// A 512-bit SIMD vector with 32 elements of type [`i16`].
4463#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4464#[repr(C)]
4465pub struct i16x32(
4466	pub i16,
4467	pub i16,
4468	pub i16,
4469	pub i16,
4470	pub i16,
4471	pub i16,
4472	pub i16,
4473	pub i16,
4474	pub i16,
4475	pub i16,
4476	pub i16,
4477	pub i16,
4478	pub i16,
4479	pub i16,
4480	pub i16,
4481	pub i16,
4482	pub i16,
4483	pub i16,
4484	pub i16,
4485	pub i16,
4486	pub i16,
4487	pub i16,
4488	pub i16,
4489	pub i16,
4490	pub i16,
4491	pub i16,
4492	pub i16,
4493	pub i16,
4494	pub i16,
4495	pub i16,
4496	pub i16,
4497	pub i16,
4498);
4499
4500/// A 128-bit SIMD vector with 8 elements of type [`u16`].
4501#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4502#[repr(C)]
4503pub struct u16x8(
4504	pub u16,
4505	pub u16,
4506	pub u16,
4507	pub u16,
4508	pub u16,
4509	pub u16,
4510	pub u16,
4511	pub u16,
4512);
4513/// A 256-bit SIMD vector with 16 elements of type [`u16`].
4514#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4515#[repr(C)]
4516pub struct u16x16(
4517	pub u16,
4518	pub u16,
4519	pub u16,
4520	pub u16,
4521	pub u16,
4522	pub u16,
4523	pub u16,
4524	pub u16,
4525	pub u16,
4526	pub u16,
4527	pub u16,
4528	pub u16,
4529	pub u16,
4530	pub u16,
4531	pub u16,
4532	pub u16,
4533);
4534/// A 512-bit SIMD vector with 32 elements of type [`u16`].
4535#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4536#[repr(C)]
4537pub struct u16x32(
4538	pub u16,
4539	pub u16,
4540	pub u16,
4541	pub u16,
4542	pub u16,
4543	pub u16,
4544	pub u16,
4545	pub u16,
4546	pub u16,
4547	pub u16,
4548	pub u16,
4549	pub u16,
4550	pub u16,
4551	pub u16,
4552	pub u16,
4553	pub u16,
4554	pub u16,
4555	pub u16,
4556	pub u16,
4557	pub u16,
4558	pub u16,
4559	pub u16,
4560	pub u16,
4561	pub u16,
4562	pub u16,
4563	pub u16,
4564	pub u16,
4565	pub u16,
4566	pub u16,
4567	pub u16,
4568	pub u16,
4569	pub u16,
4570);
4571
4572/// A 128-bit SIMD vector with 8 elements of type [`m16`].
4573#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4574#[repr(C)]
4575pub struct m16x8(
4576	pub m16,
4577	pub m16,
4578	pub m16,
4579	pub m16,
4580	pub m16,
4581	pub m16,
4582	pub m16,
4583	pub m16,
4584);
4585/// A 256-bit SIMD vector with 16 elements of type [`m16`].
4586#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4587#[repr(C)]
4588pub struct m16x16(
4589	pub m16,
4590	pub m16,
4591	pub m16,
4592	pub m16,
4593	pub m16,
4594	pub m16,
4595	pub m16,
4596	pub m16,
4597	pub m16,
4598	pub m16,
4599	pub m16,
4600	pub m16,
4601	pub m16,
4602	pub m16,
4603	pub m16,
4604	pub m16,
4605);
4606/// A 512-bit SIMD vector with 32 elements of type [`m16`].
4607#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4608#[repr(C)]
4609pub struct m16x32(
4610	pub m16,
4611	pub m16,
4612	pub m16,
4613	pub m16,
4614	pub m16,
4615	pub m16,
4616	pub m16,
4617	pub m16,
4618	pub m16,
4619	pub m16,
4620	pub m16,
4621	pub m16,
4622	pub m16,
4623	pub m16,
4624	pub m16,
4625	pub m16,
4626	pub m16,
4627	pub m16,
4628	pub m16,
4629	pub m16,
4630	pub m16,
4631	pub m16,
4632	pub m16,
4633	pub m16,
4634	pub m16,
4635	pub m16,
4636	pub m16,
4637	pub m16,
4638	pub m16,
4639	pub m16,
4640	pub m16,
4641	pub m16,
4642);
4643
4644/// A 128-bit SIMD vector with 4 elements of type [`f32`].
4645#[derive(Debug, Copy, Clone, PartialEq)]
4646#[repr(C)]
4647pub struct f32x4(pub f32, pub f32, pub f32, pub f32);
4648
4649/// A 256-bit SIMD vector with 8 elements of type [`f32`].
4650#[derive(Debug, Copy, Clone, PartialEq)]
4651#[repr(C)]
4652pub struct f32x8(
4653	pub f32,
4654	pub f32,
4655	pub f32,
4656	pub f32,
4657	pub f32,
4658	pub f32,
4659	pub f32,
4660	pub f32,
4661);
4662/// A 512-bit SIMD vector with 16 elements of type [`f32`].
4663#[derive(Debug, Copy, Clone, PartialEq)]
4664#[repr(C)]
4665pub struct f32x16(
4666	pub f32,
4667	pub f32,
4668	pub f32,
4669	pub f32,
4670	pub f32,
4671	pub f32,
4672	pub f32,
4673	pub f32,
4674	pub f32,
4675	pub f32,
4676	pub f32,
4677	pub f32,
4678	pub f32,
4679	pub f32,
4680	pub f32,
4681	pub f32,
4682);
4683
4684/// A 128-bit SIMD vector with 2 elements of type [`c32`].
4685#[derive(Copy, Clone, PartialEq)]
4686#[repr(C)]
4687pub struct c32x2(pub c32, pub c32);
4688
4689impl Debug for c32x2 {
4690	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
4691		#[derive(Copy, Clone, Debug)]
4692		#[repr(C)]
4693		pub struct c32x2(pub DebugCplx<c32>, pub DebugCplx<c32>);
4694		unsafe impl Zeroable for c32x2 {}
4695		unsafe impl Pod for c32x2 {}
4696
4697		let this: c32x2 = cast!(*self);
4698		this.fmt(f)
4699	}
4700}
4701
4702/// A 256-bit SIMD vector with 4 elements of type [`c32`].
4703#[derive(Copy, Clone, PartialEq)]
4704#[repr(C)]
4705pub struct c32x4(pub c32, pub c32, pub c32, pub c32);
4706
4707impl Debug for c32x4 {
4708	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
4709		#[derive(Copy, Clone, Debug)]
4710		#[repr(C)]
4711		pub struct c32x4(
4712			pub DebugCplx<c32>,
4713			pub DebugCplx<c32>,
4714			pub DebugCplx<c32>,
4715			pub DebugCplx<c32>,
4716		);
4717		unsafe impl Zeroable for c32x4 {}
4718		unsafe impl Pod for c32x4 {}
4719
4720		let this: c32x4 = cast!(*self);
4721		this.fmt(f)
4722	}
4723}
4724
4725/// A 512-bit SIMD vector with 8 elements of type [`c32`].
4726#[derive(Copy, Clone, PartialEq)]
4727#[repr(C)]
4728pub struct c32x8(
4729	pub c32,
4730	pub c32,
4731	pub c32,
4732	pub c32,
4733	pub c32,
4734	pub c32,
4735	pub c32,
4736	pub c32,
4737);
4738
4739impl Debug for c32x8 {
4740	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
4741		#[derive(Copy, Clone, Debug)]
4742		#[repr(C)]
4743		pub struct c32x8(
4744			pub DebugCplx<c32>,
4745			pub DebugCplx<c32>,
4746			pub DebugCplx<c32>,
4747			pub DebugCplx<c32>,
4748			pub DebugCplx<c32>,
4749			pub DebugCplx<c32>,
4750			pub DebugCplx<c32>,
4751			pub DebugCplx<c32>,
4752		);
4753		unsafe impl Zeroable for c32x8 {}
4754		unsafe impl Pod for c32x8 {}
4755
4756		let this: c32x8 = cast!(*self);
4757		this.fmt(f)
4758	}
4759}
4760/// A 128-bit SIMD vector with 4 elements of type [`i32`].
4761#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4762#[repr(C)]
4763pub struct i32x4(pub i32, pub i32, pub i32, pub i32);
4764/// A 256-bit SIMD vector with 8 elements of type [`i32`].
4765#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4766#[repr(C)]
4767pub struct i32x8(
4768	pub i32,
4769	pub i32,
4770	pub i32,
4771	pub i32,
4772	pub i32,
4773	pub i32,
4774	pub i32,
4775	pub i32,
4776);
4777/// A 512-bit SIMD vector with 16 elements of type [`i32`].
4778#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4779#[repr(C)]
4780pub struct i32x16(
4781	pub i32,
4782	pub i32,
4783	pub i32,
4784	pub i32,
4785	pub i32,
4786	pub i32,
4787	pub i32,
4788	pub i32,
4789	pub i32,
4790	pub i32,
4791	pub i32,
4792	pub i32,
4793	pub i32,
4794	pub i32,
4795	pub i32,
4796	pub i32,
4797);
4798
4799/// A 128-bit SIMD vector with 4 elements of type [`u32`].
4800#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4801#[repr(C)]
4802pub struct u32x4(pub u32, pub u32, pub u32, pub u32);
4803/// A 256-bit SIMD vector with 8 elements of type [`u32`].
4804#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4805#[repr(C)]
4806pub struct u32x8(
4807	pub u32,
4808	pub u32,
4809	pub u32,
4810	pub u32,
4811	pub u32,
4812	pub u32,
4813	pub u32,
4814	pub u32,
4815);
4816/// A 512-bit SIMD vector with 16 elements of type [`u32`].
4817#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4818#[repr(C)]
4819pub struct u32x16(
4820	pub u32,
4821	pub u32,
4822	pub u32,
4823	pub u32,
4824	pub u32,
4825	pub u32,
4826	pub u32,
4827	pub u32,
4828	pub u32,
4829	pub u32,
4830	pub u32,
4831	pub u32,
4832	pub u32,
4833	pub u32,
4834	pub u32,
4835	pub u32,
4836);
4837
4838/// A 128-bit SIMD vector with 4 elements of type [`m32`].
4839#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4840#[repr(C)]
4841pub struct m32x4(pub m32, pub m32, pub m32, pub m32);
4842/// A 256-bit SIMD vector with 8 elements of type [`m32`].
4843#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4844#[repr(C)]
4845pub struct m32x8(
4846	pub m32,
4847	pub m32,
4848	pub m32,
4849	pub m32,
4850	pub m32,
4851	pub m32,
4852	pub m32,
4853	pub m32,
4854);
4855/// A 512-bit SIMD vector with 16 elements of type [`m32`].
4856#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4857#[repr(C)]
4858pub struct m32x16(
4859	pub m32,
4860	pub m32,
4861	pub m32,
4862	pub m32,
4863	pub m32,
4864	pub m32,
4865	pub m32,
4866	pub m32,
4867	pub m32,
4868	pub m32,
4869	pub m32,
4870	pub m32,
4871	pub m32,
4872	pub m32,
4873	pub m32,
4874	pub m32,
4875);
4876
4877/// A 128-bit SIMD vector with 2 elements of type [`f64`].
4878#[derive(Debug, Copy, Clone, PartialEq)]
4879#[repr(C)]
4880pub struct f64x2(pub f64, pub f64);
4881/// A 256-bit SIMD vector with 4 elements of type [`f64`].
4882#[derive(Debug, Copy, Clone, PartialEq)]
4883#[repr(C)]
4884pub struct f64x4(pub f64, pub f64, pub f64, pub f64);
4885/// A 512-bit SIMD vector with 8 elements of type [`f64`].
4886#[derive(Debug, Copy, Clone, PartialEq)]
4887#[repr(C)]
4888pub struct f64x8(
4889	pub f64,
4890	pub f64,
4891	pub f64,
4892	pub f64,
4893	pub f64,
4894	pub f64,
4895	pub f64,
4896	pub f64,
4897);
4898
4899/// A 128-bit SIMD vector with 1 elements of type [`c64`].
4900#[derive(Copy, Clone, PartialEq)]
4901#[repr(C)]
4902pub struct c64x1(pub c64);
4903
4904impl Debug for c64x1 {
4905	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
4906		#[derive(Copy, Clone, Debug)]
4907		#[repr(C)]
4908		pub struct c64x1(pub DebugCplx<c64>);
4909		unsafe impl Zeroable for c64x1 {}
4910		unsafe impl Pod for c64x1 {}
4911
4912		let this: c64x1 = cast!(*self);
4913		this.fmt(f)
4914	}
4915}
4916
4917/// A 256-bit SIMD vector with 2 elements of type [`c64`].
4918#[derive(Copy, Clone, PartialEq)]
4919#[repr(C)]
4920pub struct c64x2(pub c64, pub c64);
4921
4922impl Debug for c64x2 {
4923	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
4924		#[derive(Copy, Clone, Debug)]
4925		#[repr(C)]
4926		pub struct c64x2(pub DebugCplx<c64>, pub DebugCplx<c64>);
4927		unsafe impl Zeroable for c64x2 {}
4928		unsafe impl Pod for c64x2 {}
4929
4930		let this: c64x2 = cast!(*self);
4931		this.fmt(f)
4932	}
4933}
4934
4935/// A 512-bit SIMD vector with 4 elements of type [`c64`].
4936#[derive(Copy, Clone, PartialEq)]
4937#[repr(C)]
4938pub struct c64x4(pub c64, pub c64, pub c64, pub c64);
4939
4940impl Debug for c64x4 {
4941	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
4942		#[derive(Copy, Clone, Debug)]
4943		#[repr(C)]
4944		pub struct c64x4(
4945			pub DebugCplx<c64>,
4946			pub DebugCplx<c64>,
4947			pub DebugCplx<c64>,
4948			pub DebugCplx<c64>,
4949		);
4950		unsafe impl Zeroable for c64x4 {}
4951		unsafe impl Pod for c64x4 {}
4952
4953		let this: c64x4 = cast!(*self);
4954		this.fmt(f)
4955	}
4956}
4957
4958/// A 128-bit SIMD vector with 2 elements of type [`i64`].
4959#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4960#[repr(C)]
4961pub struct i64x2(pub i64, pub i64);
4962/// A 256-bit SIMD vector with 4 elements of type [`i64`].
4963#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4964#[repr(C)]
4965pub struct i64x4(pub i64, pub i64, pub i64, pub i64);
4966/// A 512-bit SIMD vector with 8 elements of type [`i64`].
4967#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4968#[repr(C)]
4969pub struct i64x8(
4970	pub i64,
4971	pub i64,
4972	pub i64,
4973	pub i64,
4974	pub i64,
4975	pub i64,
4976	pub i64,
4977	pub i64,
4978);
4979
4980/// A 128-bit SIMD vector with 2 elements of type [`u64`].
4981#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4982#[repr(C)]
4983pub struct u64x2(pub u64, pub u64);
4984/// A 256-bit SIMD vector with 4 elements of type [`u64`].
4985#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4986#[repr(C)]
4987pub struct u64x4(pub u64, pub u64, pub u64, pub u64);
4988/// A 512-bit SIMD vector with 8 elements of type [`u64`].
4989#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4990#[repr(C)]
4991pub struct u64x8(
4992	pub u64,
4993	pub u64,
4994	pub u64,
4995	pub u64,
4996	pub u64,
4997	pub u64,
4998	pub u64,
4999	pub u64,
5000);
5001
5002/// A 128-bit SIMD vector with 2 elements of type [`m64`].
5003#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5004#[repr(C)]
5005pub struct m64x2(pub m64, pub m64);
5006/// A 256-bit SIMD vector with 4 elements of type [`m64`].
5007#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5008#[repr(C)]
5009pub struct m64x4(pub m64, pub m64, pub m64, pub m64);
5010/// A 512-bit SIMD vector with 8 elements of type [`m64`].
5011#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5012#[repr(C)]
5013pub struct m64x8(
5014	pub m64,
5015	pub m64,
5016	pub m64,
5017	pub m64,
5018	pub m64,
5019	pub m64,
5020	pub m64,
5021	pub m64,
5022);
5023
5024unsafe impl Zeroable for m8 {}
5025unsafe impl Zeroable for m16 {}
5026unsafe impl Zeroable for m32 {}
5027unsafe impl Zeroable for m64 {}
5028unsafe impl Pod for m8 {}
5029unsafe impl Pod for m16 {}
5030unsafe impl Pod for m32 {}
5031unsafe impl Pod for m64 {}
5032
5033unsafe impl Zeroable for b8 {}
5034unsafe impl Pod for b8 {}
5035unsafe impl Zeroable for b16 {}
5036unsafe impl Pod for b16 {}
5037unsafe impl Zeroable for b32 {}
5038unsafe impl Pod for b32 {}
5039unsafe impl Zeroable for b64 {}
5040unsafe impl Pod for b64 {}
5041
5042unsafe impl Zeroable for i8x16 {}
5043unsafe impl Zeroable for i8x32 {}
5044unsafe impl Zeroable for i8x64 {}
5045unsafe impl Pod for i8x16 {}
5046unsafe impl Pod for i8x32 {}
5047unsafe impl Pod for i8x64 {}
5048unsafe impl Zeroable for u8x16 {}
5049unsafe impl Zeroable for u8x32 {}
5050unsafe impl Zeroable for u8x64 {}
5051unsafe impl Pod for u8x16 {}
5052unsafe impl Pod for u8x32 {}
5053unsafe impl Pod for u8x64 {}
5054unsafe impl Zeroable for m8x16 {}
5055unsafe impl Zeroable for m8x32 {}
5056unsafe impl Zeroable for m8x64 {}
5057unsafe impl Pod for m8x16 {}
5058unsafe impl Pod for m8x32 {}
5059unsafe impl Pod for m8x64 {}
5060
5061unsafe impl Zeroable for i16x8 {}
5062unsafe impl Zeroable for i16x16 {}
5063unsafe impl Zeroable for i16x32 {}
5064unsafe impl Pod for i16x8 {}
5065unsafe impl Pod for i16x16 {}
5066unsafe impl Pod for i16x32 {}
5067unsafe impl Zeroable for u16x8 {}
5068unsafe impl Zeroable for u16x16 {}
5069unsafe impl Zeroable for u16x32 {}
5070unsafe impl Pod for u16x8 {}
5071unsafe impl Pod for u16x16 {}
5072unsafe impl Pod for u16x32 {}
5073unsafe impl Zeroable for m16x8 {}
5074unsafe impl Zeroable for m16x16 {}
5075unsafe impl Zeroable for m16x32 {}
5076unsafe impl Pod for m16x8 {}
5077unsafe impl Pod for m16x16 {}
5078unsafe impl Pod for m16x32 {}
5079
5080unsafe impl Zeroable for f32x4 {}
5081unsafe impl Zeroable for f32x8 {}
5082unsafe impl Zeroable for f32x16 {}
5083unsafe impl Pod for f32x4 {}
5084unsafe impl Pod for f32x8 {}
5085unsafe impl Pod for f32x16 {}
5086unsafe impl Zeroable for c32x2 {}
5087unsafe impl Zeroable for c32x4 {}
5088unsafe impl Zeroable for c32x8 {}
5089unsafe impl Pod for c32x2 {}
5090unsafe impl Pod for c32x4 {}
5091unsafe impl Pod for c32x8 {}
5092unsafe impl Zeroable for i32x4 {}
5093unsafe impl Zeroable for i32x8 {}
5094unsafe impl Zeroable for i32x16 {}
5095unsafe impl Pod for i32x4 {}
5096unsafe impl Pod for i32x8 {}
5097unsafe impl Pod for i32x16 {}
5098unsafe impl Zeroable for u32x4 {}
5099unsafe impl Zeroable for u32x8 {}
5100unsafe impl Zeroable for u32x16 {}
5101unsafe impl Pod for u32x4 {}
5102unsafe impl Pod for u32x8 {}
5103unsafe impl Pod for u32x16 {}
5104unsafe impl Zeroable for m32x4 {}
5105unsafe impl Zeroable for m32x8 {}
5106unsafe impl Zeroable for m32x16 {}
5107unsafe impl Pod for m32x4 {}
5108unsafe impl Pod for m32x8 {}
5109unsafe impl Pod for m32x16 {}
5110
5111unsafe impl Zeroable for f64x2 {}
5112unsafe impl Zeroable for f64x4 {}
5113unsafe impl Zeroable for f64x8 {}
5114unsafe impl Pod for f64x2 {}
5115unsafe impl Pod for f64x4 {}
5116unsafe impl Pod for f64x8 {}
5117unsafe impl Zeroable for c64x1 {}
5118unsafe impl Zeroable for c64x2 {}
5119unsafe impl Zeroable for c64x4 {}
5120unsafe impl Pod for c64x1 {}
5121unsafe impl Pod for c64x2 {}
5122unsafe impl Pod for c64x4 {}
5123unsafe impl Zeroable for i64x2 {}
5124unsafe impl Zeroable for i64x4 {}
5125unsafe impl Zeroable for i64x8 {}
5126unsafe impl Pod for i64x2 {}
5127unsafe impl Pod for i64x4 {}
5128unsafe impl Pod for i64x8 {}
5129unsafe impl Zeroable for u64x2 {}
5130unsafe impl Zeroable for u64x4 {}
5131unsafe impl Zeroable for u64x8 {}
5132unsafe impl Pod for u64x2 {}
5133unsafe impl Pod for u64x4 {}
5134unsafe impl Pod for u64x8 {}
5135unsafe impl Zeroable for m64x2 {}
5136unsafe impl Zeroable for m64x4 {}
5137unsafe impl Zeroable for m64x8 {}
5138unsafe impl Pod for m64x2 {}
5139unsafe impl Pod for m64x4 {}
5140unsafe impl Pod for m64x8 {}
5141
5142macro_rules! iota {
5143	($T: ty, $N: expr, $int: ty) => {
5144		const {
5145			unsafe {
5146				let mut iota = [const { core::mem::MaybeUninit::uninit() }; $N];
5147				{
5148					let mut i = 0;
5149					while i < $N {
5150						let v = (&raw mut iota[i]) as *mut $int;
5151
5152						let mut j = 0;
5153						while j < core::mem::size_of::<$T>() / core::mem::size_of::<$int>() {
5154							v.add(j).write_unaligned(i as $int);
5155							j += 1;
5156						}
5157
5158						i += 1;
5159					}
5160				}
5161				iota
5162			}
5163		}
5164	};
5165}
5166
5167pub const fn iota_8<T: Interleave, const N: usize>() -> [MaybeUninit<T>; N] {
5168	iota!(T, N, u8)
5169}
5170pub const fn iota_16<T: Interleave, const N: usize>() -> [MaybeUninit<T>; N] {
5171	iota!(T, N, u16)
5172}
5173pub const fn iota_32<T: Interleave, const N: usize>() -> [MaybeUninit<T>; N] {
5174	iota!(T, N, u32)
5175}
5176pub const fn iota_64<T: Interleave, const N: usize>() -> [MaybeUninit<T>; N] {
5177	iota!(T, N, u64)
5178}
5179
5180#[cfg(target_arch = "x86_64")]
5181#[cfg(test)]
5182mod tests {
5183	use super::*;
5184
5185	#[test]
5186	fn test_interleave() {
5187		if let Some(simd) = x86::V3::try_new() {
5188			{
5189				let src = [f64x4(0.0, 0.1, 1.0, 1.1), f64x4(2.0, 2.1, 3.0, 3.1)];
5190				let dst = unsafe { deinterleave_fallback::<f64, f64x4, [f64x4; 2]>(src) };
5191				assert_eq!(dst[1], simd.add_f64x4(dst[0], simd.splat_f64x4(0.1)));
5192				assert_eq!(src, unsafe {
5193					interleave_fallback::<f64, f64x4, [f64x4; 2]>(dst)
5194				});
5195			}
5196			{
5197				let src = [
5198					f64x4(0.0, 0.1, 0.2, 0.3),
5199					f64x4(1.0, 1.1, 1.2, 1.3),
5200					f64x4(2.0, 2.1, 2.2, 2.3),
5201					f64x4(3.0, 3.1, 3.2, 3.3),
5202				];
5203				let dst = unsafe { deinterleave_fallback::<f64, f64x4, [f64x4; 4]>(src) };
5204				assert_eq!(dst[1], simd.add_f64x4(dst[0], simd.splat_f64x4(0.1)));
5205				assert_eq!(dst[2], simd.add_f64x4(dst[0], simd.splat_f64x4(0.2)));
5206				assert_eq!(dst[3], simd.add_f64x4(dst[0], simd.splat_f64x4(0.3)));
5207				assert_eq!(src, unsafe {
5208					interleave_fallback::<f64, f64x4, [f64x4; 4]>(dst)
5209				});
5210			}
5211		}
5212	}
5213}
pulp/lib.rs

pulp/
lib.rs