safe_arch/x86_x64/
avx2.rs

1#![cfg(target_feature = "avx2")]
2
3use super::*;
4
5/// Blends the `i32` lanes in `a` and `b` into a single value.
6///
7/// * The blend is controlled by an immediate mask value (an `i32`).
8/// * For each lane `0..=3`, use `0` if you want that lane of the output to be
9///   from `a` and use `1` if you want that lane of the output to be from `b`.
10///
11/// ```
12/// # use safe_arch::*;
13/// let a = m128i::from([10, 20, 30, 40]);
14/// let b = m128i::from([100, 200, 300, 400]);
15/// //
16/// let c: [i32; 4] = blend_imm_i32_m128i::<0b0110>(a, b).into();
17/// assert_eq!(c, [10, 200, 300, 40]);
18/// ```
19/// * **Intrinsic:** [`_mm_blend_epi32`]
20/// * **Assembly:** `vpblendd xmm, xmm, xmm, imm8`
21#[must_use]
22#[inline(always)]
23#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
24pub fn blend_imm_i32_m128i<const IMM: i32>(a: m128i, b: m128i) -> m128i {
25  m128i(unsafe { _mm_blend_epi32(a.0, b.0, IMM) })
26}
27
28/// Splat the lowest 8-bit lane across the entire 128 bits.
29/// ```
30/// # use safe_arch::*;
31/// let a = m128i::from(0x77_i128);
32/// let b: [i8; 16] = splat_i8_m128i_s_m128i(a).into();
33/// assert_eq!(b, [0x77_i8; 16]);
34/// ```
35/// * **Intrinsic:** [`_mm_broadcastb_epi8`]
36/// * **Assembly:** `vpbroadcastb xmm, xmm`
37#[must_use]
38#[inline(always)]
39#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
40pub fn splat_i8_m128i_s_m128i(a: m128i) -> m128i {
41  m128i(unsafe { _mm_broadcastb_epi8(a.0) })
42}
43
44/// Splat the lowest 16-bit lane across the entire 128 bits.
45/// ```
46/// # use safe_arch::*;
47/// let a = m128i::from(0x77_i128);
48/// let b: [i16; 8] = splat_i16_m128i_s_m128i(a).into();
49/// assert_eq!(b, [0x77_i16; 8]);
50/// ```
51/// * **Intrinsic:** [`_mm_broadcastw_epi16`]
52/// * **Assembly:** `vpbroadcastw xmm, xmm`
53#[must_use]
54#[inline(always)]
55#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
56pub fn splat_i16_m128i_s_m128i(a: m128i) -> m128i {
57  m128i(unsafe { _mm_broadcastw_epi16(a.0) })
58}
59
60/// Splat the lowest 32-bit lane across the entire 128 bits.
61/// ```
62/// # use safe_arch::*;
63/// let a = m128i::from(0x77_i128);
64/// let b: [i32; 4] = splat_i32_m128i_s_m128i(a).into();
65/// assert_eq!(b, [0x77; 4]);
66/// ```
67/// * **Intrinsic:** [`_mm_broadcastd_epi32`]
68/// * **Assembly:** `vpbroadcastd xmm, xmm`
69#[must_use]
70#[inline(always)]
71#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
72pub fn splat_i32_m128i_s_m128i(a: m128i) -> m128i {
73  m128i(unsafe { _mm_broadcastd_epi32(a.0) })
74}
75
76/// Splat the lowest 64-bit lane across the entire 128 bits.
77/// ```
78/// # use safe_arch::*;
79/// let a = m128i::from(0x77_i128);
80/// let b: [i64; 2] = splat_i64_m128i_s_m128i(a).into();
81/// assert_eq!(b, [0x77_i64; 2]);
82/// ```
83/// * **Intrinsic:** [`_mm_broadcastq_epi64`]
84/// * **Assembly:** `vpbroadcastq xmm, xmm`
85#[must_use]
86#[inline(always)]
87#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
88pub fn splat_i64_m128i_s_m128i(a: m128i) -> m128i {
89  m128i(unsafe { _mm_broadcastq_epi64(a.0) })
90}
91
92/// Splat the lower `f64` across both lanes of `m128d`.
93/// ```
94/// # use safe_arch::*;
95/// let a = m128d::from([1.0, 2.0]);
96/// let b = splat_m128d_s_m128d(a).to_array();
97/// assert_eq!(b, [1.0, 1.0]);
98/// ```
99/// * **Intrinsic:** [`_mm_broadcastsd_pd`]
100/// * **Assembly:** `movddup xmm, xmm`
101#[must_use]
102#[inline(always)]
103#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
104pub fn splat_m128d_s_m128d(a: m128d) -> m128d {
105  m128d(unsafe { _mm_broadcastsd_pd(a.0) })
106}
107
108/// Splat the 128-bits across 256-bits.
109/// ```
110/// # use safe_arch::*;
111/// let a = m128i::from(1_i128);
112/// let b: [i128; 2] = splat_m128i_m256i(a).into();
113/// assert_eq!(b, [1_i128, 1]);
114/// ```
115/// * **Intrinsic:** [`_mm256_broadcastsi128_si256`]
116/// * **Assembly:** `vbroadcasti128 ymm, m128`
117#[must_use]
118#[inline(always)]
119#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
120pub fn splat_m128i_m256i(a: m128i) -> m256i {
121  m256i(unsafe { _mm256_broadcastsi128_si256(a.0) })
122}
123
124/// Splat the lowest `f32` across all four lanes.
125/// ```
126/// # use safe_arch::*;
127/// let a = set_m128_s(1.0);
128/// let b = splat_m128_s_m128(a).to_array();
129/// assert_eq!(b, [1.0, 1.0, 1.0, 1.0]);
130/// ```
131/// * **Intrinsic:** [`_mm_broadcastss_ps`]
132/// * **Assembly:** `vbroadcastss xmm, xmm`
133#[must_use]
134#[inline(always)]
135#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
136pub fn splat_m128_s_m128(a: m128) -> m128 {
137  m128(unsafe { _mm_broadcastss_ps(a.0) })
138}
139
140/// Loads the reference given and zeroes any `i32` lanes not in the mask.
141///
142/// * A lane is "in" the mask if that lane's mask value is set in the high bit
143///   (aka "if the lane's value is negative").
144/// ```
145/// # use safe_arch::*;
146/// let a = set_splat_i32_m128i(5);
147/// let b = load_masked_i32_m128i(&a, m128i::from([-1_i32, 0, 0, -1]));
148/// assert_eq!(<[i32; 4]>::from(b), [5, 0, 0, 5]);
149/// ```
150/// * **Intrinsic:** [`_mm_maskload_epi32`]
151/// * **Assembly:** `vpmaskmovd xmm, xmm, m128`
152#[must_use]
153#[inline(always)]
154#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
155pub fn load_masked_i32_m128i(a: &m128i, mask: m128i) -> m128i {
156  m128i(unsafe { _mm_maskload_epi32(a as *const m128i as *const i32, mask.0) })
157}
158
159/// Loads the reference given and zeroes any `i64` lanes not in the mask.
160///
161/// * A lane is "in" the mask if that lane's mask value is set in the high bit
162///   (aka "if the lane's value is negative").
163/// ```
164/// # use safe_arch::*;
165/// let a = set_splat_i64_m128i(5);
166/// let b = load_masked_i64_m128i(&a, m128i::from([0_i64, -1]));
167/// assert_eq!(<[i64; 2]>::from(b), [0_i64, 5]);
168/// ```
169/// * **Intrinsic:** [`_mm_maskload_epi64`]
170/// * **Assembly:** `vpmaskmovq xmm, xmm, m128`
171#[must_use]
172#[inline(always)]
173#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
174pub fn load_masked_i64_m128i(a: &m128i, mask: m128i) -> m128i {
175  m128i(unsafe { _mm_maskload_epi64(a as *const m128i as *const i64, mask.0) })
176}
177
178/// Stores the `i32` masked lanes given to the reference.
179///
180/// * A lane is "in" the mask if that lane's mask value is set in the high bit
181///   (aka "if the lane's value is negative").
182/// * Lanes not in the mask are not modified.
183/// ```
184/// # use safe_arch::*;
185/// let mut a = m128i::default();
186/// store_masked_i32_m128i(&mut a, m128i::from([-1_i32, 0, 0, -1]), set_splat_i32_m128i(5));
187/// assert_eq!(<[i32; 4]>::from(a), [5, 0, 0, 5]);
188/// ```
189/// * **Intrinsic:** [`_mm_maskstore_epi32`]
190/// * **Assembly:** `vpmaskmovd m128, xmm, xmm`
191#[inline(always)]
192#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
193pub fn store_masked_i32_m128i(addr: &mut m128i, mask: m128i, a: m128i) {
194  unsafe { _mm_maskstore_epi32(addr as *mut m128i as *mut i32, mask.0, a.0) };
195}
196
197/// Stores the `i32` masked lanes given to the reference.
198///
199/// * A lane is "in" the mask if that lane's mask value is set in the high bit
200///   (aka "if the lane's value is negative").
201/// * Lanes not in the mask are not modified.
202/// ```
203/// # use safe_arch::*;
204/// let mut a = m128i::default();
205/// store_masked_i64_m128i(&mut a, m128i::from([0_i64, -1]), set_splat_i64_m128i(5));
206/// assert_eq!(<[i64; 2]>::from(a), [0, 5]);
207/// ```
208/// * **Intrinsic:** [`_mm_maskstore_epi64`]
209/// * **Assembly:** `vpmaskmovq m128, xmm, xmm`
210#[inline(always)]
211#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
212pub fn store_masked_i64_m128i(addr: &mut m128i, mask: m128i, a: m128i) {
213  unsafe { _mm_maskstore_epi64(addr as *mut m128i as *mut i64, mask.0, a.0) };
214}
215
216/// Shift `u32` values to the left by `count` bits.
217///
218/// * Each `u32` lane in `a` is shifted by the same indexed `u32` lane in
219///   `count`.
220/// ```
221/// # use safe_arch::*;
222/// let a = m128i::from([1, 2, 3, 4]);
223/// let count = m128i::from([5, 6, 7, 8]);
224/// let out: [u32; 4] = shl_each_u32_m128i(a, count).into();
225/// assert_eq!(out, [1 << 5, 2 << 6, 3 << 7, 4 << 8]);
226/// ```
227/// * **Intrinsic:** [`_mm_sllv_epi32`]
228/// * **Assembly:** `vpsllvd xmm, xmm, xmm`
229#[must_use]
230#[inline(always)]
231#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
232pub fn shl_each_u32_m128i(a: m128i, count: m128i) -> m128i {
233  m128i(unsafe { _mm_sllv_epi32(a.0, count.0) })
234}
235
236/// Shift `u64` values to the left by `count` bits.
237///
238/// * Each `u64` lane in `a` is shifted by the same indexed `u64` lane in
239///   `count`.
240/// ```
241/// # use safe_arch::*;
242/// let a = m128i::from([1_u64, 2]);
243/// let count = m128i::from([3_u64, 4]);
244/// let out: [u64; 2] = shl_each_u64_m128i(a, count).into();
245/// assert_eq!(out, [1_u64 << 3, 2 << 4]);
246/// ```
247/// * **Intrinsic:** [`_mm_sllv_epi64`]
248/// * **Assembly:** `vpsllvq xmm, xmm, xmm`
249#[must_use]
250#[inline(always)]
251#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
252pub fn shl_each_u64_m128i(a: m128i, count: m128i) -> m128i {
253  m128i(unsafe { _mm_sllv_epi64(a.0, count.0) })
254}
255
256/// Shift `i32` values to the right by `count` bits.
257///
258/// * Each `i32` lane in `a` is shifted by the same indexed `u32` lane in
259///   `count`.
260/// ```
261/// # use safe_arch::*;
262/// let a = m128i::from([100, 110, 120, -130]);
263/// let count = m128i::from([1, 2, 3, 4]);
264/// let out: [i32; 4] = shr_each_i32_m128i(a, count).into();
265/// assert_eq!(out, [100 >> 1, 110 >> 2, 120 >> 3, (-130) >> 4]);
266/// ```
267/// * **Intrinsic:** [`_mm_srav_epi32`]
268/// * **Assembly:** `vpsravd xmm, xmm, xmm`
269#[must_use]
270#[inline(always)]
271#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
272pub fn shr_each_i32_m128i(a: m128i, count: m128i) -> m128i {
273  m128i(unsafe { _mm_srav_epi32(a.0, count.0) })
274}
275
276/// Shift `u32` values to the left by `count` bits.
277///
278/// * Each `u32` lane in `a` is shifted by the same indexed `u32` lane in
279///   `count`.
280/// ```
281/// # use safe_arch::*;
282/// let a = m128i::from([100, 110, 120, 130]);
283/// let count = m128i::from([1, 2, 3, 4]);
284/// let out: [u32; 4] = shr_each_u32_m128i(a, count).into();
285/// assert_eq!(out, [100 >> 1, 110 >> 2, 120 >> 3, 130 >> 4]);
286/// ```
287/// * **Intrinsic:** [`_mm_srlv_epi32`]
288/// * **Assembly:** `vpsrlvd xmm, xmm, xmm`
289#[must_use]
290#[inline(always)]
291#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
292pub fn shr_each_u32_m128i(a: m128i, count: m128i) -> m128i {
293  m128i(unsafe { _mm_srlv_epi32(a.0, count.0) })
294}
295
296/// Shift `u64` values to the left by `count` bits.
297///
298/// * Each `u64` lane in `a` is shifted by the same indexed `u64` lane in
299///   `count`.
300/// ```
301/// # use safe_arch::*;
302/// let a = m128i::from([100_u64, 110]);
303/// let count = m128i::from([1_u64, 2]);
304/// let out: [u64; 2] = shr_each_u64_m128i(a, count).into();
305/// assert_eq!(out, [100_u64 >> 1, 110 >> 2]);
306/// ```
307/// * **Intrinsic:** [`_mm_srlv_epi64`]
308/// * **Assembly:** `vpsrlvq xmm, xmm, xmm`
309#[must_use]
310#[inline(always)]
311#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
312pub fn shr_each_u64_m128i(a: m128i, count: m128i) -> m128i {
313  m128i(unsafe { _mm_srlv_epi64(a.0, count.0) })
314}
315
316/// Absolute value of `i8` lanes.
317/// ```
318/// # use safe_arch::*;
319/// let a = m256i::from([-7_i8; 32]);
320/// let b: [i8; 32] = abs_i8_m256i(a).into();
321/// assert_eq!(b, [7_i8; 32]);
322/// ```
323/// * **Intrinsic:** [`_mm256_abs_epi8`]
324/// * **Assembly:** `vpabsb ymm, ymm`
325#[must_use]
326#[inline(always)]
327#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
328pub fn abs_i8_m256i(a: m256i) -> m256i {
329  m256i(unsafe { _mm256_abs_epi8(a.0) })
330}
331
332/// Absolute value of `i16` lanes.
333/// ```
334/// # use safe_arch::*;
335/// let a = m256i::from([-7_i16; 16]);
336/// let b: [i16; 16] = abs_i16_m256i(a).into();
337/// assert_eq!(b, [7_i16; 16]);
338/// ```
339/// * **Intrinsic:** [`_mm256_abs_epi16`]
340/// * **Assembly:** `vpabsw ymm, ymm`
341#[must_use]
342#[inline(always)]
343#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
344pub fn abs_i16_m256i(a: m256i) -> m256i {
345  m256i(unsafe { _mm256_abs_epi16(a.0) })
346}
347
348/// Absolute value of `i32` lanes.
349/// ```
350/// # use safe_arch::*;
351/// let a = m256i::from([-7_i32; 8]);
352/// let b: [i32; 8] = abs_i32_m256i(a).into();
353/// assert_eq!(b, [7_i32; 8]);
354/// ```
355/// * **Intrinsic:** [`_mm256_abs_epi32`]
356/// * **Assembly:** `vpabsd ymm, ymm`
357#[must_use]
358#[inline(always)]
359#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
360pub fn abs_i32_m256i(a: m256i) -> m256i {
361  m256i(unsafe { _mm256_abs_epi32(a.0) })
362}
363
364/// Lanewise `a + b` with lanes as `i8`.
365/// ```
366/// # use safe_arch::*;
367/// let a = m256i::from([5_i8; 32]);
368/// let b = m256i::from([10_i8; 32]);
369/// let c: [i8; 32] = add_i8_m256i(a, b).into();
370/// assert_eq!(c, [15_i8; 32]);
371/// ```
372/// * **Intrinsic:** [`_mm256_add_epi8`]
373/// * **Assembly:** `vpaddb ymm, ymm, ymm`
374#[must_use]
375#[inline(always)]
376#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
377pub fn add_i8_m256i(a: m256i, b: m256i) -> m256i {
378  m256i(unsafe { _mm256_add_epi8(a.0, b.0) })
379}
380
381/// Lanewise `a + b` with lanes as `i16`.
382/// ```
383/// # use safe_arch::*;
384/// let a = m256i::from([5_i16; 16]);
385/// let b = m256i::from([10_i16; 16]);
386/// let c: [i16; 16] = add_i16_m256i(a, b).into();
387/// assert_eq!(c, [15_i16; 16]);
388/// ```
389/// * **Intrinsic:** [`_mm256_add_epi16`]
390/// * **Assembly:** `vpaddw ymm, ymm, ymm`
391#[must_use]
392#[inline(always)]
393#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
394pub fn add_i16_m256i(a: m256i, b: m256i) -> m256i {
395  m256i(unsafe { _mm256_add_epi16(a.0, b.0) })
396}
397
398/// Lanewise `a + b` with lanes as `i32`.
399/// ```
400/// # use safe_arch::*;
401/// let a = m256i::from([5_i32; 8]);
402/// let b = m256i::from([10_i32; 8]);
403/// let c: [i32; 8] = add_i32_m256i(a, b).into();
404/// assert_eq!(c, [15_i32; 8]);
405/// ```
406/// * **Intrinsic:** [`_mm256_add_epi32`]
407/// * **Assembly:** `vpaddd ymm, ymm, ymm`
408#[must_use]
409#[inline(always)]
410#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
411pub fn add_i32_m256i(a: m256i, b: m256i) -> m256i {
412  m256i(unsafe { _mm256_add_epi32(a.0, b.0) })
413}
414
415/// Lanewise `a + b` with lanes as `i64`.
416/// ```
417/// # use safe_arch::*;
418/// let a = m256i::from([5_i64; 4]);
419/// let b = m256i::from([10_i64; 4]);
420/// let c: [i64; 4] = add_i64_m256i(a, b).into();
421/// assert_eq!(c, [15_i64; 4]);
422/// ```
423/// * **Intrinsic:** [`_mm256_add_epi64`]
424/// * **Assembly:** `vpaddq ymm, ymm, ymm`
425#[must_use]
426#[inline(always)]
427#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
428pub fn add_i64_m256i(a: m256i, b: m256i) -> m256i {
429  m256i(unsafe { _mm256_add_epi64(a.0, b.0) })
430}
431
432/// Lanewise saturating `a + b` with lanes as `i8`.
433/// ```
434/// # use safe_arch::*;
435/// let a = m256i::from([126_i8; 32]);
436/// let b = m256i::from([125_i8; 32]);
437/// let c: [i8; 32] = add_saturating_i8_m256i(a, b).into();
438/// assert_eq!(c, [127_i8; 32]);
439/// ```
440/// * **Intrinsic:** [`_mm256_adds_epi8`]
441/// * **Assembly:** `vpaddsb ymm, ymm, ymm`
442#[must_use]
443#[inline(always)]
444#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
445pub fn add_saturating_i8_m256i(a: m256i, b: m256i) -> m256i {
446  m256i(unsafe { _mm256_adds_epi8(a.0, b.0) })
447}
448
449/// Lanewise saturating `a + b` with lanes as `i16`.
450/// ```
451/// # use safe_arch::*;
452/// let a = m256i::from([32700_i16; 16]);
453/// let b = m256i::from([32000_i16; 16]);
454/// let c: [i16; 16] = add_saturating_i16_m256i(a, b).into();
455/// assert_eq!(c, [32767_i16; 16]);
456/// ```
457/// * **Intrinsic:** [`_mm256_adds_epi16`]
458/// * **Assembly:** `vpaddsw ymm, ymm, ymm`
459#[must_use]
460#[inline(always)]
461#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
462pub fn add_saturating_i16_m256i(a: m256i, b: m256i) -> m256i {
463  m256i(unsafe { _mm256_adds_epi16(a.0, b.0) })
464}
465
466/// Lanewise saturating `a + b` with lanes as `u8`.
467/// ```
468/// # use safe_arch::*;
469/// let a = m256i::from([126_u8; 32]);
470/// let b = m256i::from([125_u8; 32]);
471/// let c: [u8; 32] = add_saturating_u8_m256i(a, b).into();
472/// assert_eq!(c, [251_u8; 32]);
473/// ```
474/// * **Intrinsic:** [`_mm256_adds_epu8`]
475/// * **Assembly:** `vpaddusb ymm, ymm, ymm`
476#[must_use]
477#[inline(always)]
478#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
479pub fn add_saturating_u8_m256i(a: m256i, b: m256i) -> m256i {
480  m256i(unsafe { _mm256_adds_epu8(a.0, b.0) })
481}
482
483/// Lanewise saturating `a + b` with lanes as `u16`.
484/// ```
485/// # use safe_arch::*;
486/// let a = m256i::from([32700_u16; 16]);
487/// let b = m256i::from([32000_u16; 16]);
488/// let c: [u16; 16] = add_saturating_u16_m256i(a, b).into();
489/// assert_eq!(c, [64700_u16; 16]);
490/// ```
491/// * **Intrinsic:** [`_mm256_adds_epu16`]
492/// * **Assembly:** `vpaddusw ymm, ymm, ymm`
493#[must_use]
494#[inline(always)]
495#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
496pub fn add_saturating_u16_m256i(a: m256i, b: m256i) -> m256i {
497  m256i(unsafe { _mm256_adds_epu16(a.0, b.0) })
498}
499
500/// Works like [`combined_byte_shr_imm_m128i`], but twice as wide.
501///
502/// The low half of the bytes and high half of the bytes are both processed
503/// separately.
504///
505/// ```
506/// # use safe_arch::*;
507/// let a = m256i::from([5_i8; 32]);
508/// let b = m256i::from([12_i8; 32]);
509/// // `a` bytes come in to the _high_ indexes because these are LE bytes.
510/// // Also note that the three 5 values at the low half and high half.
511/// let c: [i8; 32] = combined_byte_shr_imm_m256i::<3>(a, b).into();
512/// assert_eq!(
513///   c,
514///   [
515///     12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 5, 5, 5, 12, 12, 12, 12, 12, 12, 12, 12,
516///     12, 12, 12, 12, 12, 5, 5, 5_i8
517///   ]
518/// );
519/// ```
520/// * **Intrinsic:** [`_mm256_alignr_epi8`]
521/// * **Assembly:** `vpalignr ymm, ymm, ymm, imm8`
522#[must_use]
523#[inline(always)]
524#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
525pub fn combined_byte_shr_imm_m256i<const IMM: i32>(a: m256i, b: m256i) -> m256i {
526  m256i(unsafe { _mm256_alignr_epi8(a.0, b.0, IMM) })
527}
528
529/// Bitwise `a & b`.
530/// ```
531/// # use safe_arch::*;
532/// let a = m256i::from([0_i64, 0, 1, 1]);
533/// let b = m256i::from([0_i64, 1, 0, 1]);
534/// let c: [i64; 4] = bitand_m256i(a, b).into();
535/// assert_eq!(c, [0_i64, 0, 0, 1]);
536/// ```
537/// * **Intrinsic:** [`_mm256_and_si256`]
538/// * **Assembly:** `vpand ymm, ymm, ymm`
539#[must_use]
540#[inline(always)]
541#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
542pub fn bitand_m256i(a: m256i, b: m256i) -> m256i {
543  m256i(unsafe { _mm256_and_si256(a.0, b.0) })
544}
545
546/// Bitwise `(!a) & b`.
547/// ```
548/// # use safe_arch::*;
549/// let a = m256i::from([0_i64, 0, 1, 1]);
550/// let b = m256i::from([0_i64, 1, 0, 1]);
551/// let c: [i64; 4] = bitandnot_m256i(a, b).into();
552/// assert_eq!(c, [0_i64, 1, 0, 0]);
553/// ```
554/// * **Intrinsic:** [`_mm256_andnot_si256`]
555/// * **Assembly:** `vpandn ymm, ymm, ymm`
556#[must_use]
557#[inline(always)]
558#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
559pub fn bitandnot_m256i(a: m256i, b: m256i) -> m256i {
560  m256i(unsafe { _mm256_andnot_si256(a.0, b.0) })
561}
562
563/// Average `u8` lanes.
564/// ```
565/// # use safe_arch::*;
566/// let a = m256i::from([100_u8; 32]);
567/// let b = m256i::from([120_u8; 32]);
568/// let c: [u8; 32] = average_u8_m256i(a, b).into();
569/// assert_eq!(c, [110_u8; 32]);
570/// ```
571/// * **Intrinsic:** [`_mm256_avg_epu8`]
572/// * **Assembly:** `vpavgb ymm, ymm, ymm`
573#[must_use]
574#[inline(always)]
575#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
576pub fn average_u8_m256i(a: m256i, b: m256i) -> m256i {
577  m256i(unsafe { _mm256_avg_epu8(a.0, b.0) })
578}
579
580/// Average `u16` lanes.
581/// ```
582/// # use safe_arch::*;
583/// let a = m256i::from([100_u16; 16]);
584/// let b = m256i::from([120_u16; 16]);
585/// let c: [u16; 16] = average_u16_m256i(a, b).into();
586/// assert_eq!(c, [110_u16; 16]);
587/// ```
588/// * **Intrinsic:** [`_mm256_avg_epu16`]
589/// * **Assembly:** `vpavgw ymm, ymm, ymm`
590#[must_use]
591#[inline(always)]
592#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
593pub fn average_u16_m256i(a: m256i, b: m256i) -> m256i {
594  m256i(unsafe { _mm256_avg_epu16(a.0, b.0) })
595}
596
597/// Blends the `i16` lanes according to the immediate value.
598///
599/// * The low 8 lanes and high 8 lanes both use the same immediate.
600/// * Each bit in `0..=7` should be set for `$b` and unset for `$a` within that
601///   half of the `i16` values.
602///
603/// ```
604/// # use safe_arch::*;
605/// let a = m256i::from([5_i16; 16]);
606/// let b = m256i::from([10_i16; 16]);
607/// //
608/// let c: [i16; 16] = blend_imm_i16_m256i::<0b11001000>(a, b).into();
609/// assert_eq!(c, [5_i16, 5, 5, 10, 5, 5, 10, 10, 5, 5, 5, 10, 5, 5, 10, 10]);
610/// ```
611/// * **Intrinsic:** [`_mm256_blend_epi16`]
612/// * **Assembly:** `vpblendw ymm, ymm, ymm, imm8`
613#[must_use]
614#[inline(always)]
615#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
616pub fn blend_imm_i16_m256i<const IMM: i32>(a: m256i, b: m256i) -> m256i {
617  m256i(unsafe { _mm256_blend_epi16(a.0, b.0, IMM) })
618}
619
620/// Blends the `i32` lanes according to the immediate value.
621///
622/// * Each bit in `0..=7` should be set for `$b` and unset for `$a`
623///
624/// ```
625/// # use safe_arch::*;
626/// let a = m256i::from([5_i32; 8]);
627/// let b = m256i::from([10_i32; 8]);
628/// //
629/// let c: [i32; 8] = blend_imm_i32_m256i::<0b11001000>(a, b).into();
630/// assert_eq!(c, [5, 5, 5, 10, 5, 5, 10, 10]);
631/// ```
632/// * **Intrinsic:** [`_mm256_blend_epi32`]
633/// * **Assembly:** `vpblendd ymm, ymm, ymm, imm8`
634#[must_use]
635#[inline(always)]
636#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
637pub fn blend_imm_i32_m256i<const IMM: i32>(a: m256i, b: m256i) -> m256i {
638  m256i(unsafe { _mm256_blend_epi32(a.0, b.0, IMM) })
639}
640
641/// Blend `i8` lanes according to a runtime varying mask.
642///
643/// * Mask lanes should be non-negative for `a` and negative for `b`.
644/// ```
645/// # use safe_arch::*;
646/// let a = m256i::from([5_i8; 32]);
647/// let b = m256i::from([10_i8; 32]);
648/// let mask = m256i::from([
649///   0_i8, 0, 0, -1, -1, -1, 0, 0, 0, -1, -1, -1, 0, 0, 0, -1, -1, -1, 0, 0, 0, -1, -1, -1, 0, 0, 0,
650///   -1, -1, -1, 0, 0,
651/// ]);
652/// let c: [i8; 32] = blend_varying_i8_m256i(a, b, mask).into();
653/// assert_eq!(
654///   c,
655///   [
656///     5, 5, 5, 10, 10, 10, 5, 5, 5, 10, 10, 10, 5, 5, 5, 10, 10, 10, 5, 5, 5, 10, 10, 10, 5, 5, 5,
657///     10, 10, 10, 5, 5
658///   ]
659/// );
660/// ```
661/// * **Intrinsic:** [`_mm256_blendv_epi8`]
662/// * **Assembly:** `vpavgw ymm, ymm, ymm`
663#[must_use]
664#[inline(always)]
665#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
666pub fn blend_varying_i8_m256i(a: m256i, b: m256i, mask: m256i) -> m256i {
667  m256i(unsafe { _mm256_blendv_epi8(a.0, b.0, mask.0) })
668}
669
670/// Sets the lowest `i8` lane of an `m128i` as all lanes of an `m256i`.
671/// ```
672/// # use safe_arch::*;
673/// let a = m128i::from(5_i8 as i128);
674/// let b: [i8; 32] = set_splat_i8_m128i_s_m256i(a).into();
675/// assert_eq!(b, [5_i8; 32]);
676/// ```
677/// * **Intrinsic:** [`_mm256_broadcastb_epi8`]
678/// * **Assembly:** `vpbroadcastb ymm, xmm`
679#[must_use]
680#[inline(always)]
681#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
682pub fn set_splat_i8_m128i_s_m256i(a: m128i) -> m256i {
683  m256i(unsafe { _mm256_broadcastb_epi8(a.0) })
684}
685
686/// Sets the lowest `i16` lane of an `m128i` as all lanes of an `m256i`.
687/// ```
688/// # use safe_arch::*;
689/// let a = m128i::from(5_i16 as i128);
690/// let b: [i16; 16] = set_splat_i16_m128i_s_m256i(a).into();
691/// assert_eq!(b, [5_i16; 16]);
692/// ```
693/// * **Intrinsic:** [`_mm256_broadcastw_epi16`]
694/// * **Assembly:** `vpbroadcastw ymm, xmm`
695#[must_use]
696#[inline(always)]
697#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
698pub fn set_splat_i16_m128i_s_m256i(a: m128i) -> m256i {
699  m256i(unsafe { _mm256_broadcastw_epi16(a.0) })
700}
701
702/// Sets the lowest `i32` lane of an `m128i` as all lanes of an `m256i`.
703/// ```
704/// # use safe_arch::*;
705/// let a = m128i::from(5_i32 as i128);
706/// let b: [i32; 8] = set_splat_i32_m128i_s_m256i(a).into();
707/// assert_eq!(b, [5_i32; 8]);
708/// ```
709/// * **Intrinsic:** [`_mm256_broadcastd_epi32`]
710/// * **Assembly:** `vpbroadcastd ymm, xmm`
711#[must_use]
712#[inline(always)]
713#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
714pub fn set_splat_i32_m128i_s_m256i(a: m128i) -> m256i {
715  m256i(unsafe { _mm256_broadcastd_epi32(a.0) })
716}
717
718/// Sets the lowest `i64` lane of an `m128i` as all lanes of an `m256i`.
719/// ```
720/// # use safe_arch::*;
721/// let a = m128i::from(5_i64 as i128);
722/// let b: [i64; 4] = set_splat_i64_m128i_s_m256i(a).into();
723/// assert_eq!(b, [5_i64; 4]);
724/// ```
725/// * **Intrinsic:** [`_mm256_broadcastq_epi64`]
726/// * **Assembly:** `vpbroadcastq ymm, xmm`
727#[must_use]
728#[inline(always)]
729#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
730pub fn set_splat_i64_m128i_s_m256i(a: m128i) -> m256i {
731  m256i(unsafe { _mm256_broadcastq_epi64(a.0) })
732}
733
734/// Sets the lowest lane of an `m128d` as all lanes of an `m256d`.
735/// ```
736/// # use safe_arch::*;
737/// let a = set_m128d_s(5.0);
738/// let b = set_splat_m128d_s_m256d(a).to_array();
739/// assert_eq!(b, [5.0; 4]);
740/// ```
741/// * **Intrinsic:** [`_mm256_broadcastsd_pd`]
742/// * **Assembly:** `vbroadcastsd ymm, xmm`
743#[must_use]
744#[inline(always)]
745#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
746pub fn set_splat_m128d_s_m256d(a: m128d) -> m256d {
747  m256d(unsafe { _mm256_broadcastsd_pd(a.0) })
748}
749
750/// Sets the lowest lane of an `m128` as all lanes of an `m256`.
751/// ```
752/// # use safe_arch::*;
753/// let a = set_m128_s(5.0);
754/// let b = set_splat_m128_s_m256(a).to_array();
755/// assert_eq!(b, [5.0; 8]);
756/// ```
757/// * **Intrinsic:** [`_mm256_broadcastss_ps`]
758/// * **Assembly:** `vbroadcastss ymm, xmm`
759#[must_use]
760#[inline(always)]
761#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
762pub fn set_splat_m128_s_m256(a: m128) -> m256 {
763  m256(unsafe { _mm256_broadcastss_ps(a.0) })
764}
765
766/// Shifts each `u128` lane left by a number of **bytes**.
767///
768/// ```
769/// # use safe_arch::*;
770/// let a = m256i::from([0x0000000B_0000000A_0000000F_11111111_u128; 2]);
771/// //
772/// let b: [u128; 2] = byte_shl_imm_u128_m256i::<1>(a).into();
773/// assert_eq!(b, [0x00000B00_00000A00_00000F11_11111100_u128; 2]);
774/// ```
775/// * **Intrinsic:** [`_mm256_bslli_epi128`]
776/// * **Assembly:** `vpslldq ymm, ymm, imm8`
777#[must_use]
778#[inline(always)]
779#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
780pub fn byte_shl_imm_u128_m256i<const IMM: i32>(a: m256i) -> m256i {
781  m256i(unsafe { _mm256_bslli_epi128(a.0, IMM) })
782}
783
784/// Shifts each `u128` lane right by a number of **bytes**.
785///
786/// ```
787/// # use safe_arch::*;
788/// let a = m256i::from([0x0000000B_0000000A_0000000F_11111111_u128; 2]);
789/// //
790/// let b: [u128; 2] = byte_shr_imm_u128_m256i::<1>(a).into();
791/// assert_eq!(b, [0x00000000_0B000000_0A000000_0F111111; 2]);
792/// ```
793/// * **Intrinsic:** [`_mm256_bsrli_epi128`]
794/// * **Assembly:** `vpsrldq ymm, ymm, imm8`
795#[must_use]
796#[inline(always)]
797#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
798pub fn byte_shr_imm_u128_m256i<const IMM: i32>(a: m256i) -> m256i {
799  m256i(unsafe { _mm256_bsrli_epi128(a.0, IMM) })
800}
801
802/// Compare `i8` lanes for equality, mask output.
803/// ```
804/// # use safe_arch::*;
805/// assert_eq!(
806///   <[i8; 32]>::from(cmp_eq_mask_i8_m256i(m256i::from([1_i8; 32]), m256i::from([1_i8; 32]))),
807///   [-1_i8; 32]
808/// );
809/// assert_eq!(
810///   <[i8; 32]>::from(cmp_eq_mask_i8_m256i(m256i::from([5_i8; 32]), m256i::from([6_i8; 32]))),
811///   [0_i8; 32]
812/// );
813/// ```
814/// * **Intrinsic:** [`_mm256_cmpeq_epi8`]
815/// * **Assembly:** `vpcmpeqb ymm, ymm, ymm`
816#[must_use]
817#[inline(always)]
818#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
819pub fn cmp_eq_mask_i8_m256i(a: m256i, b: m256i) -> m256i {
820  m256i(unsafe { _mm256_cmpeq_epi8(a.0, b.0) })
821}
822
823/// Compare `i16` lanes for equality, mask output.
824/// ```
825/// # use safe_arch::*;
826/// assert_eq!(
827///   <[i16; 16]>::from(cmp_eq_mask_i16_m256i(m256i::from([1_i16; 16]), m256i::from([1_i16; 16]))),
828///   [-1_i16; 16]
829/// );
830/// assert_eq!(
831///   <[i16; 16]>::from(cmp_eq_mask_i16_m256i(m256i::from([5_i16; 16]), m256i::from([6_i16; 16]))),
832///   [0_i16; 16]
833/// );
834/// ```
835/// * **Intrinsic:** [`_mm256_cmpeq_epi16`]
836/// * **Assembly:** `vpcmpeqw ymm, ymm, ymm`
837#[must_use]
838#[inline(always)]
839#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
840pub fn cmp_eq_mask_i16_m256i(a: m256i, b: m256i) -> m256i {
841  m256i(unsafe { _mm256_cmpeq_epi16(a.0, b.0) })
842}
843
844/// Compare `i32` lanes for equality, mask output.
845/// ```
846/// # use safe_arch::*;
847/// assert_eq!(
848///   <[i32; 8]>::from(cmp_eq_mask_i32_m256i(m256i::from([1_i32; 8]), m256i::from([1_i32; 8]))),
849///   [-1_i32; 8]
850/// );
851/// assert_eq!(
852///   <[i32; 8]>::from(cmp_eq_mask_i32_m256i(m256i::from([5_i32; 8]), m256i::from([6_i32; 8]))),
853///   [0_i32; 8]
854/// );
855/// ```
856/// * **Intrinsic:** [`_mm256_cmpeq_epi32`]
857/// * **Assembly:** `vpcmpeqd ymm, ymm, ymm`
858#[must_use]
859#[inline(always)]
860#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
861pub fn cmp_eq_mask_i32_m256i(a: m256i, b: m256i) -> m256i {
862  m256i(unsafe { _mm256_cmpeq_epi32(a.0, b.0) })
863}
864
865/// Compare `i64` lanes for equality, mask output.
866/// ```
867/// # use safe_arch::*;
868/// assert_eq!(
869///   <[i64; 4]>::from(cmp_eq_mask_i64_m256i(m256i::from([1_i64; 4]), m256i::from([1_i64; 4]))),
870///   [-1_i64; 4]
871/// );
872/// assert_eq!(
873///   <[i64; 4]>::from(cmp_eq_mask_i64_m256i(m256i::from([5_i64; 4]), m256i::from([6_i64; 4]))),
874///   [0_i64; 4]
875/// );
876/// ```
877/// * **Intrinsic:** [`_mm256_cmpeq_epi64`]
878/// * **Assembly:** `vpcmpeqq ymm, ymm, ymm`
879#[must_use]
880#[inline(always)]
881#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
882pub fn cmp_eq_mask_i64_m256i(a: m256i, b: m256i) -> m256i {
883  m256i(unsafe { _mm256_cmpeq_epi64(a.0, b.0) })
884}
885
886/// Compare `i8` lanes for `a > b`, mask output.
887/// ```
888/// # use safe_arch::*;
889/// assert_eq!(
890///   <[i8; 32]>::from(cmp_gt_mask_i8_m256i(m256i::from([1_i8; 32]), m256i::from([0_i8; 32]))),
891///   [-1_i8; 32]
892/// );
893/// assert_eq!(
894///   <[i8; 32]>::from(cmp_gt_mask_i8_m256i(m256i::from([5_i8; 32]), m256i::from([5_i8; 32]))),
895///   [0_i8; 32]
896/// );
897/// ```
898/// * **Intrinsic:** [`_mm256_cmpgt_epi8`]
899/// * **Assembly:** `vpcmpgtb ymm, ymm, ymm`
900#[must_use]
901#[inline(always)]
902#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
903pub fn cmp_gt_mask_i8_m256i(a: m256i, b: m256i) -> m256i {
904  m256i(unsafe { _mm256_cmpgt_epi8(a.0, b.0) })
905}
906
907/// Compare `i16` lanes for `a > b`, mask output.
908/// ```
909/// # use safe_arch::*;
910/// assert_eq!(
911///   <[i16; 16]>::from(cmp_gt_mask_i16_m256i(m256i::from([1_i16; 16]), m256i::from([0_i16; 16]))),
912///   [-1_i16; 16]
913/// );
914/// assert_eq!(
915///   <[i16; 16]>::from(cmp_gt_mask_i16_m256i(m256i::from([5_i16; 16]), m256i::from([5_i16; 16]))),
916///   [0_i16; 16]
917/// );
918/// ```
919/// * **Intrinsic:** [`_mm256_cmpgt_epi16`]
920/// * **Assembly:** `vpcmpgtw ymm, ymm, ymm`
921#[must_use]
922#[inline(always)]
923#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
924pub fn cmp_gt_mask_i16_m256i(a: m256i, b: m256i) -> m256i {
925  m256i(unsafe { _mm256_cmpgt_epi16(a.0, b.0) })
926}
927
928/// Compare `i32` lanes for `a > b`, mask output.
929/// ```
930/// # use safe_arch::*;
931/// assert_eq!(
932///   <[i32; 8]>::from(cmp_gt_mask_i32_m256i(m256i::from([1_i32; 8]), m256i::from([0_i32; 8]))),
933///   [-1_i32; 8]
934/// );
935/// assert_eq!(
936///   <[i32; 8]>::from(cmp_gt_mask_i32_m256i(m256i::from([5_i32; 8]), m256i::from([5_i32; 8]))),
937///   [0_i32; 8]
938/// );
939/// ```
940/// * **Intrinsic:** [`_mm256_cmpgt_epi32`]
941/// * **Assembly:** `vpcmpgtd ymm, ymm, ymm`
942#[must_use]
943#[inline(always)]
944#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
945pub fn cmp_gt_mask_i32_m256i(a: m256i, b: m256i) -> m256i {
946  m256i(unsafe { _mm256_cmpgt_epi32(a.0, b.0) })
947}
948
949/// Compare `i64` lanes for `a > b`, mask output.
950/// ```
951/// # use safe_arch::*;
952/// assert_eq!(
953///   <[i64; 4]>::from(cmp_gt_mask_i64_m256i(m256i::from([1_i64; 4]), m256i::from([0_i64; 4]))),
954///   [-1_i64; 4]
955/// );
956/// assert_eq!(
957///   <[i64; 4]>::from(cmp_gt_mask_i64_m256i(m256i::from([5_i64; 4]), m256i::from([5_i64; 4]))),
958///   [0_i64; 4]
959/// );
960/// ```
961/// * **Intrinsic:** [`_mm256_cmpgt_epi64`]
962/// * **Assembly:** `vpcmpgtq ymm, ymm, ymm`
963#[must_use]
964#[inline(always)]
965#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
966pub fn cmp_gt_mask_i64_m256i(a: m256i, b: m256i) -> m256i {
967  m256i(unsafe { _mm256_cmpgt_epi64(a.0, b.0) })
968}
969
970/// Convert `i16` values to `i32` values.
971/// ```
972/// # use safe_arch::*;
973/// let a = m128i::from([-5_i16; 8]);
974/// let b: [i32; 8] = convert_to_i32_m256i_from_i16_m128i(a).into();
975/// assert_eq!(b, [-5_i32; 8]);
976/// ```
977/// * **Intrinsic:** [`_mm256_cvtepi16_epi32`]
978/// * **Assembly:** `vpmovsxwd ymm, xmm`
979#[must_use]
980#[inline(always)]
981#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
982pub fn convert_to_i32_m256i_from_i16_m128i(a: m128i) -> m256i {
983  m256i(unsafe { _mm256_cvtepi16_epi32(a.0) })
984}
985
986/// Convert `i16` values to `i64` values.
987/// ```
988/// # use safe_arch::*;
989/// let a = m128i::from([-5_i16; 8]);
990/// let b: [i64; 4] = convert_to_i64_m256i_from_lower4_i16_m128i(a).into();
991/// assert_eq!(b, [-5_i64; 4]);
992/// ```
993/// * **Intrinsic:** [`_mm256_cvtepi16_epi64`]
994/// * **Assembly:** `vpmovsxwq ymm, xmm`
995#[must_use]
996#[inline(always)]
997#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
998pub fn convert_to_i64_m256i_from_lower4_i16_m128i(a: m128i) -> m256i {
999  m256i(unsafe { _mm256_cvtepi16_epi64(a.0) })
1000}
1001
1002/// Convert `i32` values to `i64` values.
1003/// ```
1004/// # use safe_arch::*;
1005/// let a = m128i::from([-5_i32; 4]);
1006/// let b: [i64; 4] = convert_to_i64_m256i_from_i32_m128i(a).into();
1007/// assert_eq!(b, [-5_i64; 4]);
1008/// ```
1009/// * **Intrinsic:** [`_mm256_cvtepi32_epi64`]
1010/// * **Assembly:** `vpmovsxdq ymm, xmm`
1011#[must_use]
1012#[inline(always)]
1013#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1014pub fn convert_to_i64_m256i_from_i32_m128i(a: m128i) -> m256i {
1015  m256i(unsafe { _mm256_cvtepi32_epi64(a.0) })
1016}
1017
1018/// Convert `i8` values to `i16` values.
1019/// ```
1020/// # use safe_arch::*;
1021/// let a = m128i::from([-5_i8; 16]);
1022/// let b: [i16; 16] = convert_to_i16_m256i_from_i8_m128i(a).into();
1023/// assert_eq!(b, [-5_i16; 16]);
1024/// ```
1025/// * **Intrinsic:** [`_mm256_cvtepi8_epi16`]
1026/// * **Assembly:** `vpmovsxbw ymm, xmm`
1027#[must_use]
1028#[inline(always)]
1029#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1030pub fn convert_to_i16_m256i_from_i8_m128i(a: m128i) -> m256i {
1031  m256i(unsafe { _mm256_cvtepi8_epi16(a.0) })
1032}
1033
1034/// Convert the lower 8 `i8` values to `i32` values.
1035/// ```
1036/// # use safe_arch::*;
1037/// let a = m128i::from([-5_i8; 16]);
1038/// let b: [i32; 8] = convert_to_i32_m256i_from_lower8_i8_m128i(a).into();
1039/// assert_eq!(b, [-5_i32; 8]);
1040/// ```
1041/// * **Intrinsic:** [`_mm256_cvtepi8_epi32`]
1042/// * **Assembly:** `vpmovsxbd ymm, xmm`
1043#[must_use]
1044#[inline(always)]
1045#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1046pub fn convert_to_i32_m256i_from_lower8_i8_m128i(a: m128i) -> m256i {
1047  m256i(unsafe { _mm256_cvtepi8_epi32(a.0) })
1048}
1049
1050/// Convert the lower 4 `i8` values to `i64` values.
1051/// ```
1052/// # use safe_arch::*;
1053/// let a = m128i::from([-5_i8; 16]);
1054/// let b: [i64; 4] = convert_to_i64_m256i_from_lower4_i8_m128i(a).into();
1055/// assert_eq!(b, [-5_i64; 4]);
1056/// ```
1057/// * **Intrinsic:** [`_mm256_cvtepi8_epi64`]
1058/// * **Assembly:** `vpmovsxbq ymm, xmm`
1059#[must_use]
1060#[inline(always)]
1061#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1062pub fn convert_to_i64_m256i_from_lower4_i8_m128i(a: m128i) -> m256i {
1063  m256i(unsafe { _mm256_cvtepi8_epi64(a.0) })
1064}
1065
1066/// Convert `u16` values to `i32` values.
1067/// ```
1068/// # use safe_arch::*;
1069/// let a = m128i::from([5_u16; 8]);
1070/// let b: [i32; 8] = convert_to_i32_m256i_from_u16_m128i(a).into();
1071/// assert_eq!(b, [5_i32; 8]);
1072/// ```
1073/// * **Intrinsic:** [`_mm256_cvtepu16_epi32`]
1074/// * **Assembly:** `vpmovzxwd ymm, xmm`
1075#[must_use]
1076#[inline(always)]
1077#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1078pub fn convert_to_i32_m256i_from_u16_m128i(a: m128i) -> m256i {
1079  m256i(unsafe { _mm256_cvtepu16_epi32(a.0) })
1080}
1081
1082/// Convert `u16` values to `i64` values.
1083/// ```
1084/// # use safe_arch::*;
1085/// let a = m128i::from([5_u16; 8]);
1086/// let b: [i64; 4] = convert_to_i64_m256i_from_lower4_u16_m128i(a).into();
1087/// assert_eq!(b, [5_i64; 4]);
1088/// ```
1089/// * **Intrinsic:** [`_mm256_cvtepu16_epi64`]
1090/// * **Assembly:** `vpmovzxwq ymm, xmm`
1091#[must_use]
1092#[inline(always)]
1093#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1094pub fn convert_to_i64_m256i_from_lower4_u16_m128i(a: m128i) -> m256i {
1095  m256i(unsafe { _mm256_cvtepu16_epi64(a.0) })
1096}
1097
1098/// Convert `u32` values to `i64` values.
1099/// ```
1100/// # use safe_arch::*;
1101/// let a = m128i::from([5_u32; 4]);
1102/// let b: [i64; 4] = convert_to_i64_m256i_from_u32_m128i(a).into();
1103/// assert_eq!(b, [5_i64; 4]);
1104/// ```
1105/// * **Intrinsic:** [`_mm256_cvtepu32_epi64`]
1106/// * **Assembly:** `vpmovzxdq ymm, xmm`
1107#[must_use]
1108#[inline(always)]
1109#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1110pub fn convert_to_i64_m256i_from_u32_m128i(a: m128i) -> m256i {
1111  m256i(unsafe { _mm256_cvtepu32_epi64(a.0) })
1112}
1113
1114/// Convert `u8` values to `i16` values.
1115/// ```
1116/// # use safe_arch::*;
1117/// let a = m128i::from([5_u8; 16]);
1118/// let b: [i16; 16] = convert_to_i16_m256i_from_u8_m128i(a).into();
1119/// assert_eq!(b, [5_i16; 16]);
1120/// ```
1121/// * **Intrinsic:** [`_mm256_cvtepu8_epi16`]
1122/// * **Assembly:** `vpmovzxbw ymm, xmm`
1123#[must_use]
1124#[inline(always)]
1125#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1126pub fn convert_to_i16_m256i_from_u8_m128i(a: m128i) -> m256i {
1127  m256i(unsafe { _mm256_cvtepu8_epi16(a.0) })
1128}
1129
1130/// Convert lower 8 `u8` values to `i16` values.
1131/// ```
1132/// # use safe_arch::*;
1133/// let a = m128i::from([5_u8; 16]);
1134/// let b: [i32; 8] = convert_to_i16_m256i_from_lower8_u8_m128i(a).into();
1135/// assert_eq!(b, [5_i32; 8]);
1136/// ```
1137/// * **Intrinsic:** [`_mm256_cvtepu8_epi32`]
1138/// * **Assembly:** `vpmovzxbd ymm, xmm`
1139#[must_use]
1140#[inline(always)]
1141#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1142pub fn convert_to_i16_m256i_from_lower8_u8_m128i(a: m128i) -> m256i {
1143  m256i(unsafe { _mm256_cvtepu8_epi32(a.0) })
1144}
1145
1146/// Convert lower 4 `u8` values to `i16` values.
1147/// ```
1148/// # use safe_arch::*;
1149/// let a = m128i::from([5_u8; 16]);
1150/// let b: [i64; 4] = convert_to_i16_m256i_from_lower4_u8_m128i(a).into();
1151/// assert_eq!(b, [5_i64; 4]);
1152/// ```
1153/// * **Intrinsic:** [`_mm256_cvtepu8_epi64`]
1154/// * **Assembly:** `vpmovzxbq ymm, xmm`
1155#[must_use]
1156#[inline(always)]
1157#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1158pub fn convert_to_i16_m256i_from_lower4_u8_m128i(a: m128i) -> m256i {
1159  m256i(unsafe { _mm256_cvtepu8_epi64(a.0) })
1160}
1161
1162/// Gets an `i16` value out of an `m256i`, returns as `i32`.
1163///
1164/// The lane to get must be a constant in the range `0..16`.
1165///
1166/// ```
1167/// # use safe_arch::*;
1168/// let a = m256i::from([0xA_i16, 0xB, 0xC, 0xD, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1169/// //
1170/// assert_eq!(extract_i16_as_i32_m256i::<0>(a), 0xA);
1171/// assert_eq!(extract_i16_as_i32_m256i::<1>(a), 0xB);
1172/// ```
1173/// * **Intrinsic:** [`_mm256_extract_epi16`]
1174#[must_use]
1175#[inline(always)]
1176#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1177pub fn extract_i16_as_i32_m256i<const LANE: i32>(a: m256i) -> i32 {
1178  unsafe { _mm256_extract_epi16(a.0, LANE) }
1179}
1180
1181/// Gets an `i8` value out of an `m256i`, returns as `i32`.
1182///
1183/// The lane to get must be a constant in the range `0..32`.
1184///
1185/// ```
1186/// # use safe_arch::*;
1187/// let a = m256i::from([
1188///   0xA_i8, 0xB, 0xC, 0xD, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1189///   0, 0, 0, 0,
1190/// ]);
1191/// //
1192/// assert_eq!(extract_i8_as_i32_m256i::<0>(a), 0xA);
1193/// assert_eq!(extract_i8_as_i32_m256i::<1>(a), 0xB);
1194/// ```
1195/// * **Intrinsic:** [`_mm256_extract_epi8`]
1196#[must_use]
1197#[inline(always)]
1198#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1199pub fn extract_i8_as_i32_m256i<const LANE: i32>(a: m256i) -> i32 {
1200  unsafe { _mm256_extract_epi8(a.0, LANE) }
1201}
1202
1203/// Gets an `m128i` value out of an `m256i`.
1204///
1205/// The lane to get must be a constant 0 or 1.
1206///
1207/// ```
1208/// # use safe_arch::*;
1209/// let a = m256i::from([5_u128, 6_u128]);
1210/// //
1211/// assert_eq!(extract_m128i_m256i::<0>(a), m128i::from(5_u128));
1212/// assert_eq!(extract_m128i_m256i::<1>(a), m128i::from(6_u128));
1213/// ```
1214/// * **Intrinsic:** [`_mm256_extract_epi8`]
1215/// * **Assembly:** `vextracti128 xmm, ymm, imm8`
1216#[must_use]
1217#[inline(always)]
1218#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1219pub fn extract_m128i_m256i<const LANE: i32>(a: m256i) -> m128i {
1220  m128i(unsafe { _mm256_extracti128_si256(a.0, LANE) })
1221}
1222
1223/// Horizontal `a + b` with lanes as `i16`.
1224///
1225/// * The results are interleaved 128-bits at a time: a.low, b.low, a.high,
1226///   b.high
1227/// ```
1228/// # use safe_arch::*;
1229/// let a = m256i::from([5_i16; 16]);
1230/// let b = m256i::from([6_i16; 16]);
1231/// let c: [i16; 16] = add_horizontal_i16_m256i(a, b).into();
1232/// assert_eq!(c, [10_i16, 10, 10, 10, 12, 12, 12, 12, 10, 10, 10, 10, 12, 12, 12, 12]);
1233/// ```
1234/// * **Intrinsic:** [`_mm256_hadd_epi16`]
1235/// * **Assembly:** `vphaddw ymm, ymm, ymm`
1236#[must_use]
1237#[inline(always)]
1238#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1239pub fn add_horizontal_i16_m256i(a: m256i, b: m256i) -> m256i {
1240  m256i(unsafe { _mm256_hadd_epi16(a.0, b.0) })
1241}
1242
1243/// Horizontal saturating `a + b` with lanes as `i16`.
1244///
1245/// * The results are interleaved 128-bits at a time: a.low, b.low, a.high,
1246///   b.high
1247/// ```
1248/// # use safe_arch::*;
1249/// let a = m256i::from([i16::MAX; 16]);
1250/// let b = m256i::from([i16::MIN; 16]);
1251/// let c: [i16; 16] = add_horizontal_saturating_i16_m256i(a, b).into();
1252/// assert_eq!(
1253///   c,
1254///   [
1255///     i16::MAX, i16::MAX, i16::MAX, i16::MAX,
1256///     i16::MIN, i16::MIN, i16::MIN, i16::MIN,
1257///     i16::MAX, i16::MAX, i16::MAX, i16::MAX,
1258///     i16::MIN, i16::MIN, i16::MIN, i16::MIN,
1259///   ]
1260/// );
1261/// ```
1262/// * **Intrinsic:** [`_mm256_hadds_epi16`]
1263/// * **Assembly:** `vphaddsw ymm, ymm, ymm`
1264#[must_use]
1265#[inline(always)]
1266#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1267#[rustfmt::skip]
1268pub fn add_horizontal_saturating_i16_m256i(a: m256i, b: m256i) -> m256i {
1269  m256i(unsafe { _mm256_hadds_epi16(a.0, b.0) })
1270}
1271
1272/// Horizontal `a + b` with lanes as `i32`.
1273///
1274/// * The results are interleaved 128-bits at a time: a.low, b.low, a.high,
1275///   b.high
1276/// ```
1277/// # use safe_arch::*;
1278/// let a = m256i::from([5_i32; 8]);
1279/// let b = m256i::from([6_i32; 8]);
1280/// let c: [i32; 8] = add_horizontal_i32_m256i(a, b).into();
1281/// assert_eq!(c, [10, 10, 12, 12, 10, 10, 12, 12]);
1282/// ```
1283/// * **Intrinsic:** [`_mm256_hadd_epi32`]
1284/// * **Assembly:** `vphaddd ymm, ymm, ymm`
1285#[must_use]
1286#[inline(always)]
1287#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1288pub fn add_horizontal_i32_m256i(a: m256i, b: m256i) -> m256i {
1289  m256i(unsafe { _mm256_hadd_epi32(a.0, b.0) })
1290}
1291
1292/// Horizontal `a - b` with lanes as `i16`.
1293///
1294/// * The results are interleaved 128-bits at a time: a.low, b.low, a.high,
1295///   b.high
1296/// ```
1297/// # use safe_arch::*;
1298/// let a = m256i::from([5_i16, 6, 2, 5, 4, 3, 1, 0, -12, 13, 56, 21, 8, 7, 6, 5]);
1299/// let b = m256i::from([12000_i16, 13000, -2, -8, 0, 1, 2, 3, 8, 7, 6, 5, 234, 654, 123, 978]);
1300/// let c: [i16; 16] = add_horizontal_i16_m256i(a, b).into();
1301/// assert_eq!(c, [11_i16, 7, 7, 1, 25000, -10, 1, 5, 1, 77, 15, 11, 15, 11, 888, 1101]);
1302/// ```
1303/// * **Intrinsic:** [`_mm256_hsub_epi16`]
1304/// * **Assembly:** `vphsubw ymm, ymm, ymm`
1305#[must_use]
1306#[inline(always)]
1307#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1308pub fn sub_horizontal_i16_m256i(a: m256i, b: m256i) -> m256i {
1309  m256i(unsafe { _mm256_hsub_epi16(a.0, b.0) })
1310}
1311
1312/// Horizontal `a - b` with lanes as `i32`.
1313///
1314/// * The results are interleaved 128-bits at a time: a.low, b.low, a.high,
1315///   b.high
1316/// ```
1317/// # use safe_arch::*;
1318/// let a = m256i::from([5, 6, 2, 5, 4, 3, 1, 0]);
1319/// let b = m256i::from([-12, 13, 56, 21, 8, 7, 6, 5]);
1320/// let c: [i32; 8] = sub_horizontal_i32_m256i(a, b).into();
1321/// assert_eq!(c, [-1, -3, -25, 35, 1, 1, 1, 1]);
1322/// ```
1323/// * **Intrinsic:** [`_mm256_hsub_epi32`]
1324/// * **Assembly:** `vphsubd ymm, ymm, ymm`
1325#[must_use]
1326#[inline(always)]
1327#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1328pub fn sub_horizontal_i32_m256i(a: m256i, b: m256i) -> m256i {
1329  m256i(unsafe { _mm256_hsub_epi32(a.0, b.0) })
1330}
1331
1332/// Horizontal saturating `a - b` with lanes as `i16`.
1333///
1334/// * The results are interleaved 128-bits at a time: a.low, b.low, a.high,
1335///   b.high
1336/// ```
1337/// # use safe_arch::*;
1338/// let a = m256i::from([i16::MAX; 16]);
1339/// let b = m256i::from([i16::MIN; 16]);
1340/// let c: [i16; 16] = add_horizontal_saturating_i16_m256i(a, b).into();
1341/// assert_eq!(
1342///   c,
1343///   [
1344///     i16::MAX, i16::MAX, i16::MAX, i16::MAX,
1345///     i16::MIN, i16::MIN, i16::MIN, i16::MIN,
1346///     i16::MAX, i16::MAX, i16::MAX, i16::MAX,
1347///     i16::MIN, i16::MIN, i16::MIN, i16::MIN,
1348///   ]
1349/// );
1350/// ```
1351/// * **Intrinsic:** [`_mm256_hsubs_epi16`]
1352/// * **Assembly:** `vphsubsw ymm, ymm, ymm`
1353#[must_use]
1354#[inline(always)]
1355#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1356#[rustfmt::skip]
1357pub fn sub_horizontal_saturating_i16_m256i(a: m256i, b: m256i) -> m256i {
1358  m256i(unsafe { _mm256_hsubs_epi16(a.0, b.0) })
1359}
1360
1361/// Multiply `i16` lanes producing `i32` values, horizontal add pairs of `i32`
1362/// values to produce the final output.
1363/// ```
1364/// # use safe_arch::*;
1365/// let a = m256i::from([1_i16, 2, 3, 4, -1, -2, -3, -4, 12, 13, -14, -15, 100, 200, 300, -400]);
1366/// let b = m256i::from([5_i16, 6, 7, 8, -15, -26, -37, 48, 50, 60, 70, -80, 90, 100, 12, -80]);
1367/// let c: [i32; 8] = mul_i16_horizontal_add_m256i(a, b).into();
1368/// assert_eq!(c, [17, 53, 67, -81, 1380, 220, 29000, 35600]);
1369/// ```
1370/// * **Intrinsic:** [`_mm256_madd_epi16`]
1371/// * **Assembly:** `vpmaddwd ymm, ymm, ymm`
1372#[must_use]
1373#[inline(always)]
1374#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1375pub fn mul_i16_horizontal_add_m256i(a: m256i, b: m256i) -> m256i {
1376  m256i(unsafe { _mm256_madd_epi16(a.0, b.0) })
1377}
1378
1379/// This is dumb and weird.
1380///
1381/// * Vertically multiplies each `u8` lane from `a` with an `i8` lane from `b`,
1382///   producing an `i16` intermediate value.
1383/// * These intermediate `i16` values are horizontally added with saturation.
1384///
1385/// ```
1386/// # use safe_arch::*;
1387/// let a = m256i::from([
1388///   255_u8, 255, 0, 0, 255, 255, 1, 1, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
1389///   18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
1390/// ]);
1391/// let b = m256i::from([
1392///   127_i8, 127, 0, 0, -127, -127, 1, 1, 24, 25, 26, 27, 28, 29, 30, 31, 16,
1393///   17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
1394/// ]);
1395/// let c: [i16; 16] = mul_u8i8_add_horizontal_saturating_m256i(a, b).into();
1396/// assert_eq!(
1397///   c,
1398///   [i16::MAX, 0, i16::MIN, 2, 417, 557, 713, 885,
1399///   545, 685, 841, 1013, 1201, 1405, 1625, 1861]
1400/// );
1401/// ```
1402/// * **Intrinsic:** [`_mm256_maddubs_epi16`]
1403/// * **Assembly:** `vpmaddubsw ymm, ymm, ymm`
1404#[must_use]
1405#[inline(always)]
1406#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1407#[rustfmt::skip]
1408pub fn mul_u8i8_add_horizontal_saturating_m256i(a: m256i, b: m256i) -> m256i {
1409  m256i(unsafe { _mm256_maddubs_epi16(a.0, b.0) })
1410}
1411
1412/// Loads the reference given and zeroes any `i32` lanes not in the mask.
1413///
1414/// * A lane is "in" the mask if that lane's mask value is set in the high bit
1415///   (aka "if the lane's value is negative").
1416/// ```
1417/// # use safe_arch::*;
1418/// let a = m256i::from([5_i32; 8]);
1419/// let b = load_masked_i32_m256i(&a, m256i::from([-1_i32, 0, 0, -1, -1, -1, 0, 0]));
1420/// assert_eq!(<[i32; 8]>::from(b), [5, 0, 0, 5, 5, 5, 0, 0]);
1421/// ```
1422/// * **Intrinsic:** [`_mm256_maskload_epi32`]
1423/// * **Assembly:** `vpmaskmovd ymm, ymm, m256`
1424#[must_use]
1425#[inline(always)]
1426#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1427pub fn load_masked_i32_m256i(a: &m256i, mask: m256i) -> m256i {
1428  m256i(unsafe { _mm256_maskload_epi32(a as *const m256i as *const i32, mask.0) })
1429}
1430
1431/// Loads the reference given and zeroes any `i64` lanes not in the mask.
1432///
1433/// * A lane is "in" the mask if that lane's mask value is set in the high bit
1434///   (aka "if the lane's value is negative").
1435/// ```
1436/// # use safe_arch::*;
1437/// let a = m256i::from([5_i64; 4]);
1438/// let b = load_masked_i64_m256i(&a, m256i::from([0_i64, -1, -1, 0]));
1439/// assert_eq!(<[i64; 4]>::from(b), [0_i64, 5, 5, 0]);
1440/// ```
1441/// * **Intrinsic:** [`_mm256_maskload_epi64`]
1442/// * **Assembly:** `vpmaskmovq ymm, ymm, m256`
1443#[must_use]
1444#[inline(always)]
1445#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1446pub fn load_masked_i64_m256i(a: &m256i, mask: m256i) -> m256i {
1447  m256i(unsafe { _mm256_maskload_epi64(a as *const m256i as *const i64, mask.0) })
1448}
1449
1450/// Stores the `i32` masked lanes given to the reference.
1451///
1452/// * A lane is "in" the mask if that lane's mask value is set in the high bit
1453///   (aka "if the lane's value is negative").
1454/// * Lanes not in the mask are not modified.
1455/// ```
1456/// # use safe_arch::*;
1457/// let mut a = m256i::default();
1458/// store_masked_i32_m256i(
1459///   &mut a,
1460///   m256i::from([-1_i32, 0, 0, -1, -1, -1, 0, 0]),
1461///   m256i::from([5_i32; 8]),
1462/// );
1463/// assert_eq!(<[i32; 8]>::from(a), [5, 0, 0, 5, 5, 5, 0, 0]);
1464/// ```
1465/// * **Intrinsic:** [`_mm256_maskstore_epi32`]
1466/// * **Assembly:** `vpmaskmovd m256, ymm, ymm`
1467#[inline(always)]
1468#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1469pub fn store_masked_i32_m256i(addr: &mut m256i, mask: m256i, a: m256i) {
1470  unsafe { _mm256_maskstore_epi32(addr as *mut m256i as *mut i32, mask.0, a.0) };
1471}
1472
1473/// Stores the `i32` masked lanes given to the reference.
1474///
1475/// * A lane is "in" the mask if that lane's mask value is set in the high bit
1476///   (aka "if the lane's value is negative").
1477/// * Lanes not in the mask are not modified.
1478/// ```
1479/// # use safe_arch::*;
1480/// let mut a = m256i::default();
1481/// store_masked_i64_m256i(&mut a, m256i::from([0_i64, -1, -1, 0]), m256i::from([5_i64; 4]));
1482/// assert_eq!(<[i64; 4]>::from(a), [0, 5, 5, 0]);
1483/// ```
1484/// * **Intrinsic:** [`_mm256_maskstore_epi64`]
1485/// * **Assembly:** `vpmaskmovq m256, ymm, ymm`
1486#[inline(always)]
1487#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1488pub fn store_masked_i64_m256i(addr: &mut m256i, mask: m256i, a: m256i) {
1489  unsafe { _mm256_maskstore_epi64(addr as *mut m256i as *mut i64, mask.0, a.0) };
1490}
1491
1492/// Inserts an `m128i` to an `m256i` at the high or low position.
1493///
1494/// ```
1495/// # use safe_arch::*;
1496/// let a = m256i::from([0_i32; 8]);
1497/// let b: [i32; 8] = insert_m128i_to_m256i::<1>(a, m128i::from([1, 2, 3, 4])).into();
1498/// assert_eq!(b, [0, 0, 0, 0, 1, 2, 3, 4]);
1499/// ```
1500/// * **Intrinsic:** [`_mm256_inserti128_si256`]
1501/// * **Assembly:** `vinserti128 ymm, ymm, xmm, imm8`
1502#[must_use]
1503#[inline(always)]
1504#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1505pub fn insert_m128i_to_m256i<const LANE: i32>(a: m256i, b: m128i) -> m256i {
1506  m256i(unsafe { _mm256_inserti128_si256(a.0, b.0, LANE) })
1507}
1508
1509/// Lanewise `max(a, b)` with lanes as `i8`.
1510/// ```
1511/// # use safe_arch::*;
1512/// let a = m256i::from([
1513///   0_i8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 127, 1, 3, 5, 7, 2, 3, 5, 12, 13, 16, 27,
1514///   28, 29, 30, 31, 32,
1515/// ]);
1516/// let b = m256i::from([
1517///   0_i8, 11, 2, -13, 4, 15, 6, -17, -8, 19, -20, 21, 22, -23, 24, 127, 0, -1, 3, 4, 5, 1, -2, -4,
1518///   -8, 12, 13, 14, 29, 30, -31, -32,
1519/// ]);
1520/// let c: [i8; 32] = max_i8_m256i(a, b).into();
1521/// assert_eq!(
1522///   c,
1523///   [
1524///     0, 11, 2, 3, 4, 15, 6, 7, 8, 19, 10, 21, 22, 13, 24, 127, 1, 3, 5, 7, 5, 3, 5, 12, 13, 16,
1525///     27, 28, 29, 30, 31, 32
1526///   ]
1527/// );
1528/// ```
1529/// * **Intrinsic:** [`_mm256_max_epi8`]
1530/// * **Assembly:** `vpmaxsb ymm, ymm, ymm`
1531#[must_use]
1532#[inline(always)]
1533#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1534pub fn max_i8_m256i(a: m256i, b: m256i) -> m256i {
1535  m256i(unsafe { _mm256_max_epi8(a.0, b.0) })
1536}
1537
1538/// Lanewise `max(a, b)` with lanes as `i16`.
1539/// ```
1540/// # use safe_arch::*;
1541/// let a = m256i::from([0_i16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 127]);
1542/// let b = m256i::from([0_i16, 11, 2, -13, 4, 15, 6, -17, -8, 19, -20, 21, 22, -23, -24, 25]);
1543/// let c: [i16; 16] = max_i16_m256i(a, b).into();
1544/// assert_eq!(c, [0, 11, 2, 3, 4, 15, 6, 7, 8, 19, 10, 21, 22, 13, 14, 127]);
1545/// ```
1546/// * **Intrinsic:** [`_mm256_max_epi16`]
1547/// * **Assembly:** `vpmaxsw ymm, ymm, ymm`
1548#[must_use]
1549#[inline(always)]
1550#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1551pub fn max_i16_m256i(a: m256i, b: m256i) -> m256i {
1552  m256i(unsafe { _mm256_max_epi16(a.0, b.0) })
1553}
1554
1555/// Lanewise `max(a, b)` with lanes as `i32`.
1556/// ```
1557/// # use safe_arch::*;
1558/// let a = m256i::from([0_i32, 1, 2, 3, 4, 5, 6, 7]);
1559/// let b = m256i::from([0_i32, 11, 2, -13, 4, 15, 6, -17]);
1560/// let c: [i32; 8] = max_i32_m256i(a, b).into();
1561/// assert_eq!(c, [0, 11, 2, 3, 4, 15, 6, 7]);
1562/// ```
1563/// * **Intrinsic:** [`_mm256_max_epi32`]
1564/// * **Assembly:** `vpmaxsd ymm, ymm, ymm`
1565#[must_use]
1566#[inline(always)]
1567#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1568pub fn max_i32_m256i(a: m256i, b: m256i) -> m256i {
1569  m256i(unsafe { _mm256_max_epi32(a.0, b.0) })
1570}
1571
1572/// Lanewise `max(a, b)` with lanes as `u8`.
1573/// ```
1574/// # use safe_arch::*;
1575/// let a = m256i::from([
1576///   0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 127, 1, 3, 5, 7, 2, 3, 5, 12, 13, 16, 27,
1577///   28, 29, 30, 31, 32,
1578/// ]);
1579/// let b = m256i::from([
1580///   0_u8, 255, 2, 13, 4, 15, 6, 17, 8, 19, 20, 21, 22, 23, 24, 127, 0, 1, 3, 4, 5, 1, 2, 4, 8, 12,
1581///   13, 14, 29, 30, 31, 32,
1582/// ]);
1583/// let c: [u8; 32] = max_u8_m256i(a, b).into();
1584/// assert_eq!(
1585///   c,
1586///   [
1587///     0, 255, 2, 13, 4, 15, 6, 17, 8, 19, 20, 21, 22, 23, 24, 127, 1, 3, 5, 7, 5, 3, 5, 12, 13, 16,
1588///     27, 28, 29, 30, 31, 32
1589///   ]
1590/// );
1591/// ```
1592/// * **Intrinsic:** [`_mm256_max_epu8`]
1593/// * **Assembly:** `vpmaxub ymm, ymm, ymm`
1594#[must_use]
1595#[inline(always)]
1596#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1597pub fn max_u8_m256i(a: m256i, b: m256i) -> m256i {
1598  m256i(unsafe { _mm256_max_epu8(a.0, b.0) })
1599}
1600
1601/// Lanewise `max(a, b)` with lanes as `u16`.
1602/// ```
1603/// # use safe_arch::*;
1604/// let a = m256i::from([0_u16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 127]);
1605/// let b = m256i::from([0_u16, 65535, 2, 13, 4, 15, 6, 17, 8, 19, 20, 21, 22, 23, 24, 25]);
1606/// let c: [u16; 16] = max_u16_m256i(a, b).into();
1607/// assert_eq!(c, [0, 65535, 2, 13, 4, 15, 6, 17, 8, 19, 20, 21, 22, 23, 24, 127]);
1608/// ```
1609/// * **Intrinsic:** [`_mm256_max_epu16`]
1610/// * **Assembly:** `vpmaxuw ymm, ymm, ymm`
1611#[must_use]
1612#[inline(always)]
1613#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1614pub fn max_u16_m256i(a: m256i, b: m256i) -> m256i {
1615  m256i(unsafe { _mm256_max_epu16(a.0, b.0) })
1616}
1617
1618/// Lanewise `max(a, b)` with lanes as `u32`.
1619/// ```
1620/// # use safe_arch::*;
1621/// let a = m256i::from([0_u32, 1, 2, 3, 4, 5, 6, 7]);
1622/// let b = m256i::from([0_u32, 11, 2, 13, 4, 15, 6, 17]);
1623/// let c: [u32; 8] = max_u32_m256i(a, b).into();
1624/// assert_eq!(c, [0, 11, 2, 13, 4, 15, 6, 17]);
1625/// ```
1626/// * **Intrinsic:** [`_mm256_max_epu32`]
1627/// * **Assembly:** `vpmaxud ymm, ymm, ymm`
1628#[must_use]
1629#[inline(always)]
1630#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1631pub fn max_u32_m256i(a: m256i, b: m256i) -> m256i {
1632  m256i(unsafe { _mm256_max_epu32(a.0, b.0) })
1633}
1634
1635/// Lanewise `min(a, b)` with lanes as `i8`.
1636/// ```
1637/// # use safe_arch::*;
1638/// let a = m256i::from([
1639///   0_i8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 127, 1, 3, 5, 7, 2, 3, 5, 12, 13, 16, 27,
1640///   28, 29, 30, 31, 32,
1641/// ]);
1642/// let b = m256i::from([
1643///   0_i8, 11, 2, -13, 4, 15, 6, -17, -8, 19, -20, 21, 22, -23, 24, 127, 0, -1, 3, 4, 5, 1, -2, -4,
1644///   -8, 12, 13, 14, 29, 30, -31, -32,
1645/// ]);
1646/// let c: [i8; 32] = min_i8_m256i(a, b).into();
1647/// assert_eq!(
1648///   c,
1649///   [
1650///     0, 1, 2, -13, 4, 5, 6, -17, -8, 9, -20, 11, 12, -23, 14, 127, 0, -1, 3, 4, 2, 1, -2, -4, -8,
1651///     12, 13, 14, 29, 30, -31, -32
1652///   ]
1653/// );
1654/// ```
1655/// * **Intrinsic:** [`_mm256_min_epi8`]
1656/// * **Assembly:** `vpminsb ymm, ymm, ymm`
1657#[must_use]
1658#[inline(always)]
1659#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1660pub fn min_i8_m256i(a: m256i, b: m256i) -> m256i {
1661  m256i(unsafe { _mm256_min_epi8(a.0, b.0) })
1662}
1663
1664/// Lanewise `min(a, b)` with lanes as `i16`.
1665/// ```
1666/// # use safe_arch::*;
1667/// let a = m256i::from([0_i16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 127]);
1668/// let b = m256i::from([0_i16, 11, 2, -13, 4, 15, 6, -17, -8, 19, -20, 21, 22, -23, -24, 25]);
1669/// let c: [i16; 16] = min_i16_m256i(a, b).into();
1670/// assert_eq!(c, [0, 1, 2, -13, 4, 5, 6, -17, -8, 9, -20, 11, 12, -23, -24, 25]);
1671/// ```
1672/// * **Intrinsic:** [`_mm256_min_epi16`]
1673/// * **Assembly:** `vpminsw ymm, ymm, ymm`
1674#[must_use]
1675#[inline(always)]
1676#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1677pub fn min_i16_m256i(a: m256i, b: m256i) -> m256i {
1678  m256i(unsafe { _mm256_min_epi16(a.0, b.0) })
1679}
1680
1681/// Lanewise `min(a, b)` with lanes as `i32`.
1682/// ```
1683/// # use safe_arch::*;
1684/// let a = m256i::from([0_i32, 1, 2, 3, 4, 5, 6, 7]);
1685/// let b = m256i::from([0_i32, 11, 2, -13, 4, 15, 6, -17]);
1686/// let c: [i32; 8] = min_i32_m256i(a, b).into();
1687/// assert_eq!(c, [0, 1, 2, -13, 4, 5, 6, -17]);
1688/// ```
1689/// * **Intrinsic:** [`_mm256_min_epi32`]
1690/// * **Assembly:** `vpminsd ymm, ymm, ymm`
1691#[must_use]
1692#[inline(always)]
1693#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1694pub fn min_i32_m256i(a: m256i, b: m256i) -> m256i {
1695  m256i(unsafe { _mm256_min_epi32(a.0, b.0) })
1696}
1697
1698/// Lanewise `min(a, b)` with lanes as `u8`.
1699/// ```
1700/// # use safe_arch::*;
1701/// let a = m256i::from([
1702///   0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 127, 1, 3, 5, 7, 2, 3, 5, 12, 13, 16, 27,
1703///   28, 29, 30, 31, 32,
1704/// ]);
1705/// let b = m256i::from([
1706///   0_u8, 255, 2, 13, 4, 15, 6, 17, 8, 19, 20, 21, 22, 23, 24, 127, 0, 1, 3, 4, 5, 1, 2, 4, 8, 12,
1707///   13, 14, 29, 30, 31, 32,
1708/// ]);
1709/// let c: [u8; 32] = min_u8_m256i(a, b).into();
1710/// assert_eq!(
1711///   c,
1712///   [
1713///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 127, 0, 1, 3, 4, 2, 1, 2, 4, 8, 12, 13, 14,
1714///     29, 30, 31, 32
1715///   ]
1716/// );
1717/// ```
1718/// * **Intrinsic:** [`_mm256_min_epu8`]
1719/// * **Assembly:** `vpminub ymm, ymm, ymm`
1720#[must_use]
1721#[inline(always)]
1722#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1723pub fn min_u8_m256i(a: m256i, b: m256i) -> m256i {
1724  m256i(unsafe { _mm256_min_epu8(a.0, b.0) })
1725}
1726
1727/// Lanewise `min(a, b)` with lanes as `u16`.
1728/// ```
1729/// # use safe_arch::*;
1730/// let a = m256i::from([0_u16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 127]);
1731/// let b = m256i::from([0_u16, 65535, 2, 13, 4, 15, 6, 17, 8, 19, 20, 21, 22, 23, 24, 25]);
1732/// let c: [u16; 16] = min_u16_m256i(a, b).into();
1733/// assert_eq!(c, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 25]);
1734/// ```
1735/// * **Intrinsic:** [`_mm256_min_epu16`]
1736/// * **Assembly:** `vpminuw ymm, ymm, ymm`
1737#[must_use]
1738#[inline(always)]
1739#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1740pub fn min_u16_m256i(a: m256i, b: m256i) -> m256i {
1741  m256i(unsafe { _mm256_min_epu16(a.0, b.0) })
1742}
1743
1744/// Lanewise `min(a, b)` with lanes as `u32`.
1745/// ```
1746/// # use safe_arch::*;
1747/// let a = m256i::from([0_u32, 1, 2, 3, 4, 5, 6, 7]);
1748/// let b = m256i::from([0_u32, 11, 2, 13, 4, 15, 6, 17]);
1749/// let c: [u32; 8] = min_u32_m256i(a, b).into();
1750/// assert_eq!(c, [0, 1, 2, 3, 4, 5, 6, 7]);
1751/// ```
1752/// * **Intrinsic:** [`_mm256_min_epu32`]
1753/// * **Assembly:** `vpminud ymm, ymm, ymm`
1754#[must_use]
1755#[inline(always)]
1756#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1757pub fn min_u32_m256i(a: m256i, b: m256i) -> m256i {
1758  m256i(unsafe { _mm256_min_epu32(a.0, b.0) })
1759}
1760
1761/// Create an `i32` mask of each sign bit in the `i8` lanes.
1762/// ```
1763/// # use safe_arch::*;
1764/// let a = m256i::from([
1765///   0_i8, 11, 2, -13, 4, 15, 6, -17, -8, 19, -20, 21, 22, -23, 24, 127, 0, -1, 3, 4, 5, 1, -2, -4,
1766///   -8, 12, 13, 14, 29, 30, -31, 32,
1767/// ]);
1768/// assert_eq!(0b01000001110000100010010110001000, move_mask_i8_m256i(a));
1769/// ```
1770/// * **Intrinsic:** [`_mm256_movemask_epi8`]
1771/// * **Assembly:** `vpmovmskb r32, ymm`
1772#[must_use]
1773#[inline(always)]
1774#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1775pub fn move_mask_i8_m256i(a: m256i) -> i32 {
1776  unsafe { _mm256_movemask_epi8(a.0) }
1777}
1778
1779/// Computes eight `u16` "sum of absolute difference" values according to the
1780/// bytes selected.
1781///
1782/// * This essentially works like two [`multi_packed_sum_abs_diff_u8_m128i`]
1783///   uses happening at once, the "low" portion works on the lower 128 bits, and
1784///   the "high" portion works on the upper 128 bits.
1785///
1786/// ```
1787/// # use safe_arch::*;
1788/// let a = m256i::from([5_u8; 32]);
1789/// let b = m256i::from([7_u8; 32]);
1790/// //
1791/// let c: [u16; 16] = multi_packed_sum_abs_diff_u8_m256i::<0b101000>(a, b).into();
1792/// assert_eq!(c, [8_u16; 16]);
1793/// ```
1794/// * **Intrinsic:** [`_mm256_mpsadbw_epu8`]
1795/// * **Assembly:** ``
1796#[must_use]
1797#[inline(always)]
1798#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1799pub fn multi_packed_sum_abs_diff_u8_m256i<const IMM: i32>(a: m256i, b: m256i) -> m256i {
1800  m256i(unsafe { _mm256_mpsadbw_epu8(a.0, b.0, IMM) })
1801}
1802
1803/// Multiply the lower `i32` within each `i64` lane, `i64` output.
1804/// ```
1805/// # use safe_arch::*;
1806/// let a = m256i::from([1_i64, 2, 3, 4]);
1807/// let b = m256i::from([5_i64, 6, 7, -8]);
1808/// let c: [i64; 4] = mul_i64_low_bits_m256i(a, b).into();
1809/// assert_eq!(c, [5_i64, 12, 21, -32]);
1810/// ```
1811/// * **Intrinsic:** [`_mm256_mul_epi32`]
1812/// * **Assembly:** `vpmuldq ymm, ymm, ymm`
1813#[must_use]
1814#[inline(always)]
1815#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1816pub fn mul_i64_low_bits_m256i(a: m256i, b: m256i) -> m256i {
1817  m256i(unsafe { _mm256_mul_epi32(a.0, b.0) })
1818}
1819
1820/// Multiply the lower `u32` within each `u64` lane, `u64` output.
1821/// ```
1822/// # use safe_arch::*;
1823/// let a = m256i::from([1_u64, 2, 3, 4]);
1824/// let b = m256i::from([5_u64, 6, 7, 8]);
1825/// let c: [u64; 4] = mul_u64_low_bits_m256i(a, b).into();
1826/// assert_eq!(c, [5_u64, 12, 21, 32]);
1827/// ```
1828/// * **Intrinsic:** [`_mm256_mul_epu32`]
1829/// * **Assembly:** `vpmuludq ymm, ymm, ymm`
1830#[must_use]
1831#[inline(always)]
1832#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1833pub fn mul_u64_low_bits_m256i(a: m256i, b: m256i) -> m256i {
1834  m256i(unsafe { _mm256_mul_epu32(a.0, b.0) })
1835}
1836
1837/// Multiply the `i16` lanes and keep the high half of each 32-bit output.
1838/// ```
1839/// # use safe_arch::*;
1840/// let a = m256i::from([5_i16, 6, 2, 5, 4, 3, 1, 0, -12, 13, 56, 21, 8, 7, 6, 5]);
1841/// let b = m256i::from([12000_i16, 13000, -2, -8, 0, 1, 2, 3, 8, 7, 6, 5, 234, 654, 123, 978]);
1842/// let c: [i16; 16] = mul_i16_keep_high_m256i(a, b).into();
1843/// assert_eq!(c, [0_i16, 1, -1, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0]);
1844/// ```
1845/// * **Intrinsic:** [`_mm256_mulhi_epi16`]
1846/// * **Assembly:** `vpmulhw ymm, ymm, ymm`
1847#[must_use]
1848#[inline(always)]
1849#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1850pub fn mul_i16_keep_high_m256i(a: m256i, b: m256i) -> m256i {
1851  m256i(unsafe { _mm256_mulhi_epi16(a.0, b.0) })
1852}
1853
1854/// Multiply the `u16` lanes and keep the high half of each 32-bit output.
1855/// ```
1856/// # use safe_arch::*;
1857/// let a = m256i::from([5_u16, 6, 2, 5, 4, 3, 1, 0, 12000, 13, 56, 21, 8, 7, 6, 5]);
1858/// let b = m256i::from([12000_u16, 13000, 2000, 800, 0, 1, 2, 3, 8, 7, 6, 5, 234, 654, 123, 978]);
1859/// let c: [u16; 16] = mul_u16_keep_high_m256i(a, b).into();
1860/// assert_eq!(c, [0_u16, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]);
1861/// ```
1862/// * **Intrinsic:** [`_mm256_mulhi_epu16`]
1863/// * **Assembly:** `vpmulhuw ymm, ymm, ymm`
1864#[must_use]
1865#[inline(always)]
1866#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1867pub fn mul_u16_keep_high_m256i(a: m256i, b: m256i) -> m256i {
1868  m256i(unsafe { _mm256_mulhi_epu16(a.0, b.0) })
1869}
1870
1871/// Multiply `i16` lanes into `i32` intermediates, keep the high 18 bits, round
1872/// by adding 1, right shift by 1.
1873/// ```
1874/// # use safe_arch::*;
1875/// let a = m256i::from([
1876///   0_i16, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500,
1877/// ]);
1878/// let b = m256i::from([
1879///   800_i16, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200,
1880///   2300,
1881/// ]);
1882/// let c: [i16; 16] = mul_i16_scale_round_m256i(a, b).into();
1883/// assert_eq!(c, [0_i16, 3, 6, 10, 15, 20, 26, 32, 39, 47, 55, 64, 73, 83, 94, 105]);
1884/// ```
1885/// * **Intrinsic:** [`_mm256_mulhrs_epi16`]
1886/// * **Assembly:** `vpmulhrsw ymm, ymm, ymm`
1887#[must_use]
1888#[inline(always)]
1889#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1890pub fn mul_i16_scale_round_m256i(a: m256i, b: m256i) -> m256i {
1891  m256i(unsafe { _mm256_mulhrs_epi16(a.0, b.0) })
1892}
1893
1894/// Multiply the `i16` lanes and keep the low half of each 32-bit output.
1895/// ```
1896/// # use safe_arch::*;
1897/// let a = m256i::from([5_i16, 6, 2, 5, 4, 3, 1, 0, -12, 13, 56, 21, 8, 7, 6, 5]);
1898/// let b = m256i::from([-1_i16, 13000, -2, -8, 0, 1, 2, 3, 8, 7, 6, 5, 234, 654, 123, 978]);
1899/// let c: [i16; 16] = mul_i16_keep_low_m256i(a, b).into();
1900/// assert_eq!(c, [-5, 12464, -4, -40, 0, 3, 2, 0, -96, 91, 336, 105, 1872, 4578, 738, 4890]);
1901/// ```
1902/// * **Intrinsic:** [`_mm256_mullo_epi16`]
1903/// * **Assembly:** `vpmullw ymm, ymm, ymm`
1904#[must_use]
1905#[inline(always)]
1906#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1907pub fn mul_i16_keep_low_m256i(a: m256i, b: m256i) -> m256i {
1908  m256i(unsafe { _mm256_mullo_epi16(a.0, b.0) })
1909}
1910
1911/// Multiply the `i32` lanes and keep the low half of each 64-bit output.
1912/// ```
1913/// # use safe_arch::*;
1914/// let a = m256i::from([0_i32, 1, 2, 3, 4, 5, 6, 7]);
1915/// let b = m256i::from([0_i32, 11, 2, -13, 4, 15, 6, -17]);
1916/// let c: [i32; 8] = mul_i32_keep_low_m256i(a, b).into();
1917/// assert_eq!(c, [0, 11, 4, -39, 16, 75, 36, -119]);
1918/// ```
1919/// * **Intrinsic:** [`_mm256_mullo_epi32`]
1920/// * **Assembly:** `vpmulld ymm, ymm, ymm`
1921#[must_use]
1922#[inline(always)]
1923#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1924pub fn mul_i32_keep_low_m256i(a: m256i, b: m256i) -> m256i {
1925  m256i(unsafe { _mm256_mullo_epi32(a.0, b.0) })
1926}
1927
1928/// Bitwise `a | b`
1929/// ```
1930/// # use safe_arch::*;
1931/// let a = m256i::from([0_i64, 0, 1, 1]);
1932/// let b = m256i::from([0_i64, 1, 0, 1]);
1933/// let c: [i64; 4] = bitor_m256i(a, b).into();
1934/// assert_eq!(c, [0_i64, 1, 1, 1]);
1935/// ```
1936/// * **Intrinsic:** [`_mm256_or_si256`]
1937/// * **Assembly:** `vpor ymm, ymm, ymm`
1938#[must_use]
1939#[inline(always)]
1940#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1941pub fn bitor_m256i(a: m256i, b: m256i) -> m256i {
1942  m256i(unsafe { _mm256_or_si256(a.0, b.0) })
1943}
1944
1945/// Saturating convert `i16` to `i8`, and pack the values.
1946///
1947/// * The values are packed 128 bits at a time: `a_low`, `b_low`, `a_high`,
1948///   `b_high`
1949/// ```
1950/// # use safe_arch::*;
1951/// let a = m256i::from([1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
1952/// let b = m256i::from([17_i16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]);
1953/// let c: [i8; 32] = pack_i16_to_i8_m256i(a, b).into();
1954/// assert_eq!(
1955///   c,
1956///   [
1957///     1_i8, 2, 3, 4, 5, 6, 7, 8, 17, 18, 19, 20, 21, 22, 23, 24, 9, 10, 11, 12, 13, 14, 15, 16, 25,
1958///     26, 27, 28, 29, 30, 31, 32
1959///   ]
1960/// );
1961/// ```
1962/// * **Intrinsic:** [`_mm256_packs_epi16`]
1963/// * **Assembly:** `vpacksswb ymm, ymm, ymm`
1964#[must_use]
1965#[inline(always)]
1966#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1967pub fn pack_i16_to_i8_m256i(a: m256i, b: m256i) -> m256i {
1968  m256i(unsafe { _mm256_packs_epi16(a.0, b.0) })
1969}
1970
1971/// Saturating convert `i32` to `i16`, and pack the values.
1972///
1973/// * The values are packed 128 bits at a time: `a_low`, `b_low`, `a_high`,
1974///   `b_high`
1975/// ```
1976/// # use safe_arch::*;
1977/// let a = m256i::from([1_i32, 2, 3, 4, 5, 6, 7, 8]);
1978/// let b = m256i::from([9_i32, 10, 11, 12, 13, 14, 15, 16]);
1979/// let c: [i16; 16] = pack_i32_to_i16_m256i(a, b).into();
1980/// assert_eq!(c, [1_i16, 2, 3, 4, 9, 10, 11, 12, 5, 6, 7, 8, 13, 14, 15, 16]);
1981/// ```
1982/// * **Intrinsic:** [`_mm256_packs_epi32`]
1983/// * **Assembly:** `vpackssdw ymm, ymm, ymm`
1984#[must_use]
1985#[inline(always)]
1986#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
1987pub fn pack_i32_to_i16_m256i(a: m256i, b: m256i) -> m256i {
1988  m256i(unsafe { _mm256_packs_epi32(a.0, b.0) })
1989}
1990
1991/// Saturating convert `i16` to `u8`, and pack the values.
1992///
1993/// * The values are packed 128 bits at a time: `a_low`, `b_low`, `a_high`,
1994///   `b_high`
1995/// ```
1996/// # use safe_arch::*;
1997/// let a = m256i::from([1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
1998/// let b = m256i::from([17_i16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]);
1999/// let c: [u8; 32] = pack_i16_to_u8_m256i(a, b).into();
2000/// assert_eq!(
2001///   c,
2002///   [
2003///     1_u8, 2, 3, 4, 5, 6, 7, 8, 17, 18, 19, 20, 21, 22, 23, 24, 9, 10, 11, 12, 13, 14, 15, 16, 25,
2004///     26, 27, 28, 29, 30, 31, 32
2005///   ]
2006/// );
2007/// ```
2008/// * **Intrinsic:** [`_mm256_packus_epi16`]
2009/// * **Assembly:** `vpackuswb ymm, ymm, ymm`
2010#[must_use]
2011#[inline(always)]
2012#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2013pub fn pack_i16_to_u8_m256i(a: m256i, b: m256i) -> m256i {
2014  m256i(unsafe { _mm256_packus_epi16(a.0, b.0) })
2015}
2016
2017/// Saturating convert `i32` to `u16`, and pack the values.
2018///
2019/// * The values are packed 128 bits at a time: `a_low`, `b_low`, `a_high`,
2020///   `b_high`
2021/// ```
2022/// # use safe_arch::*;
2023/// let a = m256i::from([1_i32, 2, 3, 4, 5, 6, 7, 8]);
2024/// let b = m256i::from([9_i32, 10, 11, 12, 13, 14, 15, 16]);
2025/// let c: [u16; 16] = pack_i32_to_u16_m256i(a, b).into();
2026/// assert_eq!(c, [1_u16, 2, 3, 4, 9, 10, 11, 12, 5, 6, 7, 8, 13, 14, 15, 16]);
2027/// ```
2028/// * **Intrinsic:** [`_mm256_packus_epi32`]
2029/// * **Assembly:** `vpackusdw ymm, ymm, ymm`
2030#[must_use]
2031#[inline(always)]
2032#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2033pub fn pack_i32_to_u16_m256i(a: m256i, b: m256i) -> m256i {
2034  m256i(unsafe { _mm256_packs_epi32(a.0, b.0) })
2035}
2036
2037/// Shuffle 128 bits of integer data from `$a` and `$b` using an immediate
2038/// control value.
2039///
2040/// You can pass `A_Low`, `A_High`, `B_Low`, `B_High`, or `Zeroed`.
2041/// ```
2042/// # use safe_arch::*;
2043/// let a = m256i::from([1, 2, 3, 4, 5, 6, 7, 8]);
2044/// let b = m256i::from([9, 10, 11, 12, 13, 14, 15, 16]);
2045/// //
2046/// let c: [i32; 8] = shuffle_abi_i128z_all_m256i::<0b_1000_0010>(a, b).into();
2047/// assert_eq!(c, [9, 10, 11, 12, 0, 0, 0, 0]);
2048/// //
2049/// let c: [i32; 8] = shuffle_abi_i128z_all_m256i::<0b_0001_1000>(a, b).into();
2050/// assert_eq!(c, [0, 0, 0, 0, 5, 6, 7, 8]);
2051/// ```
2052/// * **Intrinsic:** [`_mm256_permute2x128_si256`]
2053/// * **Assembly:** `vperm2i128 ymm, ymm, ymm, imm8`
2054#[must_use]
2055#[inline(always)]
2056#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2057pub fn shuffle_abi_i128z_all_m256i<const MASK: i32>(a: m256i, b: m256i) -> m256i {
2058  m256i(unsafe { _mm256_permute2x128_si256(a.0, b.0, MASK) })
2059}
2060
2061/// Shuffle the `f64` lanes in `$a` using an immediate control value.
2062/// ```
2063/// # use safe_arch::*;
2064/// let a = m256i::from([5_i64, 6, 7, 8]);
2065/// let b: [i64; 4] = shuffle_ai_i64_all_m256i::<0b00_01_10_11>(a).into();
2066/// assert_eq!(b, [8_i64, 7, 6, 5]);
2067/// ```
2068/// * **Intrinsic:** [`_mm256_permute4x64_epi64`]
2069/// * **Assembly:** `vpermq ymm, ymm, imm8`
2070#[must_use]
2071#[inline(always)]
2072#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2073pub fn shuffle_ai_i64_all_m256i<const IMM: i32>(a: m256i) -> m256i {
2074  m256i(unsafe { _mm256_permute4x64_epi64(a.0, IMM) })
2075}
2076
2077/// Shuffle the `f64` lanes from `$a` using an immediate control value.
2078/// ```
2079/// # use safe_arch::*;
2080/// let a = m256d::from_array([5.0, 6.0, 7.0, 8.0]);
2081/// let b: [f64; 4] = shuffle_ai_f64_all_m256d::<0b00_01_10_11>(a).to_array();
2082/// assert_eq!(b, [8.0, 7.0, 6.0, 5.0]);
2083/// ```
2084/// * **Intrinsic:** [`_mm256_permute4x64_pd`]
2085/// * **Assembly:** `vpermpd ymm, ymm, imm8`
2086#[must_use]
2087#[inline(always)]
2088#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2089pub fn shuffle_ai_f64_all_m256d<const IMM: i32>(a: m256d) -> m256d {
2090  m256d(unsafe { _mm256_permute4x64_pd(a.0, IMM) })
2091}
2092
2093/// Shuffle `i32` lanes in `a` using `i32` values in `v`.
2094/// ```
2095/// # use safe_arch::*;
2096/// let a = m256i::from([8, 9, 10, 11, 12, 13, 14, 15]);
2097/// let v = m256i::from([7, 6, 5, 5, 3, 2, 2, 0]);
2098/// let c: [i32; 8] = shuffle_av_i32_all_m256i(a, v).into();
2099/// assert_eq!(c, [15, 14, 13, 13, 11, 10, 10, 8]);
2100/// ```
2101/// * **Intrinsic:** [`_mm256_permutevar8x32_epi32`]
2102/// * **Assembly:** `vpermd ymm, ymm, ymm`
2103#[must_use]
2104#[inline(always)]
2105#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2106pub fn shuffle_av_i32_all_m256i(a: m256i, v: m256i) -> m256i {
2107  m256i(unsafe { _mm256_permutevar8x32_epi32(a.0, v.0) })
2108}
2109
2110/// Shuffle `f32` lanes in `a` using `i32` values in `v`.
2111/// ```
2112/// # use safe_arch::*;
2113/// let a = m256::from_array([8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0]);
2114/// let v = m256i::from([7, 6, 5, 5, 3, 2, 2, 0]);
2115/// let c: [f32; 8] = shuffle_av_f32_all_m256(a, v).to_array();
2116/// assert_eq!(c, [15.0, 14.0, 13.0, 13.0, 11.0, 10.0, 10.0, 8.0]);
2117/// ```
2118/// * **Intrinsic:** [`_mm256_permutevar8x32_ps`]
2119/// * **Assembly:** `vpermps ymm, ymm, ymm`
2120#[must_use]
2121#[inline(always)]
2122#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2123pub fn shuffle_av_f32_all_m256(a: m256, v: m256i) -> m256 {
2124  m256(unsafe { _mm256_permutevar8x32_ps(a.0, v.0) })
2125}
2126
2127/// Compute "sum of `u8` absolute differences".
2128///
2129/// * `u8` lanewise `abs(a - b)`, producing `u8` intermediate values.
2130/// * Sum the first eight and second eight values.
2131/// * Place into the low 16 bits of four `u64` lanes.
2132/// ```
2133/// # use safe_arch::*;
2134/// let a = m256i::from([
2135///   0_u8, 11, 2, 13, 4, 15, 6, 17, 8, 19, 20, 21, 22, 23, 24, 127, 0, 11, 2, 13, 4, 15, 6, 17, 8,
2136///   19, 20, 21, 22, 23, 24, 127,
2137/// ]);
2138/// let b = m256i::from([
2139///   20_u8, 110, 250, 103, 34, 105, 60, 217, 8, 19, 210, 201, 202, 203, 204, 127, 2, 3, 4, 5, 6, 7,
2140///   8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
2141/// ]);
2142/// let c: [u64; 4] = sum_of_u8_abs_diff_m256i(a, b).into();
2143/// assert_eq!(c, [831_u64, 910, 40, 160]);
2144/// ```
2145#[must_use]
2146#[inline(always)]
2147#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2148pub fn sum_of_u8_abs_diff_m256i(a: m256i, b: m256i) -> m256i {
2149  m256i(unsafe { _mm256_sad_epu8(a.0, b.0) })
2150}
2151
2152/// Shuffle the `i32` lanes in `a` using an immediate control value.
2153///
2154/// Each lane selection value picks only within that 128-bit half of the overall
2155/// register.
2156/// ```
2157/// # use safe_arch::*;
2158/// let a = m256i::from([5, 6, 7, 8, 9, 10, 11, 12]);
2159/// let b: [i32; 8] = shuffle_ai_i32_half_m256i::<0b00_01_10_11>(a).into();
2160/// assert_eq!(b, [8, 7, 6, 5, 12, 11, 10, 9]);
2161/// ```
2162/// * **Intrinsic:** [`_mm256_shuffle_epi32`]
2163/// * **Assembly:** `vpshufd ymm, ymm, imm8`
2164#[must_use]
2165#[inline(always)]
2166#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2167pub fn shuffle_ai_i32_half_m256i<const IMM: i32>(a: m256i) -> m256i {
2168  m256i(unsafe { _mm256_shuffle_epi32(a.0, IMM) })
2169}
2170
2171/// Shuffle `i8` lanes in `a` using `i8` values in `v`.
2172///
2173/// Each lane selection value picks only within that 128-bit half of the overall
2174/// register.
2175///
2176/// If a lane in `v` is negative, that output is zeroed.
2177/// ```
2178/// # use safe_arch::*;
2179/// let a = m256i::from([
2180///   3_i8, 11, 2, 13, 4, 15, 6, 17, 8, 19, 20, 21, 22, 23, 24, 127, 7, 11, 2, 13, 4, 15, 6, 17, 8,
2181///   19, 20, 21, 22, 23, 24, 127,
2182/// ]);
2183/// let b = m256i::from([
2184///   -1_i8, 1, 0, 2, 2, 3, 4, 5, 6, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 12, 11, 10, 9,
2185///   8, 7, 6, 5, 4,
2186/// ]);
2187/// let c: [i8; 32] = shuffle_av_i8z_half_m256i(a, b).into();
2188/// assert_eq!(
2189///   c,
2190///   [
2191///     0, 11, 3, 2, 2, 13, 4, 15, 6, 6, 17, 8, 8, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 22, 21,
2192///     20, 19, 8, 17, 6, 15, 4
2193///   ]
2194/// );
2195/// ```
2196/// * **Intrinsic:** [`_mm256_shuffle_epi8`]
2197/// * **Assembly:** `vpshufb ymm, ymm, ymm`
2198#[must_use]
2199#[inline(always)]
2200#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2201pub fn shuffle_av_i8z_half_m256i(a: m256i, v: m256i) -> m256i {
2202  m256i(unsafe { _mm256_shuffle_epi8(a.0, v.0) })
2203}
2204
2205/// Shuffle the high `i16` lanes in `$a` using an immediate control value.
2206///
2207/// The lower 128 bits and upper 128 bits have this performed separately.
2208/// ```
2209/// # use safe_arch::*;
2210/// let a = m256i::from([0_i16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
2211/// let b: [i16; 16] = shuffle_ai_i16_h64half_m256i::<0b_00_01_10_11>(a).into();
2212/// assert_eq!(b, [0, 1, 2, 3, 7, 6, 5, 4, 8, 9, 10, 11, 15, 14, 13, 12]);
2213/// ```
2214/// * **Intrinsic:** [`_mm256_shufflehi_epi16`]
2215/// * **Assembly:** `vpshufhw ymm, ymm, imm8`
2216#[must_use]
2217#[inline(always)]
2218#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2219pub fn shuffle_ai_i16_h64half_m256i<const IMM: i32>(a: m256i) -> m256i {
2220  m256i(unsafe { _mm256_shufflehi_epi16(a.0, IMM) })
2221}
2222
2223/// Shuffle the low `i16` lanes in `$a` using an immediate control value.
2224///
2225/// The lower 128 bits and upper 128 bits have this performed separately.
2226/// ```
2227/// # use safe_arch::*;
2228/// let a = m256i::from([0_i16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
2229/// let b: [i16; 16] = shuffle_ai_i16_l64half_m256i::<0b00_01_10_11>(a).into();
2230/// assert_eq!(b, [3, 2, 1, 0, 4, 5, 6, 7, 11, 10, 9, 8, 12, 13, 14, 15]);
2231/// ```
2232/// * **Intrinsic:** [`_mm256_shufflelo_epi16`]
2233/// * **Assembly:** `vpshuflw ymm, ymm, imm8`
2234#[must_use]
2235#[inline(always)]
2236#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2237pub fn shuffle_ai_i16_l64half_m256i<const IMM: i32>(a: m256i) -> m256i {
2238  m256i(unsafe { _mm256_shufflelo_epi16(a.0, IMM) })
2239}
2240
2241/// Lanewise `a * signum(b)` with lanes as `i8`
2242///
2243/// * If `b` is positive, the output is `a`.
2244/// * If `b` is zero, the output is 0.
2245/// * If `b` is negative, the output is `-a`.
2246/// ```
2247/// # use safe_arch::*;
2248/// let a = m256i::from([
2249///   3_i8, 11, 2, 13, 4, 15, 6, 17, 8, 19, 20, 21, 22, 23, 24, 127, 7, 11, 2, 13, 4, 15, 6, 17, 8,
2250///   19, 20, 21, 22, 23, 24, 127,
2251/// ]);
2252/// let b = m256i::from([
2253///   -1_i8, -1, 0, 2, 2, 3, 0, 5, 6, 6, -7, 8, 8, 0, 0, 10, 10, -11, 11, 12, 12, 13, 13, 12, 11,
2254///   -10, 9, 8, 7, 6, 5, -4,
2255/// ]);
2256/// let c: [i8; 32] = sign_apply_i8_m256i(a, b).into();
2257/// assert_eq!(
2258///   c,
2259///   [
2260///     -3, -11, 0, 13, 4, 15, 0, 17, 8, 19, -20, 21, 22, 0, 0, 127, 7, -11, 2, 13, 4, 15, 6, 17, 8,
2261///     -19, 20, 21, 22, 23, 24, -127
2262///   ]
2263/// );
2264/// ```
2265/// * **Intrinsic:** [`_mm256_sign_epi8`]
2266/// * **Assembly:** `vpsignb ymm, ymm, ymm`
2267#[must_use]
2268#[inline(always)]
2269#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2270pub fn sign_apply_i8_m256i(a: m256i, b: m256i) -> m256i {
2271  m256i(unsafe { _mm256_sign_epi8(a.0, b.0) })
2272}
2273
2274/// Lanewise `a * signum(b)` with lanes as `i16`
2275///
2276/// * If `b` is positive, the output is `a`.
2277/// * If `b` is zero, the output is 0.
2278/// * If `b` is negative, the output is `-a`.
2279/// ```
2280/// # use safe_arch::*;
2281/// let a = m256i::from([5_i16, 6, 2, 5, 4, 3, 1, 0, -12, 13, 56, 21, 8, 7, 6, 5]);
2282/// let b = m256i::from([12000_i16, 13000, -2, -8, 0, 1, 2, 3, -8, -7, 6, 5, 0, 0, 0, 978]);
2283/// let c: [i16; 16] = sign_apply_i16_m256i(a, b).into();
2284/// assert_eq!(c, [5, 6, -2, -5, 0, 3, 1, 0, 12, -13, 56, 21, 0, 0, 0, 5]);
2285/// ```
2286/// * **Intrinsic:** [`_mm256_sign_epi16`]
2287/// * **Assembly:** `vpsignw ymm, ymm, ymm`
2288#[must_use]
2289#[inline(always)]
2290#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2291pub fn sign_apply_i16_m256i(a: m256i, b: m256i) -> m256i {
2292  m256i(unsafe { _mm256_sign_epi16(a.0, b.0) })
2293}
2294
2295/// Lanewise `a * signum(b)` with lanes as `i32`
2296///
2297/// * If `b` is positive, the output is `a`.
2298/// * If `b` is zero, the output is 0.
2299/// * If `b` is negative, the output is `-a`.
2300/// ```
2301/// # use safe_arch::*;
2302/// let a = m256i::from([0_i32, 1, 2, 3, 4, 5, 6, 7]);
2303/// let b = m256i::from([0_i32, 0, -2, -13, 4, 15, 6, -17]);
2304/// let c: [i32; 8] = sign_apply_i32_m256i(a, b).into();
2305/// assert_eq!(c, [0_i32, 0, -2, -3, 4, 5, 6, -7]);
2306/// ```
2307/// * **Intrinsic:** [`_mm256_sign_epi32`]
2308/// * **Assembly:** `vpsignd ymm, ymm, ymm`
2309#[must_use]
2310#[inline(always)]
2311#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2312pub fn sign_apply_i32_m256i(a: m256i, b: m256i) -> m256i {
2313  m256i(unsafe { _mm256_sign_epi32(a.0, b.0) })
2314}
2315
2316/// Lanewise `u16` shift left by the lower `u64` lane of `count`.
2317/// ```
2318/// # use safe_arch::*;
2319/// let a = m256i::from([5_u16, 6, 2, 5, 4, 3, 1, 0, 12, 13, 56, 21, 8, 7, 6, 5]);
2320/// let count = m128i::from(1_u128);
2321/// let b: [u16; 16] = shl_all_u16_m256i(a, count).into();
2322/// assert_eq!(b, [10, 12, 4, 10, 8, 6, 2, 0, 24, 26, 112, 42, 16, 14, 12, 10]);
2323/// ```
2324/// * **Intrinsic:** [`_mm256_sll_epi16`]
2325/// * **Assembly:** `vpsllw ymm, ymm, xmm`
2326#[must_use]
2327#[inline(always)]
2328#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2329pub fn shl_all_u16_m256i(a: m256i, count: m128i) -> m256i {
2330  m256i(unsafe { _mm256_sll_epi16(a.0, count.0) })
2331}
2332
2333/// Shift all `u32` lanes left by the lower `u64` lane of `count`.
2334/// ```
2335/// # use safe_arch::*;
2336/// let a = m256i::from([0_u32, 1, 2, 13, 4, 15, 6, 17]);
2337/// let count = m128i::from(1_u128);
2338/// let b: [u32; 8] = shl_all_u32_m256i(a, count).into();
2339/// assert_eq!(b, [0, 2, 4, 26, 8, 30, 12, 34]);
2340/// ```
2341/// * **Intrinsic:** [`_mm256_sll_epi32`]
2342/// * **Assembly:** `vpslld ymm, ymm, xmm`
2343#[must_use]
2344#[inline(always)]
2345#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2346pub fn shl_all_u32_m256i(a: m256i, count: m128i) -> m256i {
2347  m256i(unsafe { _mm256_sll_epi32(a.0, count.0) })
2348}
2349
2350/// Shift all `u64` lanes left by the lower `u64` lane of `count`.
2351/// ```
2352/// # use safe_arch::*;
2353/// let a = m256i::from([0_u64, 1, 2, 13]);
2354/// let count = m128i::from(1_u128);
2355/// let b: [u64; 4] = shl_all_u64_m256i(a, count).into();
2356/// assert_eq!(b, [0, 2, 4, 26]);
2357/// ```
2358/// * **Intrinsic:** [`_mm256_sll_epi64`]
2359/// * **Assembly:** `vpsllq ymm, ymm, xmm`
2360#[must_use]
2361#[inline(always)]
2362#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2363pub fn shl_all_u64_m256i(a: m256i, count: m128i) -> m256i {
2364  m256i(unsafe { _mm256_sll_epi64(a.0, count.0) })
2365}
2366
2367/// Shifts all `u16` lanes left by an immediate.
2368///
2369/// ```
2370/// # use safe_arch::*;
2371/// let a = m256i::from([1_u16, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4]);
2372/// let c: [u16; 16] = shl_imm_u16_m256i::<1>(a).into();
2373/// assert_eq!(c, [2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8]);
2374/// ```
2375/// * **Intrinsic:** [`_mm256_slli_epi16`]
2376/// * **Assembly:** `vpsllw ymm, ymm, imm8`
2377#[must_use]
2378#[inline(always)]
2379#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2380pub fn shl_imm_u16_m256i<const IMM: i32>(a: m256i) -> m256i {
2381  m256i(unsafe { _mm256_slli_epi16(a.0, IMM) })
2382}
2383
2384/// Shifts all `u32` lanes left by an immediate.
2385///
2386/// ```
2387/// # use safe_arch::*;
2388/// let a = m256i::from([1_u32, 2, 3, 4, 1, 2, 3, 4]);
2389/// let c: [u32; 8] = shl_imm_u32_m256i::<1>(a).into();
2390/// assert_eq!(c, [1_u32 << 1, 2 << 1, 3 << 1, 4 << 1, 1 << 1, 2 << 1, 3 << 1, 4 << 1]);
2391/// ```
2392/// * **Intrinsic:** [`_mm256_slli_epi32`]
2393/// * **Assembly:** `vpslld ymm, ymm, imm8`
2394#[must_use]
2395#[inline(always)]
2396#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2397pub fn shl_imm_u32_m256i<const IMM: i32>(a: m256i) -> m256i {
2398  m256i(unsafe { _mm256_slli_epi32(a.0, IMM) })
2399}
2400
2401/// Shifts all `u64` lanes left by an immediate.
2402///
2403/// ```
2404/// # use safe_arch::*;
2405/// let a = m256i::from([1_u64, 2, 3, 4]);
2406/// let c: [u64; 4] = shl_imm_u64_m256i::<1>(a).into();
2407/// assert_eq!(c, [1_u64 << 1, 2 << 1, 3 << 1, 4 << 1,]);
2408/// ```
2409/// * **Intrinsic:** [`_mm256_slli_epi64`]
2410/// * **Assembly:** `vpsllq ymm, ymm, imm8`
2411#[must_use]
2412#[inline(always)]
2413#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2414pub fn shl_imm_u64_m256i<const IMM: i32>(a: m256i) -> m256i {
2415  m256i(unsafe { _mm256_slli_epi64(a.0, IMM) })
2416}
2417
2418/// Lanewise `u32` shift left by the matching `i32` lane in `count`.
2419/// ```
2420/// # use safe_arch::*;
2421/// let a = m256i::from([0_u32, 1, 2, 13, 5, 6, 7, 1]);
2422/// let count = m256i::from([1_u32, 2, 3, 4, 5, 6, 7, 1]);
2423/// let b: [u32; 8] = shl_each_u32_m256i(a, count).into();
2424/// assert_eq!(b, [0, 4, 16, 208, 160, 384, 896, 2]);
2425/// ```
2426/// * **Intrinsic:** [`_mm256_sllv_epi32`]
2427/// * **Assembly:** `vpsllvd ymm, ymm, ymm`
2428#[must_use]
2429#[inline(always)]
2430#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2431pub fn shl_each_u32_m256i(a: m256i, count: m256i) -> m256i {
2432  m256i(unsafe { _mm256_sllv_epi32(a.0, count.0) })
2433}
2434
2435/// Lanewise `u64` shift left by the matching `u64` lane in `count`.
2436/// ```
2437/// # use safe_arch::*;
2438/// let a = m256i::from([0_u64, 1, 2, 13]);
2439/// let count = m256i::from([1_u64, 2, 3, 4]);
2440/// let b: [u64; 4] = shl_each_u64_m256i(a, count).into();
2441/// assert_eq!(b, [0, 4, 16, 208]);
2442/// ```
2443/// * **Intrinsic:** [`_mm256_sllv_epi64`]
2444/// * **Assembly:** `vpsllvq ymm, ymm, ymm`
2445#[must_use]
2446#[inline(always)]
2447#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2448pub fn shl_each_u64_m256i(a: m256i, count: m256i) -> m256i {
2449  m256i(unsafe { _mm256_sllv_epi64(a.0, count.0) })
2450}
2451
2452/// Lanewise `i16` shift right by the lower `i64` lane of `count`.
2453/// ```
2454/// # use safe_arch::*;
2455/// let a = m256i::from([5_i16, 6, 2, 5, 4, 3, 1, 0, -12, 13, 56, 21, 8, 7, 6, 5]);
2456/// let count = m128i::from(1_i128);
2457/// let b: [i16; 16] = shr_all_i16_m256i(a, count).into();
2458/// assert_eq!(b, [2, 3, 1, 2, 2, 1, 0, 0, -6, 6, 28, 10, 4, 3, 3, 2]);
2459/// ```
2460/// * **Intrinsic:** [`_mm256_sra_epi16`]
2461/// * **Assembly:** `vpsraw ymm, ymm, xmm`
2462#[must_use]
2463#[inline(always)]
2464#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2465pub fn shr_all_i16_m256i(a: m256i, count: m128i) -> m256i {
2466  m256i(unsafe { _mm256_sra_epi16(a.0, count.0) })
2467}
2468
2469/// Lanewise `i32` shift right by the lower `i64` lane of `count`.
2470/// ```
2471/// # use safe_arch::*;
2472/// let a = m256i::from([0_i32, 1, -2, -13, 4, 15, 6, -17]);
2473/// let count = m128i::from(1_i128);
2474/// let b: [i32; 8] = shr_all_i32_m256i(a, count).into();
2475/// assert_eq!(b, [0, 0, -1, -7, 2, 7, 3, -9]);
2476/// ```
2477/// * **Intrinsic:** [`_mm256_sra_epi32`]
2478/// * **Assembly:** `vpsrad ymm, ymm, xmm`
2479#[must_use]
2480#[inline(always)]
2481#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2482pub fn shr_all_i32_m256i(a: m256i, count: m128i) -> m256i {
2483  m256i(unsafe { _mm256_sra_epi32(a.0, count.0) })
2484}
2485
2486/// Shifts all `i16` lanes left by an immediate.
2487///
2488/// ```
2489/// # use safe_arch::*;
2490/// let a = m256i::from([1_i16, 2, 3, 4, -1, -2, -3, -4, 1, 2, 3, 4, -1, -2, -3, -4]);
2491/// let c: [i16; 16] = shr_imm_i16_m256i::<1>(a).into();
2492/// assert_eq!(c, [0_i16, 1, 1, 2, -1, -1, -2, -2, 0, 1, 1, 2, -1, -1, -2, -2]);
2493/// ```
2494/// * **Intrinsic:** [`_mm256_srai_epi16`]
2495/// * **Assembly:** `vpsraw ymm, ymm, imm8`
2496#[must_use]
2497#[inline(always)]
2498#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2499pub fn shr_imm_i16_m256i<const IMM: i32>(a: m256i) -> m256i {
2500  m256i(unsafe { _mm256_srai_epi16(a.0, IMM) })
2501}
2502
2503/// Shifts all `i32` lanes left by an immediate.
2504///
2505/// ```
2506/// # use safe_arch::*;
2507/// let a = m256i::from([1_i32, 2, 3, 4, -1, -2, -3, -4]);
2508/// let c: [i32; 8] = shr_imm_i32_m256i::<1>(a).into();
2509/// assert_eq!(c, [0, 1, 1, 2, -1, -1, -2, -2]);
2510/// ```
2511/// * **Intrinsic:** [`_mm256_srai_epi32`]
2512/// * **Assembly:** `vpsrad ymm, ymm, imm8`
2513#[must_use]
2514#[inline(always)]
2515#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2516pub fn shr_imm_i32_m256i<const IMM: i32>(a: m256i) -> m256i {
2517  m256i(unsafe { _mm256_srai_epi32(a.0, IMM) })
2518}
2519
2520/// Lanewise `i32` shift right by the matching `i32` lane in `count`.
2521/// ```
2522/// # use safe_arch::*;
2523/// let a = m256i::from([0_i32, 1111, -2999, -13888, 5444, 6222, 7333, -11111]);
2524/// let count = m256i::from([1_i32, 2, 3, 4, 5, 4, 3, 2]);
2525/// let b: [i32; 8] = shr_each_i32_m256i(a, count).into();
2526/// assert_eq!(b, [0, 277, -375, -868, 170, 388, 916, -2778]);
2527/// ```
2528/// * **Intrinsic:** [`_mm256_srav_epi32`]
2529/// * **Assembly:** `vpsravd ymm, ymm, ymm`
2530#[must_use]
2531#[inline(always)]
2532#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2533pub fn shr_each_i32_m256i(a: m256i, count: m256i) -> m256i {
2534  m256i(unsafe { _mm256_srav_epi32(a.0, count.0) })
2535}
2536
2537/// Lanewise `u16` shift right by the lower `u64` lane of `count`.
2538/// ```
2539/// # use safe_arch::*;
2540/// let a = m256i::from([5_u16, 6, 2, 5, 4, 3, 1, 0, 12, 13, 56, 21, 8, 7, 6, 5]);
2541/// let count = m128i::from(1_u128);
2542/// let b: [u16; 16] = shr_all_u16_m256i(a, count).into();
2543/// assert_eq!(b, [2, 3, 1, 2, 2, 1, 0, 0, 6, 6, 28, 10, 4, 3, 3, 2]);
2544/// ```
2545/// * **Intrinsic:** [`_mm256_srl_epi16`]
2546/// * **Assembly:** `vpsrlw ymm, ymm, xmm`
2547#[must_use]
2548#[inline(always)]
2549#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2550pub fn shr_all_u16_m256i(a: m256i, count: m128i) -> m256i {
2551  m256i(unsafe { _mm256_srl_epi16(a.0, count.0) })
2552}
2553
2554/// Lanewise `u32` shift right by the lower `u64` lane of `count`.
2555/// ```
2556/// # use safe_arch::*;
2557/// let a = m256i::from([0_u32, 1, 2, 13, 4, 15, 6, 17]);
2558/// let count = m128i::from(1_u128);
2559/// let b: [u32; 8] = shr_all_u32_m256i(a, count).into();
2560/// assert_eq!(b, [0, 0, 1, 6, 2, 7, 3, 8]);
2561/// ```
2562/// * **Intrinsic:** [`_mm256_srl_epi32`]
2563/// * **Assembly:** `vpsrld ymm, ymm, xmm`
2564#[must_use]
2565#[inline(always)]
2566#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2567pub fn shr_all_u32_m256i(a: m256i, count: m128i) -> m256i {
2568  m256i(unsafe { _mm256_srl_epi32(a.0, count.0) })
2569}
2570
2571/// Lanewise `u64` shift right by the lower `u64` lane of `count`.
2572/// ```
2573/// # use safe_arch::*;
2574/// let a = m256i::from([0_u64, 1, 2, 13]);
2575/// let count = m128i::from(1_u128);
2576/// let b: [u64; 4] = shr_all_u64_m256i(a, count).into();
2577/// assert_eq!(b, [0, 0, 1, 6]);
2578/// ```
2579/// * **Intrinsic:** [`_mm256_srl_epi64`]
2580/// * **Assembly:** `vpsrlq ymm, ymm, xmm`
2581#[must_use]
2582#[inline(always)]
2583#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2584pub fn shr_all_u64_m256i(a: m256i, count: m128i) -> m256i {
2585  m256i(unsafe { _mm256_srl_epi64(a.0, count.0) })
2586}
2587
2588/// Shifts all `u16` lanes right by an immediate.
2589///
2590/// ```
2591/// # use safe_arch::*;
2592/// let a = m256i::from([1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
2593/// let c: [u16; 16] = shr_imm_u16_m256i::<1>(a).into();
2594/// assert_eq!(c, [0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8]);
2595/// ```
2596/// * **Intrinsic:** [`_mm256_srli_epi16`]
2597/// * **Assembly:** `vpsrlw ymm, ymm, imm8`
2598#[must_use]
2599#[inline(always)]
2600#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2601pub fn shr_imm_u16_m256i<const IMM: i32>(a: m256i) -> m256i {
2602  m256i(unsafe { _mm256_srli_epi16(a.0, IMM) })
2603}
2604
2605/// Shifts all `u32` lanes right by an immediate.
2606///
2607/// ```
2608/// # use safe_arch::*;
2609/// let a = m256i::from([1_i32, 2, 3, 4, 5, 6, 7, 8]);
2610/// let c: [u32; 8] = shr_imm_u32_m256i::<1>(a).into();
2611/// assert_eq!(c, [0, 1, 1, 2, 2, 3, 3, 4]);
2612/// ```
2613/// * **Intrinsic:** [`_mm256_srli_epi32`]
2614/// * **Assembly:** `vpsrld ymm, ymm, imm8`
2615#[must_use]
2616#[inline(always)]
2617#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2618pub fn shr_imm_u32_m256i<const IMM: i32>(a: m256i) -> m256i {
2619  m256i(unsafe { _mm256_srli_epi32(a.0, IMM) })
2620}
2621
2622/// Shifts all `u64` lanes right by an immediate.
2623///
2624/// ```
2625/// # use safe_arch::*;
2626/// let a = m256i::from([1_u64, 2, 3, 4]);
2627/// let c: [u64; 4] = shr_imm_u64_m256i::<1>(a).into();
2628/// assert_eq!(c, [0, 1, 1, 2]);
2629/// ```
2630/// * **Intrinsic:** [`_mm256_srli_epi64`]
2631/// * **Assembly:** `vpsrlq ymm, ymm, imm8`
2632#[must_use]
2633#[inline(always)]
2634#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2635pub fn shr_imm_u64_m256i<const IMM: i32>(a: m256i) -> m256i {
2636  m256i(unsafe { _mm256_srli_epi64(a.0, IMM) })
2637}
2638
2639/// Lanewise `u32` shift right by the matching `u32` lane in `count`.
2640/// ```
2641/// # use safe_arch::*;
2642/// let a = m256i::from([0_u32, 1111, 2999, 13888, 5444, 6222, 7333, 11111]);
2643/// let count = m256i::from([1_i32, 2, 3, 4, 5, 4, 3, 2]);
2644/// let b: [u32; 8] = shr_each_u32_m256i(a, count).into();
2645/// assert_eq!(b, [0, 277, 374, 868, 170, 388, 916, 2777]);
2646/// ```
2647/// * **Intrinsic:** [`_mm256_srlv_epi32`]
2648/// * **Assembly:** `vpsrlvd ymm, ymm, ymm`
2649#[must_use]
2650#[inline(always)]
2651#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2652pub fn shr_each_u32_m256i(a: m256i, count: m256i) -> m256i {
2653  m256i(unsafe { _mm256_srlv_epi32(a.0, count.0) })
2654}
2655
2656/// Lanewise `u64` shift right by the matching `i64` lane in `count`.
2657/// ```
2658/// # use safe_arch::*;
2659/// let a = m256i::from([0_u64, 1111, 2999, 13888]);
2660/// let count = m256i::from([1_u64, 2, 3, 4]);
2661/// let b: [u64; 4] = shr_each_u64_m256i(a, count).into();
2662/// assert_eq!(b, [0, 277, 374, 868]);
2663/// ```
2664/// * **Intrinsic:** [`_mm256_srlv_epi64`]
2665/// * **Assembly:** `vpsrlvq ymm, ymm, ymm`
2666#[must_use]
2667#[inline(always)]
2668#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2669pub fn shr_each_u64_m256i(a: m256i, count: m256i) -> m256i {
2670  m256i(unsafe { _mm256_srlv_epi64(a.0, count.0) })
2671}
2672
2673/// Lanewise `a - b` with lanes as `i8`.
2674/// ```
2675/// # use safe_arch::*;
2676/// let a = m256i::from([5_i8; 32]);
2677/// let b = m256i::from([10_i8; 32]);
2678/// let c: [i8; 32] = sub_i8_m256i(a, b).into();
2679/// assert_eq!(c, [-5_i8; 32]);
2680/// ```
2681/// * **Intrinsic:** [`_mm256_sub_epi8`]
2682/// * **Assembly:** `vpsubb ymm, ymm, ymm`
2683#[must_use]
2684#[inline(always)]
2685#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2686pub fn sub_i8_m256i(a: m256i, b: m256i) -> m256i {
2687  m256i(unsafe { _mm256_sub_epi8(a.0, b.0) })
2688}
2689
2690/// Lanewise `a - b` with lanes as `i16`.
2691/// ```
2692/// # use safe_arch::*;
2693/// let a = m256i::from([5_i16; 16]);
2694/// let b = m256i::from([10_i16; 16]);
2695/// let c: [i16; 16] = sub_i16_m256i(a, b).into();
2696/// assert_eq!(c, [-5_i16; 16]);
2697/// ```
2698/// * **Intrinsic:** [`_mm256_sub_epi16`]
2699/// * **Assembly:** `vpsubw ymm, ymm, ymm`
2700#[must_use]
2701#[inline(always)]
2702#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2703pub fn sub_i16_m256i(a: m256i, b: m256i) -> m256i {
2704  m256i(unsafe { _mm256_sub_epi16(a.0, b.0) })
2705}
2706
2707/// Lanewise `a - b` with lanes as `i32`.
2708/// ```
2709/// # use safe_arch::*;
2710/// let a = m256i::from([5_i32; 8]);
2711/// let b = m256i::from([10_i32; 8]);
2712/// let c: [i32; 8] = sub_i32_m256i(a, b).into();
2713/// assert_eq!(c, [-5_i32; 8]);
2714/// ```
2715/// * **Intrinsic:** [`_mm256_sub_epi32`]
2716/// * **Assembly:** `vpsubd ymm, ymm, ymm`
2717#[must_use]
2718#[inline(always)]
2719#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2720pub fn sub_i32_m256i(a: m256i, b: m256i) -> m256i {
2721  m256i(unsafe { _mm256_sub_epi32(a.0, b.0) })
2722}
2723
2724/// Lanewise `a - b` with lanes as `i64`.
2725/// ```
2726/// # use safe_arch::*;
2727/// let a = m256i::from([5_i64; 4]);
2728/// let b = m256i::from([10_i64; 4]);
2729/// let c: [i64; 4] = sub_i64_m256i(a, b).into();
2730/// assert_eq!(c, [-5_i64; 4]);
2731/// ```
2732/// * **Intrinsic:** [`_mm256_sub_epi64`]
2733/// * **Assembly:** `vpsubq ymm, ymm, ymm`
2734#[must_use]
2735#[inline(always)]
2736#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2737pub fn sub_i64_m256i(a: m256i, b: m256i) -> m256i {
2738  m256i(unsafe { _mm256_sub_epi64(a.0, b.0) })
2739}
2740
2741/// Lanewise saturating `a - b` with lanes as `i8`.
2742/// ```
2743/// # use safe_arch::*;
2744/// let a = m256i::from([126_i8; 32]);
2745/// let b = m256i::from([125_i8; 32]);
2746/// let c: [i8; 32] = sub_saturating_i8_m256i(a, b).into();
2747/// assert_eq!(c, [1_i8; 32]);
2748/// ```
2749/// * **Intrinsic:** [`_mm256_subs_epi8`]
2750/// * **Assembly:** `vpsubsb ymm, ymm, ymm`
2751#[must_use]
2752#[inline(always)]
2753#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2754pub fn sub_saturating_i8_m256i(a: m256i, b: m256i) -> m256i {
2755  m256i(unsafe { _mm256_subs_epi8(a.0, b.0) })
2756}
2757
2758/// Lanewise saturating `a - b` with lanes as `i16`.
2759/// ```
2760/// # use safe_arch::*;
2761/// let a = m256i::from([32700_i16; 16]);
2762/// let b = m256i::from([32000_i16; 16]);
2763/// let c: [i16; 16] = sub_saturating_i16_m256i(a, b).into();
2764/// assert_eq!(c, [700_i16; 16]);
2765/// ```
2766/// * **Intrinsic:** [`_mm256_subs_epi16`]
2767/// * **Assembly:** `vpsubsw ymm, ymm, ymm`
2768#[must_use]
2769#[inline(always)]
2770#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2771pub fn sub_saturating_i16_m256i(a: m256i, b: m256i) -> m256i {
2772  m256i(unsafe { _mm256_subs_epi16(a.0, b.0) })
2773}
2774
2775/// Lanewise saturating `a - b` with lanes as `u8`.
2776/// ```
2777/// # use safe_arch::*;
2778/// let a = m256i::from([126_u8; 32]);
2779/// let b = m256i::from([125_u8; 32]);
2780/// let c: [u8; 32] = sub_saturating_u8_m256i(a, b).into();
2781/// assert_eq!(c, [1_u8; 32]);
2782/// ```
2783/// * **Intrinsic:** [`_mm256_subs_epu8`]
2784/// * **Assembly:** `vpsubusb ymm, ymm, ymm`
2785#[must_use]
2786#[inline(always)]
2787#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2788pub fn sub_saturating_u8_m256i(a: m256i, b: m256i) -> m256i {
2789  m256i(unsafe { _mm256_subs_epu8(a.0, b.0) })
2790}
2791
2792/// Lanewise saturating `a - b` with lanes as `u16`.
2793/// ```
2794/// # use safe_arch::*;
2795/// let a = m256i::from([32700_u16; 16]);
2796/// let b = m256i::from([32000_u16; 16]);
2797/// let c: [u16; 16] = sub_saturating_u16_m256i(a, b).into();
2798/// assert_eq!(c, [700_u16; 16]);
2799/// ```
2800/// * **Intrinsic:** [`_mm256_subs_epu16`]
2801/// * **Assembly:** `vpsubusw ymm, ymm, ymm`
2802#[must_use]
2803#[inline(always)]
2804#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2805pub fn sub_saturating_u16_m256i(a: m256i, b: m256i) -> m256i {
2806  m256i(unsafe { _mm256_subs_epu16(a.0, b.0) })
2807}
2808
2809/// Unpack and interleave high `i8` lanes of `a` and `b`.
2810///
2811/// * Operates on the high half of each 128 bit portion.
2812/// ```
2813/// # use safe_arch::*;
2814/// let a = m256i::from([
2815///   3_i8, 11, 2, 13, 4, 15, 6, 17, 8, 19, 20, 21, 22, 23, 24, 127, 7, 11, 2, 13, 4, 15, 6, 17, 8,
2816///   19, 20, 21, 22, 23, 24, 127,
2817/// ]);
2818/// let b = m256i::from([
2819///   -1_i8, -1, 0, 2, 2, 3, 0, 5, 6, 6, -7, 8, 8, 0, 0, 10, 10, -11, 11, 12, 12, 13, 13, 12, 11,
2820///   -10, 9, 8, 7, 6, 5, -4,
2821/// ]);
2822/// let c: [i8; 32] = unpack_high_i8_m256i(a, b).into();
2823/// assert_eq!(
2824///   c,
2825///   [
2826///     8, 6, 19, 6, 20, -7, 21, 8, 22, 8, 23, 0, 24, 0, 127, 10, 8, 11, 19, -10, 20, 9, 21, 8, 22,
2827///     7, 23, 6, 24, 5, 127, -4
2828///   ]
2829/// );
2830/// ```
2831/// * **Intrinsic:** [`_mm256_unpackhi_epi8`]
2832/// * **Assembly:** `vpunpckhbw ymm, ymm, ymm`
2833#[must_use]
2834#[inline(always)]
2835#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2836pub fn unpack_high_i8_m256i(a: m256i, b: m256i) -> m256i {
2837  m256i(unsafe { _mm256_unpackhi_epi8(a.0, b.0) })
2838}
2839
2840/// Unpack and interleave high `i16` lanes of `a` and `b`.
2841///
2842/// * Operates on the high half of each 128 bit portion.
2843/// ```
2844/// # use safe_arch::*;
2845/// let a = m256i::from([5_i16, 6, 2, 5, 4, 3, 1, 0, -12, 13, 56, 21, 8, 7, 6, 5]);
2846/// let b = m256i::from([12000_i16, 13000, -2, -8, 0, 1, 2, 3, -8, -7, 6, 5, 0, 0, 0, 978]);
2847/// let c: [i16; 16] = unpack_high_i16_m256i(a, b).into();
2848/// assert_eq!(c, [4, 0, 3, 1, 1, 2, 0, 3, 8, 0, 7, 0, 6, 0, 5, 978]);
2849/// ```
2850/// * **Intrinsic:** [`_mm256_unpackhi_epi16`]
2851/// * **Assembly:** `vpunpckhwd ymm, ymm, ymm`
2852#[must_use]
2853#[inline(always)]
2854#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2855pub fn unpack_high_i16_m256i(a: m256i, b: m256i) -> m256i {
2856  m256i(unsafe { _mm256_unpackhi_epi16(a.0, b.0) })
2857}
2858
2859/// Unpack and interleave high `i32` lanes of `a` and `b`.
2860///
2861/// * Operates on the high half of each 128 bit portion.
2862/// ```
2863/// # use safe_arch::*;
2864/// let a = m256i::from([0_i32, 1, 2, 3, 4, 5, 6, 7]);
2865/// let b = m256i::from([0_i32, 11, 2, -13, 4, 15, 6, -17]);
2866/// let c: [i32; 8] = unpack_high_i32_m256i(a, b).into();
2867/// assert_eq!(c, [2, 2, 3, -13, 6, 6, 7, -17]);
2868/// ```
2869/// * **Intrinsic:** [`_mm256_unpackhi_epi32`]
2870/// * **Assembly:** `vpunpckhdq ymm, ymm, ymm`
2871#[must_use]
2872#[inline(always)]
2873#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2874pub fn unpack_high_i32_m256i(a: m256i, b: m256i) -> m256i {
2875  m256i(unsafe { _mm256_unpackhi_epi32(a.0, b.0) })
2876}
2877
2878/// Unpack and interleave high `i64` lanes of `a` and `b`.
2879///
2880/// * Operates on the high half of each 128 bit portion.
2881/// ```
2882/// # use safe_arch::*;
2883/// let a = m256i::from([1_i64, 2, 3, 4]);
2884/// let b = m256i::from([5_i64, 6, 7, -8]);
2885/// let c: [i64; 4] = unpack_high_i64_m256i(a, b).into();
2886/// assert_eq!(c, [2, 6, 4, -8]);
2887/// ```
2888/// * **Intrinsic:** [`_mm256_unpackhi_epi64`]
2889/// * **Assembly:** `vpunpckhqdq ymm, ymm, ymm`
2890#[must_use]
2891#[inline(always)]
2892#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2893pub fn unpack_high_i64_m256i(a: m256i, b: m256i) -> m256i {
2894  m256i(unsafe { _mm256_unpackhi_epi64(a.0, b.0) })
2895}
2896
2897/// Unpack and interleave low `i8` lanes of `a` and `b`.
2898///
2899/// * Operates on the low half of each 128 bit portion.
2900/// ```
2901/// # use safe_arch::*;
2902/// let a = m256i::from([
2903///   3_i8, 11, 2, 13, 4, 15, 6, 17, 8, 19, 20, 21, 22, 23, 24, 127, 7, 11, 2, 13, 4, 15, 6, 17, 8,
2904///   19, 20, 21, 22, 23, 24, 127,
2905/// ]);
2906/// let b = m256i::from([
2907///   -1_i8, -1, 0, 2, 2, 3, 0, 5, 6, 6, -7, 8, 8, 0, 0, 10, 10, -11, 11, 12, 12, 13, 13, 12, 11,
2908///   -10, 9, 8, 7, 6, 5, -4,
2909/// ]);
2910/// let c: [i8; 32] = unpack_low_i8_m256i(a, b).into();
2911/// assert_eq!(
2912///   c,
2913///   [
2914///     3, -1, 11, -1, 2, 0, 13, 2, 4, 2, 15, 3, 6, 0, 17, 5, 7, 10, 11, -11, 2, 11, 13, 12, 4, 12,
2915///     15, 13, 6, 13, 17, 12
2916///   ]
2917/// );
2918/// ```
2919/// * **Intrinsic:** [`_mm256_unpacklo_epi8`]
2920/// * **Assembly:** `_mm256_unpacklo_epi8`
2921#[must_use]
2922#[inline(always)]
2923#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2924pub fn unpack_low_i8_m256i(a: m256i, b: m256i) -> m256i {
2925  m256i(unsafe { _mm256_unpacklo_epi8(a.0, b.0) })
2926}
2927
2928/// Unpack and interleave low `i16` lanes of `a` and `b`.
2929///
2930/// * Operates on the low half of each 128 bit portion.
2931/// ```
2932/// # use safe_arch::*;
2933/// let a = m256i::from([5_i16, 6, 2, 5, 4, 3, 1, 0, -12, 13, 56, 21, 8, 7, 6, 5]);
2934/// let b = m256i::from([12000_i16, 13000, -2, -8, 0, 1, 2, 3, -8, -7, 6, 5, 0, 0, 0, 978]);
2935/// let c: [i16; 16] = unpack_low_i16_m256i(a, b).into();
2936/// assert_eq!(c, [5, 12000, 6, 13000, 2, -2, 5, -8, -12, -8, 13, -7, 56, 6, 21, 5]);
2937/// ```
2938/// * **Intrinsic:** [`_mm256_unpacklo_epi16`]
2939/// * **Assembly:** `vpunpcklwd ymm, ymm, ymm`
2940#[must_use]
2941#[inline(always)]
2942#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2943pub fn unpack_low_i16_m256i(a: m256i, b: m256i) -> m256i {
2944  m256i(unsafe { _mm256_unpacklo_epi16(a.0, b.0) })
2945}
2946
2947/// Unpack and interleave low `i32` lanes of `a` and `b`.
2948///
2949/// * Operates on the low half of each 128 bit portion.
2950/// ```
2951/// # use safe_arch::*;
2952/// let a = m256i::from([0_i32, 1, 2, 3, 4, 5, 6, 7]);
2953/// let b = m256i::from([0_i32, 11, 2, -13, 4, 15, 6, -17]);
2954/// let c: [i32; 8] = unpack_low_i32_m256i(a, b).into();
2955/// assert_eq!(c, [0, 0, 1, 11, 4, 4, 5, 15]);
2956/// ```
2957/// * **Intrinsic:** [`_mm256_unpacklo_epi32`]
2958/// * **Assembly:** `vpunpckldq ymm, ymm, ymm`
2959#[must_use]
2960#[inline(always)]
2961#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2962pub fn unpack_low_i32_m256i(a: m256i, b: m256i) -> m256i {
2963  m256i(unsafe { _mm256_unpacklo_epi32(a.0, b.0) })
2964}
2965
2966/// Unpack and interleave low `i64` lanes of `a` and `b`.
2967///
2968/// * Operates on the low half of each 128 bit portion.
2969/// ```
2970/// # use safe_arch::*;
2971/// let a = m256i::from([1_i64, 2, 3, 4]);
2972/// let b = m256i::from([5_i64, 6, 7, -8]);
2973/// let c: [i64; 4] = unpack_low_i64_m256i(a, b).into();
2974/// assert_eq!(c, [1, 5, 3, 7]);
2975/// ```
2976/// * **Intrinsic:** [`_mm256_unpacklo_epi64`]
2977/// * **Assembly:** `vpunpcklqdq ymm, ymm, ymm`
2978#[must_use]
2979#[inline(always)]
2980#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2981pub fn unpack_low_i64_m256i(a: m256i, b: m256i) -> m256i {
2982  m256i(unsafe { _mm256_unpacklo_epi64(a.0, b.0) })
2983}
2984
2985/// Bitwise `a ^ b`.
2986/// ```
2987/// # use safe_arch::*;
2988/// let a = m256i::from([0_i64, 0, 1, 1]);
2989/// let b = m256i::from([0_i64, 1, 0, 1]);
2990/// let c: [i64; 4] = bitxor_m256i(a, b).into();
2991/// assert_eq!(c, [0_i64, 1, 1, 0]);
2992/// ```
2993/// * **Intrinsic:** [`_mm256_xor_si256`]
2994/// * **Assembly:** `vpxor ymm, ymm, ymm`
2995#[must_use]
2996#[inline(always)]
2997#[cfg_attr(docsrs, doc(cfg(target_feature = "avx2")))]
2998pub fn bitxor_m256i(a: m256i, b: m256i) -> m256i {
2999  m256i(unsafe { _mm256_xor_si256(a.0, b.0) })
3000}
3001
3002impl Not for m256i {
3003  type Output = Self;
3004  /// Not a direct intrinsic, but it's very useful and the implementation is
3005  /// simple enough.
3006  ///
3007  /// Negates the bits by performing an `xor` with an all-1s bit pattern.
3008  /// ```
3009  /// # use safe_arch::*;
3010  /// let a = m256i::from([0_u128, 0]);
3011  /// let c: [u128; 2] = (!a).into();
3012  /// assert_eq!(c, [u128::MAX, u128::MAX]);
3013  /// ```
3014  #[inline(always)]
3015  fn not(self) -> Self {
3016    let all_bits = set_splat_i16_m256i(-1);
3017    self ^ all_bits
3018  }
3019}
3020
3021impl BitAnd for m256i {
3022  type Output = Self;
3023  /// ```
3024  /// # use safe_arch::*;
3025  /// let a = m256i::from([0_i64, 0, 1, 1]);
3026  /// let b = m256i::from([0_i64, 1, 0, 1]);
3027  /// let c: [i64; 4] = (a & b).into();
3028  /// assert_eq!(c, [0_i64, 0, 0, 1]);
3029  /// ```
3030  #[inline(always)]
3031  fn bitand(self, rhs: Self) -> Self {
3032    bitand_m256i(self, rhs)
3033  }
3034}
3035impl BitAndAssign for m256i {
3036  #[inline(always)]
3037  fn bitand_assign(&mut self, rhs: Self) {
3038    *self = *self & rhs;
3039  }
3040}
3041
3042impl BitOr for m256i {
3043  type Output = Self;
3044  /// ```
3045  /// # use safe_arch::*;
3046  /// let a = m256i::from([0_i64, 0, 1, 1]);
3047  /// let b = m256i::from([0_i64, 1, 0, 1]);
3048  /// let c: [i64; 4] = (a | b).into();
3049  /// assert_eq!(c, [0_i64, 1, 1, 1]);
3050  /// ```
3051  #[inline(always)]
3052  fn bitor(self, rhs: Self) -> Self {
3053    bitor_m256i(self, rhs)
3054  }
3055}
3056impl BitOrAssign for m256i {
3057  #[inline(always)]
3058  fn bitor_assign(&mut self, rhs: Self) {
3059    *self = *self | rhs;
3060  }
3061}
3062
3063impl BitXor for m256i {
3064  type Output = Self;
3065  /// ```
3066  /// # use safe_arch::*;
3067  /// let a = m256i::from([0_i64, 0, 1, 1]);
3068  /// let b = m256i::from([0_i64, 1, 0, 1]);
3069  /// let c: [i64; 4] = (a ^ b).into();
3070  /// assert_eq!(c, [0_i64, 1, 1, 0]);
3071  /// ```
3072  #[inline(always)]
3073  fn bitxor(self, rhs: Self) -> Self {
3074    bitxor_m256i(self, rhs)
3075  }
3076}
3077impl BitXorAssign for m256i {
3078  #[inline(always)]
3079  fn bitxor_assign(&mut self, rhs: Self) {
3080    *self = *self ^ rhs;
3081  }
3082}
3083
3084impl PartialEq for m256i {
3085  /// ```
3086  /// # use safe_arch::*;
3087  /// let a = m256i::from([0_i64, 0, 1, 1]);
3088  /// let b = m256i::from([0_i64, 1, 0, 1]);
3089  /// assert_eq!(a, a);
3090  /// assert_ne!(a, b);
3091  /// ```
3092  #[inline(always)]
3093  fn eq(&self, other: &Self) -> bool {
3094    let mask = cmp_eq_mask_i8_m256i(*self, *other);
3095    move_mask_i8_m256i(mask) == -1_i32
3096  }
3097}
3098impl Eq for m256i {}
safe_arch/x86_x64/avx2.rs

safe_arch/x86_x64/
avx2.rs