pulp 0.22.2

safe generic simd
Documentation
#[cfg(all(feature = "x86-v4", target_arch = "x86_64"))]
mod x86 {
	use diol::prelude::*;
	use pulp::Simd;
	use pulp::x86::V4;
	use std::arch::x86_64::__m512d;

	fn bench_masked_store(bencher: Bencher, (): ()) {
		let simd = V4::try_new().unwrap();

		let mut x: __m512d = pulp::cast(simd.splat_f32s(0.0));
		let x: &mut [f32] = bytemuck::cast_slice_mut(core::slice::from_mut(&mut x));
		let x = x.as_mut_ptr();

		bencher.bench(|| {
			simd.vectorize(
				#[inline(always)]
				|| unsafe {
					let mask = simd.mask_between_m32s(3, 13);
					let raw_mask = mask.mask();
					mask_mem(simd, raw_mask.into(), x);
				},
			)
		});
	}

	#[inline]
	#[target_feature(enable = "avx512f")]
	unsafe fn mask_mem(simd: V4, mask: pulp::MemMask<pulp::b16>, x: *mut f32) {
		for _ in 0..16 {
			let y = simd.mask_load_ptr_f32s(mask, x);
			core::arch::asm!("/* */", in("zmm0") x);
			simd.mask_store_ptr_f32s(mask, x, y);
		}
	}

	fn bench_combined_stores(bencher: Bencher, (): ()) {
		let simd = V4::try_new().unwrap();

		let mut x: __m512d = pulp::cast(simd.splat_f32s(0.0));
		let x: &mut [f32] = bytemuck::cast_slice_mut(core::slice::from_mut(&mut x));
		let x = x.as_mut_ptr();

		bencher.bench(|| {
			simd.vectorize(
				#[inline(always)]
				|| unsafe {
					let mask = simd.mask_between_m32s(3, 13);
					for _ in 0..16 {
						simd.mask_store_ptr_f32s(mask, x, simd.mask_load_ptr_f32s(mask, x));
					}
				},
			)
		});
	}

	pub fn main() -> std::io::Result<()> {
		let mut bench = diol::Bench::new(BenchConfig::from_args()?);
		bench.register_many(list![bench_masked_store, bench_combined_stores], [()]);

		bench.run()?;
		Ok(())
	}
}

fn main() -> std::io::Result<()> {
	#[cfg(all(feature = "x86-v4", target_arch = "x86_64"))]
	x86::main()?;
	Ok(())
}