mask_store/
mask_store.rs

1#[cfg(all(feature = "x86-v4", target_arch = "x86_64"))]
2mod x86 {
3	use diol::prelude::*;
4	use pulp::Simd;
5	use pulp::x86::V4;
6	use std::arch::x86_64::__m512d;
7
8	fn bench_masked_store(bencher: Bencher, (): ()) {
9		let simd = V4::try_new().unwrap();
10
11		let mut x: __m512d = pulp::cast(simd.splat_f32s(0.0));
12		let x: &mut [f32] = bytemuck::cast_slice_mut(core::slice::from_mut(&mut x));
13		let x = x.as_mut_ptr();
14
15		bencher.bench(|| {
16			simd.vectorize(
17				#[inline(always)]
18				|| unsafe {
19					let mask = simd.mask_between_m32s(3, 13);
20					let raw_mask = mask.mask();
21					mask_mem(simd, raw_mask.into(), x);
22				},
23			)
24		});
25	}
26
27	#[inline]
28	#[target_feature(enable = "avx512f")]
29	unsafe fn mask_mem(simd: V4, mask: pulp::MemMask<pulp::b16>, x: *mut f32) {
30		for _ in 0..16 {
31			let y = simd.mask_load_ptr_f32s(mask, x);
32			core::arch::asm!("/* */", in("zmm0") x);
33			simd.mask_store_ptr_f32s(mask, x, y);
34		}
35	}
36
37	fn bench_combined_stores(bencher: Bencher, (): ()) {
38		let simd = V4::try_new().unwrap();
39
40		let mut x: __m512d = pulp::cast(simd.splat_f32s(0.0));
41		let x: &mut [f32] = bytemuck::cast_slice_mut(core::slice::from_mut(&mut x));
42		let x = x.as_mut_ptr();
43
44		bencher.bench(|| {
45			simd.vectorize(
46				#[inline(always)]
47				|| unsafe {
48					let mask = simd.mask_between_m32s(3, 13);
49					for _ in 0..16 {
50						simd.mask_store_ptr_f32s(mask, x, simd.mask_load_ptr_f32s(mask, x));
51					}
52				},
53			)
54		});
55	}
56
57	pub fn main() -> std::io::Result<()> {
58		let mut bench = diol::Bench::new(BenchConfig::from_args()?);
59		bench.register_many(list![bench_masked_store, bench_combined_stores], [()]);
60
61		bench.run()?;
62		Ok(())
63	}
64}
65
66fn main() -> std::io::Result<()> {
67	#[cfg(all(feature = "x86-v4", target_arch = "x86_64"))]
68	x86::main()?;
69	Ok(())
70}