1#[cfg(all(feature = "x86-v4", target_arch = "x86_64"))]
2mod x86 {
3 use diol::prelude::*;
4 use pulp::Simd;
5 use pulp::x86::V4;
6 use std::arch::x86_64::__m512d;
7
8 fn bench_masked_store(bencher: Bencher, (): ()) {
9 let simd = V4::try_new().unwrap();
10
11 let mut x: __m512d = pulp::cast(simd.splat_f32s(0.0));
12 let x: &mut [f32] = bytemuck::cast_slice_mut(core::slice::from_mut(&mut x));
13 let x = x.as_mut_ptr();
14
15 bencher.bench(|| {
16 simd.vectorize(
17 #[inline(always)]
18 || unsafe {
19 let mask = simd.mask_between_m32s(3, 13);
20 let raw_mask = mask.mask();
21 mask_mem(simd, raw_mask.into(), x);
22 },
23 )
24 });
25 }
26
27 #[inline]
28 #[target_feature(enable = "avx512f")]
29 unsafe fn mask_mem(simd: V4, mask: pulp::MemMask<pulp::b16>, x: *mut f32) {
30 for _ in 0..16 {
31 let y = simd.mask_load_ptr_f32s(mask, x);
32 core::arch::asm!("/* */", in("zmm0") x);
33 simd.mask_store_ptr_f32s(mask, x, y);
34 }
35 }
36
37 fn bench_combined_stores(bencher: Bencher, (): ()) {
38 let simd = V4::try_new().unwrap();
39
40 let mut x: __m512d = pulp::cast(simd.splat_f32s(0.0));
41 let x: &mut [f32] = bytemuck::cast_slice_mut(core::slice::from_mut(&mut x));
42 let x = x.as_mut_ptr();
43
44 bencher.bench(|| {
45 simd.vectorize(
46 #[inline(always)]
47 || unsafe {
48 let mask = simd.mask_between_m32s(3, 13);
49 for _ in 0..16 {
50 simd.mask_store_ptr_f32s(mask, x, simd.mask_load_ptr_f32s(mask, x));
51 }
52 },
53 )
54 });
55 }
56
57 pub fn main() -> std::io::Result<()> {
58 let mut bench = diol::Bench::new(BenchConfig::from_args()?);
59 bench.register_many(list![bench_masked_store, bench_combined_stores], [()]);
60
61 bench.run()?;
62 Ok(())
63 }
64}
65
66fn main() -> std::io::Result<()> {
67 #[cfg(all(feature = "x86-v4", target_arch = "x86_64"))]
68 x86::main()?;
69 Ok(())
70}