basic/
basic.rs

1#[cfg(all(feature = "x86-v4", target_arch = "x86_64"))]
2mod x86 {
3	use aligned_vec::avec;
4	use diol::prelude::*;
5
6	use core::arch::x86_64::*;
7	use core::mem::transmute;
8	use pulp::Simd;
9	use pulp::x86::V4;
10
11	fn sum_scalar(bencher: Bencher, len: usize) {
12		let v = &*avec![0.0f64; len];
13
14		bencher.bench(|| black_box(v.iter().sum::<f64>()));
15	}
16
17	#[target_feature(enable = "avx512f")]
18	#[allow(clippy::missing_transmute_annotations)]
19	unsafe fn sum_stdarch_imp(v: &[f64]) -> f64 {
20		let mut acc0 = _mm512_set1_pd(0.0);
21		let mut acc1 = _mm512_set1_pd(0.0);
22		let mut acc2 = _mm512_set1_pd(0.0);
23		let mut acc3 = _mm512_set1_pd(0.0);
24
25		// 512 = 64 * 8
26		let (head, tail) = pulp::as_arrays::<8, _>(v);
27		let (head4, head1) = pulp::as_arrays::<4, _>(head);
28
29		for [x0, x1, x2, x3] in head4 {
30			let x0 = transmute(*x0);
31			let x1 = transmute(*x1);
32			let x2 = transmute(*x2);
33			let x3 = transmute(*x3);
34
35			acc0 = _mm512_add_pd(acc0, x0);
36			acc1 = _mm512_add_pd(acc1, x1);
37			acc2 = _mm512_add_pd(acc2, x2);
38			acc3 = _mm512_add_pd(acc3, x3);
39		}
40
41		for x0 in head1 {
42			let x0 = pulp::cast(*x0);
43			acc0 = _mm512_add_pd(acc0, x0);
44		}
45
46		acc0 = _mm512_add_pd(acc0, acc1);
47		acc2 = _mm512_add_pd(acc2, acc3);
48		acc0 = _mm512_add_pd(acc0, acc2);
49
50		let acc: [__m256d; 2] = pulp::cast(acc0);
51		let acc = _mm256_add_pd(acc[0], acc[1]);
52
53		let acc: [__m128d; 2] = pulp::cast(acc);
54		let acc = _mm_add_pd(acc[0], acc[1]);
55
56		let acc: [f64; 2] = pulp::cast(acc);
57		let mut acc = acc[0] + acc[1];
58
59		for x0 in tail {
60			acc += *x0;
61		}
62		acc
63	}
64
65	fn sum_stdarch(bencher: Bencher, len: usize) {
66		let v = &*avec![0.0f64; len];
67
68		bencher.bench(|| unsafe { black_box(sum_stdarch_imp(v)) });
69	}
70
71	fn sum_pulp(bencher: Bencher, len: usize) {
72		if let Some(simd) = V4::try_new() {
73			let v = &*avec![0.0f64; len];
74
75			bencher.bench(|| {
76				struct Imp<'a> {
77					simd: V4,
78					v: &'a [f64],
79				}
80
81				impl pulp::NullaryFnOnce for Imp<'_> {
82					type Output = f64;
83
84					#[inline(always)]
85					fn call(self) -> Self::Output {
86						let Self { simd, v } = self;
87
88						let (head, tail) = pulp::as_arrays::<8, _>(v);
89
90						let mut acc0 = simd.splat_f64x8(0.0);
91						let mut acc1 = simd.splat_f64x8(0.0);
92						let mut acc2 = simd.splat_f64x8(0.0);
93						let mut acc3 = simd.splat_f64x8(0.0);
94
95						let (head4, head1) = pulp::as_arrays::<4, _>(head);
96
97						for [x0, x1, x2, x3] in head4 {
98							let x0 = pulp::cast(*x0);
99							let x1 = pulp::cast(*x1);
100							let x2 = pulp::cast(*x2);
101							let x3 = pulp::cast(*x3);
102
103							acc0 = pulp::cast(
104								simd.avx512f._mm512_add_pd(pulp::cast(acc0), pulp::cast(x0)),
105							);
106
107							acc0 = simd.add_f64x8(acc0, x0);
108							acc1 = simd.add_f64x8(acc1, x1);
109							acc2 = simd.add_f64x8(acc2, x2);
110							acc3 = simd.add_f64x8(acc3, x3);
111						}
112						for x0 in head1 {
113							let x0 = pulp::cast(*x0);
114							acc0 = simd.add_f64x8(acc0, x0);
115						}
116
117						acc0 = simd.add_f64x8(acc0, acc1);
118						acc2 = simd.add_f64x8(acc2, acc3);
119						acc0 = simd.add_f64x8(acc0, acc2);
120
121						let tail = simd.partial_load_f64s(tail);
122
123						simd.reduce_sum_f64s(simd.add_f64x8(acc0, tail))
124					}
125				}
126
127				simd.vectorize(Imp { simd, v })
128			});
129		}
130	}
131
132	fn sum_pulp_dispatch(bencher: Bencher, len: usize) {
133		let v = &*avec![0.0f64; len];
134
135		bencher.bench(|| {
136			struct Imp<'a> {
137				v: &'a [f64],
138			}
139
140			impl pulp::WithSimd for Imp<'_> {
141				type Output = f64;
142
143				#[inline(always)]
144				fn with_simd<S: Simd>(self, simd: S) -> Self::Output {
145					let Self { v } = self;
146
147					let (head, tail) = S::as_simd_f64s(v);
148
149					let mut acc0 = simd.splat_f64s(0.0);
150					let mut acc1 = simd.splat_f64s(0.0);
151					let mut acc2 = simd.splat_f64s(0.0);
152					let mut acc3 = simd.splat_f64s(0.0);
153
154					let (head4, head1) = pulp::as_arrays::<4, _>(head);
155
156					for &[x0, x1, x2, x3] in head4 {
157						acc0 = simd.add_f64s(acc0, x0);
158						acc1 = simd.add_f64s(acc1, x1);
159						acc2 = simd.add_f64s(acc2, x2);
160						acc3 = simd.add_f64s(acc3, x3);
161					}
162
163					for &x0 in head1 {
164						acc0 = simd.add_f64s(acc0, x0);
165					}
166
167					acc0 = simd.add_f64s(acc0, acc1);
168					acc2 = simd.add_f64s(acc2, acc3);
169					acc0 = simd.add_f64s(acc0, acc2);
170
171					simd.reduce_sum_f64s(simd.add_f64s(acc0, simd.partial_load_f64s(tail)))
172				}
173			}
174
175			pulp::Arch::new().dispatch(Imp { v })
176		});
177	}
178
179	pub fn main() -> std::io::Result<()> {
180		let mut bench = Bench::new(BenchConfig::from_args()?);
181
182		bench.register_many(
183			list![sum_scalar, sum_stdarch, sum_pulp, sum_pulp_dispatch],
184			[64, 256, 1024],
185		);
186		bench.run()?;
187		Ok(())
188	}
189}
190
191fn main() -> std::io::Result<()> {
192	#[cfg(all(feature = "x86-v4", target_arch = "x86_64"))]
193	x86::main()?;
194	Ok(())
195}