1#[cfg(all(feature = "x86-v4", target_arch = "x86_64"))]
2mod x86 {
3 use aligned_vec::avec;
4 use diol::prelude::*;
5
6 use core::arch::x86_64::*;
7 use core::mem::transmute;
8 use pulp::Simd;
9 use pulp::x86::V4;
10
11 fn sum_scalar(bencher: Bencher, len: usize) {
12 let v = &*avec![0.0f64; len];
13
14 bencher.bench(|| black_box(v.iter().sum::<f64>()));
15 }
16
17 #[target_feature(enable = "avx512f")]
18 #[allow(clippy::missing_transmute_annotations)]
19 unsafe fn sum_stdarch_imp(v: &[f64]) -> f64 {
20 let mut acc0 = _mm512_set1_pd(0.0);
21 let mut acc1 = _mm512_set1_pd(0.0);
22 let mut acc2 = _mm512_set1_pd(0.0);
23 let mut acc3 = _mm512_set1_pd(0.0);
24
25 let (head, tail) = pulp::as_arrays::<8, _>(v);
27 let (head4, head1) = pulp::as_arrays::<4, _>(head);
28
29 for [x0, x1, x2, x3] in head4 {
30 let x0 = transmute(*x0);
31 let x1 = transmute(*x1);
32 let x2 = transmute(*x2);
33 let x3 = transmute(*x3);
34
35 acc0 = _mm512_add_pd(acc0, x0);
36 acc1 = _mm512_add_pd(acc1, x1);
37 acc2 = _mm512_add_pd(acc2, x2);
38 acc3 = _mm512_add_pd(acc3, x3);
39 }
40
41 for x0 in head1 {
42 let x0 = pulp::cast(*x0);
43 acc0 = _mm512_add_pd(acc0, x0);
44 }
45
46 acc0 = _mm512_add_pd(acc0, acc1);
47 acc2 = _mm512_add_pd(acc2, acc3);
48 acc0 = _mm512_add_pd(acc0, acc2);
49
50 let acc: [__m256d; 2] = pulp::cast(acc0);
51 let acc = _mm256_add_pd(acc[0], acc[1]);
52
53 let acc: [__m128d; 2] = pulp::cast(acc);
54 let acc = _mm_add_pd(acc[0], acc[1]);
55
56 let acc: [f64; 2] = pulp::cast(acc);
57 let mut acc = acc[0] + acc[1];
58
59 for x0 in tail {
60 acc += *x0;
61 }
62 acc
63 }
64
65 fn sum_stdarch(bencher: Bencher, len: usize) {
66 let v = &*avec![0.0f64; len];
67
68 bencher.bench(|| unsafe { black_box(sum_stdarch_imp(v)) });
69 }
70
71 fn sum_pulp(bencher: Bencher, len: usize) {
72 if let Some(simd) = V4::try_new() {
73 let v = &*avec![0.0f64; len];
74
75 bencher.bench(|| {
76 struct Imp<'a> {
77 simd: V4,
78 v: &'a [f64],
79 }
80
81 impl pulp::NullaryFnOnce for Imp<'_> {
82 type Output = f64;
83
84 #[inline(always)]
85 fn call(self) -> Self::Output {
86 let Self { simd, v } = self;
87
88 let (head, tail) = pulp::as_arrays::<8, _>(v);
89
90 let mut acc0 = simd.splat_f64x8(0.0);
91 let mut acc1 = simd.splat_f64x8(0.0);
92 let mut acc2 = simd.splat_f64x8(0.0);
93 let mut acc3 = simd.splat_f64x8(0.0);
94
95 let (head4, head1) = pulp::as_arrays::<4, _>(head);
96
97 for [x0, x1, x2, x3] in head4 {
98 let x0 = pulp::cast(*x0);
99 let x1 = pulp::cast(*x1);
100 let x2 = pulp::cast(*x2);
101 let x3 = pulp::cast(*x3);
102
103 acc0 = pulp::cast(
104 simd.avx512f._mm512_add_pd(pulp::cast(acc0), pulp::cast(x0)),
105 );
106
107 acc0 = simd.add_f64x8(acc0, x0);
108 acc1 = simd.add_f64x8(acc1, x1);
109 acc2 = simd.add_f64x8(acc2, x2);
110 acc3 = simd.add_f64x8(acc3, x3);
111 }
112 for x0 in head1 {
113 let x0 = pulp::cast(*x0);
114 acc0 = simd.add_f64x8(acc0, x0);
115 }
116
117 acc0 = simd.add_f64x8(acc0, acc1);
118 acc2 = simd.add_f64x8(acc2, acc3);
119 acc0 = simd.add_f64x8(acc0, acc2);
120
121 let tail = simd.partial_load_f64s(tail);
122
123 simd.reduce_sum_f64s(simd.add_f64x8(acc0, tail))
124 }
125 }
126
127 simd.vectorize(Imp { simd, v })
128 });
129 }
130 }
131
132 fn sum_pulp_dispatch(bencher: Bencher, len: usize) {
133 let v = &*avec![0.0f64; len];
134
135 bencher.bench(|| {
136 struct Imp<'a> {
137 v: &'a [f64],
138 }
139
140 impl pulp::WithSimd for Imp<'_> {
141 type Output = f64;
142
143 #[inline(always)]
144 fn with_simd<S: Simd>(self, simd: S) -> Self::Output {
145 let Self { v } = self;
146
147 let (head, tail) = S::as_simd_f64s(v);
148
149 let mut acc0 = simd.splat_f64s(0.0);
150 let mut acc1 = simd.splat_f64s(0.0);
151 let mut acc2 = simd.splat_f64s(0.0);
152 let mut acc3 = simd.splat_f64s(0.0);
153
154 let (head4, head1) = pulp::as_arrays::<4, _>(head);
155
156 for &[x0, x1, x2, x3] in head4 {
157 acc0 = simd.add_f64s(acc0, x0);
158 acc1 = simd.add_f64s(acc1, x1);
159 acc2 = simd.add_f64s(acc2, x2);
160 acc3 = simd.add_f64s(acc3, x3);
161 }
162
163 for &x0 in head1 {
164 acc0 = simd.add_f64s(acc0, x0);
165 }
166
167 acc0 = simd.add_f64s(acc0, acc1);
168 acc2 = simd.add_f64s(acc2, acc3);
169 acc0 = simd.add_f64s(acc0, acc2);
170
171 simd.reduce_sum_f64s(simd.add_f64s(acc0, simd.partial_load_f64s(tail)))
172 }
173 }
174
175 pulp::Arch::new().dispatch(Imp { v })
176 });
177 }
178
179 pub fn main() -> std::io::Result<()> {
180 let mut bench = Bench::new(BenchConfig::from_args()?);
181
182 bench.register_many(
183 list![sum_scalar, sum_stdarch, sum_pulp, sum_pulp_dispatch],
184 [64, 256, 1024],
185 );
186 bench.run()?;
187 Ok(())
188 }
189}
190
191fn main() -> std::io::Result<()> {
192 #[cfg(all(feature = "x86-v4", target_arch = "x86_64"))]
193 x86::main()?;
194 Ok(())
195}