hotloop_benchmark_avg_simd/
hotloop_benchmark_avg_simd.rs1#![feature(portable_simd)]
14
15#[cfg(feature = "cast_arrow")]
16use crate::avg_simd::run_benchmark;
17
18pub(crate) const N: usize = 1000000;
19pub(crate) const SIMD_LANES: usize = 4;
20pub(crate) const ITERATIONS: usize = 1000;
21
22#[cfg(feature = "cast_arrow")]
23mod avg_simd {
24 use std::hint::black_box;
25 use std::simd::{LaneCount, Simd, SupportedLaneCount};
26 use std::sync::Arc;
27 use std::time::Instant;
28
29 use crate::ITERATIONS;
30 use crate::SIMD_LANES;
31
32 use arrow::array::{
33 Array as ArrowArrayTrait, ArrayRef, Float64Array as ArrowF64Array,
34 Int64Array as ArrowI64Array,
35 };
36 use minarrow::{Array, Buffer, FloatArray, IntegerArray, NumericArray, Vec64};
37
38 #[inline(always)]
39 fn simd_sum_i64<const LANES: usize>(data: &[i64]) -> i64
40 where
41 LaneCount<LANES>: SupportedLaneCount,
42 {
43 let n = data.len();
44 let simd_width = LANES;
45 let simd_chunks = n / simd_width;
46
47 let mut acc_simd: Simd<i64, LANES>;
48
49 unsafe {
50 let data_ptr = data.as_ptr();
51 let mut acc1 = Simd::<i64, LANES>::splat(0);
52 let mut acc2 = Simd::<i64, LANES>::splat(0);
53 let mut acc3 = Simd::<i64, LANES>::splat(0);
54 let mut acc4 = Simd::<i64, LANES>::splat(0);
55
56 let unroll_factor = 4;
57 let unrolled_chunks = simd_chunks / unroll_factor;
58
59 for i in 0..unrolled_chunks {
60 let base_offset = i * unroll_factor * simd_width;
61 let v1 =
62 std::ptr::read_unaligned(data_ptr.add(base_offset) as *const Simd<i64, LANES>);
63 let v2 = std::ptr::read_unaligned(
64 data_ptr.add(base_offset + simd_width) as *const Simd<i64, LANES>
65 );
66 let v3 = std::ptr::read_unaligned(
67 data_ptr.add(base_offset + 2 * simd_width) as *const Simd<i64, LANES>
68 );
69 let v4 = std::ptr::read_unaligned(
70 data_ptr.add(base_offset + 3 * simd_width) as *const Simd<i64, LANES>
71 );
72 acc1 += v1;
73 acc2 += v2;
74 acc3 += v3;
75 acc4 += v4;
76 }
77
78 acc_simd = acc1 + acc2 + acc3 + acc4;
79
80 let processed = unrolled_chunks * unroll_factor;
81 for i in processed..simd_chunks {
82 let offset = i * simd_width;
83 let v = std::ptr::read_unaligned(data_ptr.add(offset) as *const Simd<i64, LANES>);
84 acc_simd += v;
85 }
86 }
87
88 let mut result = 0i64;
89 for i in 0..LANES {
90 result += acc_simd[i];
91 }
92 let remainder_start = simd_chunks * simd_width;
93 for i in remainder_start..n {
94 result += data[i];
95 }
96
97 result
98 }
99
100 #[inline(always)]
101 fn simd_sum_f64<const LANES: usize>(data: &[f64]) -> f64
102 where
103 LaneCount<LANES>: SupportedLaneCount,
104 {
105 let n = data.len();
106 let simd_width = LANES;
107 let simd_chunks = n / simd_width;
108
109 let mut acc_simd: Simd<f64, LANES>;
110
111 unsafe {
112 let data_ptr = data.as_ptr();
113 let mut acc1 = Simd::<f64, LANES>::splat(0.0);
114 let mut acc2 = Simd::<f64, LANES>::splat(0.0);
115 let mut acc3 = Simd::<f64, LANES>::splat(0.0);
116 let mut acc4 = Simd::<f64, LANES>::splat(0.0);
117
118 let unroll_factor = 4;
119 let unrolled_chunks = simd_chunks / unroll_factor;
120
121 for i in 0..unrolled_chunks {
122 let base_offset = i * unroll_factor * simd_width;
123 let v1 =
124 std::ptr::read_unaligned(data_ptr.add(base_offset) as *const Simd<f64, LANES>);
125 let v2 = std::ptr::read_unaligned(
126 data_ptr.add(base_offset + simd_width) as *const Simd<f64, LANES>
127 );
128 let v3 = std::ptr::read_unaligned(
129 data_ptr.add(base_offset + 2 * simd_width) as *const Simd<f64, LANES>
130 );
131 let v4 = std::ptr::read_unaligned(
132 data_ptr.add(base_offset + 3 * simd_width) as *const Simd<f64, LANES>
133 );
134 acc1 += v1;
135 acc2 += v2;
136 acc3 += v3;
137 acc4 += v4;
138 }
139
140 acc_simd = acc1 + acc2 + acc3 + acc4;
141
142 let processed = unrolled_chunks * unroll_factor;
143 for i in processed..simd_chunks {
144 let offset = i * simd_width;
145 let v = std::ptr::read_unaligned(data_ptr.add(offset) as *const Simd<f64, LANES>);
146 acc_simd += v;
147 }
148 }
149
150 let mut result = 0.0;
151 for i in 0..LANES {
152 result += acc_simd[i];
153 }
154 let remainder_start = simd_chunks * simd_width;
155 for i in remainder_start..n {
156 result += data[i];
157 }
158
159 result
160 }
161
162 fn simd_sum_f64_runtime(data: &[f64], lanes: usize) -> f64 {
163 match lanes {
164 2 => simd_sum_f64::<2>(data),
165 4 => simd_sum_f64::<4>(data),
166 8 => simd_sum_f64::<8>(data),
167 16 => simd_sum_f64::<16>(data),
168 _ => panic!("Unsupported SIMD lanes. Only 2, 4, 8, 16 supported."),
169 }
170 }
171
172 fn simd_sum_i64_runtime(data: &[i64], lanes: usize) -> i64 {
173 match lanes {
174 2 => simd_sum_i64::<2>(data),
175 4 => simd_sum_i64::<4>(data),
176 8 => simd_sum_i64::<8>(data),
177 16 => simd_sum_i64::<16>(data),
178 _ => panic!("Unsupported SIMD lanes. Only 2, 4, 8, 16 supported."),
179 }
180 }
181
182 pub fn run_benchmark(n: usize, simd_lanes: usize) {
183 let mut total_vec = std::time::Duration::ZERO;
184 let mut total_vec64 = std::time::Duration::ZERO;
185 let mut total_minarrow_direct = std::time::Duration::ZERO;
186 let mut total_arrow_struct = std::time::Duration::ZERO;
187 let mut total_minarrow_enum = std::time::Duration::ZERO;
188 let mut total_arrow_dyn = std::time::Duration::ZERO;
189
190 let mut total_vec_f64 = std::time::Duration::ZERO;
191 let mut total_vec64_f64 = std::time::Duration::ZERO;
192 let mut total_minarrow_direct_f64 = std::time::Duration::ZERO;
193 let mut total_arrow_struct_f64 = std::time::Duration::ZERO;
194 let mut total_minarrow_enum_f64 = std::time::Duration::ZERO;
195 let mut total_arrow_dyn_f64 = std::time::Duration::ZERO;
196
197 let mut sum_vec_i64 = 0u128;
202 let mut sum_vec64_i64 = 0u128;
203
204 let mut v_int_data = Vec::with_capacity(n);
207 let mut v64_int_data = Vec64::with_capacity(n);
208
209 for _ in 0..ITERATIONS {
210 let t0 = Instant::now();
211 v_int_data = (0..n as i64).collect();
212 let dur_vec_i64 = t0.elapsed();
213
214 let t1 = Instant::now();
215 v64_int_data = (0..n as i64).collect();
216 let dur_vec64_i64 = t1.elapsed();
217
218 sum_vec_i64 += dur_vec_i64.as_nanos();
219 sum_vec64_i64 += dur_vec64_i64.as_nanos();
220 }
221
222 let avg_vec_i64 = sum_vec_i64 as f64 / ITERATIONS as f64;
223 let avg_vec64_i64 = sum_vec64_i64 as f64 / ITERATIONS as f64;
224
225 println!(
226 "Vec<i64> construction (avg): {}",
227 fmt_duration_ns(avg_vec_i64)
228 );
229 println!(
230 "Vec64<i64> construction (avg): {}",
231 fmt_duration_ns(avg_vec64_i64)
232 );
233 println!("\n=> Keep the above Vec construction delta in mind when interpreting the below results,
234 as it is not included in the benchmarks that follow.\n");
235
236 let v_aligned = {
239 (&v_int_data[0] as *const i64 as usize) % std::mem::align_of::<Simd<i64, SIMD_LANES>>()
240 == 0
241 };
242
243 let v64_aligned = {
244 (&v64_int_data[0] as *const i64 as usize)
245 % std::mem::align_of::<Simd<i64, SIMD_LANES>>()
246 == 0
247 };
248
249 let int_array_aligned = {
250 let int_arr = IntegerArray {
251 data: Buffer::from(v64_int_data.clone()),
252 null_mask: None,
253 };
254 let slice = &int_arr[..];
255 (slice.as_ptr() as usize) % std::mem::align_of::<Simd<i64, SIMD_LANES>>() == 0
256 };
257
258 let i64_arrow_aligned = {
259 let arr = ArrowI64Array::from(v_int_data.clone());
260 (arr.values().as_ptr() as usize) % std::mem::align_of::<Simd<i64, SIMD_LANES>>() == 0
261 };
262
263 let arr_int_enum_aligned = {
264 let array = Array::NumericArray(NumericArray::Int64(Arc::new(IntegerArray {
265 data: Buffer::from(v64_int_data.clone()),
266 null_mask: None,
267 })));
268 let int_arr = array.num().i64().unwrap();
269 (int_arr.data.as_slice().as_ptr() as usize)
270 % std::mem::align_of::<Simd<i64, SIMD_LANES>>()
271 == 0
272 };
273
274 let array_ref_int_aligned = {
275 let arr: ArrayRef = Arc::new(ArrowI64Array::from(v_int_data.clone()));
276 let int_arr = arr.as_any().downcast_ref::<ArrowI64Array>().unwrap();
277 (int_arr.values().as_ptr() as usize) % std::mem::align_of::<Simd<i64, SIMD_LANES>>()
278 == 0
279 };
280
281 let v_float_data: Vec<f64> = (0..n as i64).map(|x| x as f64).collect();
282 let v64_float_data: Vec64<f64> = (0..n as i64).map(|x| x as f64).collect();
283
284 let v_float_aligned = {
285 (&v_float_data[0] as *const f64 as usize)
286 % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
287 == 0
288 };
289
290 let v64_float_aligned = {
291 (&v64_float_data[0] as *const f64 as usize)
292 % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
293 == 0
294 };
295
296 let float_arr_aligned = {
297 let float_arr = FloatArray {
298 data: Buffer::from(v64_float_data.clone()),
299 null_mask: None,
300 };
301 (&float_arr.data.as_slice()[0] as *const f64 as usize)
302 % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
303 == 0
304 };
305
306 let arrow_f64_aligned = {
307 let arr = ArrowF64Array::from(v_float_data.clone());
308 (arr.values().as_ptr() as usize) % std::mem::align_of::<Simd<f64, SIMD_LANES>>() == 0
309 };
310
311 let float_enum_aligned = {
312 let array = Array::NumericArray(NumericArray::Float64(Arc::new(FloatArray {
313 data: Buffer::from(v64_float_data.clone()),
314 null_mask: None,
315 })));
316 let float_arr = array.num().f64().unwrap();
317 (float_arr.data.as_slice().as_ptr() as usize)
318 % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
319 == 0
320 };
321
322 let arrow_f64_arr_aligned = {
323 let arr: ArrayRef = Arc::new(ArrowF64Array::from(v_float_data.clone()));
324 let float_arr = arr.as_any().downcast_ref::<ArrowF64Array>().unwrap();
325 (float_arr.values().as_ptr() as usize) % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
326 == 0
327 };
328
329 for _ in 0..ITERATIONS {
330 let data = v_int_data.clone();
333 let start = Instant::now();
334 let sum = simd_sum_i64_runtime(&data[..], simd_lanes);
335 let dur = start.elapsed();
336 total_vec += dur;
337 black_box(sum);
338
339 let data: Vec64<i64> = v64_int_data.clone();
341 let start = Instant::now();
342 let sum = simd_sum_i64_runtime(&data[..], simd_lanes);
343 let dur = start.elapsed();
344 total_vec64 += dur;
345 black_box(sum);
346
347 let data: Vec64<i64> = v64_int_data.clone();
349 let start = Instant::now();
350 let int_arr = IntegerArray {
351 data: Buffer::from(data),
352 null_mask: None,
353 };
354 let sum = simd_sum_i64_runtime(&int_arr[..], simd_lanes);
355 let dur = start.elapsed();
356 total_minarrow_direct += dur;
357 black_box(sum);
358
359 let data: Vec<i64> = v_int_data.clone();
361 let start = Instant::now();
362 let arr = ArrowI64Array::from(data);
363 let sum = simd_sum_i64_runtime(arr.values(), simd_lanes);
364 let dur = start.elapsed();
365 total_arrow_struct += dur;
366 black_box(sum);
367
368 let data: Vec64<i64> = v64_int_data.clone();
370 let start = Instant::now();
371 let array = Array::NumericArray(NumericArray::Int64(Arc::new(IntegerArray {
372 data: Buffer::from(data),
373 null_mask: None,
374 })));
375 let int_arr = array.num().i64().unwrap();
376 let sum = simd_sum_i64_runtime(&int_arr[..], simd_lanes);
377 let dur = start.elapsed();
378 total_minarrow_enum += dur;
379 black_box(sum);
380
381 let data: Vec<i64> = v_int_data.clone();
383 let start = Instant::now();
384 let arr: ArrayRef = Arc::new(ArrowI64Array::from(data));
385 let int_arr = arr.as_any().downcast_ref::<ArrowI64Array>().unwrap();
386 let sum = simd_sum_i64_runtime(int_arr.values(), simd_lanes);
387 let dur = start.elapsed();
388 total_arrow_dyn += dur;
389 black_box(sum);
390
391 let data: Vec<f64> = v_float_data.clone();
395 let start = Instant::now();
396 let sum = simd_sum_f64_runtime(&data[..], simd_lanes);
397 let dur = start.elapsed();
398 total_vec_f64 += dur;
399 black_box(sum);
400
401 let data: Vec64<f64> = v64_float_data.clone();
403 let start = Instant::now();
404 let sum = simd_sum_f64_runtime(&data[..], simd_lanes);
405 let dur = start.elapsed();
406 total_vec64_f64 += dur;
407 black_box(sum);
408
409 let data: Vec64<f64> = v64_float_data.clone();
411 let start = Instant::now();
412 let float_arr = FloatArray {
413 data: Buffer::from(data),
414 null_mask: None,
415 };
416 let sum = simd_sum_f64_runtime(&float_arr[..], simd_lanes);
417 let dur = start.elapsed();
418 total_minarrow_direct_f64 += dur;
419 black_box(sum);
420
421 let data: Vec<f64> = v_float_data.clone();
423 let start = Instant::now();
424 let arr = ArrowF64Array::from(data);
425 let sum = simd_sum_f64_runtime(arr.values(), simd_lanes);
426 let dur = start.elapsed();
427 total_arrow_struct_f64 += dur;
428 black_box(sum);
429
430 let data: Vec64<f64> = v64_float_data.clone();
432 let start = Instant::now();
433 let array = Array::NumericArray(NumericArray::Float64(Arc::new(FloatArray {
434 data: Buffer::from(data),
435 null_mask: None,
436 })));
437 let float_arr = array.num().f64().unwrap();
438 let sum = simd_sum_f64_runtime(&float_arr[..], simd_lanes);
439 let dur = start.elapsed();
440 total_minarrow_enum_f64 += dur;
441 black_box(sum);
442
443 let data: Vec<f64> = v_float_data.clone();
445 let start = Instant::now();
446 let arr: ArrayRef = Arc::new(ArrowF64Array::from(data));
447 let float_arr = arr.as_any().downcast_ref::<ArrowF64Array>().unwrap();
448 let sum = simd_sum_f64_runtime(float_arr.values(), simd_lanes);
449 let dur = start.elapsed();
450 total_arrow_dyn_f64 += dur;
451 black_box(sum);
452 }
453
454 println!("Averaged Results from {} runs:", ITERATIONS);
455 println!("---------------------------------");
456
457 let avg_vec = total_vec.as_nanos() as f64 / ITERATIONS as f64;
458 let avg_vec64 = total_vec64.as_nanos() as f64 / ITERATIONS as f64;
459 let avg_minarrow_direct = total_minarrow_direct.as_nanos() as f64 / ITERATIONS as f64;
460 let avg_arrow_struct = total_arrow_struct.as_nanos() as f64 / ITERATIONS as f64;
461 let avg_minarrow_enum = total_minarrow_enum.as_nanos() as f64 / ITERATIONS as f64;
462 let avg_arrow_dyn = total_arrow_dyn.as_nanos() as f64 / ITERATIONS as f64;
463
464 let avg_vec_f64 = total_vec_f64.as_nanos() as f64 / ITERATIONS as f64;
465 let avg_vec64_f64 = total_vec64_f64.as_nanos() as f64 / ITERATIONS as f64;
466 let avg_minarrow_direct_f64 =
467 total_minarrow_direct_f64.as_nanos() as f64 / ITERATIONS as f64;
468 let avg_arrow_struct_f64 = total_arrow_struct_f64.as_nanos() as f64 / ITERATIONS as f64;
469 let avg_minarrow_enum_f64 = total_minarrow_enum_f64.as_nanos() as f64 / ITERATIONS as f64;
470 let avg_arrow_dyn_f64 = total_arrow_dyn_f64.as_nanos() as f64 / ITERATIONS as f64;
471
472 println!("|------------ Integer Tests (SIMD) ------------|");
473 println!(
474 "raw vec: Vec<i64> avg = {} (n={})",
475 fmt_duration_ns(avg_vec),
476 ITERATIONS
477 );
478 println!(
479 "raw vec64: Vec64<i64> avg = {} (n={})",
480 fmt_duration_ns(avg_vec64),
481 ITERATIONS
482 );
483 println!(
484 "minarrow direct: IntegerArray avg = {} (n={})",
485 fmt_duration_ns(avg_minarrow_direct),
486 ITERATIONS
487 );
488 println!(
489 "arrow-rs struct: Int64Array avg = {} (n={})",
490 fmt_duration_ns(avg_arrow_struct),
491 ITERATIONS
492 );
493 println!(
494 "minarrow enum: IntegerArray avg = {} (n={})",
495 fmt_duration_ns(avg_minarrow_enum),
496 ITERATIONS
497 );
498 println!(
499 "arrow-rs dyn: Int64Array avg = {} (n={})",
500 fmt_duration_ns(avg_arrow_dyn),
501 ITERATIONS
502 );
503
504 println!();
505 println!("|------------ Float Tests (SIMD) --------------|");
506 println!(
507 "raw vec: Vec<f64> avg = {} (n={})",
508 fmt_duration_ns(avg_vec_f64),
509 ITERATIONS
510 );
511 println!(
512 "raw vec64: Vec64<f64> avg = {} (n={})",
513 fmt_duration_ns(avg_vec64_f64),
514 ITERATIONS
515 );
516 println!(
517 "minarrow direct: FloatArray avg = {} (n={})",
518 fmt_duration_ns(avg_minarrow_direct_f64),
519 ITERATIONS
520 );
521 println!(
522 "arrow-rs struct: Float64Array avg = {} (n={})",
523 fmt_duration_ns(avg_arrow_struct_f64),
524 ITERATIONS
525 );
526 println!(
527 "minarrow enum: FloatArray avg = {} (n={})",
528 fmt_duration_ns(avg_minarrow_enum_f64),
529 ITERATIONS
530 );
531 println!(
532 "arrow-rs dyn: Float64Array avg = {} (n={})",
533 fmt_duration_ns(avg_arrow_dyn_f64),
534 ITERATIONS
535 );
536
537 println!("\n=> Vec64 backs the above `Minarrow` types and `Vec` backs Arrow_Rs.");
538
539 println!("\nVerify SIMD pointer alignment for Integer calculations (based on lane width):");
540 println!("Vec<i64> is aligned: {}", v_aligned);
541 println!("Minarrow Vec64<i64> is aligned: {}", v64_aligned);
542 println!(
543 "Minarrow IntegerArray<i64> is aligned: {}",
544 int_array_aligned
545 );
546 println!("Arrow ArrowI64Array is aligned: {}", i64_arrow_aligned);
547 println!(
548 "Minarrow Array::NumericArray<i64> is aligned: {}",
549 arr_int_enum_aligned
550 );
551 println!("Arrow ArrayRef<int> is aligned: {}", array_ref_int_aligned);
552
553 println!("\nVerify SIMD pointer alignment for Float calculations (based on lane width):");
554 println!("Vec<f64> is aligned: {}", v_float_aligned);
555 println!("Vec64<f64> is aligned: {}", v64_float_aligned);
556 println!("FloatArray<f64> is aligned: {}", float_arr_aligned);
557 println!("ArrowF64Array is aligned: {}", arrow_f64_aligned);
558 println!(
559 "Array::NumericArray<f64> is aligned: {}",
560 float_enum_aligned
561 );
562 println!("ArrayRef is aligned: {}", arrow_f64_arr_aligned);
563
564 println!("\n---------------------- END OF SIMD AVG BENCHMARKS ---------------------------");
565 }
566
567 fn fmt_duration_ns(avg_ns: f64) -> String {
568 if avg_ns < 1000.0 {
569 format!("{:.0} ns", avg_ns)
570 } else if avg_ns < 1_000_000.0 {
571 format!("{:.3} µs", avg_ns / 1000.0)
572 } else {
573 format!("{:.3} ms", avg_ns / 1_000_000.0)
574 }
575 }
576}
577
578fn main() {
579 if cfg!(feature = "cast_arrow") {
580 use crate::N;
581 println!(
582 "Running SIMD/Arrow/minarrow parity benchmarks (n={}, lanes={}, iters={})",
583 N, SIMD_LANES, ITERATIONS
584 );
585 #[cfg(feature = "cast_arrow")]
586 run_benchmark(N, SIMD_LANES);
587 } else {
588 println!(
589 "The hotloop_benchmark_avg_simd example requires enabling the `cast_arrow` feature."
590 )
591 }
592}