hotloop_benchmark_avg_simd/
hotloop_benchmark_avg_simd.rs1#![feature(portable_simd)]
14
15#[cfg(feature = "cast_arrow")]
16use crate::avg_simd::run_benchmark;
17
18pub (crate) const N: usize = 1000000;
19pub (crate) const SIMD_LANES: usize = 4;
20pub (crate) const ITERATIONS: usize = 1000;
21
22#[cfg(feature = "cast_arrow")]
23mod avg_simd {
24 use std::hint::black_box;
25 use std::simd::{LaneCount, Simd, SupportedLaneCount};
26 use std::sync::Arc;
27 use std::time::Instant;
28
29 use crate::ITERATIONS;
30 use crate::SIMD_LANES;
31
32 use arrow::array::{
33 Array as ArrowArrayTrait, ArrayRef, Float64Array as ArrowF64Array,
34 Int64Array as ArrowI64Array
35 };
36 use minarrow::{Array, Buffer, FloatArray, IntegerArray, NumericArray, Vec64};
37
38 #[inline(always)]
39 fn simd_sum_i64<const LANES: usize>(data: &[i64]) -> i64
40 where
41 LaneCount<LANES>: SupportedLaneCount
42 {
43 let n = data.len();
44 let simd_width = LANES;
45 let simd_chunks = n / simd_width;
46
47 let mut acc_simd: Simd<i64, LANES>;
48
49 unsafe {
50 let data_ptr = data.as_ptr();
51 let mut acc1 = Simd::<i64, LANES>::splat(0);
52 let mut acc2 = Simd::<i64, LANES>::splat(0);
53 let mut acc3 = Simd::<i64, LANES>::splat(0);
54 let mut acc4 = Simd::<i64, LANES>::splat(0);
55
56 let unroll_factor = 4;
57 let unrolled_chunks = simd_chunks / unroll_factor;
58
59 for i in 0..unrolled_chunks {
60 let base_offset = i * unroll_factor * simd_width;
61 let v1 =
62 std::ptr::read_unaligned(data_ptr.add(base_offset) as *const Simd<i64, LANES>);
63 let v2 = std::ptr::read_unaligned(
64 data_ptr.add(base_offset + simd_width) as *const Simd<i64, LANES>
65 );
66 let v3 = std::ptr::read_unaligned(
67 data_ptr.add(base_offset + 2 * simd_width) as *const Simd<i64, LANES>
68 );
69 let v4 = std::ptr::read_unaligned(
70 data_ptr.add(base_offset + 3 * simd_width) as *const Simd<i64, LANES>
71 );
72 acc1 += v1;
73 acc2 += v2;
74 acc3 += v3;
75 acc4 += v4;
76 }
77
78 acc_simd = acc1 + acc2 + acc3 + acc4;
79
80 let processed = unrolled_chunks * unroll_factor;
81 for i in processed..simd_chunks {
82 let offset = i * simd_width;
83 let v = std::ptr::read_unaligned(data_ptr.add(offset) as *const Simd<i64, LANES>);
84 acc_simd += v;
85 }
86 }
87
88 let mut result = 0i64;
89 for i in 0..LANES {
90 result += acc_simd[i];
91 }
92 let remainder_start = simd_chunks * simd_width;
93 for i in remainder_start..n {
94 result += data[i];
95 }
96
97 result
98 }
99
100 #[inline(always)]
101 fn simd_sum_f64<const LANES: usize>(data: &[f64]) -> f64
102 where
103 LaneCount<LANES>: SupportedLaneCount
104 {
105 let n = data.len();
106 let simd_width = LANES;
107 let simd_chunks = n / simd_width;
108
109 let mut acc_simd: Simd<f64, LANES>;
110
111 unsafe {
112 let data_ptr = data.as_ptr();
113 let mut acc1 = Simd::<f64, LANES>::splat(0.0);
114 let mut acc2 = Simd::<f64, LANES>::splat(0.0);
115 let mut acc3 = Simd::<f64, LANES>::splat(0.0);
116 let mut acc4 = Simd::<f64, LANES>::splat(0.0);
117
118 let unroll_factor = 4;
119 let unrolled_chunks = simd_chunks / unroll_factor;
120
121 for i in 0..unrolled_chunks {
122 let base_offset = i * unroll_factor * simd_width;
123 let v1 =
124 std::ptr::read_unaligned(data_ptr.add(base_offset) as *const Simd<f64, LANES>);
125 let v2 = std::ptr::read_unaligned(
126 data_ptr.add(base_offset + simd_width) as *const Simd<f64, LANES>
127 );
128 let v3 = std::ptr::read_unaligned(
129 data_ptr.add(base_offset + 2 * simd_width) as *const Simd<f64, LANES>
130 );
131 let v4 = std::ptr::read_unaligned(
132 data_ptr.add(base_offset + 3 * simd_width) as *const Simd<f64, LANES>
133 );
134 acc1 += v1;
135 acc2 += v2;
136 acc3 += v3;
137 acc4 += v4;
138 }
139
140 acc_simd = acc1 + acc2 + acc3 + acc4;
141
142 let processed = unrolled_chunks * unroll_factor;
143 for i in processed..simd_chunks {
144 let offset = i * simd_width;
145 let v = std::ptr::read_unaligned(data_ptr.add(offset) as *const Simd<f64, LANES>);
146 acc_simd += v;
147 }
148 }
149
150 let mut result = 0.0;
151 for i in 0..LANES {
152 result += acc_simd[i];
153 }
154 let remainder_start = simd_chunks * simd_width;
155 for i in remainder_start..n {
156 result += data[i];
157 }
158
159 result
160 }
161
162 fn simd_sum_f64_runtime(data: &[f64], lanes: usize) -> f64 {
163 match lanes {
164 2 => simd_sum_f64::<2>(data),
165 4 => simd_sum_f64::<4>(data),
166 8 => simd_sum_f64::<8>(data),
167 16 => simd_sum_f64::<16>(data),
168 _ => panic!("Unsupported SIMD lanes. Only 2, 4, 8, 16 supported.")
169 }
170 }
171
172 fn simd_sum_i64_runtime(data: &[i64], lanes: usize) -> i64 {
173 match lanes {
174 2 => simd_sum_i64::<2>(data),
175 4 => simd_sum_i64::<4>(data),
176 8 => simd_sum_i64::<8>(data),
177 16 => simd_sum_i64::<16>(data),
178 _ => panic!("Unsupported SIMD lanes. Only 2, 4, 8, 16 supported.")
179 }
180 }
181
182 pub fn run_benchmark(n: usize, simd_lanes: usize) {
183 let mut total_vec = std::time::Duration::ZERO;
184 let mut total_vec64 = std::time::Duration::ZERO;
185 let mut total_minarrow_direct = std::time::Duration::ZERO;
186 let mut total_arrow_struct = std::time::Duration::ZERO;
187 let mut total_minarrow_enum = std::time::Duration::ZERO;
188 let mut total_arrow_dyn = std::time::Duration::ZERO;
189
190 let mut total_vec_f64 = std::time::Duration::ZERO;
191 let mut total_vec64_f64 = std::time::Duration::ZERO;
192 let mut total_minarrow_direct_f64 = std::time::Duration::ZERO;
193 let mut total_arrow_struct_f64 = std::time::Duration::ZERO;
194 let mut total_minarrow_enum_f64 = std::time::Duration::ZERO;
195 let mut total_arrow_dyn_f64 = std::time::Duration::ZERO;
196
197 let mut sum_vec_i64 = 0u128;
202 let mut sum_vec64_i64 = 0u128;
203
204 let mut v_int_data = Vec::with_capacity(n);
207 let mut v64_int_data = Vec64::with_capacity(n);
208
209 for _ in 0..ITERATIONS {
210 let t0 = Instant::now();
211 v_int_data = (0..n as i64).collect();
212 let dur_vec_i64 = t0.elapsed();
213
214 let t1 = Instant::now();
215 v64_int_data = (0..n as i64).collect();
216 let dur_vec64_i64 = t1.elapsed();
217
218 sum_vec_i64 += dur_vec_i64.as_nanos();
219 sum_vec64_i64 += dur_vec64_i64.as_nanos();
220 }
221
222 let avg_vec_i64 = sum_vec_i64 as f64 / ITERATIONS as f64;
223 let avg_vec64_i64 = sum_vec64_i64 as f64 / ITERATIONS as f64;
224
225 println!("Vec<i64> construction (avg): {}", fmt_duration_ns(avg_vec_i64));
226 println!("Vec64<i64> construction (avg): {}", fmt_duration_ns(avg_vec64_i64));
227 println!("\n=> Keep the above Vec construction delta in mind when interpreting the below results,
228 as it is not included in the benchmarks that follow.\n");
229
230 let v_aligned = {
233 (&v_int_data[0] as *const i64 as usize) % std::mem::align_of::<Simd<i64, SIMD_LANES>>()
234 == 0
235 };
236
237 let v64_aligned = {
238 (&v64_int_data[0] as *const i64 as usize)
239 % std::mem::align_of::<Simd<i64, SIMD_LANES>>()
240 == 0
241 };
242
243 let int_array_aligned = {
244 let int_arr = IntegerArray {
245 data: Buffer::from(v64_int_data.clone()),
246 null_mask: None
247 };
248 let slice = &int_arr[..];
249 (slice.as_ptr() as usize) % std::mem::align_of::<Simd<i64, SIMD_LANES>>() == 0
250 };
251
252 let i64_arrow_aligned = {
253 let arr = ArrowI64Array::from(v_int_data.clone());
254 (arr.values().as_ptr() as usize) % std::mem::align_of::<Simd<i64, SIMD_LANES>>() == 0
255 };
256
257 let arr_int_enum_aligned = {
258 let array = Array::NumericArray(NumericArray::Int64(Arc::new(IntegerArray {
259 data: Buffer::from(v64_int_data.clone()),
260 null_mask: None
261 })));
262 let int_arr = array.num().i64().unwrap();
263 (int_arr.data.as_slice().as_ptr() as usize)
264 % std::mem::align_of::<Simd<i64, SIMD_LANES>>()
265 == 0
266 };
267
268 let array_ref_int_aligned = {
269 let arr: ArrayRef = Arc::new(ArrowI64Array::from(v_int_data.clone()));
270 let int_arr = arr.as_any().downcast_ref::<ArrowI64Array>().unwrap();
271 (int_arr.values().as_ptr() as usize) % std::mem::align_of::<Simd<i64, SIMD_LANES>>()
272 == 0
273 };
274
275 let v_float_data: Vec<f64> = (0..n as i64).map(|x| x as f64).collect();
276 let v64_float_data: Vec64<f64> = (0..n as i64).map(|x| x as f64).collect();
277
278 let v_float_aligned = {
279 (&v_float_data[0] as *const f64 as usize)
280 % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
281 == 0
282 };
283
284 let v64_float_aligned = {
285 (&v64_float_data[0] as *const f64 as usize)
286 % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
287 == 0
288 };
289
290 let float_arr_aligned = {
291 let float_arr = FloatArray {
292 data: Buffer::from(v64_float_data.clone()),
293 null_mask: None
294 };
295 (&float_arr.data.as_slice()[0] as *const f64 as usize)
296 % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
297 == 0
298 };
299
300 let arrow_f64_aligned = {
301 let arr = ArrowF64Array::from(v_float_data.clone());
302 (arr.values().as_ptr() as usize) % std::mem::align_of::<Simd<f64, SIMD_LANES>>() == 0
303 };
304
305 let float_enum_aligned = {
306 let array = Array::NumericArray(NumericArray::Float64(Arc::new(FloatArray {
307 data: Buffer::from(v64_float_data.clone()),
308 null_mask: None
309 })));
310 let float_arr = array.num().f64().unwrap();
311 (float_arr.data.as_slice().as_ptr() as usize)
312 % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
313 == 0
314 };
315
316 let arrow_f64_arr_aligned = {
317 let arr: ArrayRef = Arc::new(ArrowF64Array::from(v_float_data.clone()));
318 let float_arr = arr.as_any().downcast_ref::<ArrowF64Array>().unwrap();
319 (float_arr.values().as_ptr() as usize) % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
320 == 0
321 };
322
323 for _ in 0..ITERATIONS {
324 let data = v_int_data.clone();
327 let start = Instant::now();
328 let sum = simd_sum_i64_runtime(&data[..], simd_lanes);
329 let dur = start.elapsed();
330 total_vec += dur;
331 black_box(sum);
332
333 let data: Vec64<i64> = v64_int_data.clone();
335 let start = Instant::now();
336 let sum = simd_sum_i64_runtime(&data[..], simd_lanes);
337 let dur = start.elapsed();
338 total_vec64 += dur;
339 black_box(sum);
340
341 let data: Vec64<i64> = v64_int_data.clone();
343 let start = Instant::now();
344 let int_arr = IntegerArray {
345 data: Buffer::from(data),
346 null_mask: None
347 };
348 let sum = simd_sum_i64_runtime(&int_arr[..], simd_lanes);
349 let dur = start.elapsed();
350 total_minarrow_direct += dur;
351 black_box(sum);
352
353 let data: Vec<i64> = v_int_data.clone();
355 let start = Instant::now();
356 let arr = ArrowI64Array::from(data);
357 let sum = simd_sum_i64_runtime(arr.values(), simd_lanes);
358 let dur = start.elapsed();
359 total_arrow_struct += dur;
360 black_box(sum);
361
362 let data: Vec64<i64> = v64_int_data.clone();
364 let start = Instant::now();
365 let array = Array::NumericArray(NumericArray::Int64(Arc::new(IntegerArray {
366 data: Buffer::from(data),
367 null_mask: None
368 })));
369 let int_arr = array.num().i64().unwrap();
370 let sum = simd_sum_i64_runtime(&int_arr[..], simd_lanes);
371 let dur = start.elapsed();
372 total_minarrow_enum += dur;
373 black_box(sum);
374
375 let data: Vec<i64> = v_int_data.clone();
377 let start = Instant::now();
378 let arr: ArrayRef = Arc::new(ArrowI64Array::from(data));
379 let int_arr = arr.as_any().downcast_ref::<ArrowI64Array>().unwrap();
380 let sum = simd_sum_i64_runtime(int_arr.values(), simd_lanes);
381 let dur = start.elapsed();
382 total_arrow_dyn += dur;
383 black_box(sum);
384
385 let data: Vec<f64> = v_float_data.clone();
389 let start = Instant::now();
390 let sum = simd_sum_f64_runtime(&data[..], simd_lanes);
391 let dur = start.elapsed();
392 total_vec_f64 += dur;
393 black_box(sum);
394
395 let data: Vec64<f64> = v64_float_data.clone();
397 let start = Instant::now();
398 let sum = simd_sum_f64_runtime(&data[..], simd_lanes);
399 let dur = start.elapsed();
400 total_vec64_f64 += dur;
401 black_box(sum);
402
403 let data: Vec64<f64> = v64_float_data.clone();
405 let start = Instant::now();
406 let float_arr = FloatArray {
407 data: Buffer::from(data),
408 null_mask: None
409 };
410 let sum = simd_sum_f64_runtime(&float_arr[..], simd_lanes);
411 let dur = start.elapsed();
412 total_minarrow_direct_f64 += dur;
413 black_box(sum);
414
415 let data: Vec<f64> = v_float_data.clone();
417 let start = Instant::now();
418 let arr = ArrowF64Array::from(data);
419 let sum = simd_sum_f64_runtime(arr.values(), simd_lanes);
420 let dur = start.elapsed();
421 total_arrow_struct_f64 += dur;
422 black_box(sum);
423
424 let data: Vec64<f64> = v64_float_data.clone();
426 let start = Instant::now();
427 let array = Array::NumericArray(NumericArray::Float64(Arc::new(FloatArray {
428 data: Buffer::from(data),
429 null_mask: None
430 })));
431 let float_arr = array.num().f64().unwrap();
432 let sum = simd_sum_f64_runtime(&float_arr[..], simd_lanes);
433 let dur = start.elapsed();
434 total_minarrow_enum_f64 += dur;
435 black_box(sum);
436
437 let data: Vec<f64> = v_float_data.clone();
439 let start = Instant::now();
440 let arr: ArrayRef = Arc::new(ArrowF64Array::from(data));
441 let float_arr = arr.as_any().downcast_ref::<ArrowF64Array>().unwrap();
442 let sum = simd_sum_f64_runtime(float_arr.values(), simd_lanes);
443 let dur = start.elapsed();
444 total_arrow_dyn_f64 += dur;
445 black_box(sum);
446 }
447
448 println!("Averaged Results from {} runs:", ITERATIONS);
449 println!("---------------------------------");
450
451 let avg_vec = total_vec.as_nanos() as f64 / ITERATIONS as f64;
452 let avg_vec64 = total_vec64.as_nanos() as f64 / ITERATIONS as f64;
453 let avg_minarrow_direct = total_minarrow_direct.as_nanos() as f64 / ITERATIONS as f64;
454 let avg_arrow_struct = total_arrow_struct.as_nanos() as f64 / ITERATIONS as f64;
455 let avg_minarrow_enum = total_minarrow_enum.as_nanos() as f64 / ITERATIONS as f64;
456 let avg_arrow_dyn = total_arrow_dyn.as_nanos() as f64 / ITERATIONS as f64;
457
458 let avg_vec_f64 = total_vec_f64.as_nanos() as f64 / ITERATIONS as f64;
459 let avg_vec64_f64 = total_vec64_f64.as_nanos() as f64 / ITERATIONS as f64;
460 let avg_minarrow_direct_f64 =
461 total_minarrow_direct_f64.as_nanos() as f64 / ITERATIONS as f64;
462 let avg_arrow_struct_f64 = total_arrow_struct_f64.as_nanos() as f64 / ITERATIONS as f64;
463 let avg_minarrow_enum_f64 = total_minarrow_enum_f64.as_nanos() as f64 / ITERATIONS as f64;
464 let avg_arrow_dyn_f64 = total_arrow_dyn_f64.as_nanos() as f64 / ITERATIONS as f64;
465
466 println!("|------------ Integer Tests (SIMD) ------------|");
467 println!(
468 "raw vec: Vec<i64> avg = {} (n={})",
469 fmt_duration_ns(avg_vec),
470 ITERATIONS
471 );
472 println!(
473 "raw vec64: Vec64<i64> avg = {} (n={})",
474 fmt_duration_ns(avg_vec64),
475 ITERATIONS
476 );
477 println!(
478 "minarrow direct: IntegerArray avg = {} (n={})",
479 fmt_duration_ns(avg_minarrow_direct),
480 ITERATIONS
481 );
482 println!(
483 "arrow-rs struct: Int64Array avg = {} (n={})",
484 fmt_duration_ns(avg_arrow_struct),
485 ITERATIONS
486 );
487 println!(
488 "minarrow enum: IntegerArray avg = {} (n={})",
489 fmt_duration_ns(avg_minarrow_enum),
490 ITERATIONS
491 );
492 println!(
493 "arrow-rs dyn: Int64Array avg = {} (n={})",
494 fmt_duration_ns(avg_arrow_dyn),
495 ITERATIONS
496 );
497
498 println!();
499 println!("|------------ Float Tests (SIMD) --------------|");
500 println!(
501 "raw vec: Vec<f64> avg = {} (n={})",
502 fmt_duration_ns(avg_vec_f64),
503 ITERATIONS
504 );
505 println!(
506 "raw vec64: Vec64<f64> avg = {} (n={})",
507 fmt_duration_ns(avg_vec64_f64),
508 ITERATIONS
509 );
510 println!(
511 "minarrow direct: FloatArray avg = {} (n={})",
512 fmt_duration_ns(avg_minarrow_direct_f64),
513 ITERATIONS
514 );
515 println!(
516 "arrow-rs struct: Float64Array avg = {} (n={})",
517 fmt_duration_ns(avg_arrow_struct_f64),
518 ITERATIONS
519 );
520 println!(
521 "minarrow enum: FloatArray avg = {} (n={})",
522 fmt_duration_ns(avg_minarrow_enum_f64),
523 ITERATIONS
524 );
525 println!(
526 "arrow-rs dyn: Float64Array avg = {} (n={})",
527 fmt_duration_ns(avg_arrow_dyn_f64),
528 ITERATIONS
529 );
530
531 println!("\n=> Vec64 backs the above `Minarrow` types and `Vec` backs Arrow_Rs.");
532
533 println!("\nVerify SIMD pointer alignment for Integer calculations (based on lane width):");
534 println!("Vec<i64> is aligned: {}", v_aligned);
535 println!("Minarrow Vec64<i64> is aligned: {}", v64_aligned);
536 println!("Minarrow IntegerArray<i64> is aligned: {}", int_array_aligned);
537 println!("Arrow ArrowI64Array is aligned: {}", i64_arrow_aligned);
538 println!("Minarrow Array::NumericArray<i64> is aligned: {}", arr_int_enum_aligned);
539 println!("Arrow ArrayRef<int> is aligned: {}", array_ref_int_aligned);
540
541 println!("\nVerify SIMD pointer alignment for Float calculations (based on lane width):");
542 println!("Vec<f64> is aligned: {}", v_float_aligned);
543 println!("Vec64<f64> is aligned: {}", v64_float_aligned);
544 println!("FloatArray<f64> is aligned: {}", float_arr_aligned);
545 println!("ArrowF64Array is aligned: {}", arrow_f64_aligned);
546 println!("Array::NumericArray<f64> is aligned: {}", float_enum_aligned);
547 println!("ArrayRef is aligned: {}", arrow_f64_arr_aligned);
548
549 println!("\n---------------------- END OF SIMD AVG BENCHMARKS ---------------------------");
550 }
551
552 fn fmt_duration_ns(avg_ns: f64) -> String {
553 if avg_ns < 1000.0 {
554 format!("{:.0} ns", avg_ns)
555 } else if avg_ns < 1_000_000.0 {
556 format!("{:.3} µs", avg_ns / 1000.0)
557 } else {
558 format!("{:.3} ms", avg_ns / 1_000_000.0)
559 }
560 }
561}
562
563fn main() {
564 if cfg!(feature = "cast_arrow") {
565 use crate::N;
566 println!(
567 "Running SIMD/Arrow/minarrow parity benchmarks (n={}, lanes={}, iters={})",
568 N, SIMD_LANES, ITERATIONS
569 );
570 #[cfg(feature = "cast_arrow")]
571 run_benchmark(N, SIMD_LANES);
572 } else {
573 println!(
574 "The hotloop_benchmark_avg_simd example requires enabling the `cast_arrow` feature."
575 )
576 }
577}