hotloop_benchmark_avg_simd/
hotloop_benchmark_avg_simd.rs

1//! ---------------------------------------------------------
2//! Runs averaged sum benchmark comparisons on `Minarrow` and `Arrow-Rs`,
3//! at various layers of library abstraction:
4//!
5//!     1. Raw Vec / Vec64
6//!     2. Typed "inner" arrays
7//!     3. Top-level unified `Array` type
8//!
9//! Run with:
10//!     RUSTFLAGS="-C target-cpu=native" cargo run --release --example hotloop_benchmark_avg_simd
11//! ---------------------------------------------------------
12
13#![feature(portable_simd)]
14
15#[cfg(feature = "cast_arrow")]
16use crate::avg_simd::run_benchmark;
17
18pub (crate) const N: usize = 1000000;
19pub (crate) const SIMD_LANES: usize = 4;
20pub (crate) const ITERATIONS: usize = 1000;
21
22#[cfg(feature = "cast_arrow")]
23mod avg_simd {
24    use std::hint::black_box;
25    use std::simd::{LaneCount, Simd, SupportedLaneCount};
26    use std::sync::Arc;
27    use std::time::Instant;
28
29    use crate::ITERATIONS;
30    use crate::SIMD_LANES;
31
32    use arrow::array::{
33        Array as ArrowArrayTrait, ArrayRef, Float64Array as ArrowF64Array,
34        Int64Array as ArrowI64Array
35    };
36    use minarrow::{Array, Buffer, FloatArray, IntegerArray, NumericArray, Vec64};
37
38    #[inline(always)]
39    fn simd_sum_i64<const LANES: usize>(data: &[i64]) -> i64
40    where
41        LaneCount<LANES>: SupportedLaneCount
42    {
43        let n = data.len();
44        let simd_width = LANES;
45        let simd_chunks = n / simd_width;
46
47        let mut acc_simd: Simd<i64, LANES>;
48
49        unsafe {
50            let data_ptr = data.as_ptr();
51            let mut acc1 = Simd::<i64, LANES>::splat(0);
52            let mut acc2 = Simd::<i64, LANES>::splat(0);
53            let mut acc3 = Simd::<i64, LANES>::splat(0);
54            let mut acc4 = Simd::<i64, LANES>::splat(0);
55
56            let unroll_factor = 4;
57            let unrolled_chunks = simd_chunks / unroll_factor;
58
59            for i in 0..unrolled_chunks {
60                let base_offset = i * unroll_factor * simd_width;
61                let v1 =
62                    std::ptr::read_unaligned(data_ptr.add(base_offset) as *const Simd<i64, LANES>);
63                let v2 = std::ptr::read_unaligned(
64                    data_ptr.add(base_offset + simd_width) as *const Simd<i64, LANES>
65                );
66                let v3 = std::ptr::read_unaligned(
67                    data_ptr.add(base_offset + 2 * simd_width) as *const Simd<i64, LANES>
68                );
69                let v4 = std::ptr::read_unaligned(
70                    data_ptr.add(base_offset + 3 * simd_width) as *const Simd<i64, LANES>
71                );
72                acc1 += v1;
73                acc2 += v2;
74                acc3 += v3;
75                acc4 += v4;
76            }
77
78            acc_simd = acc1 + acc2 + acc3 + acc4;
79
80            let processed = unrolled_chunks * unroll_factor;
81            for i in processed..simd_chunks {
82                let offset = i * simd_width;
83                let v = std::ptr::read_unaligned(data_ptr.add(offset) as *const Simd<i64, LANES>);
84                acc_simd += v;
85            }
86        }
87
88        let mut result = 0i64;
89        for i in 0..LANES {
90            result += acc_simd[i];
91        }
92        let remainder_start = simd_chunks * simd_width;
93        for i in remainder_start..n {
94            result += data[i];
95        }
96
97        result
98    }
99
100    #[inline(always)]
101    fn simd_sum_f64<const LANES: usize>(data: &[f64]) -> f64
102    where
103        LaneCount<LANES>: SupportedLaneCount
104    {
105        let n = data.len();
106        let simd_width = LANES;
107        let simd_chunks = n / simd_width;
108
109        let mut acc_simd: Simd<f64, LANES>;
110
111        unsafe {
112            let data_ptr = data.as_ptr();
113            let mut acc1 = Simd::<f64, LANES>::splat(0.0);
114            let mut acc2 = Simd::<f64, LANES>::splat(0.0);
115            let mut acc3 = Simd::<f64, LANES>::splat(0.0);
116            let mut acc4 = Simd::<f64, LANES>::splat(0.0);
117
118            let unroll_factor = 4;
119            let unrolled_chunks = simd_chunks / unroll_factor;
120
121            for i in 0..unrolled_chunks {
122                let base_offset = i * unroll_factor * simd_width;
123                let v1 =
124                    std::ptr::read_unaligned(data_ptr.add(base_offset) as *const Simd<f64, LANES>);
125                let v2 = std::ptr::read_unaligned(
126                    data_ptr.add(base_offset + simd_width) as *const Simd<f64, LANES>
127                );
128                let v3 = std::ptr::read_unaligned(
129                    data_ptr.add(base_offset + 2 * simd_width) as *const Simd<f64, LANES>
130                );
131                let v4 = std::ptr::read_unaligned(
132                    data_ptr.add(base_offset + 3 * simd_width) as *const Simd<f64, LANES>
133                );
134                acc1 += v1;
135                acc2 += v2;
136                acc3 += v3;
137                acc4 += v4;
138            }
139
140            acc_simd = acc1 + acc2 + acc3 + acc4;
141
142            let processed = unrolled_chunks * unroll_factor;
143            for i in processed..simd_chunks {
144                let offset = i * simd_width;
145                let v = std::ptr::read_unaligned(data_ptr.add(offset) as *const Simd<f64, LANES>);
146                acc_simd += v;
147            }
148        }
149
150        let mut result = 0.0;
151        for i in 0..LANES {
152            result += acc_simd[i];
153        }
154        let remainder_start = simd_chunks * simd_width;
155        for i in remainder_start..n {
156            result += data[i];
157        }
158
159        result
160    }
161
162    fn simd_sum_f64_runtime(data: &[f64], lanes: usize) -> f64 {
163        match lanes {
164            2 => simd_sum_f64::<2>(data),
165            4 => simd_sum_f64::<4>(data),
166            8 => simd_sum_f64::<8>(data),
167            16 => simd_sum_f64::<16>(data),
168            _ => panic!("Unsupported SIMD lanes. Only 2, 4, 8, 16 supported.")
169        }
170    }
171
172    fn simd_sum_i64_runtime(data: &[i64], lanes: usize) -> i64 {
173        match lanes {
174            2 => simd_sum_i64::<2>(data),
175            4 => simd_sum_i64::<4>(data),
176            8 => simd_sum_i64::<8>(data),
177            16 => simd_sum_i64::<16>(data),
178            _ => panic!("Unsupported SIMD lanes. Only 2, 4, 8, 16 supported.")
179        }
180    }
181
182    pub fn run_benchmark(n: usize, simd_lanes: usize) {
183        let mut total_vec = std::time::Duration::ZERO;
184        let mut total_vec64 = std::time::Duration::ZERO;
185        let mut total_minarrow_direct = std::time::Duration::ZERO;
186        let mut total_arrow_struct = std::time::Duration::ZERO;
187        let mut total_minarrow_enum = std::time::Duration::ZERO;
188        let mut total_arrow_dyn = std::time::Duration::ZERO;
189
190        let mut total_vec_f64 = std::time::Duration::ZERO;
191        let mut total_vec64_f64 = std::time::Duration::ZERO;
192        let mut total_minarrow_direct_f64 = std::time::Duration::ZERO;
193        let mut total_arrow_struct_f64 = std::time::Duration::ZERO;
194        let mut total_minarrow_enum_f64 = std::time::Duration::ZERO;
195        let mut total_arrow_dyn_f64 = std::time::Duration::ZERO;
196
197        // Data construction - This is the only part we
198        // exclude from the overall benchmark, however, we time Vec
199        // vs. Vec64 here as an indicative profile, given this is the
200        // starting setup of all other reference points.
201        let mut sum_vec_i64 = 0u128;
202        let mut sum_vec64_i64 = 0u128;
203
204        // for keeping scope alive
205        // after the Vec benchmarks, we keep the last one each
206        let mut v_int_data = Vec::with_capacity(n);
207        let mut v64_int_data = Vec64::with_capacity(n);
208
209        for _ in 0..ITERATIONS {
210            let t0 = Instant::now();
211            v_int_data = (0..n as i64).collect();
212            let dur_vec_i64 = t0.elapsed();
213
214            let t1 = Instant::now();
215            v64_int_data = (0..n as i64).collect();
216            let dur_vec64_i64 = t1.elapsed();
217
218            sum_vec_i64 += dur_vec_i64.as_nanos();
219            sum_vec64_i64 += dur_vec64_i64.as_nanos();
220        }
221
222        let avg_vec_i64 = sum_vec_i64 as f64 / ITERATIONS as f64;
223        let avg_vec64_i64 = sum_vec64_i64 as f64 / ITERATIONS as f64;
224
225        println!("Vec<i64> construction (avg):    {}", fmt_duration_ns(avg_vec_i64));
226        println!("Vec64<i64> construction (avg):  {}", fmt_duration_ns(avg_vec64_i64));
227        println!("\n=> Keep the above Vec construction delta in mind when interpreting the below results,
228    as it is not included in the benchmarks that follow.\n");
229
230        // Alignment checks - once, outside timing
231
232        let v_aligned = {
233            (&v_int_data[0] as *const i64 as usize) % std::mem::align_of::<Simd<i64, SIMD_LANES>>()
234                == 0
235        };
236
237        let v64_aligned = {
238            (&v64_int_data[0] as *const i64 as usize)
239                % std::mem::align_of::<Simd<i64, SIMD_LANES>>()
240                == 0
241        };
242
243        let int_array_aligned = {
244            let int_arr = IntegerArray {
245                data: Buffer::from(v64_int_data.clone()),
246                null_mask: None
247            };
248            let slice = &int_arr[..];
249            (slice.as_ptr() as usize) % std::mem::align_of::<Simd<i64, SIMD_LANES>>() == 0
250        };
251
252        let i64_arrow_aligned = {
253            let arr = ArrowI64Array::from(v_int_data.clone());
254            (arr.values().as_ptr() as usize) % std::mem::align_of::<Simd<i64, SIMD_LANES>>() == 0
255        };
256
257        let arr_int_enum_aligned = {
258            let array = Array::NumericArray(NumericArray::Int64(Arc::new(IntegerArray {
259                data: Buffer::from(v64_int_data.clone()),
260                null_mask: None
261            })));
262            let int_arr = array.num().i64().unwrap();
263            (int_arr.data.as_slice().as_ptr() as usize)
264                % std::mem::align_of::<Simd<i64, SIMD_LANES>>()
265                == 0
266        };
267
268        let array_ref_int_aligned = {
269            let arr: ArrayRef = Arc::new(ArrowI64Array::from(v_int_data.clone()));
270            let int_arr = arr.as_any().downcast_ref::<ArrowI64Array>().unwrap();
271            (int_arr.values().as_ptr() as usize) % std::mem::align_of::<Simd<i64, SIMD_LANES>>()
272                == 0
273        };
274
275        let v_float_data: Vec<f64> = (0..n as i64).map(|x| x as f64).collect();
276        let v64_float_data: Vec64<f64> = (0..n as i64).map(|x| x as f64).collect();
277
278        let v_float_aligned = {
279            (&v_float_data[0] as *const f64 as usize)
280                % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
281                == 0
282        };
283
284        let v64_float_aligned = {
285            (&v64_float_data[0] as *const f64 as usize)
286                % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
287                == 0
288        };
289
290        let float_arr_aligned = {
291            let float_arr = FloatArray {
292                data: Buffer::from(v64_float_data.clone()),
293                null_mask: None
294            };
295            (&float_arr.data.as_slice()[0] as *const f64 as usize)
296                % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
297                == 0
298        };
299
300        let arrow_f64_aligned = {
301            let arr = ArrowF64Array::from(v_float_data.clone());
302            (arr.values().as_ptr() as usize) % std::mem::align_of::<Simd<f64, SIMD_LANES>>() == 0
303        };
304
305        let float_enum_aligned = {
306            let array = Array::NumericArray(NumericArray::Float64(Arc::new(FloatArray {
307                data: Buffer::from(v64_float_data.clone()),
308                null_mask: None
309            })));
310            let float_arr = array.num().f64().unwrap();
311            (float_arr.data.as_slice().as_ptr() as usize)
312                % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
313                == 0
314        };
315
316        let arrow_f64_arr_aligned = {
317            let arr: ArrayRef = Arc::new(ArrowF64Array::from(v_float_data.clone()));
318            let float_arr = arr.as_any().downcast_ref::<ArrowF64Array>().unwrap();
319            (float_arr.values().as_ptr() as usize) % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
320                == 0
321        };
322
323        for _ in 0..ITERATIONS {
324            // --- Integer (i64) tests ---
325            // Raw Vec<i64>
326            let data = v_int_data.clone();
327            let start = Instant::now();
328            let sum = simd_sum_i64_runtime(&data[..], simd_lanes);
329            let dur = start.elapsed();
330            total_vec += dur;
331            black_box(sum);
332
333            // Raw Vec64<i64>
334            let data: Vec64<i64> = v64_int_data.clone();
335            let start = Instant::now();
336            let sum = simd_sum_i64_runtime(&data[..], simd_lanes);
337            let dur = start.elapsed();
338            total_vec64 += dur;
339            black_box(sum);
340
341            // Minarrow i64 (direct struct)
342            let data: Vec64<i64> = v64_int_data.clone();
343            let start = Instant::now();
344            let int_arr = IntegerArray {
345                data: Buffer::from(data),
346                null_mask: None
347            };
348            let sum = simd_sum_i64_runtime(&int_arr[..], simd_lanes);
349            let dur = start.elapsed();
350            total_minarrow_direct += dur;
351            black_box(sum);
352
353            // Arrow i64 (struct direct)
354            let data: Vec<i64> = v_int_data.clone();
355            let start = Instant::now();
356            let arr = ArrowI64Array::from(data);
357            let sum = simd_sum_i64_runtime(arr.values(), simd_lanes);
358            let dur = start.elapsed();
359            total_arrow_struct += dur;
360            black_box(sum);
361
362            // Minarrow i64 (enum)
363            let data: Vec64<i64> = v64_int_data.clone();
364            let start = Instant::now();
365            let array = Array::NumericArray(NumericArray::Int64(Arc::new(IntegerArray {
366                data: Buffer::from(data),
367                null_mask: None
368            })));
369            let int_arr = array.num().i64().unwrap();
370            let sum = simd_sum_i64_runtime(&int_arr[..], simd_lanes);
371            let dur = start.elapsed();
372            total_minarrow_enum += dur;
373            black_box(sum);
374
375            // Arrow i64 (dynamic)
376            let data: Vec<i64> = v_int_data.clone();
377            let start = Instant::now();
378            let arr: ArrayRef = Arc::new(ArrowI64Array::from(data));
379            let int_arr = arr.as_any().downcast_ref::<ArrowI64Array>().unwrap();
380            let sum = simd_sum_i64_runtime(int_arr.values(), simd_lanes);
381            let dur = start.elapsed();
382            total_arrow_dyn += dur;
383            black_box(sum);
384
385            // --- Float (f64) tests ---
386
387            // Raw Vec<f64>
388            let data: Vec<f64> = v_float_data.clone();
389            let start = Instant::now();
390            let sum = simd_sum_f64_runtime(&data[..], simd_lanes);
391            let dur = start.elapsed();
392            total_vec_f64 += dur;
393            black_box(sum);
394
395            // Raw Vec64<f64>
396            let data: Vec64<f64> = v64_float_data.clone();
397            let start = Instant::now();
398            let sum = simd_sum_f64_runtime(&data[..], simd_lanes);
399            let dur = start.elapsed();
400            total_vec64_f64 += dur;
401            black_box(sum);
402
403            // Minarrow f64 (direct struct)
404            let data: Vec64<f64> = v64_float_data.clone();
405            let start = Instant::now();
406            let float_arr = FloatArray {
407                data: Buffer::from(data),
408                null_mask: None
409            };
410            let sum = simd_sum_f64_runtime(&float_arr[..], simd_lanes);
411            let dur = start.elapsed();
412            total_minarrow_direct_f64 += dur;
413            black_box(sum);
414
415            // Arrow f64 (struct direct)
416            let data: Vec<f64> = v_float_data.clone();
417            let start = Instant::now();
418            let arr = ArrowF64Array::from(data);
419            let sum = simd_sum_f64_runtime(arr.values(), simd_lanes);
420            let dur = start.elapsed();
421            total_arrow_struct_f64 += dur;
422            black_box(sum);
423
424            // Minarrow f64 (enum)
425            let data: Vec64<f64> = v64_float_data.clone();
426            let start = Instant::now();
427            let array = Array::NumericArray(NumericArray::Float64(Arc::new(FloatArray {
428                data: Buffer::from(data),
429                null_mask: None
430            })));
431            let float_arr = array.num().f64().unwrap();
432            let sum = simd_sum_f64_runtime(&float_arr[..], simd_lanes);
433            let dur = start.elapsed();
434            total_minarrow_enum_f64 += dur;
435            black_box(sum);
436
437            // Arrow f64 (dynamic)
438            let data: Vec<f64> = v_float_data.clone();
439            let start = Instant::now();
440            let arr: ArrayRef = Arc::new(ArrowF64Array::from(data));
441            let float_arr = arr.as_any().downcast_ref::<ArrowF64Array>().unwrap();
442            let sum = simd_sum_f64_runtime(float_arr.values(), simd_lanes);
443            let dur = start.elapsed();
444            total_arrow_dyn_f64 += dur;
445            black_box(sum);
446        }
447
448        println!("Averaged Results from {} runs:", ITERATIONS);
449        println!("---------------------------------");
450
451        let avg_vec = total_vec.as_nanos() as f64 / ITERATIONS as f64;
452        let avg_vec64 = total_vec64.as_nanos() as f64 / ITERATIONS as f64;
453        let avg_minarrow_direct = total_minarrow_direct.as_nanos() as f64 / ITERATIONS as f64;
454        let avg_arrow_struct = total_arrow_struct.as_nanos() as f64 / ITERATIONS as f64;
455        let avg_minarrow_enum = total_minarrow_enum.as_nanos() as f64 / ITERATIONS as f64;
456        let avg_arrow_dyn = total_arrow_dyn.as_nanos() as f64 / ITERATIONS as f64;
457
458        let avg_vec_f64 = total_vec_f64.as_nanos() as f64 / ITERATIONS as f64;
459        let avg_vec64_f64 = total_vec64_f64.as_nanos() as f64 / ITERATIONS as f64;
460        let avg_minarrow_direct_f64 =
461            total_minarrow_direct_f64.as_nanos() as f64 / ITERATIONS as f64;
462        let avg_arrow_struct_f64 = total_arrow_struct_f64.as_nanos() as f64 / ITERATIONS as f64;
463        let avg_minarrow_enum_f64 = total_minarrow_enum_f64.as_nanos() as f64 / ITERATIONS as f64;
464        let avg_arrow_dyn_f64 = total_arrow_dyn_f64.as_nanos() as f64 / ITERATIONS as f64;
465
466        println!("|------------ Integer Tests (SIMD) ------------|");
467        println!(
468            "raw vec: Vec<i64>                             avg = {} (n={})",
469            fmt_duration_ns(avg_vec),
470            ITERATIONS
471        );
472        println!(
473            "raw vec64: Vec64<i64>                         avg = {} (n={})",
474            fmt_duration_ns(avg_vec64),
475            ITERATIONS
476        );
477        println!(
478            "minarrow direct: IntegerArray                  avg = {} (n={})",
479            fmt_duration_ns(avg_minarrow_direct),
480            ITERATIONS
481        );
482        println!(
483            "arrow-rs struct: Int64Array                   avg = {} (n={})",
484            fmt_duration_ns(avg_arrow_struct),
485            ITERATIONS
486        );
487        println!(
488            "minarrow enum: IntegerArray                   avg = {} (n={})",
489            fmt_duration_ns(avg_minarrow_enum),
490            ITERATIONS
491        );
492        println!(
493            "arrow-rs dyn: Int64Array                      avg = {} (n={})",
494            fmt_duration_ns(avg_arrow_dyn),
495            ITERATIONS
496        );
497
498        println!();
499        println!("|------------ Float Tests (SIMD) --------------|");
500        println!(
501            "raw vec: Vec<f64>                             avg = {} (n={})",
502            fmt_duration_ns(avg_vec_f64),
503            ITERATIONS
504        );
505        println!(
506            "raw vec64: Vec64<f64>                         avg = {} (n={})",
507            fmt_duration_ns(avg_vec64_f64),
508            ITERATIONS
509        );
510        println!(
511            "minarrow direct: FloatArray                   avg = {} (n={})",
512            fmt_duration_ns(avg_minarrow_direct_f64),
513            ITERATIONS
514        );
515        println!(
516            "arrow-rs struct: Float64Array                 avg = {} (n={})",
517            fmt_duration_ns(avg_arrow_struct_f64),
518            ITERATIONS
519        );
520        println!(
521            "minarrow enum: FloatArray                     avg = {} (n={})",
522            fmt_duration_ns(avg_minarrow_enum_f64),
523            ITERATIONS
524        );
525        println!(
526            "arrow-rs dyn: Float64Array                    avg = {} (n={})",
527            fmt_duration_ns(avg_arrow_dyn_f64),
528            ITERATIONS
529        );
530
531        println!("\n=> Vec64 backs the above `Minarrow` types and `Vec` backs Arrow_Rs.");
532
533        println!("\nVerify SIMD pointer alignment for Integer calculations (based on lane width):");
534        println!("Vec<i64> is aligned: {}", v_aligned);
535        println!("Minarrow Vec64<i64> is aligned: {}", v64_aligned);
536        println!("Minarrow IntegerArray<i64> is aligned: {}", int_array_aligned);
537        println!("Arrow ArrowI64Array is aligned: {}", i64_arrow_aligned);
538        println!("Minarrow Array::NumericArray<i64> is aligned: {}", arr_int_enum_aligned);
539        println!("Arrow ArrayRef<int> is aligned: {}", array_ref_int_aligned);
540
541        println!("\nVerify SIMD pointer alignment for Float calculations (based on lane width):");
542        println!("Vec<f64> is aligned: {}", v_float_aligned);
543        println!("Vec64<f64> is aligned: {}", v64_float_aligned);
544        println!("FloatArray<f64> is aligned: {}", float_arr_aligned);
545        println!("ArrowF64Array is aligned: {}", arrow_f64_aligned);
546        println!("Array::NumericArray<f64> is aligned: {}", float_enum_aligned);
547        println!("ArrayRef is aligned: {}", arrow_f64_arr_aligned);
548
549        println!("\n---------------------- END OF SIMD AVG BENCHMARKS ---------------------------");
550    }
551
552    fn fmt_duration_ns(avg_ns: f64) -> String {
553        if avg_ns < 1000.0 {
554            format!("{:.0} ns", avg_ns)
555        } else if avg_ns < 1_000_000.0 {
556            format!("{:.3} µs", avg_ns / 1000.0)
557        } else {
558            format!("{:.3} ms", avg_ns / 1_000_000.0)
559        }
560    }
561}
562
563fn main() {
564    if cfg!(feature = "cast_arrow") {
565        use crate::N;
566        println!(
567            "Running SIMD/Arrow/minarrow parity benchmarks (n={}, lanes={}, iters={})",
568            N, SIMD_LANES, ITERATIONS
569        );
570        #[cfg(feature = "cast_arrow")]
571        run_benchmark(N, SIMD_LANES);
572    } else {
573        println!(
574            "The hotloop_benchmark_avg_simd example requires enabling the `cast_arrow` feature."
575        )
576    }
577}