hotloop_benchmark_avg_simd/
hotloop_benchmark_avg_simd.rs

1//! ---------------------------------------------------------
2//! Runs averaged sum benchmark comparisons on `Minarrow` and `Arrow-Rs`,
3//! at various layers of library abstraction:
4//!
5//!     1. Raw Vec / Vec64
6//!     2. Typed "inner" arrays
7//!     3. Top-level unified `Array` type
8//!
9//! Run with:
10//!     RUSTFLAGS="-C target-cpu=native" cargo run --release --example hotloop_benchmark_avg_simd
11//! ---------------------------------------------------------
12
13#![feature(portable_simd)]
14
15#[cfg(feature = "cast_arrow")]
16use crate::avg_simd::run_benchmark;
17
18pub(crate) const N: usize = 1000000;
19pub(crate) const SIMD_LANES: usize = 4;
20pub(crate) const ITERATIONS: usize = 1000;
21
22#[cfg(feature = "cast_arrow")]
23mod avg_simd {
24    use std::hint::black_box;
25    use std::simd::{LaneCount, Simd, SupportedLaneCount};
26    use std::sync::Arc;
27    use std::time::Instant;
28
29    use crate::ITERATIONS;
30    use crate::SIMD_LANES;
31
32    use arrow::array::{
33        Array as ArrowArrayTrait, ArrayRef, Float64Array as ArrowF64Array,
34        Int64Array as ArrowI64Array,
35    };
36    use minarrow::{Array, Buffer, FloatArray, IntegerArray, NumericArray, Vec64};
37
38    #[inline(always)]
39    fn simd_sum_i64<const LANES: usize>(data: &[i64]) -> i64
40    where
41        LaneCount<LANES>: SupportedLaneCount,
42    {
43        let n = data.len();
44        let simd_width = LANES;
45        let simd_chunks = n / simd_width;
46
47        let mut acc_simd: Simd<i64, LANES>;
48
49        unsafe {
50            let data_ptr = data.as_ptr();
51            let mut acc1 = Simd::<i64, LANES>::splat(0);
52            let mut acc2 = Simd::<i64, LANES>::splat(0);
53            let mut acc3 = Simd::<i64, LANES>::splat(0);
54            let mut acc4 = Simd::<i64, LANES>::splat(0);
55
56            let unroll_factor = 4;
57            let unrolled_chunks = simd_chunks / unroll_factor;
58
59            for i in 0..unrolled_chunks {
60                let base_offset = i * unroll_factor * simd_width;
61                let v1 =
62                    std::ptr::read_unaligned(data_ptr.add(base_offset) as *const Simd<i64, LANES>);
63                let v2 = std::ptr::read_unaligned(
64                    data_ptr.add(base_offset + simd_width) as *const Simd<i64, LANES>
65                );
66                let v3 = std::ptr::read_unaligned(
67                    data_ptr.add(base_offset + 2 * simd_width) as *const Simd<i64, LANES>
68                );
69                let v4 = std::ptr::read_unaligned(
70                    data_ptr.add(base_offset + 3 * simd_width) as *const Simd<i64, LANES>
71                );
72                acc1 += v1;
73                acc2 += v2;
74                acc3 += v3;
75                acc4 += v4;
76            }
77
78            acc_simd = acc1 + acc2 + acc3 + acc4;
79
80            let processed = unrolled_chunks * unroll_factor;
81            for i in processed..simd_chunks {
82                let offset = i * simd_width;
83                let v = std::ptr::read_unaligned(data_ptr.add(offset) as *const Simd<i64, LANES>);
84                acc_simd += v;
85            }
86        }
87
88        let mut result = 0i64;
89        for i in 0..LANES {
90            result += acc_simd[i];
91        }
92        let remainder_start = simd_chunks * simd_width;
93        for i in remainder_start..n {
94            result += data[i];
95        }
96
97        result
98    }
99
100    #[inline(always)]
101    fn simd_sum_f64<const LANES: usize>(data: &[f64]) -> f64
102    where
103        LaneCount<LANES>: SupportedLaneCount,
104    {
105        let n = data.len();
106        let simd_width = LANES;
107        let simd_chunks = n / simd_width;
108
109        let mut acc_simd: Simd<f64, LANES>;
110
111        unsafe {
112            let data_ptr = data.as_ptr();
113            let mut acc1 = Simd::<f64, LANES>::splat(0.0);
114            let mut acc2 = Simd::<f64, LANES>::splat(0.0);
115            let mut acc3 = Simd::<f64, LANES>::splat(0.0);
116            let mut acc4 = Simd::<f64, LANES>::splat(0.0);
117
118            let unroll_factor = 4;
119            let unrolled_chunks = simd_chunks / unroll_factor;
120
121            for i in 0..unrolled_chunks {
122                let base_offset = i * unroll_factor * simd_width;
123                let v1 =
124                    std::ptr::read_unaligned(data_ptr.add(base_offset) as *const Simd<f64, LANES>);
125                let v2 = std::ptr::read_unaligned(
126                    data_ptr.add(base_offset + simd_width) as *const Simd<f64, LANES>
127                );
128                let v3 = std::ptr::read_unaligned(
129                    data_ptr.add(base_offset + 2 * simd_width) as *const Simd<f64, LANES>
130                );
131                let v4 = std::ptr::read_unaligned(
132                    data_ptr.add(base_offset + 3 * simd_width) as *const Simd<f64, LANES>
133                );
134                acc1 += v1;
135                acc2 += v2;
136                acc3 += v3;
137                acc4 += v4;
138            }
139
140            acc_simd = acc1 + acc2 + acc3 + acc4;
141
142            let processed = unrolled_chunks * unroll_factor;
143            for i in processed..simd_chunks {
144                let offset = i * simd_width;
145                let v = std::ptr::read_unaligned(data_ptr.add(offset) as *const Simd<f64, LANES>);
146                acc_simd += v;
147            }
148        }
149
150        let mut result = 0.0;
151        for i in 0..LANES {
152            result += acc_simd[i];
153        }
154        let remainder_start = simd_chunks * simd_width;
155        for i in remainder_start..n {
156            result += data[i];
157        }
158
159        result
160    }
161
162    fn simd_sum_f64_runtime(data: &[f64], lanes: usize) -> f64 {
163        match lanes {
164            2 => simd_sum_f64::<2>(data),
165            4 => simd_sum_f64::<4>(data),
166            8 => simd_sum_f64::<8>(data),
167            16 => simd_sum_f64::<16>(data),
168            _ => panic!("Unsupported SIMD lanes. Only 2, 4, 8, 16 supported."),
169        }
170    }
171
172    fn simd_sum_i64_runtime(data: &[i64], lanes: usize) -> i64 {
173        match lanes {
174            2 => simd_sum_i64::<2>(data),
175            4 => simd_sum_i64::<4>(data),
176            8 => simd_sum_i64::<8>(data),
177            16 => simd_sum_i64::<16>(data),
178            _ => panic!("Unsupported SIMD lanes. Only 2, 4, 8, 16 supported."),
179        }
180    }
181
182    pub fn run_benchmark(n: usize, simd_lanes: usize) {
183        let mut total_vec = std::time::Duration::ZERO;
184        let mut total_vec64 = std::time::Duration::ZERO;
185        let mut total_minarrow_direct = std::time::Duration::ZERO;
186        let mut total_arrow_struct = std::time::Duration::ZERO;
187        let mut total_minarrow_enum = std::time::Duration::ZERO;
188        let mut total_arrow_dyn = std::time::Duration::ZERO;
189
190        let mut total_vec_f64 = std::time::Duration::ZERO;
191        let mut total_vec64_f64 = std::time::Duration::ZERO;
192        let mut total_minarrow_direct_f64 = std::time::Duration::ZERO;
193        let mut total_arrow_struct_f64 = std::time::Duration::ZERO;
194        let mut total_minarrow_enum_f64 = std::time::Duration::ZERO;
195        let mut total_arrow_dyn_f64 = std::time::Duration::ZERO;
196
197        // Data construction - This is the only part we
198        // exclude from the overall benchmark, however, we time Vec
199        // vs. Vec64 here as an indicative profile, given this is the
200        // starting setup of all other reference points.
201        let mut sum_vec_i64 = 0u128;
202        let mut sum_vec64_i64 = 0u128;
203
204        // for keeping scope alive
205        // after the Vec benchmarks, we keep the last one each
206        let mut v_int_data = Vec::with_capacity(n);
207        let mut v64_int_data = Vec64::with_capacity(n);
208
209        for _ in 0..ITERATIONS {
210            let t0 = Instant::now();
211            v_int_data = (0..n as i64).collect();
212            let dur_vec_i64 = t0.elapsed();
213
214            let t1 = Instant::now();
215            v64_int_data = (0..n as i64).collect();
216            let dur_vec64_i64 = t1.elapsed();
217
218            sum_vec_i64 += dur_vec_i64.as_nanos();
219            sum_vec64_i64 += dur_vec64_i64.as_nanos();
220        }
221
222        let avg_vec_i64 = sum_vec_i64 as f64 / ITERATIONS as f64;
223        let avg_vec64_i64 = sum_vec64_i64 as f64 / ITERATIONS as f64;
224
225        println!(
226            "Vec<i64> construction (avg):    {}",
227            fmt_duration_ns(avg_vec_i64)
228        );
229        println!(
230            "Vec64<i64> construction (avg):  {}",
231            fmt_duration_ns(avg_vec64_i64)
232        );
233        println!("\n=> Keep the above Vec construction delta in mind when interpreting the below results,
234    as it is not included in the benchmarks that follow.\n");
235
236        // Alignment checks - once, outside timing
237
238        let v_aligned = {
239            (&v_int_data[0] as *const i64 as usize) % std::mem::align_of::<Simd<i64, SIMD_LANES>>()
240                == 0
241        };
242
243        let v64_aligned = {
244            (&v64_int_data[0] as *const i64 as usize)
245                % std::mem::align_of::<Simd<i64, SIMD_LANES>>()
246                == 0
247        };
248
249        let int_array_aligned = {
250            let int_arr = IntegerArray {
251                data: Buffer::from(v64_int_data.clone()),
252                null_mask: None,
253            };
254            let slice = &int_arr[..];
255            (slice.as_ptr() as usize) % std::mem::align_of::<Simd<i64, SIMD_LANES>>() == 0
256        };
257
258        let i64_arrow_aligned = {
259            let arr = ArrowI64Array::from(v_int_data.clone());
260            (arr.values().as_ptr() as usize) % std::mem::align_of::<Simd<i64, SIMD_LANES>>() == 0
261        };
262
263        let arr_int_enum_aligned = {
264            let array = Array::NumericArray(NumericArray::Int64(Arc::new(IntegerArray {
265                data: Buffer::from(v64_int_data.clone()),
266                null_mask: None,
267            })));
268            let int_arr = array.num().i64().unwrap();
269            (int_arr.data.as_slice().as_ptr() as usize)
270                % std::mem::align_of::<Simd<i64, SIMD_LANES>>()
271                == 0
272        };
273
274        let array_ref_int_aligned = {
275            let arr: ArrayRef = Arc::new(ArrowI64Array::from(v_int_data.clone()));
276            let int_arr = arr.as_any().downcast_ref::<ArrowI64Array>().unwrap();
277            (int_arr.values().as_ptr() as usize) % std::mem::align_of::<Simd<i64, SIMD_LANES>>()
278                == 0
279        };
280
281        let v_float_data: Vec<f64> = (0..n as i64).map(|x| x as f64).collect();
282        let v64_float_data: Vec64<f64> = (0..n as i64).map(|x| x as f64).collect();
283
284        let v_float_aligned = {
285            (&v_float_data[0] as *const f64 as usize)
286                % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
287                == 0
288        };
289
290        let v64_float_aligned = {
291            (&v64_float_data[0] as *const f64 as usize)
292                % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
293                == 0
294        };
295
296        let float_arr_aligned = {
297            let float_arr = FloatArray {
298                data: Buffer::from(v64_float_data.clone()),
299                null_mask: None,
300            };
301            (&float_arr.data.as_slice()[0] as *const f64 as usize)
302                % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
303                == 0
304        };
305
306        let arrow_f64_aligned = {
307            let arr = ArrowF64Array::from(v_float_data.clone());
308            (arr.values().as_ptr() as usize) % std::mem::align_of::<Simd<f64, SIMD_LANES>>() == 0
309        };
310
311        let float_enum_aligned = {
312            let array = Array::NumericArray(NumericArray::Float64(Arc::new(FloatArray {
313                data: Buffer::from(v64_float_data.clone()),
314                null_mask: None,
315            })));
316            let float_arr = array.num().f64().unwrap();
317            (float_arr.data.as_slice().as_ptr() as usize)
318                % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
319                == 0
320        };
321
322        let arrow_f64_arr_aligned = {
323            let arr: ArrayRef = Arc::new(ArrowF64Array::from(v_float_data.clone()));
324            let float_arr = arr.as_any().downcast_ref::<ArrowF64Array>().unwrap();
325            (float_arr.values().as_ptr() as usize) % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
326                == 0
327        };
328
329        for _ in 0..ITERATIONS {
330            // --- Integer (i64) tests ---
331            // Raw Vec<i64>
332            let data = v_int_data.clone();
333            let start = Instant::now();
334            let sum = simd_sum_i64_runtime(&data[..], simd_lanes);
335            let dur = start.elapsed();
336            total_vec += dur;
337            black_box(sum);
338
339            // Raw Vec64<i64>
340            let data: Vec64<i64> = v64_int_data.clone();
341            let start = Instant::now();
342            let sum = simd_sum_i64_runtime(&data[..], simd_lanes);
343            let dur = start.elapsed();
344            total_vec64 += dur;
345            black_box(sum);
346
347            // Minarrow i64 (direct struct)
348            let data: Vec64<i64> = v64_int_data.clone();
349            let start = Instant::now();
350            let int_arr = IntegerArray {
351                data: Buffer::from(data),
352                null_mask: None,
353            };
354            let sum = simd_sum_i64_runtime(&int_arr[..], simd_lanes);
355            let dur = start.elapsed();
356            total_minarrow_direct += dur;
357            black_box(sum);
358
359            // Arrow i64 (struct direct)
360            let data: Vec<i64> = v_int_data.clone();
361            let start = Instant::now();
362            let arr = ArrowI64Array::from(data);
363            let sum = simd_sum_i64_runtime(arr.values(), simd_lanes);
364            let dur = start.elapsed();
365            total_arrow_struct += dur;
366            black_box(sum);
367
368            // Minarrow i64 (enum)
369            let data: Vec64<i64> = v64_int_data.clone();
370            let start = Instant::now();
371            let array = Array::NumericArray(NumericArray::Int64(Arc::new(IntegerArray {
372                data: Buffer::from(data),
373                null_mask: None,
374            })));
375            let int_arr = array.num().i64().unwrap();
376            let sum = simd_sum_i64_runtime(&int_arr[..], simd_lanes);
377            let dur = start.elapsed();
378            total_minarrow_enum += dur;
379            black_box(sum);
380
381            // Arrow i64 (dynamic)
382            let data: Vec<i64> = v_int_data.clone();
383            let start = Instant::now();
384            let arr: ArrayRef = Arc::new(ArrowI64Array::from(data));
385            let int_arr = arr.as_any().downcast_ref::<ArrowI64Array>().unwrap();
386            let sum = simd_sum_i64_runtime(int_arr.values(), simd_lanes);
387            let dur = start.elapsed();
388            total_arrow_dyn += dur;
389            black_box(sum);
390
391            // --- Float (f64) tests ---
392
393            // Raw Vec<f64>
394            let data: Vec<f64> = v_float_data.clone();
395            let start = Instant::now();
396            let sum = simd_sum_f64_runtime(&data[..], simd_lanes);
397            let dur = start.elapsed();
398            total_vec_f64 += dur;
399            black_box(sum);
400
401            // Raw Vec64<f64>
402            let data: Vec64<f64> = v64_float_data.clone();
403            let start = Instant::now();
404            let sum = simd_sum_f64_runtime(&data[..], simd_lanes);
405            let dur = start.elapsed();
406            total_vec64_f64 += dur;
407            black_box(sum);
408
409            // Minarrow f64 (direct struct)
410            let data: Vec64<f64> = v64_float_data.clone();
411            let start = Instant::now();
412            let float_arr = FloatArray {
413                data: Buffer::from(data),
414                null_mask: None,
415            };
416            let sum = simd_sum_f64_runtime(&float_arr[..], simd_lanes);
417            let dur = start.elapsed();
418            total_minarrow_direct_f64 += dur;
419            black_box(sum);
420
421            // Arrow f64 (struct direct)
422            let data: Vec<f64> = v_float_data.clone();
423            let start = Instant::now();
424            let arr = ArrowF64Array::from(data);
425            let sum = simd_sum_f64_runtime(arr.values(), simd_lanes);
426            let dur = start.elapsed();
427            total_arrow_struct_f64 += dur;
428            black_box(sum);
429
430            // Minarrow f64 (enum)
431            let data: Vec64<f64> = v64_float_data.clone();
432            let start = Instant::now();
433            let array = Array::NumericArray(NumericArray::Float64(Arc::new(FloatArray {
434                data: Buffer::from(data),
435                null_mask: None,
436            })));
437            let float_arr = array.num().f64().unwrap();
438            let sum = simd_sum_f64_runtime(&float_arr[..], simd_lanes);
439            let dur = start.elapsed();
440            total_minarrow_enum_f64 += dur;
441            black_box(sum);
442
443            // Arrow f64 (dynamic)
444            let data: Vec<f64> = v_float_data.clone();
445            let start = Instant::now();
446            let arr: ArrayRef = Arc::new(ArrowF64Array::from(data));
447            let float_arr = arr.as_any().downcast_ref::<ArrowF64Array>().unwrap();
448            let sum = simd_sum_f64_runtime(float_arr.values(), simd_lanes);
449            let dur = start.elapsed();
450            total_arrow_dyn_f64 += dur;
451            black_box(sum);
452        }
453
454        println!("Averaged Results from {} runs:", ITERATIONS);
455        println!("---------------------------------");
456
457        let avg_vec = total_vec.as_nanos() as f64 / ITERATIONS as f64;
458        let avg_vec64 = total_vec64.as_nanos() as f64 / ITERATIONS as f64;
459        let avg_minarrow_direct = total_minarrow_direct.as_nanos() as f64 / ITERATIONS as f64;
460        let avg_arrow_struct = total_arrow_struct.as_nanos() as f64 / ITERATIONS as f64;
461        let avg_minarrow_enum = total_minarrow_enum.as_nanos() as f64 / ITERATIONS as f64;
462        let avg_arrow_dyn = total_arrow_dyn.as_nanos() as f64 / ITERATIONS as f64;
463
464        let avg_vec_f64 = total_vec_f64.as_nanos() as f64 / ITERATIONS as f64;
465        let avg_vec64_f64 = total_vec64_f64.as_nanos() as f64 / ITERATIONS as f64;
466        let avg_minarrow_direct_f64 =
467            total_minarrow_direct_f64.as_nanos() as f64 / ITERATIONS as f64;
468        let avg_arrow_struct_f64 = total_arrow_struct_f64.as_nanos() as f64 / ITERATIONS as f64;
469        let avg_minarrow_enum_f64 = total_minarrow_enum_f64.as_nanos() as f64 / ITERATIONS as f64;
470        let avg_arrow_dyn_f64 = total_arrow_dyn_f64.as_nanos() as f64 / ITERATIONS as f64;
471
472        println!("|------------ Integer Tests (SIMD) ------------|");
473        println!(
474            "raw vec: Vec<i64>                             avg = {} (n={})",
475            fmt_duration_ns(avg_vec),
476            ITERATIONS
477        );
478        println!(
479            "raw vec64: Vec64<i64>                         avg = {} (n={})",
480            fmt_duration_ns(avg_vec64),
481            ITERATIONS
482        );
483        println!(
484            "minarrow direct: IntegerArray                  avg = {} (n={})",
485            fmt_duration_ns(avg_minarrow_direct),
486            ITERATIONS
487        );
488        println!(
489            "arrow-rs struct: Int64Array                   avg = {} (n={})",
490            fmt_duration_ns(avg_arrow_struct),
491            ITERATIONS
492        );
493        println!(
494            "minarrow enum: IntegerArray                   avg = {} (n={})",
495            fmt_duration_ns(avg_minarrow_enum),
496            ITERATIONS
497        );
498        println!(
499            "arrow-rs dyn: Int64Array                      avg = {} (n={})",
500            fmt_duration_ns(avg_arrow_dyn),
501            ITERATIONS
502        );
503
504        println!();
505        println!("|------------ Float Tests (SIMD) --------------|");
506        println!(
507            "raw vec: Vec<f64>                             avg = {} (n={})",
508            fmt_duration_ns(avg_vec_f64),
509            ITERATIONS
510        );
511        println!(
512            "raw vec64: Vec64<f64>                         avg = {} (n={})",
513            fmt_duration_ns(avg_vec64_f64),
514            ITERATIONS
515        );
516        println!(
517            "minarrow direct: FloatArray                   avg = {} (n={})",
518            fmt_duration_ns(avg_minarrow_direct_f64),
519            ITERATIONS
520        );
521        println!(
522            "arrow-rs struct: Float64Array                 avg = {} (n={})",
523            fmt_duration_ns(avg_arrow_struct_f64),
524            ITERATIONS
525        );
526        println!(
527            "minarrow enum: FloatArray                     avg = {} (n={})",
528            fmt_duration_ns(avg_minarrow_enum_f64),
529            ITERATIONS
530        );
531        println!(
532            "arrow-rs dyn: Float64Array                    avg = {} (n={})",
533            fmt_duration_ns(avg_arrow_dyn_f64),
534            ITERATIONS
535        );
536
537        println!("\n=> Vec64 backs the above `Minarrow` types and `Vec` backs Arrow_Rs.");
538
539        println!("\nVerify SIMD pointer alignment for Integer calculations (based on lane width):");
540        println!("Vec<i64> is aligned: {}", v_aligned);
541        println!("Minarrow Vec64<i64> is aligned: {}", v64_aligned);
542        println!(
543            "Minarrow IntegerArray<i64> is aligned: {}",
544            int_array_aligned
545        );
546        println!("Arrow ArrowI64Array is aligned: {}", i64_arrow_aligned);
547        println!(
548            "Minarrow Array::NumericArray<i64> is aligned: {}",
549            arr_int_enum_aligned
550        );
551        println!("Arrow ArrayRef<int> is aligned: {}", array_ref_int_aligned);
552
553        println!("\nVerify SIMD pointer alignment for Float calculations (based on lane width):");
554        println!("Vec<f64> is aligned: {}", v_float_aligned);
555        println!("Vec64<f64> is aligned: {}", v64_float_aligned);
556        println!("FloatArray<f64> is aligned: {}", float_arr_aligned);
557        println!("ArrowF64Array is aligned: {}", arrow_f64_aligned);
558        println!(
559            "Array::NumericArray<f64> is aligned: {}",
560            float_enum_aligned
561        );
562        println!("ArrayRef is aligned: {}", arrow_f64_arr_aligned);
563
564        println!("\n---------------------- END OF SIMD AVG BENCHMARKS ---------------------------");
565    }
566
567    fn fmt_duration_ns(avg_ns: f64) -> String {
568        if avg_ns < 1000.0 {
569            format!("{:.0} ns", avg_ns)
570        } else if avg_ns < 1_000_000.0 {
571            format!("{:.3} µs", avg_ns / 1000.0)
572        } else {
573            format!("{:.3} ms", avg_ns / 1_000_000.0)
574        }
575    }
576}
577
578fn main() {
579    if cfg!(feature = "cast_arrow") {
580        use crate::N;
581        println!(
582            "Running SIMD/Arrow/minarrow parity benchmarks (n={}, lanes={}, iters={})",
583            N, SIMD_LANES, ITERATIONS
584        );
585        #[cfg(feature = "cast_arrow")]
586        run_benchmark(N, SIMD_LANES);
587    } else {
588        println!(
589            "The hotloop_benchmark_avg_simd example requires enabling the `cast_arrow` feature."
590        )
591    }
592}