hotloop_benchmark_simd/
hotloop_benchmark_simd.rs

1//! ---------------------------------------------------------
2//! Runs sum benchmark comparisons on `Minarrow` and `Arrow-Rs`,
3//! at various layers of library abstraction, using SIMD:
4//!
5//!     1. Raw Vec / Vec64
6//!     2. Typed "inner" arrays
7//!     3. Top-level unified `Array` type
8//!
9//! Run with:
10//!     RUSTFLAGS="-C target-cpu=native" cargo run --release --example hotloop_benchmark_simd
11//!
12//! The *RUSTFLAGS* argument ensures it compiles to your host instruction-set.
13//!
14//! Use 2, 4, 8, or 16 LANES as per your processor's SIMD support.
15//! ---------------------------------------------------------
16
17#![feature(portable_simd)]
18
19#[cfg(feature = "cast_arrow")]
20use crate::benchmarks_simd::run_benchmark;
21
22pub(crate) const N: usize = 1_000;
23pub(crate) const SIMD_LANES: usize = 4;
24
25#[cfg(feature = "cast_arrow")]
26mod benchmarks_simd {
27
28    use std::hint::black_box;
29    use std::simd::{LaneCount, Simd, SupportedLaneCount};
30    use std::sync::Arc;
31    use std::time::Instant;
32
33    use crate::SIMD_LANES;
34
35    use arrow::array::{
36        Array as ArrowArrayTrait, ArrayRef, Float64Array as ArrowF64Array,
37        Int64Array as ArrowI64Array,
38    };
39    use minarrow::{Array, Buffer, FloatArray, IntegerArray, NumericArray, Vec64};
40
41    #[inline(always)]
42    fn simd_sum_i64<const LANES: usize>(data: &[i64]) -> i64
43    where
44        LaneCount<LANES>: SupportedLaneCount,
45    {
46        let n = data.len();
47        let simd_width = LANES;
48        let simd_chunks = n / simd_width;
49
50        let mut acc_simd: Simd<i64, LANES>;
51
52        unsafe {
53            let data_ptr = data.as_ptr();
54            let mut acc1 = Simd::<i64, LANES>::splat(0);
55            let mut acc2 = Simd::<i64, LANES>::splat(0);
56            let mut acc3 = Simd::<i64, LANES>::splat(0);
57            let mut acc4 = Simd::<i64, LANES>::splat(0);
58
59            let unroll_factor = 4;
60            let unrolled_chunks = simd_chunks / unroll_factor;
61
62            for i in 0..unrolled_chunks {
63                let base_offset = i * unroll_factor * simd_width;
64                let v1 =
65                    std::ptr::read_unaligned(data_ptr.add(base_offset) as *const Simd<i64, LANES>);
66                let v2 = std::ptr::read_unaligned(
67                    data_ptr.add(base_offset + simd_width) as *const Simd<i64, LANES>
68                );
69                let v3 = std::ptr::read_unaligned(
70                    data_ptr.add(base_offset + 2 * simd_width) as *const Simd<i64, LANES>
71                );
72                let v4 = std::ptr::read_unaligned(
73                    data_ptr.add(base_offset + 3 * simd_width) as *const Simd<i64, LANES>
74                );
75                acc1 += v1;
76                acc2 += v2;
77                acc3 += v3;
78                acc4 += v4;
79            }
80
81            acc_simd = acc1 + acc2 + acc3 + acc4;
82
83            let processed = unrolled_chunks * unroll_factor;
84            for i in processed..simd_chunks {
85                let offset = i * simd_width;
86                let v = std::ptr::read_unaligned(data_ptr.add(offset) as *const Simd<i64, LANES>);
87                acc_simd += v;
88            }
89        }
90
91        // Horizontal sum
92        let mut result = 0i64;
93        for i in 0..LANES {
94            result += acc_simd[i];
95        }
96
97        let remainder_start = simd_chunks * simd_width;
98        for i in remainder_start..n {
99            result += data[i];
100        }
101
102        result
103    }
104
105    #[inline(always)]
106    fn simd_sum_f64<const LANES: usize>(data: &[f64]) -> f64
107    where
108        LaneCount<LANES>: SupportedLaneCount,
109    {
110        let n = data.len();
111        let simd_width = LANES;
112        let simd_chunks = n / simd_width;
113
114        let mut acc_simd: Simd<f64, LANES>;
115
116        unsafe {
117            let data_ptr = data.as_ptr();
118            let mut acc1 = Simd::<f64, LANES>::splat(0.0);
119            let mut acc2 = Simd::<f64, LANES>::splat(0.0);
120            let mut acc3 = Simd::<f64, LANES>::splat(0.0);
121            let mut acc4 = Simd::<f64, LANES>::splat(0.0);
122
123            let unroll_factor = 4;
124            let unrolled_chunks = simd_chunks / unroll_factor;
125
126            for i in 0..unrolled_chunks {
127                let base_offset = i * unroll_factor * simd_width;
128                let v1 =
129                    std::ptr::read_unaligned(data_ptr.add(base_offset) as *const Simd<f64, LANES>);
130                let v2 = std::ptr::read_unaligned(
131                    data_ptr.add(base_offset + simd_width) as *const Simd<f64, LANES>
132                );
133                let v3 = std::ptr::read_unaligned(
134                    data_ptr.add(base_offset + 2 * simd_width) as *const Simd<f64, LANES>
135                );
136                let v4 = std::ptr::read_unaligned(
137                    data_ptr.add(base_offset + 3 * simd_width) as *const Simd<f64, LANES>
138                );
139                acc1 += v1;
140                acc2 += v2;
141                acc3 += v3;
142                acc4 += v4;
143            }
144
145            acc_simd = acc1 + acc2 + acc3 + acc4;
146
147            let processed = unrolled_chunks * unroll_factor;
148            for i in processed..simd_chunks {
149                let offset = i * simd_width;
150                let v = std::ptr::read_unaligned(data_ptr.add(offset) as *const Simd<f64, LANES>);
151                acc_simd += v;
152            }
153        }
154
155        let mut result = 0.0;
156        for i in 0..LANES {
157            result += acc_simd[i];
158        }
159
160        let remainder_start = simd_chunks * simd_width;
161        for i in remainder_start..n {
162            result += data[i];
163        }
164
165        result
166    }
167
168    fn simd_sum_f64_runtime(data: &[f64], lanes: usize) -> f64 {
169        match lanes {
170            2 => simd_sum_f64::<2>(data),
171            4 => simd_sum_f64::<4>(data),
172            8 => simd_sum_f64::<8>(data),
173            16 => simd_sum_f64::<16>(data),
174            _ => panic!("Unsupported SIMD lanes. Only 2, 4, 8, 16 supported."),
175        }
176    }
177
178    fn simd_sum_i64_runtime(data: &[i64], lanes: usize) -> i64 {
179        match lanes {
180            2 => simd_sum_i64::<2>(data),
181            4 => simd_sum_i64::<4>(data),
182            8 => simd_sum_i64::<8>(data),
183            16 => simd_sum_i64::<16>(data),
184            _ => panic!("Unsupported SIMD lanes. Only 2, 4, 8, 16 supported."),
185        }
186    }
187
188    pub fn run_benchmark(n: usize, simd_lanes: usize) {
189        // ----------- Integer (i64) tests -----------
190
191        let data: Vec<i64> = (0..n as i64).collect();
192        black_box(simd_sum_i64_runtime(&data[..], simd_lanes)); // warmup, ignore result
193
194        println!("|------------ Integer Tests ------------ |\n");
195        // Raw Vec<i64>
196        // Sometimes this will randomly align, other times it will not.
197        let data: Vec<i64> = (0..n as i64).collect();
198        let start = Instant::now();
199        let slice = &data[..];
200        let sum = simd_sum_i64_runtime(slice, simd_lanes);
201        let dur_vec = start.elapsed();
202        println!("raw vec: Vec<i64> sum = {}, {:?}", sum, dur_vec);
203        let v_aligned =
204            (&data[0] as *const i64 as usize) % std::mem::align_of::<Simd<i64, SIMD_LANES>>() == 0;
205        black_box(sum);
206
207        // Raw Vec64<i64>
208        let data: Vec64<i64> = (0..n as i64).collect();
209        let start = Instant::now();
210        let slice = &data[..];
211        let sum = simd_sum_i64_runtime(slice, simd_lanes);
212        let dur_vec = start.elapsed();
213        println!("raw vec64: Vec64<i64> sum = {}, {:?}", sum, dur_vec);
214        let v64_aligned =
215            (&data[0] as *const i64 as usize) % std::mem::align_of::<Simd<i64, SIMD_LANES>>() == 0;
216        black_box(sum);
217
218        // Minarrow i64 (direct struct)
219        let data: Vec64<i64> = (0..n as i64).collect();
220        let data_copy = data.clone();
221
222        let start = Instant::now();
223        let int_arr = IntegerArray {
224            data: Buffer::from(data),
225            null_mask: None,
226        };
227        let slice = &int_arr[..];
228        let sum = simd_sum_i64_runtime(slice, simd_lanes);
229        let dur_minarrow_direct = start.elapsed();
230        println!(
231            "minarrow direct: IntegerArray sum = {}, {:?}",
232            sum, dur_minarrow_direct
233        );
234        let int_array_aligned = (&data_copy[0] as *const i64 as usize)
235            % std::mem::align_of::<Simd<i64, SIMD_LANES>>()
236            == 0;
237        black_box(sum);
238
239        // Arrow i64 (struct direct)
240        let data: Vec<i64> = (0..n as i64).collect();
241        let data_copy = data.clone();
242
243        let start = Instant::now();
244        let arr = ArrowI64Array::from(data);
245        let slice = arr.values();
246        let sum = simd_sum_i64_runtime(slice, simd_lanes);
247        let dur_arrow_struct = start.elapsed();
248        println!(
249            "arrow-rs struct: Int64Array sum = {}, {:?}",
250            sum, dur_arrow_struct
251        );
252        let i64_arrow_aligned = (&data_copy[0] as *const i64 as usize)
253            % std::mem::align_of::<Simd<i64, SIMD_LANES>>()
254            == 0;
255        black_box(sum);
256
257        // Minarrow i64 (enum)
258        let data: Vec64<i64> = (0..n as i64).collect();
259        let data_copy = data.clone();
260
261        let start = Instant::now();
262        let array = Array::NumericArray(NumericArray::Int64(Arc::new(IntegerArray {
263            data: Buffer::from(data),
264            null_mask: None,
265        })));
266        let int_arr = array.num().i64().unwrap();
267        let slice = &int_arr[..];
268        let sum = simd_sum_i64_runtime(slice, simd_lanes);
269        let dur_minarrow_enum = start.elapsed();
270        println!(
271            "minarrow enum: IntegerArray sum = {}, {:?}",
272            sum, dur_minarrow_enum
273        );
274        let arr_int_enum_aligned = (&data_copy[0] as *const i64 as usize)
275            % std::mem::align_of::<Simd<i64, SIMD_LANES>>()
276            == 0;
277        black_box(sum);
278
279        // Arrow i64 (dynamic)
280        let data: Vec<i64> = (0..n as i64).collect();
281        let data_copy = data.clone();
282
283        let start = Instant::now();
284        let arr: ArrayRef = Arc::new(ArrowI64Array::from(data));
285        let slice = if let Some(f) = arr.as_any().downcast_ref::<ArrowI64Array>() {
286            f.values()
287        } else {
288            panic!("downcast failed")
289        };
290        let sum = simd_sum_i64_runtime(slice, simd_lanes);
291        let dur_arrow_dyn_i64 = start.elapsed();
292        println!(
293            "arrow-rs dyn: Int64Array sum = {}, {:?}",
294            sum, dur_arrow_dyn_i64
295        );
296        let array_ref_int_aligned = (&data_copy[0] as *const i64 as usize)
297            % std::mem::align_of::<Simd<i64, SIMD_LANES>>()
298            == 0;
299        black_box(sum);
300        println!("\n");
301
302        // ----------- Float (f64) tests ----------------
303
304        println!("|------------ Float Tests ------------ |\n");
305
306        // Raw Vec<f64>
307        // Sometimes this will randomly align, other times it will not.
308        let data: Vec<f64> = (0..n as i64).map(|x| x as f64).collect();
309        let start = Instant::now();
310        let sum = simd_sum_f64_runtime(&data[..], simd_lanes);
311        let dur_vec_f64 = start.elapsed();
312        println!("raw vec: Vec<f64> sum = {}, {:?}", sum, dur_vec_f64);
313        let v_float_aligned =
314            (&data[0] as *const f64 as usize) % std::mem::align_of::<Simd<f64, SIMD_LANES>>() == 0;
315
316        black_box(sum);
317
318        // Raw Vec64<f64>
319        let data: Vec64<f64> = (0..n as i64).map(|x| x as f64).collect();
320        let start = Instant::now();
321        let sum = simd_sum_f64_runtime(&data[..], simd_lanes);
322        let dur_vec_f64 = start.elapsed();
323        println!("raw vec64: Vec64<f64> sum = {}, {:?}", sum, dur_vec_f64);
324        let v64_float_aligned =
325            (&data[0] as *const f64 as usize) % std::mem::align_of::<Simd<f64, SIMD_LANES>>() == 0;
326
327        black_box(sum);
328
329        // Minarrow f64 (direct struct, SIMD)
330        let data: Vec64<f64> = (0..n as i64).map(|x| x as f64).collect();
331        let data_copy = data.clone();
332
333        let start = Instant::now();
334        let float_arr = FloatArray {
335            data: Buffer::from(data),
336            null_mask: None,
337        };
338        let sum = simd_sum_f64_runtime(&float_arr[..], simd_lanes);
339        let dur_minarrow_direct_f64 = start.elapsed();
340        println!(
341            "minarrow direct: FloatArray sum = {}, {:?}",
342            sum, dur_minarrow_direct_f64
343        );
344        let float_arr_aligned = (&data_copy[0] as *const f64 as usize)
345            % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
346            == 0;
347        black_box(sum);
348
349        // Arrow f64 (struct direct)
350        let data: Vec<f64> = (0..n as i64).map(|x| x as f64).collect();
351        let data_copy = data.clone();
352
353        let start = Instant::now();
354        let arr = ArrowF64Array::from(data);
355        let sum = simd_sum_f64_runtime(arr.values(), simd_lanes);
356        let dur_arrow_struct_f64 = start.elapsed();
357        println!(
358            "arrow-rs struct: Float64Array sum = {}, {:?}",
359            sum, dur_arrow_struct_f64
360        );
361        let arrow_f64_aligned = (&data_copy[0] as *const f64 as usize)
362            % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
363            == 0;
364        black_box(sum);
365
366        // Minarrow f64 (enum)
367        let data: Vec64<f64> = (0..n as i64).map(|x| x as f64).collect();
368        let data_copy = data.clone();
369
370        let start = Instant::now();
371        let array = Array::NumericArray(NumericArray::Float64(Arc::new(FloatArray {
372            data: Buffer::from(data),
373            null_mask: None,
374        })));
375        let float_arr = array.num().f64().unwrap();
376        let sum = simd_sum_f64_runtime(&float_arr[..], simd_lanes);
377        let dur_minarrow_enum_f64 = start.elapsed();
378        println!(
379            "minarrow enum: FloatArray sum = {}, {:?}",
380            sum, dur_minarrow_enum_f64
381        );
382        let float_enum_aligned = (&data_copy[0] as *const f64 as usize)
383            % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
384            == 0;
385        black_box(sum);
386
387        // Arrow f64 (dynamic)
388        let data: Vec<f64> = (0..n as i64).map(|x| x as f64).collect();
389        let data_copy = data.clone();
390
391        let start = Instant::now();
392        let arr: ArrayRef = Arc::new(ArrowF64Array::from(data));
393        let slice = if let Some(f) = arr.as_any().downcast_ref::<ArrowF64Array>() {
394            f.values()
395        } else {
396            panic!("downcast failed")
397        };
398        let sum = simd_sum_f64_runtime(slice, simd_lanes);
399        let dur_arrow_dyn_f64 = start.elapsed();
400        println!(
401            "arrow-rs dyn: Float64Array sum = {}, {:?}",
402            sum, dur_arrow_dyn_f64
403        );
404        let arrow_f64_arr_aligned = (&data_copy[0] as *const f64 as usize)
405            % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
406            == 0;
407        black_box(sum);
408        println!("\n");
409        println!("Verify SIMD pointer alignment for Integer calculations (based on lane width):");
410        println!("Vec<i64> is aligned: {}", v_aligned);
411        println!("Minarrow Vec64<i64> is aligned: {}", v64_aligned);
412        println!(
413            "Minarrow IntegerArray<i64> is aligned: {}",
414            int_array_aligned
415        );
416        println!("Arrow ArrowI64Array is aligned: {}", i64_arrow_aligned);
417        println!(
418            "Minarrow Array::NumericArray<i64> is aligned: {}",
419            arr_int_enum_aligned
420        );
421        println!("Arrow ArrayRef<int> is aligned: {}", array_ref_int_aligned);
422        println!("\n");
423        println!("Verify SIMD pointer alignment for Float calculations (based on lane width):");
424        println!("Vec<f64> is aligned: {}", v_float_aligned);
425        println!("Vec64<f64> is aligned: {}", v64_float_aligned);
426        println!("FloatArray<f64> is aligned: {}", float_arr_aligned);
427        println!("ArrowF64Array is aligned: {}", arrow_f64_aligned);
428        println!(
429            "Array::NumericArray<f64> is aligned: {}",
430            float_enum_aligned
431        );
432        println!("ArrayRef is aligned: {}", arrow_f64_arr_aligned);
433        println!("\n");
434
435        println!("---------------------- END OF SIMD BENCHMARKS ---------------------------");
436    }
437}
438
439fn main() {
440    if cfg!(feature = "cast_arrow") {
441        use crate::N;
442
443        println!(
444            "Running SIMD/Arrow/minarrow parity benchmarks (n={}, lanes={})",
445            N, SIMD_LANES
446        );
447        #[cfg(feature = "cast_arrow")]
448        run_benchmark(N, SIMD_LANES);
449    } else {
450        println!("The apache-FFI example requires enabling the `cast_arrow` feature.")
451    }
452}