parallel_simd/
parallel_simd.rs

1//! ---------------------------------------------------------
2//! Runs sum benchmark on Minarrow with Rayon (Multi-core processing) and SIMD
3//!
4//! Run with:
5//!     RUSTFLAGS="-C target-cpu=native" cargo run --release --example parallel_simd --features parallel_proc
6//!
7//! The *RUSTFLAGS* argument ensures it compiles to your host instruction-set.
8//!
9//! Use 2, 4, 8, or 16 SIMD_LANES as per your processor's SIMD support.
10//! ---------------------------------------------------------
11
12#![feature(portable_simd)]
13
14use std::hint::black_box;
15use std::simd::num::{SimdFloat, SimdInt};
16use std::simd::{LaneCount, Simd, SupportedLaneCount};
17use std::time::Instant;
18
19use minarrow::{Buffer, Vec64};
20#[cfg(feature = "parallel_proc")]
21use rayon::iter::ParallelIterator;
22#[cfg(feature = "parallel_proc")]
23use rayon::slice::ParallelSlice;
24
25const N: usize = 1_000_000_000;
26const SIMD_LANES: usize = 4;
27
28// SIMD chunk sum for i64
29#[inline(always)]
30fn simd_sum_i64<const LANES: usize>(data: &[i64]) -> i64
31where
32    LaneCount<LANES>: SupportedLaneCount
33{
34    let n = data.len();
35    let simd_width = LANES;
36    let simd_chunks = n / simd_width;
37
38    let mut acc_simd = Simd::<i64, LANES>::splat(0);
39    for i in 0..simd_chunks {
40        let v = Simd::<i64, LANES>::from_slice(&data[i * simd_width..][..simd_width]);
41        acc_simd += v;
42    }
43    let mut result = acc_simd.reduce_sum();
44    for i in (simd_chunks * simd_width)..n {
45        result += data[i];
46    }
47    result
48}
49
50// SIMD chunk sum for f64
51#[inline(always)]
52fn simd_sum_f64<const LANES: usize>(data: &[f64]) -> f64
53where
54    LaneCount<LANES>: SupportedLaneCount
55{
56    let n = data.len();
57    let simd_width = LANES;
58    let simd_chunks = n / simd_width;
59
60    let mut acc_simd = Simd::<f64, LANES>::splat(0.0);
61    for i in 0..simd_chunks {
62        let v = Simd::<f64, LANES>::from_slice(&data[i * simd_width..][..simd_width]);
63        acc_simd += v;
64    }
65    let mut result = acc_simd.reduce_sum();
66    for i in (simd_chunks * simd_width)..n {
67        result += data[i];
68    }
69    result
70}
71
72// Rayon + SIMD for i64
73#[cfg(feature = "parallel_proc")]
74fn rayon_simd_sum_i64(buffer: &Buffer<i64>) -> i64 {
75    let slice = buffer.as_slice();
76    let chunk_size = 1 << 20; // 1M per chunk, tune if desired
77    slice.par_chunks(chunk_size).map(|chunk| simd_sum_i64::<SIMD_LANES>(chunk)).sum()
78}
79
80// Rayon + SIMD for f64
81#[cfg(feature = "parallel_proc")]
82fn rayon_simd_sum_f64(buffer: &Buffer<f64>) -> f64 {
83    let slice = buffer.as_slice();
84    let chunk_size = 1 << 20; // 1M per chunk, tune if desired
85    slice.par_chunks(chunk_size).map(|chunk| simd_sum_f64::<SIMD_LANES>(chunk)).sum()
86}
87#[cfg(feature = "parallel_proc")]
88fn run_benchmark() {
89    println!("--- SIMD + Rayon Benchmark, N = {} ---", N);
90
91    // IntegerArray<i64>
92    let data: Vec64<i64> = (0..N as i64).collect();
93    let buffer = Buffer::from(data);
94
95    let start = Instant::now();
96    let sum = black_box(rayon_simd_sum_i64(&buffer));
97    let dur = start.elapsed();
98    println!("SIMD + Rayon IntegerArray<i64>: sum = {}, time = {:?}", sum, dur);
99
100    // FloatArray<f64>
101    let data: Vec64<f64> = (0..N as i64).map(|x| x as f64).collect();
102    let buffer = Buffer::from(data);
103
104    let start = Instant::now();
105    let sum = black_box(rayon_simd_sum_f64(&buffer));
106    let dur = start.elapsed();
107    println!("SIMD + Rayon FloatArray<f64>: sum = {}, time = {:?}", sum, dur);
108}
109
110fn main() {
111    if cfg!(feature = "parallel_proc") {
112        #[cfg(feature = "parallel_proc")]
113        run_benchmark()
114    } else {
115        println!("The parallel_simd example requires enabling the `parallel_proc` feature.")
116    }
117}