1#![feature(portable_simd)]
18
19#[cfg(feature = "cast_arrow")]
20use crate::benchmarks_simd::run_benchmark;
21
22pub(crate) const N: usize = 1_000;
23pub(crate) const SIMD_LANES: usize = 4;
24
25#[cfg(feature = "cast_arrow")]
26mod benchmarks_simd {
27
28 use std::hint::black_box;
29 use std::simd::{LaneCount, Simd, SupportedLaneCount};
30 use std::sync::Arc;
31 use std::time::Instant;
32
33 use crate::SIMD_LANES;
34
35 use arrow::array::{
36 Array as ArrowArrayTrait, ArrayRef, Float64Array as ArrowF64Array,
37 Int64Array as ArrowI64Array,
38 };
39 use minarrow::{Array, Buffer, FloatArray, IntegerArray, NumericArray, Vec64};
40
41 #[inline(always)]
42 fn simd_sum_i64<const LANES: usize>(data: &[i64]) -> i64
43 where
44 LaneCount<LANES>: SupportedLaneCount,
45 {
46 let n = data.len();
47 let simd_width = LANES;
48 let simd_chunks = n / simd_width;
49
50 let mut acc_simd: Simd<i64, LANES>;
51
52 unsafe {
53 let data_ptr = data.as_ptr();
54 let mut acc1 = Simd::<i64, LANES>::splat(0);
55 let mut acc2 = Simd::<i64, LANES>::splat(0);
56 let mut acc3 = Simd::<i64, LANES>::splat(0);
57 let mut acc4 = Simd::<i64, LANES>::splat(0);
58
59 let unroll_factor = 4;
60 let unrolled_chunks = simd_chunks / unroll_factor;
61
62 for i in 0..unrolled_chunks {
63 let base_offset = i * unroll_factor * simd_width;
64 let v1 =
65 std::ptr::read_unaligned(data_ptr.add(base_offset) as *const Simd<i64, LANES>);
66 let v2 = std::ptr::read_unaligned(
67 data_ptr.add(base_offset + simd_width) as *const Simd<i64, LANES>
68 );
69 let v3 = std::ptr::read_unaligned(
70 data_ptr.add(base_offset + 2 * simd_width) as *const Simd<i64, LANES>
71 );
72 let v4 = std::ptr::read_unaligned(
73 data_ptr.add(base_offset + 3 * simd_width) as *const Simd<i64, LANES>
74 );
75 acc1 += v1;
76 acc2 += v2;
77 acc3 += v3;
78 acc4 += v4;
79 }
80
81 acc_simd = acc1 + acc2 + acc3 + acc4;
82
83 let processed = unrolled_chunks * unroll_factor;
84 for i in processed..simd_chunks {
85 let offset = i * simd_width;
86 let v = std::ptr::read_unaligned(data_ptr.add(offset) as *const Simd<i64, LANES>);
87 acc_simd += v;
88 }
89 }
90
91 let mut result = 0i64;
93 for i in 0..LANES {
94 result += acc_simd[i];
95 }
96
97 let remainder_start = simd_chunks * simd_width;
98 for i in remainder_start..n {
99 result += data[i];
100 }
101
102 result
103 }
104
105 #[inline(always)]
106 fn simd_sum_f64<const LANES: usize>(data: &[f64]) -> f64
107 where
108 LaneCount<LANES>: SupportedLaneCount,
109 {
110 let n = data.len();
111 let simd_width = LANES;
112 let simd_chunks = n / simd_width;
113
114 let mut acc_simd: Simd<f64, LANES>;
115
116 unsafe {
117 let data_ptr = data.as_ptr();
118 let mut acc1 = Simd::<f64, LANES>::splat(0.0);
119 let mut acc2 = Simd::<f64, LANES>::splat(0.0);
120 let mut acc3 = Simd::<f64, LANES>::splat(0.0);
121 let mut acc4 = Simd::<f64, LANES>::splat(0.0);
122
123 let unroll_factor = 4;
124 let unrolled_chunks = simd_chunks / unroll_factor;
125
126 for i in 0..unrolled_chunks {
127 let base_offset = i * unroll_factor * simd_width;
128 let v1 =
129 std::ptr::read_unaligned(data_ptr.add(base_offset) as *const Simd<f64, LANES>);
130 let v2 = std::ptr::read_unaligned(
131 data_ptr.add(base_offset + simd_width) as *const Simd<f64, LANES>
132 );
133 let v3 = std::ptr::read_unaligned(
134 data_ptr.add(base_offset + 2 * simd_width) as *const Simd<f64, LANES>
135 );
136 let v4 = std::ptr::read_unaligned(
137 data_ptr.add(base_offset + 3 * simd_width) as *const Simd<f64, LANES>
138 );
139 acc1 += v1;
140 acc2 += v2;
141 acc3 += v3;
142 acc4 += v4;
143 }
144
145 acc_simd = acc1 + acc2 + acc3 + acc4;
146
147 let processed = unrolled_chunks * unroll_factor;
148 for i in processed..simd_chunks {
149 let offset = i * simd_width;
150 let v = std::ptr::read_unaligned(data_ptr.add(offset) as *const Simd<f64, LANES>);
151 acc_simd += v;
152 }
153 }
154
155 let mut result = 0.0;
156 for i in 0..LANES {
157 result += acc_simd[i];
158 }
159
160 let remainder_start = simd_chunks * simd_width;
161 for i in remainder_start..n {
162 result += data[i];
163 }
164
165 result
166 }
167
168 fn simd_sum_f64_runtime(data: &[f64], lanes: usize) -> f64 {
169 match lanes {
170 2 => simd_sum_f64::<2>(data),
171 4 => simd_sum_f64::<4>(data),
172 8 => simd_sum_f64::<8>(data),
173 16 => simd_sum_f64::<16>(data),
174 _ => panic!("Unsupported SIMD lanes. Only 2, 4, 8, 16 supported."),
175 }
176 }
177
178 fn simd_sum_i64_runtime(data: &[i64], lanes: usize) -> i64 {
179 match lanes {
180 2 => simd_sum_i64::<2>(data),
181 4 => simd_sum_i64::<4>(data),
182 8 => simd_sum_i64::<8>(data),
183 16 => simd_sum_i64::<16>(data),
184 _ => panic!("Unsupported SIMD lanes. Only 2, 4, 8, 16 supported."),
185 }
186 }
187
188 pub fn run_benchmark(n: usize, simd_lanes: usize) {
189 let data: Vec<i64> = (0..n as i64).collect();
192 black_box(simd_sum_i64_runtime(&data[..], simd_lanes)); println!("|------------ Integer Tests ------------ |\n");
195 let data: Vec<i64> = (0..n as i64).collect();
198 let start = Instant::now();
199 let slice = &data[..];
200 let sum = simd_sum_i64_runtime(slice, simd_lanes);
201 let dur_vec = start.elapsed();
202 println!("raw vec: Vec<i64> sum = {}, {:?}", sum, dur_vec);
203 let v_aligned =
204 (&data[0] as *const i64 as usize) % std::mem::align_of::<Simd<i64, SIMD_LANES>>() == 0;
205 black_box(sum);
206
207 let data: Vec64<i64> = (0..n as i64).collect();
209 let start = Instant::now();
210 let slice = &data[..];
211 let sum = simd_sum_i64_runtime(slice, simd_lanes);
212 let dur_vec = start.elapsed();
213 println!("raw vec64: Vec64<i64> sum = {}, {:?}", sum, dur_vec);
214 let v64_aligned =
215 (&data[0] as *const i64 as usize) % std::mem::align_of::<Simd<i64, SIMD_LANES>>() == 0;
216 black_box(sum);
217
218 let data: Vec64<i64> = (0..n as i64).collect();
220 let data_copy = data.clone();
221
222 let start = Instant::now();
223 let int_arr = IntegerArray {
224 data: Buffer::from(data),
225 null_mask: None,
226 };
227 let slice = &int_arr[..];
228 let sum = simd_sum_i64_runtime(slice, simd_lanes);
229 let dur_minarrow_direct = start.elapsed();
230 println!(
231 "minarrow direct: IntegerArray sum = {}, {:?}",
232 sum, dur_minarrow_direct
233 );
234 let int_array_aligned = (&data_copy[0] as *const i64 as usize)
235 % std::mem::align_of::<Simd<i64, SIMD_LANES>>()
236 == 0;
237 black_box(sum);
238
239 let data: Vec<i64> = (0..n as i64).collect();
241 let data_copy = data.clone();
242
243 let start = Instant::now();
244 let arr = ArrowI64Array::from(data);
245 let slice = arr.values();
246 let sum = simd_sum_i64_runtime(slice, simd_lanes);
247 let dur_arrow_struct = start.elapsed();
248 println!(
249 "arrow-rs struct: Int64Array sum = {}, {:?}",
250 sum, dur_arrow_struct
251 );
252 let i64_arrow_aligned = (&data_copy[0] as *const i64 as usize)
253 % std::mem::align_of::<Simd<i64, SIMD_LANES>>()
254 == 0;
255 black_box(sum);
256
257 let data: Vec64<i64> = (0..n as i64).collect();
259 let data_copy = data.clone();
260
261 let start = Instant::now();
262 let array = Array::NumericArray(NumericArray::Int64(Arc::new(IntegerArray {
263 data: Buffer::from(data),
264 null_mask: None,
265 })));
266 let int_arr = array.num().i64().unwrap();
267 let slice = &int_arr[..];
268 let sum = simd_sum_i64_runtime(slice, simd_lanes);
269 let dur_minarrow_enum = start.elapsed();
270 println!(
271 "minarrow enum: IntegerArray sum = {}, {:?}",
272 sum, dur_minarrow_enum
273 );
274 let arr_int_enum_aligned = (&data_copy[0] as *const i64 as usize)
275 % std::mem::align_of::<Simd<i64, SIMD_LANES>>()
276 == 0;
277 black_box(sum);
278
279 let data: Vec<i64> = (0..n as i64).collect();
281 let data_copy = data.clone();
282
283 let start = Instant::now();
284 let arr: ArrayRef = Arc::new(ArrowI64Array::from(data));
285 let slice = if let Some(f) = arr.as_any().downcast_ref::<ArrowI64Array>() {
286 f.values()
287 } else {
288 panic!("downcast failed")
289 };
290 let sum = simd_sum_i64_runtime(slice, simd_lanes);
291 let dur_arrow_dyn_i64 = start.elapsed();
292 println!(
293 "arrow-rs dyn: Int64Array sum = {}, {:?}",
294 sum, dur_arrow_dyn_i64
295 );
296 let array_ref_int_aligned = (&data_copy[0] as *const i64 as usize)
297 % std::mem::align_of::<Simd<i64, SIMD_LANES>>()
298 == 0;
299 black_box(sum);
300 println!("\n");
301
302 println!("|------------ Float Tests ------------ |\n");
305
306 let data: Vec<f64> = (0..n as i64).map(|x| x as f64).collect();
309 let start = Instant::now();
310 let sum = simd_sum_f64_runtime(&data[..], simd_lanes);
311 let dur_vec_f64 = start.elapsed();
312 println!("raw vec: Vec<f64> sum = {}, {:?}", sum, dur_vec_f64);
313 let v_float_aligned =
314 (&data[0] as *const f64 as usize) % std::mem::align_of::<Simd<f64, SIMD_LANES>>() == 0;
315
316 black_box(sum);
317
318 let data: Vec64<f64> = (0..n as i64).map(|x| x as f64).collect();
320 let start = Instant::now();
321 let sum = simd_sum_f64_runtime(&data[..], simd_lanes);
322 let dur_vec_f64 = start.elapsed();
323 println!("raw vec64: Vec64<f64> sum = {}, {:?}", sum, dur_vec_f64);
324 let v64_float_aligned =
325 (&data[0] as *const f64 as usize) % std::mem::align_of::<Simd<f64, SIMD_LANES>>() == 0;
326
327 black_box(sum);
328
329 let data: Vec64<f64> = (0..n as i64).map(|x| x as f64).collect();
331 let data_copy = data.clone();
332
333 let start = Instant::now();
334 let float_arr = FloatArray {
335 data: Buffer::from(data),
336 null_mask: None,
337 };
338 let sum = simd_sum_f64_runtime(&float_arr[..], simd_lanes);
339 let dur_minarrow_direct_f64 = start.elapsed();
340 println!(
341 "minarrow direct: FloatArray sum = {}, {:?}",
342 sum, dur_minarrow_direct_f64
343 );
344 let float_arr_aligned = (&data_copy[0] as *const f64 as usize)
345 % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
346 == 0;
347 black_box(sum);
348
349 let data: Vec<f64> = (0..n as i64).map(|x| x as f64).collect();
351 let data_copy = data.clone();
352
353 let start = Instant::now();
354 let arr = ArrowF64Array::from(data);
355 let sum = simd_sum_f64_runtime(arr.values(), simd_lanes);
356 let dur_arrow_struct_f64 = start.elapsed();
357 println!(
358 "arrow-rs struct: Float64Array sum = {}, {:?}",
359 sum, dur_arrow_struct_f64
360 );
361 let arrow_f64_aligned = (&data_copy[0] as *const f64 as usize)
362 % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
363 == 0;
364 black_box(sum);
365
366 let data: Vec64<f64> = (0..n as i64).map(|x| x as f64).collect();
368 let data_copy = data.clone();
369
370 let start = Instant::now();
371 let array = Array::NumericArray(NumericArray::Float64(Arc::new(FloatArray {
372 data: Buffer::from(data),
373 null_mask: None,
374 })));
375 let float_arr = array.num().f64().unwrap();
376 let sum = simd_sum_f64_runtime(&float_arr[..], simd_lanes);
377 let dur_minarrow_enum_f64 = start.elapsed();
378 println!(
379 "minarrow enum: FloatArray sum = {}, {:?}",
380 sum, dur_minarrow_enum_f64
381 );
382 let float_enum_aligned = (&data_copy[0] as *const f64 as usize)
383 % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
384 == 0;
385 black_box(sum);
386
387 let data: Vec<f64> = (0..n as i64).map(|x| x as f64).collect();
389 let data_copy = data.clone();
390
391 let start = Instant::now();
392 let arr: ArrayRef = Arc::new(ArrowF64Array::from(data));
393 let slice = if let Some(f) = arr.as_any().downcast_ref::<ArrowF64Array>() {
394 f.values()
395 } else {
396 panic!("downcast failed")
397 };
398 let sum = simd_sum_f64_runtime(slice, simd_lanes);
399 let dur_arrow_dyn_f64 = start.elapsed();
400 println!(
401 "arrow-rs dyn: Float64Array sum = {}, {:?}",
402 sum, dur_arrow_dyn_f64
403 );
404 let arrow_f64_arr_aligned = (&data_copy[0] as *const f64 as usize)
405 % std::mem::align_of::<Simd<f64, SIMD_LANES>>()
406 == 0;
407 black_box(sum);
408 println!("\n");
409 println!("Verify SIMD pointer alignment for Integer calculations (based on lane width):");
410 println!("Vec<i64> is aligned: {}", v_aligned);
411 println!("Minarrow Vec64<i64> is aligned: {}", v64_aligned);
412 println!(
413 "Minarrow IntegerArray<i64> is aligned: {}",
414 int_array_aligned
415 );
416 println!("Arrow ArrowI64Array is aligned: {}", i64_arrow_aligned);
417 println!(
418 "Minarrow Array::NumericArray<i64> is aligned: {}",
419 arr_int_enum_aligned
420 );
421 println!("Arrow ArrayRef<int> is aligned: {}", array_ref_int_aligned);
422 println!("\n");
423 println!("Verify SIMD pointer alignment for Float calculations (based on lane width):");
424 println!("Vec<f64> is aligned: {}", v_float_aligned);
425 println!("Vec64<f64> is aligned: {}", v64_float_aligned);
426 println!("FloatArray<f64> is aligned: {}", float_arr_aligned);
427 println!("ArrowF64Array is aligned: {}", arrow_f64_aligned);
428 println!(
429 "Array::NumericArray<f64> is aligned: {}",
430 float_enum_aligned
431 );
432 println!("ArrayRef is aligned: {}", arrow_f64_arr_aligned);
433 println!("\n");
434
435 println!("---------------------- END OF SIMD BENCHMARKS ---------------------------");
436 }
437}
438
439fn main() {
440 if cfg!(feature = "cast_arrow") {
441 use crate::N;
442
443 println!(
444 "Running SIMD/Arrow/minarrow parity benchmarks (n={}, lanes={})",
445 N, SIMD_LANES
446 );
447 #[cfg(feature = "cast_arrow")]
448 run_benchmark(N, SIMD_LANES);
449 } else {
450 println!("The apache-FFI example requires enabling the `cast_arrow` feature.")
451 }
452}