scirs2_core/simd/
basic.rs

1//! Basic arithmetic operations with SIMD acceleration
2//!
3//! This module provides optimized implementations of fundamental array operations
4//! including element-wise maximum, minimum, and addition with various optimization levels.
5
6use ndarray::{Array1, ArrayView1};
7
8/// Compute element-wise maximum of two f32 arrays using unified SIMD operations
9///
10/// This function uses the unified SIMD interface for better performance when
11/// processing large arrays of f32 values.
12///
13/// # Arguments
14///
15/// * `a` - First array
16/// * `b` - Second array
17///
18/// # Returns
19///
20/// * Element-wise maximum array
21#[allow(dead_code)]
22pub fn simd_maximum_f32(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> Array1<f32> {
23    // Direct implementation to avoid circular dependency
24    let mut result = Array1::zeros(a.len());
25    for i in 0..a.len() {
26        result[i] = a[i].max(b[i]);
27    }
28    result
29}
30
31/// Compute element-wise maximum of two f64 arrays using unified SIMD operations
32///
33/// This function uses the unified SIMD interface for better performance when
34/// processing large arrays of f64 values.
35///
36/// # Arguments
37///
38/// * `a` - First array
39/// * `b` - Second array
40///
41/// # Returns
42///
43/// * Element-wise maximum array
44#[allow(dead_code)]
45pub fn simd_maximum_f64(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> Array1<f64> {
46    // Direct implementation to avoid circular dependency
47    let mut result = Array1::zeros(a.len());
48    for i in 0..a.len() {
49        result[i] = a[i].max(b[i]);
50    }
51    result
52}
53
54/// Compute element-wise minimum of two f32 arrays using unified SIMD operations
55///
56/// This function uses the unified SIMD interface for better performance when
57/// processing large arrays of f32 values.
58///
59/// # Arguments
60///
61/// * `a` - First array
62/// * `b` - Second array
63///
64/// # Returns
65///
66/// * Element-wise minimum array
67#[allow(dead_code)]
68pub fn simd_minimum_f32(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> Array1<f32> {
69    // Direct implementation to avoid circular dependency
70    let mut result = Array1::zeros(a.len());
71    for i in 0..a.len() {
72        result[i] = a[i].min(b[i]);
73    }
74    result
75}
76
77/// Compute element-wise minimum of two f64 arrays using unified SIMD operations
78///
79/// This function uses the unified SIMD interface for better performance when
80/// processing large arrays of f64 values.
81///
82/// # Arguments
83///
84/// * `a` - First array
85/// * `b` - Second array
86///
87/// # Returns
88///
89/// * Element-wise minimum array
90#[allow(dead_code)]
91pub fn simd_minimum_f64(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> Array1<f64> {
92    // Direct implementation to avoid circular dependency
93    let mut result = Array1::zeros(a.len());
94    for i in 0..a.len() {
95        result[i] = a[i].min(b[i]);
96    }
97    result
98}
99
100/// Compute element-wise addition of two f32 arrays using unified SIMD operations
101///
102/// This function uses the unified SIMD interface for better performance when
103/// processing large arrays of f32 values.
104///
105/// # Arguments
106///
107/// * `a` - First array
108/// * `b` - Second array
109///
110/// # Returns
111///
112/// * Element-wise sum array
113#[allow(dead_code)]
114pub fn simd_add_f32(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> Array1<f32> {
115    assert_eq!(a.len(), b.len(), "Arrays must have the same length");
116
117    let len = a.len();
118    let mut result = Vec::with_capacity(len);
119
120    #[cfg(target_arch = "x86_64")]
121    {
122        use std::arch::x86_64::*;
123
124        if is_x86_feature_detected!("avx2") {
125            unsafe {
126                let mut i = 0;
127                // Process 8 f32s at a time with AVX2
128                while i + 8 <= len {
129                    let a_slice = &a.as_slice().expect("Operation failed")[i..i + 8];
130                    let b_slice = &b.as_slice().expect("Operation failed")[i..i + 8];
131
132                    let a_vec = _mm256_loadu_ps(a_slice.as_ptr());
133                    let b_vec = _mm256_loadu_ps(b_slice.as_ptr());
134                    let result_vec = _mm256_add_ps(a_vec, b_vec);
135
136                    let mut temp = [0.0f32; 8];
137                    _mm256_storeu_ps(temp.as_mut_ptr(), result_vec);
138                    result.extend_from_slice(&temp);
139                    i += 8;
140                }
141
142                // Handle remaining elements
143                for j in i..len {
144                    result.push(a[j] + b[j]);
145                }
146            }
147        } else if is_x86_feature_detected!("sse") {
148            unsafe {
149                let mut i = 0;
150                // Process 4 f32s at a time with SSE
151                while i + 4 <= len {
152                    let a_slice = &a.as_slice().expect("Operation failed")[i..i + 4];
153                    let b_slice = &b.as_slice().expect("Operation failed")[i..i + 4];
154
155                    let a_vec = _mm_loadu_ps(a_slice.as_ptr());
156                    let b_vec = _mm_loadu_ps(b_slice.as_ptr());
157                    let result_vec = _mm_add_ps(a_vec, b_vec);
158
159                    let mut temp = [0.0f32; 4];
160                    _mm_storeu_ps(temp.as_mut_ptr(), result_vec);
161                    result.extend_from_slice(&temp);
162                    i += 4;
163                }
164
165                // Handle remaining elements
166                for j in i..len {
167                    result.push(a[j] + b[j]);
168                }
169            }
170        } else {
171            // Fallback to scalar implementation
172            for i in 0..len {
173                result.push(a[i] + b[i]);
174            }
175        }
176    }
177
178    #[cfg(target_arch = "aarch64")]
179    {
180        use std::arch::aarch64::*;
181
182        if std::arch::is_aarch64_feature_detected!("neon") {
183            unsafe {
184                let mut i = 0;
185                // Process 4 f32s at a time with NEON
186                while i + 4 <= len {
187                    let a_slice = &a.as_slice().expect("Operation failed")[i..i + 4];
188                    let b_slice = &b.as_slice().expect("Operation failed")[i..i + 4];
189
190                    let a_vec = vld1q_f32(a_slice.as_ptr());
191                    let b_vec = vld1q_f32(b_slice.as_ptr());
192                    let result_vec = vaddq_f32(a_vec, b_vec);
193
194                    let mut temp = [0.0f32; 4];
195                    vst1q_f32(temp.as_mut_ptr(), result_vec);
196                    result.extend_from_slice(&temp);
197                    i += 4;
198                }
199
200                // Handle remaining elements
201                for j in i..len {
202                    result.push(a[j] + b[j]);
203                }
204            }
205        } else {
206            // Fallback to scalar implementation
207            for i in 0..len {
208                result.push(a[i] + b[i]);
209            }
210        }
211    }
212
213    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
214    {
215        // Fallback to scalar implementation for other architectures
216        for i in 0..len {
217            result.push(a[i] + b[i]);
218        }
219    }
220
221    Array1::from_vec(result)
222}
223
224/// OPTIMIZED: Compute element-wise addition of two f32 arrays using SIMD operations
225/// This is the Phase 1 optimization version with direct memory access patterns
226#[allow(dead_code)]
227pub fn simd_add_f32_optimized(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> Array1<f32> {
228    assert_eq!(a.len(), b.len(), "Arrays must have the same length");
229
230    let len = a.len();
231
232    // Phase 1 Optimization: Pre-allocate exact size, avoid dynamic growth
233    let mut result = vec![0.0f32; len];
234
235    // Get contiguous data pointers for direct access
236    let a_ptr = a.as_slice().expect("Operation failed").as_ptr();
237    let b_ptr = b.as_slice().expect("Operation failed").as_ptr();
238    let result_ptr = result.as_mut_ptr();
239
240    #[cfg(target_arch = "x86_64")]
241    {
242        use std::arch::x86_64::*;
243
244        if is_x86_feature_detected!("avx2") {
245            unsafe {
246                let mut i = 0;
247                // Process 8 f32s at a time with AVX2 - direct memory access
248                while i + 8 <= len {
249                    // Direct pointer access - no bounds checking, no slicing
250                    let a_vec = _mm256_loadu_ps(a_ptr.add(i));
251                    let b_vec = _mm256_loadu_ps(b_ptr.add(i));
252                    let result_vec = _mm256_add_ps(a_vec, b_vec);
253
254                    // Direct store - no temporary buffer, no copying
255                    _mm256_storeu_ps(result_ptr.add(i), result_vec);
256                    i += 8;
257                }
258
259                // Handle remaining elements efficiently
260                while i < len {
261                    *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
262                    i += 1;
263                }
264            }
265        } else if is_x86_feature_detected!("sse") {
266            unsafe {
267                let mut i = 0;
268                // Process 4 f32s at a time with SSE - direct memory access
269                while i + 4 <= len {
270                    let a_vec = _mm_loadu_ps(a_ptr.add(i));
271                    let b_vec = _mm_loadu_ps(b_ptr.add(i));
272                    let result_vec = _mm_add_ps(a_vec, b_vec);
273
274                    // Direct store - no temporary buffer
275                    _mm_storeu_ps(result_ptr.add(i), result_vec);
276                    i += 4;
277                }
278
279                // Handle remaining elements
280                while i < len {
281                    *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
282                    i += 1;
283                }
284            }
285        } else {
286            // Fallback to scalar implementation
287            unsafe {
288                for i in 0..len {
289                    *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
290                }
291            }
292        }
293    }
294
295    #[cfg(target_arch = "aarch64")]
296    {
297        use std::arch::aarch64::*;
298
299        if std::arch::is_aarch64_feature_detected!("neon") {
300            unsafe {
301                let mut i = 0;
302                // Process 4 f32s at a time with NEON - direct memory access
303                while i + 4 <= len {
304                    let a_vec = vld1q_f32(a_ptr.add(i));
305                    let b_vec = vld1q_f32(b_ptr.add(i));
306                    let result_vec = vaddq_f32(a_vec, b_vec);
307
308                    // Direct store - no temporary buffer
309                    vst1q_f32(result_ptr.add(i), result_vec);
310                    i += 4;
311                }
312
313                // Handle remaining elements
314                while i < len {
315                    *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
316                    i += 1;
317                }
318            }
319        } else {
320            // Fallback to scalar implementation
321            unsafe {
322                for i in 0..len {
323                    *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
324                }
325            }
326        }
327    }
328
329    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
330    {
331        // Fallback to scalar implementation for other architectures
332        unsafe {
333            for i in 0..len {
334                *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
335            }
336        }
337    }
338
339    Array1::from_vec(result)
340}
341
342/// PHASE 2: CPU feature caching for optimal performance
343use std::sync::OnceLock;
344
345struct CpuFeatures {
346    has_avx512f: bool,
347    has_avx2: bool,
348    has_sse: bool,
349    has_fma: bool,
350    has_neon: bool,
351}
352
353static CPU_FEATURES: OnceLock<CpuFeatures> = OnceLock::new();
354
355fn get_cpu_features() -> &'static CpuFeatures {
356    CPU_FEATURES.get_or_init(|| {
357        #[cfg(target_arch = "x86_64")]
358        {
359            CpuFeatures {
360                has_avx512f: std::arch::is_x86_feature_detected!("avx512f"),
361                has_avx2: std::arch::is_x86_feature_detected!("avx2"),
362                has_sse: std::arch::is_x86_feature_detected!("sse"),
363                has_fma: std::arch::is_x86_feature_detected!("fma"),
364                has_neon: false,
365            }
366        }
367        #[cfg(target_arch = "aarch64")]
368        {
369            CpuFeatures {
370                has_avx512f: false,
371                has_avx2: false,
372                has_sse: false,
373                has_fma: false, // ARM uses different FMA instructions
374                has_neon: std::arch::is_aarch64_feature_detected!("neon"),
375            }
376        }
377        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
378        {
379            CpuFeatures {
380                has_avx512f: false,
381                has_avx2: false,
382                has_sse: false,
383                has_fma: false,
384                has_neon: false,
385            }
386        }
387    })
388}
389
390/// PHASE 2: Advanced SIMD addition with CPU feature caching and loop unrolling
391#[allow(dead_code)]
392pub fn simd_add_f32_fast(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> Array1<f32> {
393    assert_eq!(a.len(), b.len(), "Arrays must have the same length");
394
395    let len = a.len();
396    let mut result = vec![0.0f32; len];
397
398    let a_ptr = a.as_slice().expect("Operation failed").as_ptr();
399    let b_ptr = b.as_slice().expect("Operation failed").as_ptr();
400    let result_ptr = result.as_mut_ptr();
401
402    let features = get_cpu_features();
403
404    #[cfg(target_arch = "x86_64")]
405    {
406        use std::arch::x86_64::*;
407
408        if features.has_avx2 {
409            unsafe {
410                let mut i = 0;
411
412                // Phase 2: Loop unrolling - process 32 elements (4x AVX2) at once
413                while i + 32 <= len {
414                    // Load 4 AVX2 registers worth of data
415                    let a_vec1 = _mm256_loadu_ps(a_ptr.add(i));
416                    let a_vec2 = _mm256_loadu_ps(a_ptr.add(i + 8));
417                    let a_vec3 = _mm256_loadu_ps(a_ptr.add(i + 16));
418                    let a_vec4 = _mm256_loadu_ps(a_ptr.add(i + 24));
419
420                    let b_vec1 = _mm256_loadu_ps(b_ptr.add(i));
421                    let b_vec2 = _mm256_loadu_ps(b_ptr.add(i + 8));
422                    let b_vec3 = _mm256_loadu_ps(b_ptr.add(i + 16));
423                    let b_vec4 = _mm256_loadu_ps(b_ptr.add(i + 24));
424
425                    // Parallel execution of 4 SIMD operations
426                    let result_vec1 = _mm256_add_ps(a_vec1, b_vec1);
427                    let result_vec2 = _mm256_add_ps(a_vec2, b_vec2);
428                    let result_vec3 = _mm256_add_ps(a_vec3, b_vec3);
429                    let result_vec4 = _mm256_add_ps(a_vec4, b_vec4);
430
431                    // Store results
432                    _mm256_storeu_ps(result_ptr.add(i), result_vec1);
433                    _mm256_storeu_ps(result_ptr.add(i + 8), result_vec2);
434                    _mm256_storeu_ps(result_ptr.add(i + 16), result_vec3);
435                    _mm256_storeu_ps(result_ptr.add(i + 24), result_vec4);
436
437                    i += 32;
438                }
439
440                // Handle remaining 8-element chunks
441                while i + 8 <= len {
442                    let a_vec = _mm256_loadu_ps(a_ptr.add(i));
443                    let b_vec = _mm256_loadu_ps(b_ptr.add(i));
444                    let result_vec = _mm256_add_ps(a_vec, b_vec);
445                    _mm256_storeu_ps(result_ptr.add(i), result_vec);
446                    i += 8;
447                }
448
449                // Handle remaining elements
450                while i < len {
451                    *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
452                    i += 1;
453                }
454            }
455        } else if features.has_sse {
456            unsafe {
457                let mut i = 0;
458
459                // SSE version with 2x unrolling
460                while i + 8 <= len {
461                    let a_vec1 = _mm_loadu_ps(a_ptr.add(i));
462                    let a_vec2 = _mm_loadu_ps(a_ptr.add(i + 4));
463                    let b_vec1 = _mm_loadu_ps(b_ptr.add(i));
464                    let b_vec2 = _mm_loadu_ps(b_ptr.add(i + 4));
465
466                    let result_vec1 = _mm_add_ps(a_vec1, b_vec1);
467                    let result_vec2 = _mm_add_ps(a_vec2, b_vec2);
468
469                    _mm_storeu_ps(result_ptr.add(i), result_vec1);
470                    _mm_storeu_ps(result_ptr.add(i + 4), result_vec2);
471                    i += 8;
472                }
473
474                while i + 4 <= len {
475                    let a_vec = _mm_loadu_ps(a_ptr.add(i));
476                    let b_vec = _mm_loadu_ps(b_ptr.add(i));
477                    let result_vec = _mm_add_ps(a_vec, b_vec);
478                    _mm_storeu_ps(result_ptr.add(i), result_vec);
479                    i += 4;
480                }
481
482                while i < len {
483                    *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
484                    i += 1;
485                }
486            }
487        } else {
488            unsafe {
489                for i in 0..len {
490                    *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
491                }
492            }
493        }
494    }
495
496    #[cfg(target_arch = "aarch64")]
497    {
498        use std::arch::aarch64::*;
499
500        if features.has_neon {
501            unsafe {
502                let mut i = 0;
503
504                // NEON version with 2x unrolling
505                while i + 8 <= len {
506                    let a_vec1 = vld1q_f32(a_ptr.add(i));
507                    let a_vec2 = vld1q_f32(a_ptr.add(i + 4));
508                    let b_vec1 = vld1q_f32(b_ptr.add(i));
509                    let b_vec2 = vld1q_f32(b_ptr.add(i + 4));
510
511                    let result_vec1 = vaddq_f32(a_vec1, b_vec1);
512                    let result_vec2 = vaddq_f32(a_vec2, b_vec2);
513
514                    vst1q_f32(result_ptr.add(i), result_vec1);
515                    vst1q_f32(result_ptr.add(i + 4), result_vec2);
516                    i += 8;
517                }
518
519                while i + 4 <= len {
520                    let a_vec = vld1q_f32(a_ptr.add(i));
521                    let b_vec = vld1q_f32(b_ptr.add(i));
522                    let result_vec = vaddq_f32(a_vec, b_vec);
523                    vst1q_f32(result_ptr.add(i), result_vec);
524                    i += 4;
525                }
526
527                while i < len {
528                    *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
529                    i += 1;
530                }
531            }
532        } else {
533            unsafe {
534                for i in 0..len {
535                    *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
536                }
537            }
538        }
539    }
540
541    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
542    {
543        unsafe {
544            for i in 0..len {
545                *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
546            }
547        }
548    }
549
550    Array1::from_vec(result)
551}
552
553/// PHASE 3: Memory-aligned SIMD with prefetching for maximum performance
554#[allow(dead_code)]
555pub fn simd_add_f32_ultra(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> Array1<f32> {
556    assert_eq!(a.len(), b.len(), "Arrays must have the same length");
557
558    let len = a.len();
559    let mut result = vec![0.0f32; len];
560
561    let a_ptr = a.as_slice().expect("Operation failed").as_ptr();
562    let b_ptr = b.as_slice().expect("Operation failed").as_ptr();
563    let result_ptr = result.as_mut_ptr();
564
565    let features = get_cpu_features();
566
567    #[cfg(target_arch = "x86_64")]
568    {
569        use std::arch::x86_64::*;
570
571        if features.has_avx2 {
572            unsafe {
573                let mut i = 0;
574
575                // Phase 3: Advanced optimizations with prefetching
576                const PREFETCH_DISTANCE: usize = 256; // 64 cache lines ahead
577
578                // Check if we can use aligned loads (rare but optimal)
579                let a_aligned = (a_ptr as usize) % 32 == 0;
580                let b_aligned = (b_ptr as usize) % 32 == 0;
581                let result_aligned = (result_ptr as usize) % 32 == 0;
582
583                if a_aligned && b_aligned && result_aligned && len >= 64 {
584                    // Optimal path: All data is aligned - use fastest instructions
585                    while i + 64 <= len {
586                        // Prefetch future data
587                        if i + PREFETCH_DISTANCE < len {
588                            _mm_prefetch(
589                                a_ptr.add(i + PREFETCH_DISTANCE) as *const i8,
590                                _MM_HINT_T0,
591                            );
592                            _mm_prefetch(
593                                b_ptr.add(i + PREFETCH_DISTANCE) as *const i8,
594                                _MM_HINT_T0,
595                            );
596                        }
597
598                        // Process 64 elements (8x AVX2 registers) with aligned loads
599                        let a_vec1 = _mm256_load_ps(a_ptr.add(i));
600                        let a_vec2 = _mm256_load_ps(a_ptr.add(i + 8));
601                        let a_vec3 = _mm256_load_ps(a_ptr.add(i + 16));
602                        let a_vec4 = _mm256_load_ps(a_ptr.add(i + 24));
603                        let a_vec5 = _mm256_load_ps(a_ptr.add(i + 32));
604                        let a_vec6 = _mm256_load_ps(a_ptr.add(i + 40));
605                        let a_vec7 = _mm256_load_ps(a_ptr.add(i + 48));
606                        let a_vec8 = _mm256_load_ps(a_ptr.add(i + 56));
607
608                        let b_vec1 = _mm256_load_ps(b_ptr.add(i));
609                        let b_vec2 = _mm256_load_ps(b_ptr.add(i + 8));
610                        let b_vec3 = _mm256_load_ps(b_ptr.add(i + 16));
611                        let b_vec4 = _mm256_load_ps(b_ptr.add(i + 24));
612                        let b_vec5 = _mm256_load_ps(b_ptr.add(i + 32));
613                        let b_vec6 = _mm256_load_ps(b_ptr.add(i + 40));
614                        let b_vec7 = _mm256_load_ps(b_ptr.add(i + 48));
615                        let b_vec8 = _mm256_load_ps(b_ptr.add(i + 56));
616
617                        // Parallel execution of 8 SIMD operations
618                        let result_vec1 = _mm256_add_ps(a_vec1, b_vec1);
619                        let result_vec2 = _mm256_add_ps(a_vec2, b_vec2);
620                        let result_vec3 = _mm256_add_ps(a_vec3, b_vec3);
621                        let result_vec4 = _mm256_add_ps(a_vec4, b_vec4);
622                        let result_vec5 = _mm256_add_ps(a_vec5, b_vec5);
623                        let result_vec6 = _mm256_add_ps(a_vec6, b_vec6);
624                        let result_vec7 = _mm256_add_ps(a_vec7, b_vec7);
625                        let result_vec8 = _mm256_add_ps(a_vec8, b_vec8);
626
627                        // Aligned stores
628                        _mm256_store_ps(result_ptr.add(i), result_vec1);
629                        _mm256_store_ps(result_ptr.add(i + 8), result_vec2);
630                        _mm256_store_ps(result_ptr.add(i + 16), result_vec3);
631                        _mm256_store_ps(result_ptr.add(i + 24), result_vec4);
632                        _mm256_store_ps(result_ptr.add(i + 32), result_vec5);
633                        _mm256_store_ps(result_ptr.add(i + 40), result_vec6);
634                        _mm256_store_ps(result_ptr.add(i + 48), result_vec7);
635                        _mm256_store_ps(result_ptr.add(i + 56), result_vec8);
636
637                        i += 64;
638                    }
639                } else {
640                    // Standard path: Unaligned data with prefetching
641                    while i + 32 <= len {
642                        // Prefetch future data for better cache performance
643                        if i + PREFETCH_DISTANCE < len {
644                            _mm_prefetch(
645                                a_ptr.add(i + PREFETCH_DISTANCE) as *const i8,
646                                _MM_HINT_T0,
647                            );
648                            _mm_prefetch(
649                                b_ptr.add(i + PREFETCH_DISTANCE) as *const i8,
650                                _MM_HINT_T0,
651                            );
652                        }
653
654                        // 4x unrolled loop with unaligned loads
655                        let a_vec1 = _mm256_loadu_ps(a_ptr.add(i));
656                        let a_vec2 = _mm256_loadu_ps(a_ptr.add(i + 8));
657                        let a_vec3 = _mm256_loadu_ps(a_ptr.add(i + 16));
658                        let a_vec4 = _mm256_loadu_ps(a_ptr.add(i + 24));
659
660                        let b_vec1 = _mm256_loadu_ps(b_ptr.add(i));
661                        let b_vec2 = _mm256_loadu_ps(b_ptr.add(i + 8));
662                        let b_vec3 = _mm256_loadu_ps(b_ptr.add(i + 16));
663                        let b_vec4 = _mm256_loadu_ps(b_ptr.add(i + 24));
664
665                        let result_vec1 = _mm256_add_ps(a_vec1, b_vec1);
666                        let result_vec2 = _mm256_add_ps(a_vec2, b_vec2);
667                        let result_vec3 = _mm256_add_ps(a_vec3, b_vec3);
668                        let result_vec4 = _mm256_add_ps(a_vec4, b_vec4);
669
670                        _mm256_storeu_ps(result_ptr.add(i), result_vec1);
671                        _mm256_storeu_ps(result_ptr.add(i + 8), result_vec2);
672                        _mm256_storeu_ps(result_ptr.add(i + 16), result_vec3);
673                        _mm256_storeu_ps(result_ptr.add(i + 24), result_vec4);
674
675                        i += 32;
676                    }
677                }
678
679                // Handle remaining 8-element chunks
680                while i + 8 <= len {
681                    let a_vec = _mm256_loadu_ps(a_ptr.add(i));
682                    let b_vec = _mm256_loadu_ps(b_ptr.add(i));
683                    let result_vec = _mm256_add_ps(a_vec, b_vec);
684                    _mm256_storeu_ps(result_ptr.add(i), result_vec);
685                    i += 8;
686                }
687
688                // Handle remaining elements
689                while i < len {
690                    *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
691                    i += 1;
692                }
693            }
694        } else if features.has_sse {
695            unsafe {
696                let mut i = 0;
697
698                // SSE version with prefetching
699                while i + 16 <= len {
700                    if i + 128 < len {
701                        _mm_prefetch(a_ptr.add(i + 128) as *const i8, _MM_HINT_T0);
702                        _mm_prefetch(b_ptr.add(i + 128) as *const i8, _MM_HINT_T0);
703                    }
704
705                    // 4x unrolled SSE
706                    let a_vec1 = _mm_loadu_ps(a_ptr.add(i));
707                    let a_vec2 = _mm_loadu_ps(a_ptr.add(i + 4));
708                    let a_vec3 = _mm_loadu_ps(a_ptr.add(i + 8));
709                    let a_vec4 = _mm_loadu_ps(a_ptr.add(i + 12));
710
711                    let b_vec1 = _mm_loadu_ps(b_ptr.add(i));
712                    let b_vec2 = _mm_loadu_ps(b_ptr.add(i + 4));
713                    let b_vec3 = _mm_loadu_ps(b_ptr.add(i + 8));
714                    let b_vec4 = _mm_loadu_ps(b_ptr.add(i + 12));
715
716                    let result_vec1 = _mm_add_ps(a_vec1, b_vec1);
717                    let result_vec2 = _mm_add_ps(a_vec2, b_vec2);
718                    let result_vec3 = _mm_add_ps(a_vec3, b_vec3);
719                    let result_vec4 = _mm_add_ps(a_vec4, b_vec4);
720
721                    _mm_storeu_ps(result_ptr.add(i), result_vec1);
722                    _mm_storeu_ps(result_ptr.add(i + 4), result_vec2);
723                    _mm_storeu_ps(result_ptr.add(i + 8), result_vec3);
724                    _mm_storeu_ps(result_ptr.add(i + 12), result_vec4);
725
726                    i += 16;
727                }
728
729                while i + 4 <= len {
730                    let a_vec = _mm_loadu_ps(a_ptr.add(i));
731                    let b_vec = _mm_loadu_ps(b_ptr.add(i));
732                    let result_vec = _mm_add_ps(a_vec, b_vec);
733                    _mm_storeu_ps(result_ptr.add(i), result_vec);
734                    i += 4;
735                }
736
737                while i < len {
738                    *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
739                    i += 1;
740                }
741            }
742        } else {
743            // Scalar with prefetching
744            unsafe {
745                let mut i = 0;
746                while i + 64 <= len {
747                    if i + 256 < len {
748                        _mm_prefetch(a_ptr.add(i + 256) as *const i8, _MM_HINT_T0);
749                        _mm_prefetch(b_ptr.add(i + 256) as *const i8, _MM_HINT_T0);
750                    }
751
752                    // Unrolled scalar loop
753                    for j in 0..64 {
754                        *result_ptr.add(i + j) = *a_ptr.add(i + j) + *b_ptr.add(i + j);
755                    }
756                    i += 64;
757                }
758
759                while i < len {
760                    *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
761                    i += 1;
762                }
763            }
764        }
765    }
766
767    #[cfg(target_arch = "x86_64")]
768    {
769        Array1::from_vec(result)
770    }
771
772    #[cfg(not(target_arch = "x86_64"))]
773    {
774        // Use Phase 2 implementation for other architectures
775        simd_add_f32_fast(a, b)
776    }
777}
778
779/// PHASE 3: AlignedVec integration for maximum SIMD performance
780use crate::simd_aligned::AlignedVec;
781
782#[allow(dead_code)]
783pub fn simd_add_aligned_ultra(
784    a: &AlignedVec<f32>,
785    b: &AlignedVec<f32>,
786) -> Result<AlignedVec<f32>, &'static str> {
787    if a.len() != b.len() {
788        return Err("Arrays must have the same length");
789    }
790
791    let len = a.len();
792    let mut result =
793        AlignedVec::with_capacity(len).map_err(|_| "Failed to allocate aligned memory")?;
794
795    unsafe {
796        let a_ptr = a.as_slice().as_ptr();
797        let b_ptr = b.as_slice().as_ptr();
798        let result_ptr = result.as_slice().as_ptr() as *mut f32;
799
800        #[cfg(target_arch = "x86_64")]
801        {
802            use std::arch::x86_64::*;
803
804            if is_x86_feature_detected!("avx2") {
805                let mut i = 0;
806
807                // Guaranteed aligned access - use fastest instructions
808                while i + 64 <= len {
809                    // Prefetch ahead
810                    if i + 256 < len {
811                        _mm_prefetch(a_ptr.add(i + 256) as *const i8, _MM_HINT_T0);
812                        _mm_prefetch(b_ptr.add(i + 256) as *const i8, _MM_HINT_T0);
813                    }
814
815                    // 8x AVX2 operations with aligned loads/stores
816                    for j in (0..64).step_by(8) {
817                        let a_vec = _mm256_load_ps(a_ptr.add(i + j)); // Aligned load
818                        let b_vec = _mm256_load_ps(b_ptr.add(i + j)); // Aligned load
819                        let result_vec = _mm256_add_ps(a_vec, b_vec);
820                        _mm256_store_ps(result_ptr.add(i + j), result_vec); // Aligned store
821                    }
822
823                    i += 64;
824                }
825
826                // Handle remaining elements
827                while i + 8 <= len {
828                    let a_vec = _mm256_load_ps(a_ptr.add(i));
829                    let b_vec = _mm256_load_ps(b_ptr.add(i));
830                    let result_vec = _mm256_add_ps(a_vec, b_vec);
831                    _mm256_store_ps(result_ptr.add(i), result_vec);
832                    i += 8;
833                }
834
835                while i < len {
836                    *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
837                    i += 1;
838                }
839
840                result.set_len(len);
841            }
842        }
843
844        #[cfg(not(target_arch = "x86_64"))]
845        {
846            // Fallback for other architectures
847            for i in 0..len {
848                *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
849            }
850            result.set_len(len);
851        }
852    }
853
854    Ok(result)
855}
856
857/// Compute element-wise addition of two f64 arrays using unified SIMD operations
858#[allow(dead_code)]
859pub fn simd_add_f64(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> Array1<f64> {
860    // Direct implementation to avoid circular dependency
861    (a + b).to_owned()
862}