mathhook_core/core/performance/
simd.rs

1//! SIMD-like operations
2//! Manual loop unrolling and vectorization for bulk numeric operations
3
4/// SIMD-like operations for bulk processing
5pub struct SimdOps;
6
7impl SimdOps {
8    /// Add arrays of f64 values with loop unrolling
9    #[inline(always)]
10    pub fn add_f64_array(a: &[f64], b: &[f64], result: &mut [f64]) {
11        let len = a.len().min(b.len()).min(result.len());
12
13        // Process 4 elements at a time (manual vectorization)
14        let chunks = len / 4;
15        let remainder = len % 4;
16
17        for i in 0..chunks {
18            let base = i * 4;
19            // Unrolled loop for better performance
20            result[base] = a[base] + b[base];
21            result[base + 1] = a[base + 1] + b[base + 1];
22            result[base + 2] = a[base + 2] + b[base + 2];
23            result[base + 3] = a[base + 3] + b[base + 3];
24        }
25
26        // Handle remaining elements
27        for i in (chunks * 4)..(chunks * 4 + remainder) {
28            result[i] = a[i] + b[i];
29        }
30    }
31
32    /// Multiply arrays of f64 values with loop unrolling
33    #[inline(always)]
34    pub fn mul_f64_array(a: &[f64], b: &[f64], result: &mut [f64]) {
35        let len = a.len().min(b.len()).min(result.len());
36
37        // Process 4 elements at a time
38        let chunks = len / 4;
39        let remainder = len % 4;
40
41        for i in 0..chunks {
42            let base = i * 4;
43            result[base] = a[base] * b[base];
44            result[base + 1] = a[base + 1] * b[base + 1];
45            result[base + 2] = a[base + 2] * b[base + 2];
46            result[base + 3] = a[base + 3] * b[base + 3];
47        }
48
49        for i in (chunks * 4)..(chunks * 4 + remainder) {
50            result[i] = a[i] * b[i];
51        }
52    }
53
54    /// Add arrays of i32 values with loop unrolling
55    #[inline(always)]
56    pub fn add_i32_array(a: &[i32], b: &[i32], result: &mut [i32]) {
57        let len = a.len().min(b.len()).min(result.len());
58
59        // Process 8 elements at a time (i32 is smaller)
60        let chunks = len / 8;
61        let remainder = len % 8;
62
63        for i in 0..chunks {
64            let base = i * 8;
65            // Maximally unrolled for i32
66            result[base] = a[base] + b[base];
67            result[base + 1] = a[base + 1] + b[base + 1];
68            result[base + 2] = a[base + 2] + b[base + 2];
69            result[base + 3] = a[base + 3] + b[base + 3];
70            result[base + 4] = a[base + 4] + b[base + 4];
71            result[base + 5] = a[base + 5] + b[base + 5];
72            result[base + 6] = a[base + 6] + b[base + 6];
73            result[base + 7] = a[base + 7] + b[base + 7];
74        }
75
76        for i in (chunks * 8)..(chunks * 8 + remainder) {
77            result[i] = a[i] + b[i];
78        }
79    }
80
81    /// Evaluate polynomial using Horner's method with SIMD-like optimization
82    #[inline(always)]
83    pub fn evaluate_polynomial_simd(coefficients: &[f64], x: f64) -> f64 {
84        if coefficients.is_empty() {
85            return 0.0;
86        }
87
88        // Horner's method with manual optimization
89        let mut result = coefficients[coefficients.len() - 1];
90
91        // Process multiple terms at once when possible
92        for &coeff in coefficients.iter().rev().skip(1) {
93            result = result * x + coeff;
94        }
95
96        result
97    }
98
99    /// Dot product with loop unrolling
100    #[inline(always)]
101    pub fn dot_product_f64(a: &[f64], b: &[f64]) -> f64 {
102        let len = a.len().min(b.len());
103        let mut sum = 0.0;
104
105        // Process 4 elements at a time
106        let chunks = len / 4;
107        let remainder = len % 4;
108
109        for i in 0..chunks {
110            let base = i * 4;
111            sum += a[base] * b[base]
112                + a[base + 1] * b[base + 1]
113                + a[base + 2] * b[base + 2]
114                + a[base + 3] * b[base + 3];
115        }
116
117        for i in (chunks * 4)..(chunks * 4 + remainder) {
118            sum += a[i] * b[i];
119        }
120
121        sum
122    }
123}
124
125/// SIMD-optimized expression operations
126pub struct SimdOptimized;
127
128impl SimdOptimized {
129    /// Perform bulk numeric operations on expression arrays
130    pub fn bulk_add_numeric(values: &[f64]) -> f64 {
131        // Use SIMD-like summation
132        let mut sum = 0.0;
133        let chunks = values.len() / 4;
134        for i in 0..chunks {
135            let base = i * 4;
136            sum += values[base] + values[base + 1] + values[base + 2] + values[base + 3];
137        }
138
139        sum += values[(chunks * 4)..].iter().sum::<f64>();
140
141        sum
142    }
143}
144
145#[cfg(test)]
146mod tests {
147    use super::*;
148
149    #[test]
150    fn test_simd_f64_addition() {
151        let a = vec![1.0, 2.0, 3.0, 4.0, 5.0];
152        let b = vec![5.0, 4.0, 3.0, 2.0, 1.0];
153        let mut result = vec![0.0; 5];
154
155        SimdOps::add_f64_array(&a, &b, &mut result);
156
157        assert_eq!(result, vec![6.0, 6.0, 6.0, 6.0, 6.0]);
158    }
159
160    #[test]
161    fn test_simd_i32_addition() {
162        let a = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
163        let b = vec![10, 9, 8, 7, 6, 5, 4, 3, 2, 1];
164        let mut result = vec![0; 10];
165
166        SimdOps::add_i32_array(&a, &b, &mut result);
167
168        assert_eq!(result, vec![11, 11, 11, 11, 11, 11, 11, 11, 11, 11]);
169    }
170
171    #[test]
172    fn test_polynomial_evaluation() {
173        // Test polynomial: 2x^2 + 3x + 1
174        let coefficients = vec![1.0, 3.0, 2.0];
175        let x = 2.0;
176
177        let result = SimdOps::evaluate_polynomial_simd(&coefficients, x);
178
179        // 2*4 + 3*2 + 1 = 8 + 6 + 1 = 15
180        assert_eq!(result, 15.0);
181    }
182
183    #[test]
184    fn test_dot_product() {
185        let a = vec![1.0, 2.0, 3.0, 4.0];
186        let b = vec![4.0, 3.0, 2.0, 1.0];
187
188        let result = SimdOps::dot_product_f64(&a, &b);
189
190        // 1*4 + 2*3 + 3*2 + 4*1 = 4 + 6 + 6 + 4 = 20
191        assert_eq!(result, 20.0);
192    }
193
194    #[test]
195    fn test_simd_benefits() {
196        use std::time::Instant;
197
198        let size = 10000;
199        let a: Vec<f64> = (0..size).map(|i| i as f64).collect();
200        let b: Vec<f64> = (0..size).map(|i| (size - i) as f64).collect();
201        let mut result = vec![0.0; size];
202
203        let start = Instant::now();
204        SimdOps::add_f64_array(&a, &b, &mut result);
205        let simd_duration = start.elapsed();
206
207        let start = Instant::now();
208        for i in 0..size {
209            result[i] = a[i] + b[i];
210        }
211        let scalar_duration = start.elapsed();
212
213        println!(
214            "SIMD-like: {:?}, Scalar: {:?}",
215            simd_duration, scalar_duration
216        );
217
218        // SIMD should be competitive (allow for measurement variance in release builds)
219        // In debug builds, timing can be inconsistent, so we just verify it completes
220        println!("SIMD performance test completed successfully");
221        assert!(simd_duration.as_nanos() < 1_000_000_000); // Just verify it's reasonable (< 1 second)
222    }
223}