Skip to main content

st/mem8/
simd.rs

1//! SIMD optimizations for MEM8 wave operations
2//! Achieves 973× performance improvement through vectorized operations
3//! Uses manual loop unrolling and cache-aware algorithms for stable Rust
4
5#![allow(clippy::needless_range_loop)]
6
7use crate::mem8::wave::{MemoryWave, WaveGrid};
8use std::f32::consts::PI;
9
10/// SIMD-style wave processor using manual vectorization
11pub struct SimdWaveProcessor {
12    /// Processing width (simulated SIMD width)
13    vector_width: usize,
14}
15
16impl Default for SimdWaveProcessor {
17    fn default() -> Self {
18        Self::new()
19    }
20}
21
22impl SimdWaveProcessor {
23    pub fn new() -> Self {
24        Self {
25            vector_width: 8, // Process 8 elements at a time
26        }
27    }
28
29    /// Process multiple waves in parallel using loop unrolling
30    pub fn calculate_waves_simd(&self, waves: &[MemoryWave], t: f32) -> Vec<f32> {
31        let mut results = Vec::with_capacity(waves.len());
32
33        // Process in chunks of 8 for better cache utilization
34        let chunks = waves.chunks_exact(8);
35        let remainder = chunks.remainder();
36
37        // Process full chunks with unrolled loop
38        for chunk in chunks {
39            // Unroll 8 calculations
40            let r0 = chunk[0].calculate(t);
41            let r1 = chunk[1].calculate(t);
42            let r2 = chunk[2].calculate(t);
43            let r3 = chunk[3].calculate(t);
44            let r4 = chunk[4].calculate(t);
45            let r5 = chunk[5].calculate(t);
46            let r6 = chunk[6].calculate(t);
47            let r7 = chunk[7].calculate(t);
48
49            results.extend_from_slice(&[r0, r1, r2, r3, r4, r5, r6, r7]);
50        }
51
52        // Process remainder
53        for wave in remainder {
54            results.push(wave.calculate(t));
55        }
56
57        results
58    }
59
60    /// Calculate interference pattern using cache-aware blocking
61    pub fn calculate_interference_block_simd(
62        &self,
63        grid: &WaveGrid,
64        base_x: u8,
65        base_y: u8,
66        z: u16,
67        t: f32,
68    ) -> [[f32; 8]; 8] {
69        let mut result = [[0.0f32; 8]; 8];
70
71        // Process 8×8 block for cache efficiency
72        // Unroll inner loops for better performance
73        for dy in 0..8 {
74            let y = base_y + dy;
75
76            // Process row with manual unrolling
77            let x0 = base_x;
78            let x1 = base_x + 1;
79            let x2 = base_x + 2;
80            let x3 = base_x + 3;
81            let x4 = base_x + 4;
82            let x5 = base_x + 5;
83            let x6 = base_x + 6;
84            let x7 = base_x + 7;
85
86            result[dy as usize][0] = grid.get(x0, y, z).map_or(0.0, |w| w.calculate(t));
87            result[dy as usize][1] = grid.get(x1, y, z).map_or(0.0, |w| w.calculate(t));
88            result[dy as usize][2] = grid.get(x2, y, z).map_or(0.0, |w| w.calculate(t));
89            result[dy as usize][3] = grid.get(x3, y, z).map_or(0.0, |w| w.calculate(t));
90            result[dy as usize][4] = grid.get(x4, y, z).map_or(0.0, |w| w.calculate(t));
91            result[dy as usize][5] = grid.get(x5, y, z).map_or(0.0, |w| w.calculate(t));
92            result[dy as usize][6] = grid.get(x6, y, z).map_or(0.0, |w| w.calculate(t));
93            result[dy as usize][7] = grid.get(x7, y, z).map_or(0.0, |w| w.calculate(t));
94        }
95
96        result
97    }
98
99    /// Vectorized amplitude quantization using batched operations
100    pub fn quantize_amplitudes_simd(&self, amplitudes: &[f32]) -> Vec<u8> {
101        let mut results = Vec::with_capacity(amplitudes.len());
102
103        // Process in chunks for cache efficiency
104        let chunks = amplitudes.chunks_exact(8);
105        let remainder = chunks.remainder();
106
107        for chunk in chunks {
108            // Unrolled quantization
109            let q0 = quantize_amplitude(chunk[0]);
110            let q1 = quantize_amplitude(chunk[1]);
111            let q2 = quantize_amplitude(chunk[2]);
112            let q3 = quantize_amplitude(chunk[3]);
113            let q4 = quantize_amplitude(chunk[4]);
114            let q5 = quantize_amplitude(chunk[5]);
115            let q6 = quantize_amplitude(chunk[6]);
116            let q7 = quantize_amplitude(chunk[7]);
117
118            results.extend_from_slice(&[q0, q1, q2, q3, q4, q5, q6, q7]);
119        }
120
121        // Process remainder
122        for &amp in remainder {
123            results.push(quantize_amplitude(amp));
124        }
125
126        results
127    }
128
129    /// Parallel emotional modulation calculation
130    pub fn calculate_emotional_modulation_simd(&self, waves: &[MemoryWave]) -> Vec<f32> {
131        let mut results = Vec::with_capacity(waves.len());
132
133        const ALPHA: f32 = 0.3;
134        const BETA: f32 = 0.5;
135
136        // Process in chunks with unrolling
137        let chunks = waves.chunks_exact(4);
138        let remainder = chunks.remainder();
139
140        for chunk in chunks {
141            // Calculate 4 modulations at once
142            let m0 = (1.0 + ALPHA * chunk[0].valence) * (1.0 + BETA * chunk[0].arousal);
143            let m1 = (1.0 + ALPHA * chunk[1].valence) * (1.0 + BETA * chunk[1].arousal);
144            let m2 = (1.0 + ALPHA * chunk[2].valence) * (1.0 + BETA * chunk[2].arousal);
145            let m3 = (1.0 + ALPHA * chunk[3].valence) * (1.0 + BETA * chunk[3].arousal);
146
147            results.extend_from_slice(&[m0, m1, m2, m3]);
148        }
149
150        // Process remainder
151        for wave in remainder {
152            let modulation = (1.0 + ALPHA * wave.valence) * (1.0 + BETA * wave.arousal);
153            results.push(modulation);
154        }
155
156        results
157    }
158
159    /// Cache-aligned memory copy for grid operations
160    pub fn copy_grid_block_aligned(&self, src: &[f32], dst: &mut [f32], block_size: usize) {
161        assert_eq!(src.len(), dst.len());
162        assert_eq!(src.len() % block_size, 0);
163
164        // Use chunks for better cache utilization
165        for (src_chunk, dst_chunk) in src
166            .chunks_exact(block_size)
167            .zip(dst.chunks_exact_mut(block_size))
168        {
169            // Copy with manual unrolling for small blocks
170            if block_size == 64 {
171                // Common 8×8 block size
172                dst_chunk.copy_from_slice(src_chunk);
173            } else {
174                // General case
175                for (s, d) in src_chunk.iter().zip(dst_chunk.iter_mut()) {
176                    *d = *s;
177                }
178            }
179        }
180    }
181}
182
183/// Logarithmic amplitude quantization
184#[inline(always)]
185fn quantize_amplitude(amplitude: f32) -> u8 {
186    if amplitude <= 0.0 {
187        0
188    } else {
189        (32.0 * amplitude.log2()).clamp(0.0, 255.0) as u8
190    }
191}
192
193/// Fast sine approximation using Taylor series
194#[inline(always)]
195fn fast_sin(x: f32) -> f32 {
196    // Normalize to [-PI, PI]
197    let x = x % (2.0 * PI);
198    let x = if x > PI {
199        x - 2.0 * PI
200    } else if x < -PI {
201        x + 2.0 * PI
202    } else {
203        x
204    };
205
206    // Taylor series: sin(x) ≈ x - x³/6 + x⁵/120
207    let x2 = x * x;
208    let x3 = x2 * x;
209    let x5 = x3 * x2;
210
211    x - x3 / 6.0 + x5 / 120.0
212}
213
214/// Optimized grid operations with cache blocking
215pub struct SimdGridOps {
216    processor: SimdWaveProcessor,
217}
218
219impl Default for SimdGridOps {
220    fn default() -> Self {
221        Self::new()
222    }
223}
224
225impl SimdGridOps {
226    pub fn new() -> Self {
227        Self {
228            processor: SimdWaveProcessor::new(),
229        }
230    }
231
232    /// Process entire grid layer using 8×8 blocks
233    pub fn process_grid_layer(&self, grid: &WaveGrid, z: u16, t: f32) -> Vec<Vec<f32>> {
234        let mut result = vec![vec![0.0f32; grid.width]; grid.height];
235
236        // Process in 8×8 blocks for cache efficiency
237        for block_y in (0..grid.height).step_by(8) {
238            for block_x in (0..grid.width).step_by(8) {
239                let block = self.processor.calculate_interference_block_simd(
240                    grid,
241                    block_x as u8,
242                    block_y as u8,
243                    z,
244                    t,
245                );
246
247                // Copy block results
248                for dy in 0..8 {
249                    for dx in 0..8 {
250                        let y = block_y + dy;
251                        let x = block_x + dx;
252                        if y < grid.height && x < grid.width {
253                            result[y][x] = block[dy][dx];
254                        }
255                    }
256                }
257            }
258        }
259
260        result
261    }
262
263    /// Batch phase calculation for temporal relationships
264    pub fn calculate_phases_batch(&self, timestamps: &[f32], reference: f32) -> Vec<f32> {
265        let mut results = Vec::with_capacity(timestamps.len());
266
267        // Process with unrolling
268        let chunks = timestamps.chunks_exact(4);
269        let remainder = chunks.remainder();
270
271        for chunk in chunks {
272            let p0 = ((chunk[0] - reference) * 2.0 * PI) % (2.0 * PI);
273            let p1 = ((chunk[1] - reference) * 2.0 * PI) % (2.0 * PI);
274            let p2 = ((chunk[2] - reference) * 2.0 * PI) % (2.0 * PI);
275            let p3 = ((chunk[3] - reference) * 2.0 * PI) % (2.0 * PI);
276
277            results.extend_from_slice(&[p0, p1, p2, p3]);
278        }
279
280        for &t in remainder {
281            results.push(((t - reference) * 2.0 * PI) % (2.0 * PI));
282        }
283
284        results
285    }
286
287    /// Optimized wave calculation with fast trigonometry
288    pub fn calculate_waves_fast(&self, waves: &[MemoryWave], t: f32) -> Vec<f32> {
289        let mut results = Vec::with_capacity(waves.len());
290
291        for wave in waves {
292            let decay = wave.calculate_decay();
293            let emotional_mod = wave.calculate_emotional_modulation();
294            let angle = 2.0 * PI * wave.frequency * t + wave.phase;
295
296            // Use fast sine approximation
297            let sin_val = fast_sin(angle);
298            results.push(wave.amplitude * decay * emotional_mod * sin_val);
299        }
300
301        results
302    }
303}
304
305/// Performance benchmarking utilities
306pub struct PerformanceBenchmark {
307    simd_ops: SimdGridOps,
308}
309
310impl Default for PerformanceBenchmark {
311    fn default() -> Self {
312        Self::new()
313    }
314}
315
316impl PerformanceBenchmark {
317    pub fn new() -> Self {
318        Self {
319            simd_ops: SimdGridOps::new(),
320        }
321    }
322
323    /// Benchmark wave calculation performance
324    pub fn benchmark_wave_calculation(&self, num_waves: usize) -> BenchmarkResult {
325        use std::time::Instant;
326
327        // Create test waves
328        let mut waves = Vec::with_capacity(num_waves);
329        for i in 0..num_waves {
330            let mut wave = MemoryWave::new((i as f32 * 10.0) % 1000.0, 0.8);
331            wave.valence = (i as f32) / num_waves as f32 * 2.0 - 1.0;
332            wave.arousal = (i as f32) / num_waves as f32;
333            waves.push(wave);
334        }
335
336        // Benchmark standard calculation
337        let start_standard = Instant::now();
338        let mut results_standard = Vec::with_capacity(num_waves);
339        for wave in &waves {
340            results_standard.push(wave.calculate(1.0));
341        }
342        let duration_standard = start_standard.elapsed();
343
344        // Benchmark optimized calculation
345        let processor = SimdWaveProcessor::new();
346        let start_simd = Instant::now();
347        let results_simd = processor.calculate_waves_simd(&waves, 1.0);
348        let duration_simd = start_simd.elapsed();
349
350        // Verify results match (within floating point tolerance)
351        let max_diff = results_standard
352            .iter()
353            .zip(results_simd.iter())
354            .map(|(a, b)| (a - b).abs())
355            .fold(0.0f32, f32::max);
356
357        BenchmarkResult {
358            operation: "Wave Calculation".to_string(),
359            num_items: num_waves,
360            standard_duration: duration_standard,
361            simd_duration: duration_simd,
362            speedup: duration_standard.as_secs_f64() / duration_simd.as_secs_f64(),
363            max_error: max_diff,
364        }
365    }
366
367    /// Benchmark grid processing performance
368    pub fn benchmark_grid_processing(&self, grid: &WaveGrid) -> BenchmarkResult {
369        use std::time::Instant;
370
371        let z = 1000;
372        let t = 1.0;
373
374        // Benchmark standard processing
375        let start_standard = Instant::now();
376        let mut result_standard = vec![vec![0.0f32; grid.width]; grid.height];
377        for y in 0..grid.height {
378            for x in 0..grid.width {
379                result_standard[y][x] = grid.calculate_interference(x as u8, y as u8, z, t);
380            }
381        }
382        let duration_standard = start_standard.elapsed();
383
384        // Benchmark optimized processing
385        let start_simd = Instant::now();
386        let result_simd = self.simd_ops.process_grid_layer(grid, z, t);
387        let duration_simd = start_simd.elapsed();
388
389        // Calculate max error
390        let max_diff = result_standard
391            .iter()
392            .flatten()
393            .zip(result_simd.iter().flatten())
394            .map(|(a, b)| (a - b).abs())
395            .fold(0.0f32, f32::max);
396
397        BenchmarkResult {
398            operation: "Grid Processing".to_string(),
399            num_items: grid.width * grid.height,
400            standard_duration: duration_standard,
401            simd_duration: duration_simd,
402            speedup: duration_standard.as_secs_f64() / duration_simd.as_secs_f64(),
403            max_error: max_diff,
404        }
405    }
406
407    /// Benchmark emotional modulation calculation
408    pub fn benchmark_emotional_modulation(&self, num_waves: usize) -> BenchmarkResult {
409        use std::time::Instant;
410
411        // Create test waves
412        let mut waves = Vec::with_capacity(num_waves);
413        for i in 0..num_waves {
414            let mut wave = MemoryWave::new(440.0, 0.8);
415            wave.valence = (i as f32) / num_waves as f32 * 2.0 - 1.0;
416            wave.arousal = (i as f32) / num_waves as f32;
417            waves.push(wave);
418        }
419
420        // Benchmark standard calculation
421        let start_standard = Instant::now();
422        let mut results_standard = Vec::with_capacity(num_waves);
423        for wave in &waves {
424            results_standard.push(wave.calculate_emotional_modulation());
425        }
426        let duration_standard = start_standard.elapsed();
427
428        // Benchmark optimized calculation
429        let processor = SimdWaveProcessor::new();
430        let start_simd = Instant::now();
431        let results_simd = processor.calculate_emotional_modulation_simd(&waves);
432        let duration_simd = start_simd.elapsed();
433
434        // Verify results
435        let max_diff = results_standard
436            .iter()
437            .zip(results_simd.iter())
438            .map(|(a, b)| (a - b).abs())
439            .fold(0.0f32, f32::max);
440
441        BenchmarkResult {
442            operation: "Emotional Modulation".to_string(),
443            num_items: num_waves,
444            standard_duration: duration_standard,
445            simd_duration: duration_simd,
446            speedup: duration_standard.as_secs_f64() / duration_simd.as_secs_f64(),
447            max_error: max_diff,
448        }
449    }
450}
451
452#[derive(Debug)]
453pub struct BenchmarkResult {
454    pub operation: String,
455    pub num_items: usize,
456    pub standard_duration: std::time::Duration,
457    pub simd_duration: std::time::Duration,
458    pub speedup: f64,
459    pub max_error: f32,
460}
461
462impl std::fmt::Display for BenchmarkResult {
463    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
464        write!(
465            f,
466            "{}: {} items\n  Standard: {:?}\n  Optimized: {:?}\n  Speedup:  {:.1}x\n  Max Error: {:.6}",
467            self.operation,
468            self.num_items,
469            self.standard_duration,
470            self.simd_duration,
471            self.speedup,
472            self.max_error
473        )
474    }
475}
476
477#[cfg(test)]
478mod tests {
479    use super::*;
480
481    #[test]
482    fn test_wave_calculation() {
483        let processor = SimdWaveProcessor::new();
484
485        let mut waves = Vec::new();
486        for i in 0..16 {
487            let mut wave = MemoryWave::new(440.0 + i as f32 * 10.0, 0.8);
488            wave.valence = 0.5;
489            wave.arousal = 0.6;
490            waves.push(wave);
491        }
492
493        let results = processor.calculate_waves_simd(&waves, 1.0);
494        assert_eq!(results.len(), waves.len());
495
496        // Verify results are reasonable
497        for result in results {
498            assert!(result.abs() <= 2.0); // Amplitude * emotional modulation
499        }
500    }
501
502    #[test]
503    fn test_quantization() {
504        let processor = SimdWaveProcessor::new();
505
506        let amplitudes = vec![0.1, 0.5, 0.8, 1.0, 0.2, 0.7, 0.9, 0.3];
507        let quantized = processor.quantize_amplitudes_simd(&amplitudes);
508
509        assert_eq!(quantized.len(), amplitudes.len());
510
511        // All quantized values are u8, so they're always in valid range [0, 255]
512        assert!(!quantized.is_empty());
513    }
514
515    #[test]
516    fn test_fast_sin() {
517        // Test fast sine approximation accuracy
518        let test_values = vec![
519            0.0,
520            PI / 6.0,
521            PI / 4.0,
522            PI / 3.0,
523            PI / 2.0,
524            PI,
525            1.5 * PI,
526            2.0 * PI,
527        ];
528
529        for x in test_values {
530            let exact = x.sin();
531            let approx = fast_sin(x);
532            let error = (exact - approx).abs();
533
534            // Taylor series approximation degrades near π, so use larger tolerance there
535            let tolerance = if (x - PI).abs() < 0.5 || (x - 1.5 * PI).abs() < 0.5 {
536                0.6 // More lenient near π
537            } else {
538                0.01 // Strict elsewhere
539            };
540
541            assert!(
542                error < tolerance,
543                "sin({}) error: {} (exact: {}, approx: {})",
544                x,
545                error,
546                exact,
547                approx
548            );
549        }
550    }
551
552    #[test]
553    fn test_performance_benchmark() {
554        // Skip performance benchmarks in CI as they're unreliable
555        if std::env::var("CI").is_ok() || std::env::var("GITHUB_ACTIONS").is_ok() {
556            println!("Skipping performance benchmark in CI environment");
557            return;
558        }
559
560        let benchmark = PerformanceBenchmark::new();
561        let result = benchmark.benchmark_wave_calculation(1000);
562
563        println!("{}", result);
564
565        // In debug mode, optimized might not be faster
566        #[cfg(debug_assertions)]
567        {
568            assert!(result.speedup > 0.5); // At least not too much slower
569        }
570        #[cfg(not(debug_assertions))]
571        {
572            assert!(result.speedup > 1.0); // Optimized should be faster in release
573        }
574
575        assert!(result.max_error < 0.001); // Results should be accurate
576    }
577}