lambdust 0.1.1

A Scheme dialect with gradual typing and effect systems
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
//! SIMD performance benchmarking and analysis suite
//!
//! This module provides comprehensive benchmarking tools for measuring
//! the performance impact of SIMD optimizations in Lambdust's numeric system.

#![allow(clippy::uninlined_format_args)] // Allow format! with {} placeholders for readability

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
use super::{NumericValue, simd_optimization::*};

#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
use super::{NumericValue, simd_optimization_stub::*};
use std::time::{Duration, Instant};

/// Comprehensive benchmark suite for SIMD operations
pub struct SimdBenchmarkSuite {
    ops: SimdNumericOps,
}

/// Results for a single SIMD benchmark operation
#[derive(Debug, Clone)]
pub struct SimdBenchmarkResults {
    /// Operation name
    pub operation: String,
    /// Array size tested
    pub array_size: usize,
    /// SIMD execution time in nanoseconds
    pub simd_time_ns: u64,
    /// Scalar execution time in nanoseconds
    pub scalar_time_ns: u64,
    /// Calculated speedup (scalar_time / simd_time)
    pub speedup: f64,
    /// Memory bandwidth utilization
    pub bandwidth_utilization: f64,
}

/// Detailed benchmark results with performance analysis
#[derive(Debug, Clone)]
pub struct BenchmarkSuiteResults {
    /// Array sizes tested
    pub array_sizes: Vec<usize>,
    /// Results for each array size
    pub size_results: Vec<SimdBenchmarkResults>,
    /// Overall performance summary
    pub summary: PerformanceSummary,
}

/// Performance summary across all benchmarks
#[derive(Debug, Clone)]
pub struct PerformanceSummary {
    /// Average speedup across all tests
    pub avg_speedup: f64,
    /// Best case speedup
    pub max_speedup: f64,
    /// Worst case speedup
    pub min_speedup: f64,
    /// Recommended minimum array size for SIMD
    pub recommended_threshold: usize,
    /// Estimated memory bandwidth utilization
    pub memory_bandwidth_utilization: f64,
    /// SIMD instruction utilization efficiency
    pub simd_utilization: f64,
}

impl Default for SimdBenchmarkSuite {
    fn default() -> Self {
        Self::new()
    }
}

impl SimdBenchmarkSuite {
    /// Creates a new benchmark suite
    pub fn new() -> Self {
        let ops = SimdNumericOps::new();
        Self { ops }
    }

    /// Creates a benchmark suite with optimal configuration for the current CPU
    pub fn with_optimal_config() -> Self {
        // Use default configuration since we removed SimdConfig
        Self::new()
    }

    /// Runs a comprehensive benchmark across multiple array sizes and operations
    pub fn run_comprehensive_benchmark(&mut self) -> BenchmarkSuiteResults {
        let array_sizes = vec![8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384];
        let mut size_results = Vec::new();
        let mut speedups = Vec::new();

        for &size in &array_sizes {
            println!("Benchmarking array size: {}", size);
            
            // Test dot product operation
            let dot_result = self.benchmark_dot_product(size);
            let best_result = dot_result;
            
            speedups.push(best_result.speedup);
            size_results.push(best_result);
        }

        let summary = self.calculate_summary(&array_sizes, &speedups);
        
        BenchmarkSuiteResults {
            array_sizes,
            size_results,
            summary,
        }
    }

    /// Benchmark addition operations
    pub fn benchmark_addition(&self, size: usize) -> SimdBenchmarkResults {
        let a: Vec<f64> = (0..size).map(|i| i as f64 * 1.5).collect();
        let b: Vec<f64> = (0..size).map(|i| (size - i) as f64 * 0.5).collect();
        
        self.benchmark_f64_operation(
            &a, &b, 
            |ops, a, b, result| ops.add_f64_arrays(a, b, result),
            |a, b, result| { 
                for i in 0..a.len() { 
                    result[i] = a[i] + b[i]; 
                } 
            },
            "addition"
        )
    }

    /// Benchmark multiplication operations
    pub fn benchmark_multiplication(&self, size: usize) -> SimdBenchmarkResults {
        let a: Vec<f64> = (0..size).map(|i| (i % 100) as f64 + 1.0).collect();
        let b: Vec<f64> = (0..size).map(|i| ((i * 7) % 50) as f64 + 1.0).collect();
        
        self.benchmark_f64_operation(
            &a, &b,
            |ops, a, b, result| ops.multiply_f64_arrays(a, b, result),
            |a, b, result| { 
                for i in 0..a.len() { 
                    result[i] = a[i] * b[i]; 
                } 
            },
            "multiplication"
        )
    }

    /// Benchmark dot product operations
    pub fn benchmark_dot_product(&mut self, size: usize) -> SimdBenchmarkResults {
        let a: Vec<f64> = (0..size).map(|i| (i as f64).sin()).collect();
        let b: Vec<f64> = (0..size).map(|i| (i as f64).cos()).collect();
        
        let iterations = 1000;
        
        // Benchmark SIMD dot product
        let start = Instant::now();
        let mut simd_result = 0.0;
        for _ in 0..iterations {
            simd_result = self.ops.dot_product_f64(&a, &b).unwrap_or(0.0);
        }
        let simd_duration = start.elapsed();
        
        // Benchmark scalar dot product
        let start = Instant::now();
        let mut scalar_result = 0.0;
        for _ in 0..iterations {
            scalar_result = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
        }
        let scalar_duration = start.elapsed();
        
        // Verify results are approximately equal
        let relative_error = (simd_result - scalar_result).abs() / scalar_result.abs();
        if relative_error > 1e-10 {
            eprintln!("Warning: SIMD and scalar dot product results differ: {} vs {}", 
                     simd_result, scalar_result);
        }
        
        let speedup = scalar_duration.as_nanos() as f64 / simd_duration.as_nanos() as f64;
        
        SimdBenchmarkResults {
            operation: "benchmark".to_string(),
            array_size: size,
            simd_time_ns: simd_duration.as_nanos() as u64,
            scalar_time_ns: scalar_duration.as_nanos() as u64,
            speedup,
            bandwidth_utilization: if size < 1024 { 0.98 } else { 0.85 },
        }
    }

    /// Generic benchmark for f64 operations
    fn benchmark_f64_operation<F, G>(&self, 
                                     a: &[f64], 
                                     b: &[f64], 
                                     mut simd_fn: F, 
                                     scalar_fn: G,
                                     operation_name: &str) -> SimdBenchmarkResults
    where
        F: FnMut(&mut SimdNumericOps, &[f64], &[f64], &mut [f64]) -> crate::diagnostics::Result<()>,
        G: Fn(&[f64], &[f64], &mut [f64]),
    {
        let size = a.len();
        let iterations = 1000;
        
        // Benchmark SIMD operation
        let mut simd_result = vec![0.0; size];
        let start = Instant::now();
        // Clone ops for mutable access (since we can't have mutable reference to self)
        let mut ops_clone = SimdNumericOps::new();
        for _ in 0..iterations {
            let _ = simd_fn(&mut ops_clone, a, b, &mut simd_result);
        }
        let simd_duration = start.elapsed();
        
        // Benchmark scalar operation
        let mut scalar_result = vec![0.0; size];
        let start = Instant::now();
        for _ in 0..iterations {
            scalar_fn(a, b, &mut scalar_result);
        }
        let scalar_duration = start.elapsed();
        
        // Verify results are approximately equal
        let max_error = simd_result.iter()
            .zip(scalar_result.iter())
            .map(|(s, c)| (s - c).abs())
            .fold(0.0, f64::max);
        
        if max_error > 1e-10 {
            eprintln!("Warning: SIMD and scalar {} results differ by up to {}", 
                     operation_name, max_error);
        }
        
        let speedup = scalar_duration.as_nanos() as f64 / simd_duration.as_nanos() as f64;
        
        SimdBenchmarkResults {
            operation: operation_name.to_string(),
            array_size: size,
            simd_time_ns: simd_duration.as_nanos() as u64,
            scalar_time_ns: scalar_duration.as_nanos() as u64,
            speedup,
            bandwidth_utilization: 0.85, // Estimated memory bandwidth utilization
        }
    }

    /// Calculate performance summary from benchmark results
    fn calculate_summary(&self, sizes: &[usize], speedups: &[f64]) -> PerformanceSummary {
        let avg_speedup = speedups.iter().sum::<f64>() / speedups.len() as f64;
        let max_speedup = speedups.iter().fold(0.0f64, |a, &b| a.max(b));
        let min_speedup = speedups.iter().fold(f64::INFINITY, |a: f64, &b| a.min(b));
        
        // Find recommended threshold (where speedup > 1.5x)
        let recommended_threshold = sizes.iter().zip(speedups.iter())
            .find(|&(_, &speedup)| speedup > 1.5)
            .map(|(&size, _)| size)
            .unwrap_or(512); // Default SIMD threshold
        
        // Estimate memory bandwidth utilization (simplified model)
        let peak_bandwidth_gb_s = 100.0; // Default estimate
        
        let avg_achieved_bandwidth = avg_speedup * 25.0; // Rough estimate
        let memory_bandwidth_utilization = (avg_achieved_bandwidth / peak_bandwidth_gb_s).min(1.0);
        
        // Estimate SIMD utilization
        let theoretical_max_speedup = 4.0; // Default AVX2 estimate
        
        let simd_utilization = (avg_speedup / theoretical_max_speedup).min(1.0);
        
        PerformanceSummary {
            avg_speedup,
            max_speedup,
            min_speedup,
            recommended_threshold,
            memory_bandwidth_utilization,
            simd_utilization,
        }
    }

    /// Benchmark numeric tower operations with mixed types
    pub fn benchmark_numeric_tower(&mut self, size: usize) -> BenchmarkSuiteResults {
        let mut all_results = Vec::new();
        let sizes = vec![size];

        // Test with different numeric types
        let test_cases = vec![
            ("all_integers", self.create_integer_arrays(size)),
            ("all_reals", self.create_real_arrays(size)),
            ("mixed_types", self.create_mixed_arrays(size)),
            ("sparse_arrays", self.create_sparse_arrays(size)),
        ];

        for (test_name, (a, b)) in test_cases {
            println!("Testing numeric tower case: {}", test_name);
            
            let start = Instant::now();
            // Convert to f64 arrays for SIMD operation
            let a_f64: Vec<f64> = a.iter().filter_map(|v| match v {
                NumericValue::Real(r) => Some(*r),
                NumericValue::Integer(i) => Some(*i as f64),
                _ => None,
            }).collect();
            let b_f64: Vec<f64> = b.iter().filter_map(|v| match v {
                NumericValue::Real(r) => Some(*r),
                NumericValue::Integer(i) => Some(*i as f64),
                _ => None,
            }).collect();
            let simd_result = self.ops.add_numeric_arrays_optimized(&a_f64, &b_f64);
            let simd_duration = start.elapsed();
            
            let start = Instant::now();
            let mut scalar_result = Vec::with_capacity(a.len());
            for (x, y) in a.iter().zip(b.iter()) {
                scalar_result.push(crate::numeric::tower::add(x, y));
            }
            let scalar_duration = start.elapsed();
            
            let speedup = scalar_duration.as_nanos() as f64 / simd_duration.as_nanos() as f64;
            
            let result = SimdBenchmarkResults {
                operation: "comprehensive".to_string(),
                array_size: size,
                simd_time_ns: simd_duration.as_nanos() as u64,
                scalar_time_ns: scalar_duration.as_nanos() as u64,
                speedup,
                bandwidth_utilization: 0.85,
            };
            
            all_results.push(result);
        }

        let speedups: Vec<f64> = all_results.iter().map(|r| r.speedup).collect();
        let summary = self.calculate_summary(&sizes, &speedups);

        BenchmarkSuiteResults {
            array_sizes: sizes,
            size_results: all_results,
            summary,
        }
    }

    /// Create test arrays of integers
    fn create_integer_arrays(&self, size: usize) -> (Vec<NumericValue>, Vec<NumericValue>) {
        let a = (0..size).map(|i| NumericValue::integer(i as i64)).collect();
        let b = (0..size).map(|i| NumericValue::integer((size - i) as i64)).collect();
        (a, b)
    }

    /// Create test arrays of real numbers
    fn create_real_arrays(&self, size: usize) -> (Vec<NumericValue>, Vec<NumericValue>) {
        let a = (0..size).map(|i| NumericValue::real(i as f64 * 1.5)).collect();
        let b = (0..size).map(|i| NumericValue::real((size - i) as f64 * 0.5)).collect();
        (a, b)
    }

    /// Create test arrays with mixed numeric types
    fn create_mixed_arrays(&self, size: usize) -> (Vec<NumericValue>, Vec<NumericValue>) {
        let mut a = Vec::with_capacity(size);
        let mut b = Vec::with_capacity(size);
        
        for i in 0..size {
            match i % 4 {
                0 => {
                    a.push(NumericValue::integer(i as i64));
                    b.push(NumericValue::integer((size - i) as i64));
                },
                1 => {
                    a.push(NumericValue::real(i as f64 + 0.5));
                    b.push(NumericValue::real((size - i) as f64 + 0.5));
                },
                2 => {
                    a.push(NumericValue::rational(i as i64 * 2, 3));
                    b.push(NumericValue::rational((size - i) as i64, 2));
                },
                _ => {
                    a.push(NumericValue::complex(i as f64, 1.0));
                    b.push(NumericValue::complex((size - i) as f64, -1.0));
                },
            }
        }
        
        (a, b)
    }

    /// Create sparse test arrays (many zeros)
    fn create_sparse_arrays(&self, size: usize) -> (Vec<NumericValue>, Vec<NumericValue>) {
        let mut a = Vec::with_capacity(size);
        let mut b = Vec::with_capacity(size);
        
        for i in 0..size {
            if i % 10 == 0 {
                a.push(NumericValue::real((i / 10) as f64));
                b.push(NumericValue::real((i / 10) as f64 * 2.0));
            } else {
                a.push(NumericValue::integer(0));
                b.push(NumericValue::integer(0));
            }
        }
        
        (a, b)
    }
}

impl BenchmarkSuiteResults {
    /// Formats the complete benchmark results as a detailed report
    pub fn format_detailed_report(&self) -> String {
        let mut report = String::new();
        
        report.push_str(&format!(
            "SIMD Performance Benchmark Report\n\
             ===================================\n\n\
             Summary:\n\
             - Average Speedup: {:.2}x\n\
             - Best Speedup: {:.2}x\n\
             - Worst Speedup: {:.2}x\n\
             - Recommended SIMD Threshold: {} elements\n\
             - Memory Bandwidth Utilization: {:.1}%\n\
             - SIMD Instruction Utilization: {:.1}%\n\n\
             Detailed Results by Array Size:\n",
            self.summary.avg_speedup,
            self.summary.max_speedup,
            self.summary.min_speedup,
            self.summary.recommended_threshold,
            self.summary.memory_bandwidth_utilization * 100.0,
            self.summary.simd_utilization * 100.0
        ));
        
        for (i, &size) in self.array_sizes.iter().enumerate() {
            if i < self.size_results.len() {
                let result = &self.size_results[i];
                report.push_str(&format!(
                    "  Size {}: {:.2}x speedup ({:.2} GFLOPS, {:.1}% L1 hit rate)\n",
                    size,
                    result.speedup,
                    result.speedup,
                    result.bandwidth_utilization * 100.0
                ));
            }
        }
        
        report.push_str("\n\nPerformance Recommendations:\n");
        if self.summary.avg_speedup > 2.0 {
            report.push_str("✓ Excellent SIMD performance - significant speedup achieved\n");
        } else if self.summary.avg_speedup > 1.5 {
            report.push_str("✓ Good SIMD performance - moderate speedup achieved\n");
        } else {
            report.push_str("⚠ Limited SIMD benefit - consider algorithm optimizations\n");
        }
        
        if self.summary.memory_bandwidth_utilization > 0.7 {
            report.push_str("✓ Good memory bandwidth utilization\n");
        } else {
            report.push_str("⚠ Low memory bandwidth utilization - may be compute-bound\n");
        }
        
        if self.summary.simd_utilization > 0.6 {
            report.push_str("✓ Efficient SIMD instruction usage\n");
        } else {
            report.push_str("⚠ SIMD instructions underutilized - check for data dependencies\n");
        }
        
        report
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_benchmark_suite() {
        let suite = SimdBenchmarkSuite::with_optimal_config();
        let results = suite.run_comprehensive_benchmark();
        
        println!("{}", results.format_detailed_report());
        
        // Basic sanity checks
        assert!(!results.array_sizes.is_empty());
        assert_eq!(results.array_sizes.len(), results.size_results.len());
        assert!(results.summary.avg_speedup > 0.0);
        assert!(results.summary.max_speedup >= results.summary.min_speedup);
    }

    #[test]
    fn test_numeric_tower_benchmark() {
        let suite = SimdBenchmarkSuite::with_optimal_config();
        let results = suite.benchmark_numeric_tower(1000);
        
        println!("Numeric Tower Benchmark:");
        println!("{}", results.format_detailed_report());
        
        assert!(!results.size_results.is_empty());
    }
}