cuda_rust_wasm/neural_integration/
benchmarks.rs

1//! Comprehensive Benchmarks for Neural Integration
2//!
3//! This module provides extensive benchmarking capabilities to measure
4//! and compare performance between CPU and GPU implementations.
5
6use super::{
7    ActivationFunction, BridgeConfig, NeuralBridge, NeuralOperation, NeuralResult,
8    GpuDevice, Precision,
9};
10use std::time::{Duration, Instant};
11
12/// Benchmark results for a single operation
13#[derive(Debug, Clone)]
14pub struct BenchmarkResult {
15    pub operation_name: String,
16    pub input_size: usize,
17    pub gpu_time: Duration,
18    pub cpu_time: Duration,
19    pub gpu_throughput: f64,
20    pub cpu_throughput: f64,
21    pub speedup_factor: f64,
22    pub memory_usage: usize,
23}
24
25/// Comprehensive benchmark suite
26pub struct BenchmarkSuite {
27    gpu_bridge: NeuralBridge,
28    cpu_bridge: NeuralBridge,
29    results: Vec<BenchmarkResult>,
30}
31
32impl BenchmarkSuite {
33    /// Create a new benchmark suite
34    pub fn new() -> NeuralResult<Self> {
35        // Create GPU-enabled bridge
36        let gpu_config = BridgeConfig {
37            enable_gpu: true,
38            gpu_device: GpuDevice::HighPerformance,
39            memory_pool_size: 1024, // 1GB
40            enable_monitoring: true,
41            auto_fallback: false, // Don't fallback to CPU
42            batch_size: 64,
43            precision: Precision::Float32,
44        };
45        
46        // Create CPU-only bridge
47        let cpu_config = BridgeConfig {
48            enable_gpu: false,
49            auto_fallback: false,
50            enable_monitoring: true,
51            ..gpu_config.clone()
52        };
53        
54        let gpu_bridge = NeuralBridge::with_config(gpu_config)?;
55        let cpu_bridge = NeuralBridge::with_config(cpu_config)?;
56        
57        Ok(Self {
58            gpu_bridge,
59            cpu_bridge,
60            results: Vec::new(),
61        })
62    }
63    
64    /// Run comprehensive benchmarks
65    pub fn run_comprehensive_benchmarks(&mut self) -> NeuralResult<()> {
66        println!("Running Comprehensive Neural Integration Benchmarks");
67        println!("=================================================");
68        
69        self.benchmark_vector_operations()?;
70        self.benchmark_matrix_operations()?;
71        self.benchmark_activation_functions()?;
72        self.benchmark_neural_networks()?;
73        self.benchmark_batch_operations()?;
74        
75        self.print_summary();
76        
77        Ok(())
78    }
79    
80    /// Benchmark vector operations
81    fn benchmark_vector_operations(&mut self) -> NeuralResult<()> {
82        println!("\n--- Vector Operations Benchmark ---");
83        
84        let sizes = vec![1_000, 10_000, 100_000, 1_000_000, 10_000_000];
85        
86        for size in sizes {
87            let result = self.benchmark_vector_add(size)?;
88            self.results.push(result.clone());
89            
90            println!(
91                "Vector Add ({}): GPU {:.2}ms ({:.0} Mops/s), CPU {:.2}ms ({:.0} Mops/s), Speedup: {:.2}x",
92                format_size(size),
93                result.gpu_time.as_secs_f64() * 1000.0,
94                result.gpu_throughput / 1e6,
95                result.cpu_time.as_secs_f64() * 1000.0,
96                result.cpu_throughput / 1e6,
97                result.speedup_factor
98            );
99        }
100        
101        Ok(())
102    }
103    
104    /// Benchmark matrix operations
105    fn benchmark_matrix_operations(&mut self) -> NeuralResult<()> {
106        println!("\n--- Matrix Operations Benchmark ---");
107        
108        let sizes = vec![64, 128, 256, 512, 1024];
109        
110        for size in sizes {
111            let result = self.benchmark_matrix_multiply(size)?;
112            self.results.push(result.clone());
113            
114            let gflops_gpu = calculate_matrix_gflops(size, result.gpu_time);
115            let gflops_cpu = calculate_matrix_gflops(size, result.cpu_time);
116            
117            println!(
118                "Matrix {}x{}: GPU {:.2}ms ({:.1} GFLOPS), CPU {:.2}ms ({:.1} GFLOPS), Speedup: {:.2}x",
119                size, size,
120                result.gpu_time.as_secs_f64() * 1000.0,
121                gflops_gpu,
122                result.cpu_time.as_secs_f64() * 1000.0,
123                gflops_cpu,
124                result.speedup_factor
125            );
126        }
127        
128        Ok(())
129    }
130    
131    /// Benchmark activation functions
132    fn benchmark_activation_functions(&mut self) -> NeuralResult<()> {
133        println!("\n--- Activation Functions Benchmark ---");
134        
135        let functions = vec![
136            ("Sigmoid", ActivationFunction::Sigmoid),
137            ("ReLU", ActivationFunction::ReLU),
138            ("Tanh", ActivationFunction::Tanh),
139            ("GELU", ActivationFunction::GELU),
140            ("Swish", ActivationFunction::Swish),
141        ];
142        
143        let size = 1_000_000;
144        
145        for (name, function) in functions {
146            let result = self.benchmark_activation_function(function, size)?;
147            self.results.push(result.clone());
148            
149            println!(
150                "{:>8} ({}): GPU {:.2}ms ({:.0} Mops/s), CPU {:.2}ms ({:.0} Mops/s), Speedup: {:.2}x",
151                name,
152                format_size(size),
153                result.gpu_time.as_secs_f64() * 1000.0,
154                result.gpu_throughput / 1e6,
155                result.cpu_time.as_secs_f64() * 1000.0,
156                result.cpu_throughput / 1e6,
157                result.speedup_factor
158            );
159        }
160        
161        Ok(())
162    }
163    
164    /// Benchmark neural network operations
165    fn benchmark_neural_networks(&mut self) -> NeuralResult<()> {
166        println!("\n--- Neural Network Benchmark ---");
167        
168        let networks = vec![
169            ("Small", vec![10, 20, 10]),
170            ("Medium", vec![100, 200, 100, 50]),
171            ("Large", vec![784, 1000, 500, 250, 10]),
172            ("Deep", vec![100, 100, 100, 100, 100, 100, 10]),
173        ];
174        
175        for (name, layer_sizes) in networks {
176            let result = self.benchmark_neural_network(&layer_sizes)?;
177            self.results.push(result.clone());
178            
179            println!(
180                "{:>6} ({:?}): GPU {:.2}ms, CPU {:.2}ms, Speedup: {:.2}x",
181                name,
182                layer_sizes,
183                result.gpu_time.as_secs_f64() * 1000.0,
184                result.cpu_time.as_secs_f64() * 1000.0,
185                result.speedup_factor
186            );
187        }
188        
189        Ok(())
190    }
191    
192    /// Benchmark batch operations
193    fn benchmark_batch_operations(&mut self) -> NeuralResult<()> {
194        println!("\n--- Batch Operations Benchmark ---");
195        
196        let batch_sizes = vec![1, 8, 32, 128, 512];
197        let operation_size = 10_000;
198        
199        for batch_size in batch_sizes {
200            let result = self.benchmark_batch_processing(batch_size, operation_size)?;
201            self.results.push(result.clone());
202            
203            println!(
204                "Batch size {:3}: GPU {:.2}ms ({:.0} ops/s), CPU {:.2}ms ({:.0} ops/s), Speedup: {:.2}x",
205                batch_size,
206                result.gpu_time.as_secs_f64() * 1000.0,
207                result.gpu_throughput,
208                result.cpu_time.as_secs_f64() * 1000.0,
209                result.cpu_throughput,
210                result.speedup_factor
211            );
212        }
213        
214        Ok(())
215    }
216    
217    /// Benchmark vector addition
218    fn benchmark_vector_add(&self, size: usize) -> NeuralResult<BenchmarkResult> {
219        let a: Vec<f32> = (0..size).map(|i| i as f32).collect();
220        let b: Vec<f32> = (0..size).map(|i| (i * 2) as f32).collect();
221        let mut input_data = a;
222        input_data.extend(b);
223        
224        let operation = NeuralOperation::VectorAdd { size, _phantom: std::marker::PhantomData };
225        
226        // GPU benchmark
227        let gpu_time = self.time_operation(&self.gpu_bridge, operation.clone(), &input_data)?;
228        
229        // CPU benchmark
230        let cpu_time = self.time_operation(&self.cpu_bridge, operation, &input_data)?;
231        
232        let gpu_throughput = size as f64 / gpu_time.as_secs_f64();
233        let cpu_throughput = size as f64 / cpu_time.as_secs_f64();
234        let speedup_factor = cpu_time.as_secs_f64() / gpu_time.as_secs_f64();
235        
236        Ok(BenchmarkResult {
237            operation_name: "vector_add".to_string(),
238            input_size: size,
239            gpu_time,
240            cpu_time,
241            gpu_throughput,
242            cpu_throughput,
243            speedup_factor,
244            memory_usage: size * 4 * 3, // 3 vectors of f32
245        })
246    }
247    
248    /// Benchmark matrix multiplication
249    fn benchmark_matrix_multiply(&self, size: usize) -> NeuralResult<BenchmarkResult> {
250        let matrix_a: Vec<f32> = (0..size * size).map(|i| i as f32).collect();
251        let matrix_b: Vec<f32> = (0..size * size).map(|i| (i * 2) as f32).collect();
252        let mut input_data = matrix_a;
253        input_data.extend(matrix_b);
254        
255        let operation = NeuralOperation::MatrixMultiply {
256            a_rows: size,
257            a_cols: size,
258            b_cols: size,
259            _phantom: std::marker::PhantomData,
260        };
261        
262        // GPU benchmark
263        let gpu_time = self.time_operation(&self.gpu_bridge, operation.clone(), &input_data)?;
264        
265        // CPU benchmark
266        let cpu_time = self.time_operation(&self.cpu_bridge, operation, &input_data)?;
267        
268        let operations = 2.0 * (size as f64).powi(3); // 2 * n^3 FLOPs
269        let gpu_throughput = operations / gpu_time.as_secs_f64();
270        let cpu_throughput = operations / cpu_time.as_secs_f64();
271        let speedup_factor = cpu_time.as_secs_f64() / gpu_time.as_secs_f64();
272        
273        Ok(BenchmarkResult {
274            operation_name: "matrix_multiply".to_string(),
275            input_size: size * size,
276            gpu_time,
277            cpu_time,
278            gpu_throughput,
279            cpu_throughput,
280            speedup_factor,
281            memory_usage: size * size * 4 * 3, // 3 matrices of f32
282        })
283    }
284    
285    /// Benchmark activation function
286    fn benchmark_activation_function(&self, function: ActivationFunction, size: usize) -> NeuralResult<BenchmarkResult> {
287        let input_data: Vec<f32> = (0..size).map(|i| (i as f32) / 1000.0 - 5.0).collect();
288        
289        let operation = NeuralOperation::ActivationFunction { function, size, _phantom: std::marker::PhantomData };
290        
291        // GPU benchmark
292        let gpu_time = self.time_operation(&self.gpu_bridge, operation.clone(), &input_data)?;
293        
294        // CPU benchmark
295        let cpu_time = self.time_operation(&self.cpu_bridge, operation, &input_data)?;
296        
297        let gpu_throughput = size as f64 / gpu_time.as_secs_f64();
298        let cpu_throughput = size as f64 / cpu_time.as_secs_f64();
299        let speedup_factor = cpu_time.as_secs_f64() / gpu_time.as_secs_f64();
300        
301        Ok(BenchmarkResult {
302            operation_name: format!("activation_{function:?}"),
303            input_size: size,
304            gpu_time,
305            cpu_time,
306            gpu_throughput,
307            cpu_throughput,
308            speedup_factor,
309            memory_usage: size * 4 * 2, // Input and output f32 vectors
310        })
311    }
312    
313    /// Benchmark neural network forward propagation
314    fn benchmark_neural_network(&self, layer_sizes: &[usize]) -> NeuralResult<BenchmarkResult> {
315        let input_size = layer_sizes[0];
316        let input_data: Vec<f32> = (0..input_size).map(|i| (i as f32) / input_size as f32).collect();
317        
318        let operation = NeuralOperation::ForwardPropagation {
319            layer_sizes: layer_sizes.to_vec(),
320            _phantom: std::marker::PhantomData,
321        };
322        
323        // GPU benchmark
324        let gpu_time = self.time_operation(&self.gpu_bridge, operation.clone(), &input_data)?;
325        
326        // CPU benchmark
327        let cpu_time = self.time_operation(&self.cpu_bridge, operation, &input_data)?;
328        
329        let total_params: usize = layer_sizes.windows(2).map(|w| w[0] * w[1]).sum();
330        let gpu_throughput = total_params as f64 / gpu_time.as_secs_f64();
331        let cpu_throughput = total_params as f64 / cpu_time.as_secs_f64();
332        let speedup_factor = cpu_time.as_secs_f64() / gpu_time.as_secs_f64();
333        
334        Ok(BenchmarkResult {
335            operation_name: "neural_network".to_string(),
336            input_size: total_params,
337            gpu_time,
338            cpu_time,
339            gpu_throughput,
340            cpu_throughput,
341            speedup_factor,
342            memory_usage: total_params * 4, // Approximate memory usage
343        })
344    }
345    
346    /// Benchmark batch processing
347    fn benchmark_batch_processing(&self, batch_size: usize, operation_size: usize) -> NeuralResult<BenchmarkResult> {
348        let operations: Vec<_> = (0..batch_size)
349            .map(|_| NeuralOperation::VectorAdd { size: operation_size, _phantom: std::marker::PhantomData })
350            .collect();
351        
352        let inputs: Vec<_> = (0..batch_size)
353            .map(|_| {
354                let mut data: Vec<f32> = (0..operation_size).map(|i| i as f32).collect();
355                data.extend((0..operation_size).map(|i| (i * 2) as f32));
356                data
357            })
358            .collect();
359        
360        // GPU benchmark
361        let gpu_start = Instant::now();
362        let gpu_processor = self.gpu_bridge.create_batch_processor();
363        let _gpu_results = gpu_processor.process_batch(operations.clone(), inputs.clone())?;
364        let gpu_time = gpu_start.elapsed();
365        
366        // CPU benchmark  
367        let cpu_start = Instant::now();
368        let cpu_processor = self.cpu_bridge.create_batch_processor();
369        let _cpu_results = cpu_processor.process_batch(operations, inputs)?;
370        let cpu_time = cpu_start.elapsed();
371        
372        let gpu_throughput = batch_size as f64 / gpu_time.as_secs_f64();
373        let cpu_throughput = batch_size as f64 / cpu_time.as_secs_f64();
374        let speedup_factor = cpu_time.as_secs_f64() / gpu_time.as_secs_f64();
375        
376        Ok(BenchmarkResult {
377            operation_name: "batch_processing".to_string(),
378            input_size: batch_size,
379            gpu_time,
380            cpu_time,
381            gpu_throughput,
382            cpu_throughput,
383            speedup_factor,
384            memory_usage: batch_size * operation_size * 4 * 3,
385        })
386    }
387    
388    /// Time a single operation
389    fn time_operation(
390        &self,
391        bridge: &NeuralBridge,
392        operation: NeuralOperation<f32>,
393        input_data: &[f32],
394    ) -> NeuralResult<Duration> {
395        // Warm up
396        for _ in 0..3 {
397            let _ = bridge.execute_neural_operation(operation.clone(), input_data)?;
398        }
399        
400        // Actual timing
401        let iterations = 10;
402        let start = Instant::now();
403        
404        for _ in 0..iterations {
405            let _ = bridge.execute_neural_operation(operation.clone(), input_data)?;
406        }
407        
408        let total_time = start.elapsed();
409        Ok(total_time / iterations)
410    }
411    
412    /// Print benchmark summary
413    fn print_summary(&self) {
414        println!("\n=== Benchmark Summary ===");
415        
416        let mut operation_groups: std::collections::HashMap<String, Vec<&BenchmarkResult>> = std::collections::HashMap::new();
417        
418        for result in &self.results {
419            let base_name = result.operation_name.split('_').next().unwrap_or(&result.operation_name);
420            operation_groups.entry(base_name.to_string()).or_default().push(result);
421        }
422        
423        for (operation_type, results) in operation_groups {
424            let avg_speedup: f64 = results.iter().map(|r| r.speedup_factor).sum::<f64>() / results.len() as f64;
425            let max_speedup = results.iter().map(|r| r.speedup_factor).fold(0.0, f64::max);
426            let min_speedup = results.iter().map(|r| r.speedup_factor).fold(f64::INFINITY, f64::min);
427            
428            println!(
429                "{:>15}: Avg {:.2}x, Max {:.2}x, Min {:.2}x speedup ({} tests)",
430                operation_type, avg_speedup, max_speedup, min_speedup, results.len()
431            );
432        }
433        
434        let overall_avg_speedup: f64 = self.results.iter().map(|r| r.speedup_factor).sum::<f64>() / self.results.len() as f64;
435        println!("\nOverall Average Speedup: {overall_avg_speedup:.2}x");
436        
437        // Memory usage summary
438        let total_memory: usize = self.results.iter().map(|r| r.memory_usage).sum();
439        println!("Total Memory Tested: {}", format_bytes(total_memory));
440        
441        // Performance recommendations
442        println!("\n=== Performance Recommendations ===");
443        let best_operations: Vec<_> = self.results.iter()
444            .filter(|r| r.speedup_factor > 5.0)
445            .collect();
446        
447        if !best_operations.is_empty() {
448            println!("Best GPU operations (>5x speedup):");
449            for result in best_operations {
450                println!("  - {} ({:.1}x speedup)", result.operation_name, result.speedup_factor);
451            }
452        }
453        
454        let poor_operations: Vec<_> = self.results.iter()
455            .filter(|r| r.speedup_factor < 1.5)
456            .collect();
457        
458        if !poor_operations.is_empty() {
459            println!("Operations better on CPU (<1.5x speedup):");
460            for result in poor_operations {
461                println!("  - {} ({:.1}x speedup)", result.operation_name, result.speedup_factor);
462            }
463        }
464    }
465    
466    /// Export results as CSV
467    pub fn export_csv(&self, filename: &str) -> Result<(), std::io::Error> {
468        use std::fs::File;
469        use std::io::Write;
470        
471        let mut file = File::create(filename)?;
472        
473        // CSV header
474        writeln!(
475            file,
476            "Operation,InputSize,GPUTimeMs,CPUTimeMs,GPUThroughput,CPUThroughput,SpeedupFactor,MemoryUsage"
477        )?;
478        
479        // CSV data
480        for result in &self.results {
481            writeln!(
482                file,
483                "{},{},{:.6},{:.6},{:.2},{:.2},{:.2},{}",
484                result.operation_name,
485                result.input_size,
486                result.gpu_time.as_secs_f64() * 1000.0,
487                result.cpu_time.as_secs_f64() * 1000.0,
488                result.gpu_throughput,
489                result.cpu_throughput,
490                result.speedup_factor,
491                result.memory_usage
492            )?;
493        }
494        
495        println!("Benchmark results exported to {filename}");
496        Ok(())
497    }
498}
499
500/// Helper function to calculate matrix multiplication GFLOPS
501fn calculate_matrix_gflops(size: usize, time: Duration) -> f64 {
502    let flops = 2.0 * (size as f64).powi(3); // 2 * n^3 operations
503    flops / time.as_secs_f64() / 1e9
504}
505
506/// Helper function to format sizes
507fn format_size(size: usize) -> String {
508    if size >= 1_000_000 {
509        format!("{}M", size / 1_000_000)
510    } else if size >= 1_000 {
511        format!("{}K", size / 1_000)
512    } else {
513        size.to_string()
514    }
515}
516
517/// Helper function to format bytes
518fn format_bytes(bytes: usize) -> String {
519    const UNITS: &[&str] = &["B", "KB", "MB", "GB"];
520    let mut size = bytes as f64;
521    let mut unit_index = 0;
522    
523    while size >= 1024.0 && unit_index < UNITS.len() - 1 {
524        size /= 1024.0;
525        unit_index += 1;
526    }
527    
528    format!("{:.1} {}", size, UNITS[unit_index])
529}
530
531/// Run quick benchmark (for testing)
532pub fn run_quick_benchmark() -> NeuralResult<()> {
533    println!("Running Quick Benchmark...");
534    
535    let suite = BenchmarkSuite::new()?;
536    
537    // Run a subset of benchmarks
538    let result = suite.benchmark_vector_add(10_000)?;
539    println!("Vector Add 10K: {:.2}x speedup", result.speedup_factor);
540    
541    let result = suite.benchmark_matrix_multiply(128)?;
542    println!("Matrix 128x128: {:.2}x speedup", result.speedup_factor);
543    
544    let result = suite.benchmark_activation_function(ActivationFunction::ReLU, 100_000)?;
545    println!("ReLU 100K: {:.2}x speedup", result.speedup_factor);
546    
547    Ok(())
548}
549
550#[cfg(test)]
551mod tests {
552    use super::*;
553    
554    #[test]
555    fn test_benchmark_suite_creation() {
556        let suite = BenchmarkSuite::new();
557        assert!(suite.is_ok(), "Failed to create benchmark suite");
558    }
559    
560    #[test]
561    fn test_quick_benchmark() {
562        let result = run_quick_benchmark();
563        assert!(result.is_ok(), "Quick benchmark failed: {result:?}");
564    }
565    
566    #[test]
567    fn test_format_functions() {
568        assert_eq!(format_size(1_000), "1K");
569        assert_eq!(format_size(1_500_000), "1M");
570        assert_eq!(format_bytes(1024), "1.0 KB");
571        assert_eq!(format_bytes(1024 * 1024), "1.0 MB");
572    }
573}