1use super::{
7 ActivationFunction, BridgeConfig, NeuralBridge, NeuralOperation, NeuralResult,
8 GpuDevice, Precision,
9};
10use std::time::{Duration, Instant};
11
12#[derive(Debug, Clone)]
14pub struct BenchmarkResult {
15 pub operation_name: String,
16 pub input_size: usize,
17 pub gpu_time: Duration,
18 pub cpu_time: Duration,
19 pub gpu_throughput: f64,
20 pub cpu_throughput: f64,
21 pub speedup_factor: f64,
22 pub memory_usage: usize,
23}
24
25pub struct BenchmarkSuite {
27 gpu_bridge: NeuralBridge,
28 cpu_bridge: NeuralBridge,
29 results: Vec<BenchmarkResult>,
30}
31
32impl BenchmarkSuite {
33 pub fn new() -> NeuralResult<Self> {
35 let gpu_config = BridgeConfig {
37 enable_gpu: true,
38 gpu_device: GpuDevice::HighPerformance,
39 memory_pool_size: 1024, enable_monitoring: true,
41 auto_fallback: false, batch_size: 64,
43 precision: Precision::Float32,
44 };
45
46 let cpu_config = BridgeConfig {
48 enable_gpu: false,
49 auto_fallback: false,
50 enable_monitoring: true,
51 ..gpu_config.clone()
52 };
53
54 let gpu_bridge = NeuralBridge::with_config(gpu_config)?;
55 let cpu_bridge = NeuralBridge::with_config(cpu_config)?;
56
57 Ok(Self {
58 gpu_bridge,
59 cpu_bridge,
60 results: Vec::new(),
61 })
62 }
63
64 pub fn run_comprehensive_benchmarks(&mut self) -> NeuralResult<()> {
66 println!("Running Comprehensive Neural Integration Benchmarks");
67 println!("=================================================");
68
69 self.benchmark_vector_operations()?;
70 self.benchmark_matrix_operations()?;
71 self.benchmark_activation_functions()?;
72 self.benchmark_neural_networks()?;
73 self.benchmark_batch_operations()?;
74
75 self.print_summary();
76
77 Ok(())
78 }
79
80 fn benchmark_vector_operations(&mut self) -> NeuralResult<()> {
82 println!("\n--- Vector Operations Benchmark ---");
83
84 let sizes = vec![1_000, 10_000, 100_000, 1_000_000, 10_000_000];
85
86 for size in sizes {
87 let result = self.benchmark_vector_add(size)?;
88 self.results.push(result.clone());
89
90 println!(
91 "Vector Add ({}): GPU {:.2}ms ({:.0} Mops/s), CPU {:.2}ms ({:.0} Mops/s), Speedup: {:.2}x",
92 format_size(size),
93 result.gpu_time.as_secs_f64() * 1000.0,
94 result.gpu_throughput / 1e6,
95 result.cpu_time.as_secs_f64() * 1000.0,
96 result.cpu_throughput / 1e6,
97 result.speedup_factor
98 );
99 }
100
101 Ok(())
102 }
103
104 fn benchmark_matrix_operations(&mut self) -> NeuralResult<()> {
106 println!("\n--- Matrix Operations Benchmark ---");
107
108 let sizes = vec![64, 128, 256, 512, 1024];
109
110 for size in sizes {
111 let result = self.benchmark_matrix_multiply(size)?;
112 self.results.push(result.clone());
113
114 let gflops_gpu = calculate_matrix_gflops(size, result.gpu_time);
115 let gflops_cpu = calculate_matrix_gflops(size, result.cpu_time);
116
117 println!(
118 "Matrix {}x{}: GPU {:.2}ms ({:.1} GFLOPS), CPU {:.2}ms ({:.1} GFLOPS), Speedup: {:.2}x",
119 size, size,
120 result.gpu_time.as_secs_f64() * 1000.0,
121 gflops_gpu,
122 result.cpu_time.as_secs_f64() * 1000.0,
123 gflops_cpu,
124 result.speedup_factor
125 );
126 }
127
128 Ok(())
129 }
130
131 fn benchmark_activation_functions(&mut self) -> NeuralResult<()> {
133 println!("\n--- Activation Functions Benchmark ---");
134
135 let functions = vec![
136 ("Sigmoid", ActivationFunction::Sigmoid),
137 ("ReLU", ActivationFunction::ReLU),
138 ("Tanh", ActivationFunction::Tanh),
139 ("GELU", ActivationFunction::GELU),
140 ("Swish", ActivationFunction::Swish),
141 ];
142
143 let size = 1_000_000;
144
145 for (name, function) in functions {
146 let result = self.benchmark_activation_function(function, size)?;
147 self.results.push(result.clone());
148
149 println!(
150 "{:>8} ({}): GPU {:.2}ms ({:.0} Mops/s), CPU {:.2}ms ({:.0} Mops/s), Speedup: {:.2}x",
151 name,
152 format_size(size),
153 result.gpu_time.as_secs_f64() * 1000.0,
154 result.gpu_throughput / 1e6,
155 result.cpu_time.as_secs_f64() * 1000.0,
156 result.cpu_throughput / 1e6,
157 result.speedup_factor
158 );
159 }
160
161 Ok(())
162 }
163
164 fn benchmark_neural_networks(&mut self) -> NeuralResult<()> {
166 println!("\n--- Neural Network Benchmark ---");
167
168 let networks = vec![
169 ("Small", vec![10, 20, 10]),
170 ("Medium", vec![100, 200, 100, 50]),
171 ("Large", vec![784, 1000, 500, 250, 10]),
172 ("Deep", vec![100, 100, 100, 100, 100, 100, 10]),
173 ];
174
175 for (name, layer_sizes) in networks {
176 let result = self.benchmark_neural_network(&layer_sizes)?;
177 self.results.push(result.clone());
178
179 println!(
180 "{:>6} ({:?}): GPU {:.2}ms, CPU {:.2}ms, Speedup: {:.2}x",
181 name,
182 layer_sizes,
183 result.gpu_time.as_secs_f64() * 1000.0,
184 result.cpu_time.as_secs_f64() * 1000.0,
185 result.speedup_factor
186 );
187 }
188
189 Ok(())
190 }
191
192 fn benchmark_batch_operations(&mut self) -> NeuralResult<()> {
194 println!("\n--- Batch Operations Benchmark ---");
195
196 let batch_sizes = vec![1, 8, 32, 128, 512];
197 let operation_size = 10_000;
198
199 for batch_size in batch_sizes {
200 let result = self.benchmark_batch_processing(batch_size, operation_size)?;
201 self.results.push(result.clone());
202
203 println!(
204 "Batch size {:3}: GPU {:.2}ms ({:.0} ops/s), CPU {:.2}ms ({:.0} ops/s), Speedup: {:.2}x",
205 batch_size,
206 result.gpu_time.as_secs_f64() * 1000.0,
207 result.gpu_throughput,
208 result.cpu_time.as_secs_f64() * 1000.0,
209 result.cpu_throughput,
210 result.speedup_factor
211 );
212 }
213
214 Ok(())
215 }
216
217 fn benchmark_vector_add(&self, size: usize) -> NeuralResult<BenchmarkResult> {
219 let a: Vec<f32> = (0..size).map(|i| i as f32).collect();
220 let b: Vec<f32> = (0..size).map(|i| (i * 2) as f32).collect();
221 let mut input_data = a;
222 input_data.extend(b);
223
224 let operation = NeuralOperation::VectorAdd { size, _phantom: std::marker::PhantomData };
225
226 let gpu_time = self.time_operation(&self.gpu_bridge, operation.clone(), &input_data)?;
228
229 let cpu_time = self.time_operation(&self.cpu_bridge, operation, &input_data)?;
231
232 let gpu_throughput = size as f64 / gpu_time.as_secs_f64();
233 let cpu_throughput = size as f64 / cpu_time.as_secs_f64();
234 let speedup_factor = cpu_time.as_secs_f64() / gpu_time.as_secs_f64();
235
236 Ok(BenchmarkResult {
237 operation_name: "vector_add".to_string(),
238 input_size: size,
239 gpu_time,
240 cpu_time,
241 gpu_throughput,
242 cpu_throughput,
243 speedup_factor,
244 memory_usage: size * 4 * 3, })
246 }
247
248 fn benchmark_matrix_multiply(&self, size: usize) -> NeuralResult<BenchmarkResult> {
250 let matrix_a: Vec<f32> = (0..size * size).map(|i| i as f32).collect();
251 let matrix_b: Vec<f32> = (0..size * size).map(|i| (i * 2) as f32).collect();
252 let mut input_data = matrix_a;
253 input_data.extend(matrix_b);
254
255 let operation = NeuralOperation::MatrixMultiply {
256 a_rows: size,
257 a_cols: size,
258 b_cols: size,
259 _phantom: std::marker::PhantomData,
260 };
261
262 let gpu_time = self.time_operation(&self.gpu_bridge, operation.clone(), &input_data)?;
264
265 let cpu_time = self.time_operation(&self.cpu_bridge, operation, &input_data)?;
267
268 let operations = 2.0 * (size as f64).powi(3); let gpu_throughput = operations / gpu_time.as_secs_f64();
270 let cpu_throughput = operations / cpu_time.as_secs_f64();
271 let speedup_factor = cpu_time.as_secs_f64() / gpu_time.as_secs_f64();
272
273 Ok(BenchmarkResult {
274 operation_name: "matrix_multiply".to_string(),
275 input_size: size * size,
276 gpu_time,
277 cpu_time,
278 gpu_throughput,
279 cpu_throughput,
280 speedup_factor,
281 memory_usage: size * size * 4 * 3, })
283 }
284
285 fn benchmark_activation_function(&self, function: ActivationFunction, size: usize) -> NeuralResult<BenchmarkResult> {
287 let input_data: Vec<f32> = (0..size).map(|i| (i as f32) / 1000.0 - 5.0).collect();
288
289 let operation = NeuralOperation::ActivationFunction { function, size, _phantom: std::marker::PhantomData };
290
291 let gpu_time = self.time_operation(&self.gpu_bridge, operation.clone(), &input_data)?;
293
294 let cpu_time = self.time_operation(&self.cpu_bridge, operation, &input_data)?;
296
297 let gpu_throughput = size as f64 / gpu_time.as_secs_f64();
298 let cpu_throughput = size as f64 / cpu_time.as_secs_f64();
299 let speedup_factor = cpu_time.as_secs_f64() / gpu_time.as_secs_f64();
300
301 Ok(BenchmarkResult {
302 operation_name: format!("activation_{function:?}"),
303 input_size: size,
304 gpu_time,
305 cpu_time,
306 gpu_throughput,
307 cpu_throughput,
308 speedup_factor,
309 memory_usage: size * 4 * 2, })
311 }
312
313 fn benchmark_neural_network(&self, layer_sizes: &[usize]) -> NeuralResult<BenchmarkResult> {
315 let input_size = layer_sizes[0];
316 let input_data: Vec<f32> = (0..input_size).map(|i| (i as f32) / input_size as f32).collect();
317
318 let operation = NeuralOperation::ForwardPropagation {
319 layer_sizes: layer_sizes.to_vec(),
320 _phantom: std::marker::PhantomData,
321 };
322
323 let gpu_time = self.time_operation(&self.gpu_bridge, operation.clone(), &input_data)?;
325
326 let cpu_time = self.time_operation(&self.cpu_bridge, operation, &input_data)?;
328
329 let total_params: usize = layer_sizes.windows(2).map(|w| w[0] * w[1]).sum();
330 let gpu_throughput = total_params as f64 / gpu_time.as_secs_f64();
331 let cpu_throughput = total_params as f64 / cpu_time.as_secs_f64();
332 let speedup_factor = cpu_time.as_secs_f64() / gpu_time.as_secs_f64();
333
334 Ok(BenchmarkResult {
335 operation_name: "neural_network".to_string(),
336 input_size: total_params,
337 gpu_time,
338 cpu_time,
339 gpu_throughput,
340 cpu_throughput,
341 speedup_factor,
342 memory_usage: total_params * 4, })
344 }
345
346 fn benchmark_batch_processing(&self, batch_size: usize, operation_size: usize) -> NeuralResult<BenchmarkResult> {
348 let operations: Vec<_> = (0..batch_size)
349 .map(|_| NeuralOperation::VectorAdd { size: operation_size, _phantom: std::marker::PhantomData })
350 .collect();
351
352 let inputs: Vec<_> = (0..batch_size)
353 .map(|_| {
354 let mut data: Vec<f32> = (0..operation_size).map(|i| i as f32).collect();
355 data.extend((0..operation_size).map(|i| (i * 2) as f32));
356 data
357 })
358 .collect();
359
360 let gpu_start = Instant::now();
362 let gpu_processor = self.gpu_bridge.create_batch_processor();
363 let _gpu_results = gpu_processor.process_batch(operations.clone(), inputs.clone())?;
364 let gpu_time = gpu_start.elapsed();
365
366 let cpu_start = Instant::now();
368 let cpu_processor = self.cpu_bridge.create_batch_processor();
369 let _cpu_results = cpu_processor.process_batch(operations, inputs)?;
370 let cpu_time = cpu_start.elapsed();
371
372 let gpu_throughput = batch_size as f64 / gpu_time.as_secs_f64();
373 let cpu_throughput = batch_size as f64 / cpu_time.as_secs_f64();
374 let speedup_factor = cpu_time.as_secs_f64() / gpu_time.as_secs_f64();
375
376 Ok(BenchmarkResult {
377 operation_name: "batch_processing".to_string(),
378 input_size: batch_size,
379 gpu_time,
380 cpu_time,
381 gpu_throughput,
382 cpu_throughput,
383 speedup_factor,
384 memory_usage: batch_size * operation_size * 4 * 3,
385 })
386 }
387
388 fn time_operation(
390 &self,
391 bridge: &NeuralBridge,
392 operation: NeuralOperation<f32>,
393 input_data: &[f32],
394 ) -> NeuralResult<Duration> {
395 for _ in 0..3 {
397 let _ = bridge.execute_neural_operation(operation.clone(), input_data)?;
398 }
399
400 let iterations = 10;
402 let start = Instant::now();
403
404 for _ in 0..iterations {
405 let _ = bridge.execute_neural_operation(operation.clone(), input_data)?;
406 }
407
408 let total_time = start.elapsed();
409 Ok(total_time / iterations)
410 }
411
412 fn print_summary(&self) {
414 println!("\n=== Benchmark Summary ===");
415
416 let mut operation_groups: std::collections::HashMap<String, Vec<&BenchmarkResult>> = std::collections::HashMap::new();
417
418 for result in &self.results {
419 let base_name = result.operation_name.split('_').next().unwrap_or(&result.operation_name);
420 operation_groups.entry(base_name.to_string()).or_default().push(result);
421 }
422
423 for (operation_type, results) in operation_groups {
424 let avg_speedup: f64 = results.iter().map(|r| r.speedup_factor).sum::<f64>() / results.len() as f64;
425 let max_speedup = results.iter().map(|r| r.speedup_factor).fold(0.0, f64::max);
426 let min_speedup = results.iter().map(|r| r.speedup_factor).fold(f64::INFINITY, f64::min);
427
428 println!(
429 "{:>15}: Avg {:.2}x, Max {:.2}x, Min {:.2}x speedup ({} tests)",
430 operation_type, avg_speedup, max_speedup, min_speedup, results.len()
431 );
432 }
433
434 let overall_avg_speedup: f64 = self.results.iter().map(|r| r.speedup_factor).sum::<f64>() / self.results.len() as f64;
435 println!("\nOverall Average Speedup: {overall_avg_speedup:.2}x");
436
437 let total_memory: usize = self.results.iter().map(|r| r.memory_usage).sum();
439 println!("Total Memory Tested: {}", format_bytes(total_memory));
440
441 println!("\n=== Performance Recommendations ===");
443 let best_operations: Vec<_> = self.results.iter()
444 .filter(|r| r.speedup_factor > 5.0)
445 .collect();
446
447 if !best_operations.is_empty() {
448 println!("Best GPU operations (>5x speedup):");
449 for result in best_operations {
450 println!(" - {} ({:.1}x speedup)", result.operation_name, result.speedup_factor);
451 }
452 }
453
454 let poor_operations: Vec<_> = self.results.iter()
455 .filter(|r| r.speedup_factor < 1.5)
456 .collect();
457
458 if !poor_operations.is_empty() {
459 println!("Operations better on CPU (<1.5x speedup):");
460 for result in poor_operations {
461 println!(" - {} ({:.1}x speedup)", result.operation_name, result.speedup_factor);
462 }
463 }
464 }
465
466 pub fn export_csv(&self, filename: &str) -> Result<(), std::io::Error> {
468 use std::fs::File;
469 use std::io::Write;
470
471 let mut file = File::create(filename)?;
472
473 writeln!(
475 file,
476 "Operation,InputSize,GPUTimeMs,CPUTimeMs,GPUThroughput,CPUThroughput,SpeedupFactor,MemoryUsage"
477 )?;
478
479 for result in &self.results {
481 writeln!(
482 file,
483 "{},{},{:.6},{:.6},{:.2},{:.2},{:.2},{}",
484 result.operation_name,
485 result.input_size,
486 result.gpu_time.as_secs_f64() * 1000.0,
487 result.cpu_time.as_secs_f64() * 1000.0,
488 result.gpu_throughput,
489 result.cpu_throughput,
490 result.speedup_factor,
491 result.memory_usage
492 )?;
493 }
494
495 println!("Benchmark results exported to {filename}");
496 Ok(())
497 }
498}
499
500fn calculate_matrix_gflops(size: usize, time: Duration) -> f64 {
502 let flops = 2.0 * (size as f64).powi(3); flops / time.as_secs_f64() / 1e9
504}
505
506fn format_size(size: usize) -> String {
508 if size >= 1_000_000 {
509 format!("{}M", size / 1_000_000)
510 } else if size >= 1_000 {
511 format!("{}K", size / 1_000)
512 } else {
513 size.to_string()
514 }
515}
516
517fn format_bytes(bytes: usize) -> String {
519 const UNITS: &[&str] = &["B", "KB", "MB", "GB"];
520 let mut size = bytes as f64;
521 let mut unit_index = 0;
522
523 while size >= 1024.0 && unit_index < UNITS.len() - 1 {
524 size /= 1024.0;
525 unit_index += 1;
526 }
527
528 format!("{:.1} {}", size, UNITS[unit_index])
529}
530
531pub fn run_quick_benchmark() -> NeuralResult<()> {
533 println!("Running Quick Benchmark...");
534
535 let suite = BenchmarkSuite::new()?;
536
537 let result = suite.benchmark_vector_add(10_000)?;
539 println!("Vector Add 10K: {:.2}x speedup", result.speedup_factor);
540
541 let result = suite.benchmark_matrix_multiply(128)?;
542 println!("Matrix 128x128: {:.2}x speedup", result.speedup_factor);
543
544 let result = suite.benchmark_activation_function(ActivationFunction::ReLU, 100_000)?;
545 println!("ReLU 100K: {:.2}x speedup", result.speedup_factor);
546
547 Ok(())
548}
549
550#[cfg(test)]
551mod tests {
552 use super::*;
553
554 #[test]
555 fn test_benchmark_suite_creation() {
556 let suite = BenchmarkSuite::new();
557 assert!(suite.is_ok(), "Failed to create benchmark suite");
558 }
559
560 #[test]
561 fn test_quick_benchmark() {
562 let result = run_quick_benchmark();
563 assert!(result.is_ok(), "Quick benchmark failed: {result:?}");
564 }
565
566 #[test]
567 fn test_format_functions() {
568 assert_eq!(format_size(1_000), "1K");
569 assert_eq!(format_size(1_500_000), "1M");
570 assert_eq!(format_bytes(1024), "1.0 KB");
571 assert_eq!(format_bytes(1024 * 1024), "1.0 MB");
572 }
573}