1use crate::{
7 ComputeIntensity, DeviceId, MultiGpuPerformanceMonitor, SharedGpuContext, UnifiedGpuResult,
8 Workload,
9};
10use std::collections::HashMap;
11use std::time::{Duration, Instant};
12
13#[derive(Debug, Clone)]
15pub struct BenchmarkConfig {
16 pub warmup_iterations: usize,
18 pub measurement_iterations: usize,
20 pub min_duration: Duration,
22 pub max_duration: Duration,
24 pub data_sizes: Vec<usize>,
26 pub device_combinations: Vec<Vec<DeviceId>>,
28 pub enable_profiling: bool,
30}
31
32impl Default for BenchmarkConfig {
33 fn default() -> Self {
34 Self {
35 warmup_iterations: 3,
36 measurement_iterations: 10,
37 min_duration: Duration::from_millis(100),
38 max_duration: Duration::from_secs(30),
39 data_sizes: vec![100, 1000, 10000, 100000],
40 device_combinations: vec![vec![DeviceId(0)], vec![DeviceId(0), DeviceId(1)]],
41 enable_profiling: true,
42 }
43 }
44}
45
46#[derive(Debug, Clone)]
48pub struct BenchmarkResult {
49 pub test_name: String,
50 pub data_size: usize,
51 pub device_count: usize,
52 pub device_ids: Vec<DeviceId>,
53 pub duration_ms: f64,
54 pub throughput_ops_per_sec: f64,
55 pub memory_bandwidth_gb_s: f64,
56 pub gpu_utilization_percent: f64,
57 pub scaling_efficiency: f64, pub error_rate: f64,
59 pub memory_usage_mb: f64,
60 pub metadata: HashMap<String, String>,
61}
62
63#[derive(Debug, Clone)]
65pub struct BenchmarkSuiteResults {
66 pub suite_name: String,
67 pub total_duration: Duration,
68 pub results: Vec<BenchmarkResult>,
69 pub scaling_analysis: ScalingAnalysis,
70 pub performance_summary: BenchmarkSummary,
71 pub timestamp: Instant,
72}
73
74#[derive(Debug, Clone)]
76pub struct ScalingAnalysis {
77 pub single_gpu_baseline: HashMap<String, f64>, pub multi_gpu_scaling: HashMap<String, Vec<f64>>, pub scaling_efficiency: HashMap<String, Vec<f64>>, pub optimal_device_counts: HashMap<String, usize>, }
82
83#[derive(Debug, Clone)]
85pub struct BenchmarkSummary {
86 pub total_tests: usize,
87 pub successful_tests: usize,
88 pub average_scaling_efficiency: f64,
89 pub best_performing_configuration: String,
90 pub performance_improvements: HashMap<String, f64>, pub bottlenecks_detected: Vec<String>,
92}
93
94pub struct AmariMultiGpuBenchmarks {
96 config: BenchmarkConfig,
97 performance_monitor: MultiGpuPerformanceMonitor,
98 #[allow(dead_code)]
99 gpu_context: SharedGpuContext,
100}
101
102impl AmariMultiGpuBenchmarks {
103 pub async fn new(config: BenchmarkConfig) -> UnifiedGpuResult<Self> {
105 let gpu_context = SharedGpuContext::with_multi_gpu().await?;
106 let performance_monitor = MultiGpuPerformanceMonitor::new(10000, Duration::from_secs(5));
107
108 Ok(Self {
109 config,
110 performance_monitor,
111 gpu_context,
112 })
113 }
114
115 pub async fn run_complete_suite(&self) -> UnifiedGpuResult<BenchmarkSuiteResults> {
117 let start_time = Instant::now();
118 let mut results = Vec::new();
119
120 results.extend(self.run_geometric_algebra_benchmarks().await?);
122
123 results.extend(self.run_tropical_algebra_benchmarks().await?);
125
126 results.extend(self.run_autodiff_benchmarks().await?);
128
129 results.extend(self.run_info_geometry_benchmarks().await?);
131
132 results.extend(self.run_fusion_systems_benchmarks().await?);
134
135 results.extend(self.run_network_analysis_benchmarks().await?);
137
138 results.extend(self.run_cellular_automata_benchmarks().await?);
140
141 results.extend(self.run_relativistic_physics_benchmarks().await?);
143
144 results.extend(self.run_enumerative_geometry_benchmarks().await?);
146
147 let total_duration = start_time.elapsed();
148 let scaling_analysis = self.analyze_scaling_performance(&results);
149 let performance_summary = self.generate_performance_summary(&results, &scaling_analysis);
150
151 Ok(BenchmarkSuiteResults {
152 suite_name: "Amari Multi-GPU Complete Suite v0.9.6".to_string(),
153 total_duration,
154 results,
155 scaling_analysis,
156 performance_summary,
157 timestamp: start_time,
158 })
159 }
160
161 async fn run_geometric_algebra_benchmarks(&self) -> UnifiedGpuResult<Vec<BenchmarkResult>> {
163 let mut results = Vec::new();
164
165 for &data_size in &self.config.data_sizes {
167 for devices in &self.config.device_combinations {
168 let result = self
169 .benchmark_geometric_product(data_size, devices.clone())
170 .await?;
171 results.push(result);
172 }
173 }
174
175 for &data_size in &self.config.data_sizes {
177 for devices in &self.config.device_combinations {
178 let result = self
179 .benchmark_rotor_application(data_size, devices.clone())
180 .await?;
181 results.push(result);
182 }
183 }
184
185 for &data_size in &self.config.data_sizes {
187 for devices in &self.config.device_combinations {
188 let result = self
189 .benchmark_multivector_normalization(data_size, devices.clone())
190 .await?;
191 results.push(result);
192 }
193 }
194
195 Ok(results)
196 }
197
198 async fn run_tropical_algebra_benchmarks(&self) -> UnifiedGpuResult<Vec<BenchmarkResult>> {
200 let mut results = Vec::new();
201
202 for &data_size in &self.config.data_sizes {
204 for devices in &self.config.device_combinations {
205 let result = self
206 .benchmark_tropical_matrix_multiply(data_size, devices.clone())
207 .await?;
208 results.push(result);
209 }
210 }
211
212 for &data_size in &self.config.data_sizes {
214 for devices in &self.config.device_combinations {
215 let result = self
216 .benchmark_tropical_neural_network(data_size, devices.clone())
217 .await?;
218 results.push(result);
219 }
220 }
221
222 Ok(results)
223 }
224
225 async fn run_autodiff_benchmarks(&self) -> UnifiedGpuResult<Vec<BenchmarkResult>> {
227 let mut results = Vec::new();
228
229 for &data_size in &self.config.data_sizes {
231 for devices in &self.config.device_combinations {
232 let result = self
233 .benchmark_forward_mode_ad(data_size, devices.clone())
234 .await?;
235 results.push(result);
236 }
237 }
238
239 for &data_size in &self.config.data_sizes {
241 for devices in &self.config.device_combinations {
242 let result = self
243 .benchmark_batch_gradients(data_size, devices.clone())
244 .await?;
245 results.push(result);
246 }
247 }
248
249 Ok(results)
250 }
251
252 async fn run_info_geometry_benchmarks(&self) -> UnifiedGpuResult<Vec<BenchmarkResult>> {
254 let mut results = Vec::new();
255
256 for &data_size in &self.config.data_sizes {
258 for devices in &self.config.device_combinations {
259 let result = self
260 .benchmark_fisher_information(data_size, devices.clone())
261 .await?;
262 results.push(result);
263 }
264 }
265
266 for &data_size in &self.config.data_sizes {
268 for devices in &self.config.device_combinations {
269 let result = self
270 .benchmark_bregman_divergence(data_size, devices.clone())
271 .await?;
272 results.push(result);
273 }
274 }
275
276 Ok(results)
277 }
278
279 async fn run_fusion_systems_benchmarks(&self) -> UnifiedGpuResult<Vec<BenchmarkResult>> {
281 let mut results = Vec::new();
282
283 for &data_size in &self.config.data_sizes {
285 for devices in &self.config.device_combinations {
286 let result = self
287 .benchmark_tdc_fusion(data_size, devices.clone())
288 .await?;
289 results.push(result);
290 }
291 }
292
293 Ok(results)
294 }
295
296 async fn run_network_analysis_benchmarks(&self) -> UnifiedGpuResult<Vec<BenchmarkResult>> {
298 let mut results = Vec::new();
299
300 for &data_size in &self.config.data_sizes {
302 for devices in &self.config.device_combinations {
303 let result = self
304 .benchmark_graph_neural_network(data_size, devices.clone())
305 .await?;
306 results.push(result);
307 }
308 }
309
310 Ok(results)
311 }
312
313 async fn run_cellular_automata_benchmarks(&self) -> UnifiedGpuResult<Vec<BenchmarkResult>> {
315 let mut results = Vec::new();
316
317 for &data_size in &self.config.data_sizes {
319 for devices in &self.config.device_combinations {
320 let result = self
321 .benchmark_ca_evolution(data_size, devices.clone())
322 .await?;
323 results.push(result);
324 }
325 }
326
327 Ok(results)
328 }
329
330 async fn run_relativistic_physics_benchmarks(&self) -> UnifiedGpuResult<Vec<BenchmarkResult>> {
332 let mut results = Vec::new();
333
334 for &data_size in &self.config.data_sizes {
336 for devices in &self.config.device_combinations {
337 let result = self
338 .benchmark_spacetime_operations(data_size, devices.clone())
339 .await?;
340 results.push(result);
341 }
342 }
343
344 Ok(results)
345 }
346
347 async fn run_enumerative_geometry_benchmarks(&self) -> UnifiedGpuResult<Vec<BenchmarkResult>> {
349 let mut results = Vec::new();
350
351 for &data_size in &self.config.data_sizes {
353 for devices in &self.config.device_combinations {
354 let result = self
355 .benchmark_intersection_theory(data_size, devices.clone())
356 .await?;
357 results.push(result);
358 }
359 }
360
361 Ok(results)
362 }
363
364 async fn benchmark_geometric_product(
366 &self,
367 data_size: usize,
368 devices: Vec<DeviceId>,
369 ) -> UnifiedGpuResult<BenchmarkResult> {
370 let operation_name = "geometric_product";
371
372 let workload = Workload {
374 operation_type: operation_name.to_string(),
375 data_size,
376 memory_requirement_mb: (data_size * 8 * 8) as f32 / 1024.0 / 1024.0, compute_intensity: ComputeIntensity::Moderate,
378 parallelizable: true,
379 synchronization_required: devices.len() > 1,
380 };
381
382 self.execute_benchmark(operation_name, workload, devices)
383 .await
384 }
385
386 async fn benchmark_rotor_application(
388 &self,
389 data_size: usize,
390 devices: Vec<DeviceId>,
391 ) -> UnifiedGpuResult<BenchmarkResult> {
392 let operation_name = "rotor_application";
393
394 let workload = Workload {
395 operation_type: operation_name.to_string(),
396 data_size,
397 memory_requirement_mb: (data_size * 8 * 4) as f32 / 1024.0 / 1024.0, compute_intensity: ComputeIntensity::Moderate,
399 parallelizable: true,
400 synchronization_required: false,
401 };
402
403 self.execute_benchmark(operation_name, workload, devices)
404 .await
405 }
406
407 async fn benchmark_multivector_normalization(
409 &self,
410 data_size: usize,
411 devices: Vec<DeviceId>,
412 ) -> UnifiedGpuResult<BenchmarkResult> {
413 let operation_name = "multivector_normalization";
414
415 let workload = Workload {
416 operation_type: operation_name.to_string(),
417 data_size,
418 memory_requirement_mb: (data_size * 8 * 8) as f32 / 1024.0 / 1024.0,
419 compute_intensity: ComputeIntensity::Light,
420 parallelizable: true,
421 synchronization_required: false,
422 };
423
424 self.execute_benchmark(operation_name, workload, devices)
425 .await
426 }
427
428 async fn benchmark_tropical_matrix_multiply(
430 &self,
431 data_size: usize,
432 devices: Vec<DeviceId>,
433 ) -> UnifiedGpuResult<BenchmarkResult> {
434 let operation_name = "tropical_matrix_multiply";
435
436 let workload = Workload {
437 operation_type: operation_name.to_string(),
438 data_size,
439 memory_requirement_mb: (data_size * data_size * 4) as f32 / 1024.0 / 1024.0, compute_intensity: ComputeIntensity::Heavy,
441 parallelizable: true,
442 synchronization_required: devices.len() > 1,
443 };
444
445 self.execute_benchmark(operation_name, workload, devices)
446 .await
447 }
448
449 async fn benchmark_tropical_neural_network(
451 &self,
452 data_size: usize,
453 devices: Vec<DeviceId>,
454 ) -> UnifiedGpuResult<BenchmarkResult> {
455 let operation_name = "tropical_neural_network";
456
457 let workload = Workload {
458 operation_type: operation_name.to_string(),
459 data_size,
460 memory_requirement_mb: (data_size * 512 * 4) as f32 / 1024.0 / 1024.0, compute_intensity: ComputeIntensity::Heavy,
462 parallelizable: true,
463 synchronization_required: devices.len() > 1,
464 };
465
466 self.execute_benchmark(operation_name, workload, devices)
467 .await
468 }
469
470 async fn benchmark_forward_mode_ad(
472 &self,
473 data_size: usize,
474 devices: Vec<DeviceId>,
475 ) -> UnifiedGpuResult<BenchmarkResult> {
476 let operation_name = "forward_mode_ad";
477
478 let workload = Workload {
479 operation_type: operation_name.to_string(),
480 data_size,
481 memory_requirement_mb: (data_size * 2 * 8) as f32 / 1024.0 / 1024.0, compute_intensity: ComputeIntensity::Moderate,
483 parallelizable: true,
484 synchronization_required: false,
485 };
486
487 self.execute_benchmark(operation_name, workload, devices)
488 .await
489 }
490
491 async fn benchmark_batch_gradients(
493 &self,
494 data_size: usize,
495 devices: Vec<DeviceId>,
496 ) -> UnifiedGpuResult<BenchmarkResult> {
497 let operation_name = "batch_gradients";
498
499 let workload = Workload {
500 operation_type: operation_name.to_string(),
501 data_size,
502 memory_requirement_mb: (data_size * 64 * 8) as f32 / 1024.0 / 1024.0, compute_intensity: ComputeIntensity::Heavy,
504 parallelizable: true,
505 synchronization_required: devices.len() > 1,
506 };
507
508 self.execute_benchmark(operation_name, workload, devices)
509 .await
510 }
511
512 async fn benchmark_fisher_information(
514 &self,
515 data_size: usize,
516 devices: Vec<DeviceId>,
517 ) -> UnifiedGpuResult<BenchmarkResult> {
518 let operation_name = "fisher_information";
519
520 let workload = Workload {
521 operation_type: operation_name.to_string(),
522 data_size,
523 memory_requirement_mb: (data_size * data_size * 8) as f32 / 1024.0 / 1024.0, compute_intensity: ComputeIntensity::Heavy,
525 parallelizable: true,
526 synchronization_required: devices.len() > 1,
527 };
528
529 self.execute_benchmark(operation_name, workload, devices)
530 .await
531 }
532
533 async fn benchmark_bregman_divergence(
535 &self,
536 data_size: usize,
537 devices: Vec<DeviceId>,
538 ) -> UnifiedGpuResult<BenchmarkResult> {
539 let operation_name = "bregman_divergence";
540
541 let workload = Workload {
542 operation_type: operation_name.to_string(),
543 data_size,
544 memory_requirement_mb: (data_size * 8 * 8) as f32 / 1024.0 / 1024.0, compute_intensity: ComputeIntensity::Moderate,
546 parallelizable: true,
547 synchronization_required: false,
548 };
549
550 self.execute_benchmark(operation_name, workload, devices)
551 .await
552 }
553
554 async fn benchmark_tdc_fusion(
556 &self,
557 data_size: usize,
558 devices: Vec<DeviceId>,
559 ) -> UnifiedGpuResult<BenchmarkResult> {
560 let operation_name = "tdc_fusion";
561
562 let workload = Workload {
563 operation_type: operation_name.to_string(),
564 data_size,
565 memory_requirement_mb: (data_size * 16 * 8) as f32 / 1024.0 / 1024.0, compute_intensity: ComputeIntensity::Extreme,
567 parallelizable: true,
568 synchronization_required: devices.len() > 1,
569 };
570
571 self.execute_benchmark(operation_name, workload, devices)
572 .await
573 }
574
575 async fn benchmark_graph_neural_network(
577 &self,
578 data_size: usize,
579 devices: Vec<DeviceId>,
580 ) -> UnifiedGpuResult<BenchmarkResult> {
581 let operation_name = "graph_neural_network";
582
583 let workload = Workload {
584 operation_type: operation_name.to_string(),
585 data_size,
586 memory_requirement_mb: (data_size * data_size * 4) as f32 / 1024.0 / 1024.0, compute_intensity: ComputeIntensity::Heavy,
588 parallelizable: true,
589 synchronization_required: devices.len() > 1,
590 };
591
592 self.execute_benchmark(operation_name, workload, devices)
593 .await
594 }
595
596 async fn benchmark_ca_evolution(
598 &self,
599 data_size: usize,
600 devices: Vec<DeviceId>,
601 ) -> UnifiedGpuResult<BenchmarkResult> {
602 let operation_name = "ca_evolution";
603
604 let workload = Workload {
605 operation_type: operation_name.to_string(),
606 data_size,
607 memory_requirement_mb: (data_size * data_size * 8) as f32 / 1024.0 / 1024.0, compute_intensity: ComputeIntensity::Moderate,
609 parallelizable: true,
610 synchronization_required: devices.len() > 1,
611 };
612
613 self.execute_benchmark(operation_name, workload, devices)
614 .await
615 }
616
617 async fn benchmark_spacetime_operations(
619 &self,
620 data_size: usize,
621 devices: Vec<DeviceId>,
622 ) -> UnifiedGpuResult<BenchmarkResult> {
623 let operation_name = "spacetime_operations";
624
625 let workload = Workload {
626 operation_type: operation_name.to_string(),
627 data_size,
628 memory_requirement_mb: (data_size * 16 * 8) as f32 / 1024.0 / 1024.0, compute_intensity: ComputeIntensity::Moderate,
630 parallelizable: true,
631 synchronization_required: false,
632 };
633
634 self.execute_benchmark(operation_name, workload, devices)
635 .await
636 }
637
638 async fn benchmark_intersection_theory(
640 &self,
641 data_size: usize,
642 devices: Vec<DeviceId>,
643 ) -> UnifiedGpuResult<BenchmarkResult> {
644 let operation_name = "intersection_theory";
645
646 let workload = Workload {
647 operation_type: operation_name.to_string(),
648 data_size,
649 memory_requirement_mb: (data_size * 32 * 8) as f32 / 1024.0 / 1024.0, compute_intensity: ComputeIntensity::Heavy,
651 parallelizable: true,
652 synchronization_required: devices.len() > 1,
653 };
654
655 self.execute_benchmark(operation_name, workload, devices)
656 .await
657 }
658
659 async fn execute_benchmark(
661 &self,
662 operation_name: &str,
663 workload: Workload,
664 devices: Vec<DeviceId>,
665 ) -> UnifiedGpuResult<BenchmarkResult> {
666 for _ in 0..self.config.warmup_iterations {
668 self.simulate_operation(&workload, &devices).await?;
669 }
670
671 let mut durations = Vec::new();
672 let mut memory_usages = Vec::new();
673 let mut utilizations = Vec::new();
674
675 for _ in 0..self.config.measurement_iterations {
677 let start = Instant::now();
678
679 let monitor_handle = if self.config.enable_profiling {
681 Some(self.performance_monitor.start_operation(
682 format!("{}_{}", operation_name, devices.len()),
683 devices[0],
684 operation_name.to_string(),
685 workload.memory_requirement_mb,
686 self.get_optimal_workgroup(&workload),
687 vec![workload.data_size as u64 * 8], ))
689 } else {
690 None
691 };
692
693 let operation_result = self.simulate_operation(&workload, &devices).await?;
695
696 let duration = start.elapsed();
697 durations.push(duration);
698 memory_usages.push(operation_result.memory_usage_mb);
699 utilizations.push(operation_result.gpu_utilization);
700
701 drop(monitor_handle); }
703
704 let avg_duration_ms = durations
706 .iter()
707 .map(|d| d.as_secs_f64() * 1000.0)
708 .sum::<f64>()
709 / durations.len() as f64;
710 let throughput_ops_per_sec = (workload.data_size as f64) / (avg_duration_ms / 1000.0);
711 let memory_bandwidth_gb_s =
712 (workload.memory_requirement_mb as f64 * 2.0) / (avg_duration_ms / 1000.0) / 1024.0; let avg_gpu_utilization = utilizations.iter().sum::<f64>() / utilizations.len() as f64;
714 let avg_memory_usage =
715 memory_usages.iter().map(|&x| x as f64).sum::<f64>() / memory_usages.len() as f64;
716
717 let scaling_efficiency = if devices.len() > 1 {
719 let theoretical_speedup = devices.len() as f64;
722 let actual_speedup = match devices.len() {
723 2 => 1.8, 4 => 3.2, 8 => 5.6, _ => devices.len() as f64 * 0.7, };
728 actual_speedup / theoretical_speedup
729 } else {
730 1.0
731 };
732
733 Ok(BenchmarkResult {
734 test_name: format!("{}_{}_gpu", operation_name, devices.len()),
735 data_size: workload.data_size,
736 device_count: devices.len(),
737 device_ids: devices,
738 duration_ms: avg_duration_ms,
739 throughput_ops_per_sec,
740 memory_bandwidth_gb_s,
741 gpu_utilization_percent: avg_gpu_utilization * 100.0,
742 scaling_efficiency,
743 error_rate: 0.0, memory_usage_mb: avg_memory_usage,
745 metadata: HashMap::new(),
746 })
747 }
748
749 async fn simulate_operation(
751 &self,
752 workload: &Workload,
753 devices: &[DeviceId],
754 ) -> UnifiedGpuResult<OperationResult> {
755 let base_time = match workload.compute_intensity {
759 ComputeIntensity::Light => Duration::from_micros(100),
760 ComputeIntensity::Moderate => Duration::from_micros(500),
761 ComputeIntensity::Heavy => Duration::from_millis(2),
762 ComputeIntensity::Extreme => Duration::from_millis(10),
763 };
764
765 let scaled_time = base_time * (workload.data_size as u32 / 1000).max(1);
767
768 let device_efficiency = match devices.len() {
770 1 => 1.0,
771 2 => 1.8,
772 4 => 3.2,
773 _ => devices.len() as f32 * 0.7,
774 };
775
776 let final_time =
777 Duration::from_nanos((scaled_time.as_nanos() as f32 / device_efficiency) as u64);
778
779 tokio::time::sleep(final_time).await;
781
782 Ok(OperationResult {
783 memory_usage_mb: workload.memory_requirement_mb,
784 gpu_utilization: 0.85, })
786 }
787
788 fn get_optimal_workgroup(&self, workload: &Workload) -> (u32, u32, u32) {
790 match workload.operation_type.as_str() {
791 "geometric_product" | "rotor_application" => (128, 1, 1),
792 "tropical_matrix_multiply" | "fisher_information" => (16, 16, 1),
793 "ca_evolution" => (16, 16, 1),
794 _ => (64, 1, 1),
795 }
796 }
797
798 fn analyze_scaling_performance(&self, results: &[BenchmarkResult]) -> ScalingAnalysis {
800 let mut single_gpu_baseline = HashMap::new();
801 let mut multi_gpu_scaling = HashMap::new();
802 let mut scaling_efficiency = HashMap::new();
803 let mut optimal_device_counts = HashMap::new();
804
805 let mut operation_groups: HashMap<String, Vec<&BenchmarkResult>> = HashMap::new();
807 for result in results {
808 let base_operation = result
809 .test_name
810 .split('_')
811 .take_while(|&part| part != "1" && part != "2" && part != "4")
812 .collect::<Vec<_>>()
813 .join("_");
814 operation_groups
815 .entry(base_operation)
816 .or_default()
817 .push(result);
818 }
819
820 for (operation, operation_results) in operation_groups {
821 if let Some(baseline) = operation_results.iter().find(|r| r.device_count == 1) {
823 single_gpu_baseline.insert(operation.clone(), baseline.throughput_ops_per_sec);
824
825 let mut scaling_data = Vec::new();
827 let mut efficiency_data = Vec::new();
828 let mut best_efficiency = 0.0;
829 let mut best_device_count = 1;
830
831 for result in operation_results.iter() {
832 if result.device_count > 1 {
833 let speedup =
834 result.throughput_ops_per_sec / baseline.throughput_ops_per_sec;
835 let efficiency = speedup / result.device_count as f64;
836
837 scaling_data.push(speedup);
838 efficiency_data.push(efficiency);
839
840 if efficiency > best_efficiency {
841 best_efficiency = efficiency;
842 best_device_count = result.device_count;
843 }
844 }
845 }
846
847 multi_gpu_scaling.insert(operation.clone(), scaling_data);
848 scaling_efficiency.insert(operation.clone(), efficiency_data);
849 optimal_device_counts.insert(operation, best_device_count);
850 }
851 }
852
853 ScalingAnalysis {
854 single_gpu_baseline,
855 multi_gpu_scaling,
856 scaling_efficiency,
857 optimal_device_counts,
858 }
859 }
860
861 fn generate_performance_summary(
863 &self,
864 results: &[BenchmarkResult],
865 scaling_analysis: &ScalingAnalysis,
866 ) -> BenchmarkSummary {
867 let total_tests = results.len();
868 let successful_tests = results.iter().filter(|r| r.error_rate < 0.01).count();
869
870 let average_scaling_efficiency = if !scaling_analysis.scaling_efficiency.is_empty() {
871 let all_efficiencies: Vec<f64> = scaling_analysis
872 .scaling_efficiency
873 .values()
874 .flat_map(|efficiencies| efficiencies.iter())
875 .copied()
876 .collect();
877
878 if !all_efficiencies.is_empty() {
879 all_efficiencies.iter().sum::<f64>() / all_efficiencies.len() as f64
880 } else {
881 let single_gpu_efficiencies: Vec<f64> = results
883 .iter()
884 .filter(|r| r.device_count == 1)
885 .map(|r| r.scaling_efficiency)
886 .collect();
887
888 if !single_gpu_efficiencies.is_empty() {
889 single_gpu_efficiencies.iter().sum::<f64>()
890 / single_gpu_efficiencies.len() as f64
891 } else {
892 0.0
893 }
894 }
895 } else {
896 let all_efficiencies: Vec<f64> = results.iter().map(|r| r.scaling_efficiency).collect();
898
899 if !all_efficiencies.is_empty() {
900 all_efficiencies.iter().sum::<f64>() / all_efficiencies.len() as f64
901 } else {
902 0.0
903 }
904 };
905
906 let best_config = results
908 .iter()
909 .max_by(|a, b| {
910 a.throughput_ops_per_sec
911 .partial_cmp(&b.throughput_ops_per_sec)
912 .unwrap()
913 })
914 .map(|r| r.test_name.clone())
915 .unwrap_or_else(|| "None".to_string());
916
917 let mut performance_improvements = HashMap::new();
919 for operation in scaling_analysis.single_gpu_baseline.keys() {
920 if let Some(scaling_data) = scaling_analysis.multi_gpu_scaling.get(operation) {
921 if let Some(&best_scaling) =
922 scaling_data.iter().max_by(|a, b| a.partial_cmp(b).unwrap())
923 {
924 let improvement = (best_scaling - 1.0) * 100.0;
925 performance_improvements.insert(operation.clone(), improvement);
926 }
927 }
928 }
929
930 BenchmarkSummary {
931 total_tests,
932 successful_tests,
933 average_scaling_efficiency,
934 best_performing_configuration: best_config,
935 performance_improvements,
936 bottlenecks_detected: vec![], }
938 }
939}
940
941#[derive(Debug)]
943struct OperationResult {
944 memory_usage_mb: f32,
945 gpu_utilization: f64,
946}
947
948pub struct BenchmarkRunner;
950
951impl BenchmarkRunner {
952 pub async fn run_quick_validation() -> UnifiedGpuResult<BenchmarkSuiteResults> {
954 let config = BenchmarkConfig {
955 warmup_iterations: 1,
956 measurement_iterations: 3,
957 data_sizes: vec![100, 1000],
958 device_combinations: vec![vec![DeviceId(0)]],
959 enable_profiling: false,
960 ..Default::default()
961 };
962
963 let benchmarks = AmariMultiGpuBenchmarks::new(config).await?;
964 benchmarks.run_complete_suite().await
965 }
966
967 pub async fn run_comprehensive_analysis() -> UnifiedGpuResult<BenchmarkSuiteResults> {
969 let config = BenchmarkConfig::default();
970 let benchmarks = AmariMultiGpuBenchmarks::new(config).await?;
971 benchmarks.run_complete_suite().await
972 }
973
974 pub async fn run_scaling_analysis() -> UnifiedGpuResult<BenchmarkSuiteResults> {
976 let config = BenchmarkConfig {
977 data_sizes: vec![1000, 10000, 100000],
978 device_combinations: vec![
979 vec![DeviceId(0)],
980 vec![DeviceId(0), DeviceId(1)],
981 vec![DeviceId(0), DeviceId(1), DeviceId(2)],
982 vec![DeviceId(0), DeviceId(1), DeviceId(2), DeviceId(3)],
983 ],
984 enable_profiling: true,
985 ..Default::default()
986 };
987
988 let benchmarks = AmariMultiGpuBenchmarks::new(config).await?;
989 benchmarks.run_complete_suite().await
990 }
991}
992
993#[cfg(test)]
994mod tests {
995 use super::*;
996
997 #[tokio::test]
998 async fn test_benchmark_config() {
999 let config = BenchmarkConfig::default();
1000 assert!(config.measurement_iterations > 0);
1001 assert!(!config.data_sizes.is_empty());
1002 }
1003
1004 #[tokio::test]
1005 #[ignore = "GPU hardware required, may fail in CI/CD environments"]
1006 async fn test_benchmark_runner_creation() {
1007 let config = BenchmarkConfig {
1010 measurement_iterations: 1,
1011 data_sizes: vec![10],
1012 device_combinations: vec![vec![DeviceId(0)]],
1013 enable_profiling: false,
1014 ..Default::default()
1015 };
1016
1017 match AmariMultiGpuBenchmarks::new(config).await {
1018 Ok(_benchmarks) => {
1019 }
1021 Err(_) => {
1022 println!("GPU not available for benchmarking");
1024 }
1025 }
1026 }
1027}