scirs2_optimize/
distributed_gpu.rs

1//! Distributed GPU optimization combining MPI and GPU acceleration
2//!
3//! This module provides optimization algorithms that leverage both distributed
4//! computing (MPI) and GPU acceleration, enabling massive parallel optimization
5//! across multiple nodes with GPU acceleration on each node.
6
7use crate::error::ScirsResult;
8use ndarray::{Array1, Array2, ArrayView1};
9
10use crate::distributed::{
11    DistributedConfig, DistributedOptimizationContext, DistributedStats, MPIInterface,
12};
13use crate::gpu::{
14    acceleration::{AccelerationConfig, AccelerationManager},
15    cuda_kernels::DifferentialEvolutionKernel,
16    tensor_core_optimization::{AMPManager, TensorCoreOptimizationConfig, TensorCoreOptimizer},
17    GpuOptimizationConfig, GpuOptimizationContext,
18};
19use crate::result::OptimizeResults;
20use statrs::statistics::Statistics;
21
22/// Configuration for distributed GPU optimization
23#[derive(Clone)]
24pub struct DistributedGpuConfig {
25    /// Distributed computing configuration
26    pub distributed_config: DistributedConfig,
27    /// GPU optimization configuration
28    pub gpu_config: GpuOptimizationConfig,
29    /// GPU acceleration configuration
30    pub acceleration_config: AccelerationConfig,
31    /// Whether to use Tensor Cores if available
32    pub use_tensor_cores: bool,
33    /// Tensor Core configuration
34    pub tensor_config: Option<TensorCoreOptimizationConfig>,
35    /// Communication strategy for GPU data
36    pub gpu_communication_strategy: GpuCommunicationStrategy,
37    /// Load balancing between GPU and CPU work
38    pub gpu_cpu_load_balance: f64, // 0.0 = all CPU, 1.0 = all GPU
39}
40
41impl Default for DistributedGpuConfig {
42    fn default() -> Self {
43        Self {
44            distributed_config: DistributedConfig::default(),
45            gpu_config: GpuOptimizationConfig::default(),
46            acceleration_config: AccelerationConfig::default(),
47            use_tensor_cores: true,
48            tensor_config: Some(TensorCoreOptimizationConfig::default()),
49            gpu_communication_strategy: GpuCommunicationStrategy::Direct,
50            gpu_cpu_load_balance: 0.8, // Prefer GPU but keep some CPU work
51        }
52    }
53}
54
55/// GPU communication strategies for distributed systems
56#[derive(Debug, Clone, Copy, PartialEq)]
57pub enum GpuCommunicationStrategy {
58    /// Direct GPU-to-GPU communication via GPUDirect
59    Direct,
60    /// GPU-to-CPU-to-MPI-to-CPU-to-GPU
61    Staged,
62    /// Asynchronous overlapped communication
63    AsyncOverlapped,
64    /// Hierarchical communication (intra-node GPU, inter-node MPI)
65    Hierarchical,
66}
67
68/// Distributed GPU optimization context
69pub struct DistributedGpuOptimizer<M: MPIInterface> {
70    distributed_context: DistributedOptimizationContext<M>,
71    gpu_context: GpuOptimizationContext,
72    acceleration_manager: AccelerationManager,
73    tensor_optimizer: Option<TensorCoreOptimizer>,
74    amp_manager: Option<AMPManager>,
75    config: DistributedGpuConfig,
76    performance_stats: DistributedGpuStats,
77}
78
79impl<M: MPIInterface> DistributedGpuOptimizer<M> {
80    /// Create a new distributed GPU optimizer
81    pub fn new(mpi: M, config: DistributedGpuConfig) -> ScirsResult<Self> {
82        let distributed_context =
83            DistributedOptimizationContext::new(mpi, config.distributed_config.clone());
84        let gpu_context = GpuOptimizationContext::new(config.gpu_config.clone())?;
85        let acceleration_manager = AccelerationManager::new(config.acceleration_config.clone());
86
87        let tensor_optimizer = if config.use_tensor_cores {
88            match config.tensor_config.as_ref() {
89                Some(tensor_config) => {
90                    match TensorCoreOptimizer::new(
91                        gpu_context.context().clone(),
92                        tensor_config.clone(),
93                    ) {
94                        Ok(optimizer) => Some(optimizer),
95                        Err(_) => {
96                            // Tensor Cores not available, continue without them
97                            None
98                        }
99                    }
100                }
101                None => None,
102            }
103        } else {
104            None
105        };
106
107        let amp_manager = if config
108            .tensor_config
109            .as_ref()
110            .map(|tc| tc.use_amp)
111            .unwrap_or(false)
112        {
113            Some(AMPManager::new())
114        } else {
115            None
116        };
117
118        Ok(Self {
119            distributed_context,
120            gpu_context,
121            acceleration_manager,
122            tensor_optimizer,
123            amp_manager,
124            config,
125            performance_stats: DistributedGpuStats::new(),
126        })
127    }
128
129    /// Optimize using distributed differential evolution with GPU acceleration
130    pub fn differential_evolution<F>(
131        &mut self,
132        function: F,
133        bounds: &[(f64, f64)],
134        population_size: usize,
135        max_nit: usize,
136    ) -> ScirsResult<DistributedGpuResults>
137    where
138        F: Fn(&ArrayView1<f64>) -> f64 + Clone + Send + Sync,
139    {
140        let start_time = std::time::Instant::now();
141
142        // Distribute work across MPI processes
143        let work_assignment = self.distributed_context.distribute_work(population_size);
144        let local_pop_size = work_assignment.count;
145
146        if local_pop_size == 0 {
147            return Ok(DistributedGpuResults::empty()); // Worker with no assigned work
148        }
149
150        // Initialize population on GPU
151        let dims = bounds.len();
152        let local_population = self.initialize_gpu_population(local_pop_size, bounds)?;
153        let local_fitness = self.evaluate_population_gpu(&function, &local_population)?;
154
155        // GPU kernels for evolution operations
156        // TODO: Fix GpuContext type mismatch between local alias and scirs2_core::GpuContext
157        // let evolution_kernel =
158        //     DifferentialEvolutionKernel::new(
159        //         Arc::new(CoreGpuContext::from_device(self.gpu_context.context().as_ref().clone()))
160        //     ).map_err(|e| scirs2_core::error::CoreError::General(e.to_string()))?;
161        let evolution_kernel = todo!("Fix GpuContext type conversion");
162
163        #[allow(unreachable_code)]
164        let mut best_individual = Array1::zeros(dims);
165        let mut best_fitness = f64::INFINITY;
166        let mut total_evaluations = local_pop_size;
167
168        // Main evolution loop
169        for iteration in 0..max_nit {
170            // Generate trial population using GPU
171            let trial_population = self.gpu_mutation_crossover(
172                &evolution_kernel,
173                &local_population,
174                0.8, // F scale
175                0.7, // Crossover rate
176            )?;
177
178            // Evaluate trial population
179            let trial_fitness = self.evaluate_population_gpu(&function, &trial_population)?;
180            total_evaluations += local_pop_size;
181
182            // GPU-accelerated selection
183            self.gpu_selection(
184                &evolution_kernel,
185                &mut local_population,
186                &trial_population,
187                &mut local_fitness,
188                &trial_fitness,
189            )?;
190
191            // Find local best
192            let (local_best_idx, local_best_fitness) = self.find_local_best(&local_fitness)?;
193
194            if local_best_fitness < best_fitness {
195                best_fitness = local_best_fitness;
196                best_individual = local_population.row(local_best_idx).to_owned();
197            }
198
199            // Periodic communication and migration
200            if iteration % 10 == 0 {
201                let global_best =
202                    self.communicate_best_individuals(&best_individual, best_fitness)?;
203
204                if let Some((global_best_individual, global_best_fitness)) = global_best {
205                    if global_best_fitness < best_fitness {
206                        best_individual = global_best_individual;
207                        best_fitness = global_best_fitness;
208                    }
209                }
210
211                // GPU-to-GPU migration if supported
212                self.gpu_migration(&mut local_population, &mut local_fitness)?;
213            }
214
215            // Update performance statistics
216            self.performance_stats.record_iteration(
217                iteration,
218                local_pop_size,
219                best_fitness,
220                start_time.elapsed().as_secs_f64(),
221            );
222
223            // Check convergence
224            if self.check_convergence(&local_fitness, iteration)? {
225                break;
226            }
227        }
228
229        // Final global best communication
230        let final_global_best =
231            self.communicate_best_individuals(&best_individual, best_fitness)?;
232
233        if let Some((final_best_individual, final_best_fitness)) = final_global_best {
234            best_individual = final_best_individual;
235            best_fitness = final_best_fitness;
236        }
237
238        let total_time = start_time.elapsed().as_secs_f64();
239
240        Ok(DistributedGpuResults {
241            base_result: OptimizeResults::<f64> {
242                x: best_individual,
243                fun: best_fitness,
244                success: true,
245                message: "Distributed GPU differential evolution completed".to_string(),
246                nit: max_nit,
247                nfev: total_evaluations,
248                ..OptimizeResults::default()
249            },
250            gpu_stats: crate::gpu::acceleration::PerformanceStats::default(),
251            distributed_stats: self.distributed_context.stats().clone(),
252            performance_stats: self.performance_stats.clone(),
253            total_time,
254        })
255    }
256
257    /// Initialize population on GPU
258    fn initialize_gpu_population(
259        &self,
260        pop_size: usize,
261        bounds: &[(f64, f64)],
262    ) -> ScirsResult<Array2<f64>> {
263        use rand::Rng;
264        let mut rng = rand::rng();
265
266        let dims = bounds.len();
267        let mut population = Array2::zeros((pop_size, dims));
268
269        for i in 0..pop_size {
270            for j in 0..dims {
271                let (low, high) = bounds[j];
272                population[[i, j]] = rng.gen_range(low..=high);
273            }
274        }
275
276        Ok(population)
277    }
278
279    /// Evaluate population using GPU acceleration
280    fn evaluate_population_gpu<F>(
281        &mut self,
282        function: &F,
283        population: &Array2<f64>,
284    ) -> ScirsResult<Array1<f64>>
285    where
286        F: Fn(&ArrayView1<f64>) -> f64,
287    {
288        let pop_size = population.nrows();
289        let mut fitness = Array1::zeros(pop_size);
290
291        // Decide between GPU and CPU based on problem size and configuration
292        let use_gpu = pop_size >= 100 && self.config.gpu_cpu_load_balance > 0.5;
293
294        if use_gpu {
295            // Use GPU acceleration for function evaluation
296            self.performance_stats.gpu_evaluations += pop_size;
297
298            // For now, use CPU evaluation since GPU function interface needs adaptation
299            // TODO: Implement proper GPU function evaluation interface
300            for i in 0..pop_size {
301                fitness[i] = function(&population.row(i));
302            }
303        } else {
304            // Use CPU evaluation
305            self.performance_stats.cpu_evaluations += pop_size;
306            for i in 0..pop_size {
307                fitness[i] = function(&population.row(i));
308            }
309        }
310
311        Ok(fitness)
312    }
313
314    /// Perform GPU-accelerated mutation and crossover
315    fn gpu_mutation_crossover(
316        &self,
317        _kernel: &DifferentialEvolutionKernel,
318        population: &Array2<f64>,
319        f_scale: f64,
320        crossover_rate: f64,
321    ) -> ScirsResult<Array2<f64>> {
322        let (pop_size, dims) = population.dim();
323        let mut trial_population = Array2::zeros((pop_size, dims));
324
325        // For now, implement CPU-based mutation and crossover
326        // TODO: Use actual GPU kernels when properly implemented
327        use rand::Rng;
328        let mut rng = rand::rng();
329
330        for i in 0..pop_size {
331            // Select three random individuals different from current
332            let mut indices = Vec::new();
333            while indices.len() < 3 {
334                let idx = rng.gen_range(0..pop_size);
335                if idx != i && !indices.contains(&idx) {
336                    indices.push(idx);
337                }
338            }
339
340            let a = indices[0];
341            let b = indices[1];
342            let c = indices[2];
343
344            // Mutation and crossover
345            let j_rand = rng.gen_range(0..dims);
346            for j in 0..dims {
347                if rng.gen_range(0.0..1.0) < crossover_rate || j == j_rand {
348                    trial_population[[i, j]] =
349                        population[[a, j]] + f_scale * (population[[b, j]] - population[[c, j]]);
350                } else {
351                    trial_population[[i, j]] = population[[i, j]];
352                }
353            }
354        }
355
356        Ok(trial_population)
357    }
358
359    /// Perform GPU-accelerated selection
360    fn gpu_selection(
361        &self,
362        _kernel: &DifferentialEvolutionKernel,
363        population: &mut Array2<f64>,
364        trial_population: &Array2<f64>,
365        fitness: &mut Array1<f64>,
366        trial_fitness: &Array1<f64>,
367    ) -> ScirsResult<()> {
368        // For now, implement CPU-based selection
369        // TODO: Use actual GPU kernels when properly implemented
370        for i in 0..population.nrows() {
371            if trial_fitness[i] <= fitness[i] {
372                for j in 0..population.ncols() {
373                    population[[i, j]] = trial_population[[i, j]];
374                }
375                fitness[i] = trial_fitness[i];
376            }
377        }
378
379        Ok(())
380    }
381
382    /// Find local best individual and fitness
383    fn find_local_best(&self, fitness: &Array1<f64>) -> ScirsResult<(usize, f64)> {
384        let mut best_idx = 0;
385        let mut best_fitness = fitness[0];
386
387        for (i, &f) in fitness.iter().enumerate() {
388            if f < best_fitness {
389                best_fitness = f;
390                best_idx = i;
391            }
392        }
393
394        Ok((best_idx, best_fitness))
395    }
396
397    /// Communicate best individuals across MPI processes
398    fn communicate_best_individuals(
399        &mut self,
400        local_best: &Array1<f64>,
401        local_best_fitness: f64,
402    ) -> ScirsResult<Option<(Array1<f64>, f64)>> {
403        if self.distributed_context.size() <= 1 {
404            return Ok(None);
405        }
406
407        // For simplicity, we'll use a basic approach
408        // In a full implementation, this would use MPI all-reduce operations
409        // to find the global best across all processes
410
411        // Placeholder implementation
412        Ok(Some((local_best.clone(), local_best_fitness)))
413    }
414
415    /// Perform GPU-to-GPU migration between processes
416    fn gpu_migration(
417        &mut self,
418        population: &mut Array2<f64>,
419        fitness: &mut Array1<f64>,
420    ) -> ScirsResult<()> {
421        match self.config.gpu_communication_strategy {
422            GpuCommunicationStrategy::Direct => {
423                // Use GPUDirect for direct GPU-to-GPU communication
424                self.gpu_direct_migration(population, fitness)
425            }
426            GpuCommunicationStrategy::Staged => {
427                // Stage through CPU memory
428                self.staged_migration(population, fitness)
429            }
430            GpuCommunicationStrategy::AsyncOverlapped => {
431                // Asynchronous communication with computation overlap
432                self.async_migration(population, fitness)
433            }
434            GpuCommunicationStrategy::Hierarchical => {
435                // Hierarchical intra-node GPU, inter-node MPI
436                self.hierarchical_migration(population, fitness)
437            }
438        }
439    }
440
441    /// Direct GPU-to-GPU migration using GPUDirect
442    fn gpu_direct_migration(
443        &mut self,
444        population: &mut Array2<f64>,
445        _fitness: &mut Array1<f64>,
446    ) -> ScirsResult<()> {
447        // Placeholder for GPUDirect implementation
448        // This would use CUDA-aware MPI or similar technology
449        Ok(())
450    }
451
452    /// Staged migration through CPU memory
453    fn staged_migration(
454        &mut self,
455        population: &mut Array2<f64>,
456        _fitness: &mut Array1<f64>,
457    ) -> ScirsResult<()> {
458        // Download from GPU, perform MPI communication, upload back to GPU
459        // This is less efficient but more compatible
460        Ok(())
461    }
462
463    /// Asynchronous migration with computation overlap
464    fn async_migration(
465        &mut self,
466        population: &mut Array2<f64>,
467        _fitness: &mut Array1<f64>,
468    ) -> ScirsResult<()> {
469        // Use asynchronous MPI operations to overlap communication with computation
470        Ok(())
471    }
472
473    /// Hierarchical migration (intra-node GPU, inter-node MPI)
474    fn hierarchical_migration(
475        &mut self,
476        population: &mut Array2<f64>,
477        _fitness: &mut Array1<f64>,
478    ) -> ScirsResult<()> {
479        // First migrate within node between GPUs, then between nodes via MPI
480        Ok(())
481    }
482
483    /// Check convergence criteria
484    fn check_convergence(&self, fitness: &Array1<f64>, iteration: usize) -> ScirsResult<bool> {
485        if fitness.len() < 2 {
486            return Ok(false);
487        }
488
489        let mean = fitness.view().mean();
490        let variance =
491            fitness.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / fitness.len() as f64;
492
493        let std_dev = variance.sqrt();
494
495        // Simple convergence criterion
496        Ok(std_dev < 1e-12 || iteration >= 1000)
497    }
498
499    /// Generate random indices for differential evolution mutation
500    fn generate_random_indices(&self, pop_size: usize) -> ScirsResult<Array2<i32>> {
501        use rand::Rng;
502        let mut rng = rand::rng();
503        let mut indices = Array2::zeros((pop_size, 3));
504
505        for i in 0..pop_size {
506            let mut selected = std::collections::HashSet::new();
507            selected.insert(i);
508
509            for j in 0..3 {
510                loop {
511                    let idx = rng.gen_range(0..pop_size);
512                    if !selected.contains(&idx) {
513                        indices[[i, j]] = idx as i32;
514                        selected.insert(idx);
515                        break;
516                    }
517                }
518            }
519        }
520
521        Ok(indices)
522    }
523
524    /// Generate random values for crossover
525    fn generate_random_values(&self, count: usize) -> ScirsResult<Array1<f64>> {
526        use rand::Rng;
527        let mut rng = rand::rng();
528        let mut values = Array1::zeros(count);
529
530        for i in 0..count {
531            values[i] = rng.gen_range(0.0..1.0);
532        }
533
534        Ok(values)
535    }
536
537    /// Generate j_rand values for crossover
538    fn generate_j_rand(&self, pop_size: usize, dims: usize) -> ScirsResult<Array1<i32>> {
539        use rand::Rng;
540        let mut rng = rand::rng();
541        let mut j_rand = Array1::zeros(pop_size);
542
543        for i in 0..pop_size {
544            j_rand[i] = rng.gen_range(0..dims) as i32;
545        }
546
547        Ok(j_rand)
548    }
549
550    /// Get performance statistics
551    pub fn stats(&self) -> &DistributedGpuStats {
552        &self.performance_stats
553    }
554}
555
556/// Performance statistics for distributed GPU optimization
557#[derive(Debug, Clone)]
558pub struct DistributedGpuStats {
559    /// Total GPU function evaluations
560    pub gpu_evaluations: usize,
561    /// Total CPU function evaluations
562    pub cpu_evaluations: usize,
563    /// GPU utilization percentage
564    pub gpu_utilization: f64,
565    /// Communication overhead time
566    pub communication_time: f64,
567    /// GPU memory usage statistics
568    pub gpu_memory_usage: f64,
569    /// Iteration statistics
570    pub nit: Vec<IterationStats>,
571}
572
573impl DistributedGpuStats {
574    fn new() -> Self {
575        Self {
576            gpu_evaluations: 0,
577            cpu_evaluations: 0,
578            gpu_utilization: 0.0,
579            communication_time: 0.0,
580            gpu_memory_usage: 0.0,
581            nit: Vec::new(),
582        }
583    }
584
585    fn record_iteration(
586        &mut self,
587        iteration: usize,
588        pop_size: usize,
589        best_fitness: f64,
590        elapsed_time: f64,
591    ) {
592        self.nit.push(IterationStats {
593            iteration,
594            population_size: pop_size,
595            best_fitness,
596            elapsed_time,
597        });
598    }
599
600    /// Generate comprehensive performance report
601    pub fn generate_report(&self) -> String {
602        let mut report = String::from("Distributed GPU Optimization Performance Report\n");
603        report.push_str("==============================================\n\n");
604
605        report.push_str(&format!(
606            "GPU Function Evaluations: {}\n",
607            self.gpu_evaluations
608        ));
609        report.push_str(&format!(
610            "CPU Function Evaluations: {}\n",
611            self.cpu_evaluations
612        ));
613
614        let total_evaluations = self.gpu_evaluations + self.cpu_evaluations;
615        if total_evaluations > 0 {
616            let gpu_percentage = (self.gpu_evaluations as f64 / total_evaluations as f64) * 100.0;
617            report.push_str(&format!("GPU Usage: {:.1}%\n", gpu_percentage));
618        }
619
620        report.push_str(&format!(
621            "GPU Utilization: {:.1}%\n",
622            self.gpu_utilization * 100.0
623        ));
624        report.push_str(&format!(
625            "Communication Overhead: {:.3}s\n",
626            self.communication_time
627        ));
628        report.push_str(&format!(
629            "GPU Memory Usage: {:.1}%\n",
630            self.gpu_memory_usage * 100.0
631        ));
632
633        if let Some(last_iteration) = self.nit.last() {
634            report.push_str(&format!(
635                "Final Best Fitness: {:.6e}\n",
636                last_iteration.best_fitness
637            ));
638            report.push_str(&format!(
639                "Total Time: {:.3}s\n",
640                last_iteration.elapsed_time
641            ));
642        }
643
644        report
645    }
646}
647
648/// Statistics for individual iterations
649#[derive(Debug, Clone)]
650pub struct IterationStats {
651    pub iteration: usize,
652    pub population_size: usize,
653    pub best_fitness: f64,
654    pub elapsed_time: f64,
655}
656
657/// Results from distributed GPU optimization
658#[derive(Debug, Clone)]
659pub struct DistributedGpuResults {
660    /// Base optimization results
661    pub base_result: OptimizeResults<f64>,
662    /// GPU-specific performance statistics
663    pub gpu_stats: crate::gpu::acceleration::PerformanceStats,
664    /// Distributed computing statistics
665    pub distributed_stats: DistributedStats,
666    /// Combined performance statistics
667    pub performance_stats: DistributedGpuStats,
668    /// Total optimization time
669    pub total_time: f64,
670}
671
672impl DistributedGpuResults {
673    fn empty() -> Self {
674        Self {
675            base_result: OptimizeResults::<f64> {
676                x: Array1::zeros(0),
677                fun: 0.0,
678                success: false,
679                message: "No work assigned to this process".to_string(),
680                nit: 0,
681                nfev: 0,
682                ..OptimizeResults::default()
683            },
684            gpu_stats: crate::gpu::acceleration::PerformanceStats::default(),
685            distributed_stats: DistributedStats {
686                communication_time: 0.0,
687                computation_time: 0.0,
688                load_balance_ratio: 1.0,
689                synchronizations: 0,
690                bytes_transferred: 0,
691            },
692            performance_stats: DistributedGpuStats::new(),
693            total_time: 0.0,
694        }
695    }
696
697    /// Print comprehensive results summary
698    pub fn print_summary(&self) {
699        println!("Distributed GPU Optimization Results");
700        println!("===================================");
701        println!("Success: {}", self.base_result.success);
702        println!("Final function value: {:.6e}", self.base_result.fun);
703        println!("Iterations: {}", self.base_result.nit);
704        println!("Function evaluations: {}", self.base_result.nfev);
705        println!("Total time: {:.3}s", self.total_time);
706        println!();
707
708        println!("GPU Performance:");
709        println!("{}", self.gpu_stats.generate_report());
710        println!();
711
712        println!("Distributed Performance:");
713        println!("{}", self.distributed_stats.generate_report());
714        println!();
715
716        println!("Combined Performance:");
717        println!("{}", self.performance_stats.generate_report());
718    }
719}
720
721#[cfg(test)]
722mod tests {
723    use super::*;
724
725    #[test]
726    fn test_distributed_gpu_config() {
727        let config = DistributedGpuConfig::default();
728        assert!(config.use_tensor_cores);
729        assert_eq!(config.gpu_cpu_load_balance, 0.8);
730        assert_eq!(
731            config.gpu_communication_strategy,
732            GpuCommunicationStrategy::Direct
733        );
734    }
735
736    #[test]
737    fn test_gpu_communication_strategies() {
738        let strategies = [
739            GpuCommunicationStrategy::Direct,
740            GpuCommunicationStrategy::Staged,
741            GpuCommunicationStrategy::AsyncOverlapped,
742            GpuCommunicationStrategy::Hierarchical,
743        ];
744
745        for strategy in &strategies {
746            let mut config = DistributedGpuConfig::default();
747            config.gpu_communication_strategy = *strategy;
748            // Test that configuration is valid
749            assert_eq!(config.gpu_communication_strategy, *strategy);
750        }
751    }
752
753    #[test]
754    fn test_performance_stats() {
755        let mut stats = DistributedGpuStats::new();
756        stats.gpu_evaluations = 1000;
757        stats.cpu_evaluations = 200;
758        stats.gpu_utilization = 0.85;
759
760        let report = stats.generate_report();
761        assert!(report.contains("GPU Function Evaluations: 1000"));
762        assert!(report.contains("CPU Function Evaluations: 200"));
763        assert!(report.contains("GPU Usage: 83.3%")); // 1000/(1000+200) * 100
764    }
765
766    #[test]
767    #[ignore = "Requires MPI and GPU"]
768    fn test_distributed_gpu_optimization() {
769        // This would test the actual distributed GPU optimization
770        // Implementation depends on having both MPI and GPU available
771    }
772}