Skip to main content

scirs2_optimize/
distributed_gpu.rs

1//! Distributed GPU optimization combining MPI and GPU acceleration
2//!
3//! This module provides optimization algorithms that leverage both distributed
4//! computing (MPI) and GPU acceleration, enabling massive parallel optimization
5//! across multiple nodes with GPU acceleration on each node.
6
7use crate::error::ScirsResult;
8use scirs2_core::ndarray::{Array1, Array2, ArrayView1};
9use std::sync::Arc;
10
11use crate::distributed::{
12    DistributedConfig, DistributedOptimizationContext, DistributedStats, MPIInterface,
13};
14use crate::gpu::{
15    acceleration::{AccelerationConfig, AccelerationManager},
16    cuda_kernels::DifferentialEvolutionKernel,
17    tensor_core_optimization::{AMPManager, TensorCoreOptimizationConfig, TensorCoreOptimizer},
18    GpuOptimizationConfig, GpuOptimizationContext,
19};
20use crate::result::OptimizeResults;
21use statrs::statistics::Statistics;
22
23/// Configuration for distributed GPU optimization
24#[derive(Clone)]
25pub struct DistributedGpuConfig {
26    /// Distributed computing configuration
27    pub distributed_config: DistributedConfig,
28    /// GPU optimization configuration
29    pub gpu_config: GpuOptimizationConfig,
30    /// GPU acceleration configuration
31    pub acceleration_config: AccelerationConfig,
32    /// Whether to use Tensor Cores if available
33    pub use_tensor_cores: bool,
34    /// Tensor Core configuration
35    pub tensor_config: Option<TensorCoreOptimizationConfig>,
36    /// Communication strategy for GPU data
37    pub gpu_communication_strategy: GpuCommunicationStrategy,
38    /// Load balancing between GPU and CPU work
39    pub gpu_cpu_load_balance: f64, // 0.0 = all CPU, 1.0 = all GPU
40}
41
42impl Default for DistributedGpuConfig {
43    fn default() -> Self {
44        Self {
45            distributed_config: DistributedConfig::default(),
46            gpu_config: GpuOptimizationConfig::default(),
47            acceleration_config: AccelerationConfig::default(),
48            use_tensor_cores: true,
49            tensor_config: Some(TensorCoreOptimizationConfig::default()),
50            gpu_communication_strategy: GpuCommunicationStrategy::Direct,
51            gpu_cpu_load_balance: 0.8, // Prefer GPU but keep some CPU work
52        }
53    }
54}
55
56/// GPU communication strategies for distributed systems
57#[derive(Debug, Clone, Copy, PartialEq)]
58pub enum GpuCommunicationStrategy {
59    /// Direct GPU-to-GPU communication via GPUDirect
60    Direct,
61    /// GPU-to-CPU-to-MPI-to-CPU-to-GPU
62    Staged,
63    /// Asynchronous overlapped communication
64    AsyncOverlapped,
65    /// Hierarchical communication (intra-node GPU, inter-node MPI)
66    Hierarchical,
67}
68
69/// Distributed GPU optimization context
70pub struct DistributedGpuOptimizer<M: MPIInterface> {
71    distributed_context: DistributedOptimizationContext<M>,
72    gpu_context: GpuOptimizationContext,
73    acceleration_manager: AccelerationManager,
74    tensor_optimizer: Option<TensorCoreOptimizer>,
75    amp_manager: Option<AMPManager>,
76    config: DistributedGpuConfig,
77    performance_stats: DistributedGpuStats,
78}
79
80impl<M: MPIInterface> DistributedGpuOptimizer<M> {
81    /// Create a new distributed GPU optimizer
82    pub fn new(mpi: M, config: DistributedGpuConfig) -> ScirsResult<Self> {
83        let distributed_context =
84            DistributedOptimizationContext::new(mpi, config.distributed_config.clone());
85        let gpu_context = GpuOptimizationContext::new(config.gpu_config.clone())?;
86        let acceleration_manager = AccelerationManager::new(config.acceleration_config.clone());
87
88        let tensor_optimizer = if config.use_tensor_cores {
89            match config.tensor_config.as_ref() {
90                Some(tensor_config) => {
91                    match TensorCoreOptimizer::new(
92                        gpu_context.context().clone(),
93                        tensor_config.clone(),
94                    ) {
95                        Ok(optimizer) => Some(optimizer),
96                        Err(_) => {
97                            // Tensor Cores not available, continue without them
98                            None
99                        }
100                    }
101                }
102                None => None,
103            }
104        } else {
105            None
106        };
107
108        let amp_manager = if config
109            .tensor_config
110            .as_ref()
111            .map(|tc| tc.use_amp)
112            .unwrap_or(false)
113        {
114            Some(AMPManager::new())
115        } else {
116            None
117        };
118
119        Ok(Self {
120            distributed_context,
121            gpu_context,
122            acceleration_manager,
123            tensor_optimizer,
124            amp_manager,
125            config,
126            performance_stats: DistributedGpuStats::new(),
127        })
128    }
129
130    /// Optimize using distributed differential evolution with GPU acceleration
131    pub fn differential_evolution<F>(
132        &mut self,
133        function: F,
134        bounds: &[(f64, f64)],
135        population_size: usize,
136        max_nit: usize,
137    ) -> ScirsResult<DistributedGpuResults>
138    where
139        F: Fn(&ArrayView1<f64>) -> f64 + Clone + Send + Sync,
140    {
141        let start_time = std::time::Instant::now();
142
143        // Distribute work across MPI processes
144        let work_assignment = self.distributed_context.distribute_work(population_size);
145        let local_pop_size = work_assignment.count;
146
147        if local_pop_size == 0 {
148            return Ok(DistributedGpuResults::empty()); // Worker with no assigned work
149        }
150
151        // Initialize population on GPU
152        let dims = bounds.len();
153        let mut local_population = self.initialize_gpu_population(local_pop_size, bounds)?;
154        let mut local_fitness = self.evaluate_population_gpu(&function, &local_population)?;
155
156        // GPU kernels for evolution operations.
157        //
158        // `GpuOptimizationContext::context()` returns `&Arc<scirs2_core::gpu::GpuContext>`,
159        // which is the exact type `DifferentialEvolutionKernel::new` expects (an
160        // `Arc<GpuContext>`). The earlier comment about a type mismatch referred to a
161        // path that tried to `.clone()` the inner `GpuContext` and rewrap it; that path
162        // is unsound because `GpuContext` is not `Clone` (it owns a `KernelRegistry`).
163        // The right approach is to share ownership via `Arc::clone`, matching the
164        // pattern used by `SwarmKernelCache` in `gpu/acceleration.rs`.
165        let evolution_kernel =
166            DifferentialEvolutionKernel::new(Arc::clone(self.gpu_context.context()))?;
167
168        let mut best_individual = Array1::zeros(dims);
169        let mut best_fitness = f64::INFINITY;
170        let mut total_evaluations = local_pop_size;
171
172        // Main evolution loop
173        for iteration in 0..max_nit {
174            // Generate trial population using GPU
175            let trial_population = self.gpu_mutation_crossover(
176                &evolution_kernel,
177                &local_population,
178                0.8, // F scale
179                0.7, // Crossover rate
180            )?;
181
182            // Evaluate trial population
183            let trial_fitness = self.evaluate_population_gpu(&function, &trial_population)?;
184            total_evaluations += local_pop_size;
185
186            // GPU-accelerated selection
187            self.gpu_selection(
188                &evolution_kernel,
189                &mut local_population,
190                &trial_population,
191                &mut local_fitness,
192                &trial_fitness,
193            )?;
194
195            // Find local best
196            let (local_best_idx, local_best_fitness) = self.find_local_best(&local_fitness)?;
197
198            if local_best_fitness < best_fitness {
199                best_fitness = local_best_fitness;
200                best_individual = local_population.row(local_best_idx).to_owned();
201            }
202
203            // Periodic communication and migration
204            if iteration % 10 == 0 {
205                let global_best =
206                    self.communicate_best_individuals(&best_individual, best_fitness)?;
207
208                if let Some((global_best_individual, global_best_fitness)) = global_best {
209                    if global_best_fitness < best_fitness {
210                        best_individual = global_best_individual;
211                        best_fitness = global_best_fitness;
212                    }
213                }
214
215                // GPU-to-GPU migration if supported
216                self.gpu_migration(&mut local_population, &mut local_fitness)?;
217            }
218
219            // Update performance statistics
220            self.performance_stats.record_iteration(
221                iteration,
222                local_pop_size,
223                best_fitness,
224                start_time.elapsed().as_secs_f64(),
225            );
226
227            // Check convergence
228            if self.check_convergence(&local_fitness, iteration)? {
229                break;
230            }
231        }
232
233        // Final global best communication
234        let final_global_best =
235            self.communicate_best_individuals(&best_individual, best_fitness)?;
236
237        if let Some((final_best_individual, final_best_fitness)) = final_global_best {
238            best_individual = final_best_individual;
239            best_fitness = final_best_fitness;
240        }
241
242        let total_time = start_time.elapsed().as_secs_f64();
243
244        Ok(DistributedGpuResults {
245            base_result: OptimizeResults::<f64> {
246                x: best_individual,
247                fun: best_fitness,
248                success: true,
249                message: "Distributed GPU differential evolution completed".to_string(),
250                nit: max_nit,
251                nfev: total_evaluations,
252                ..OptimizeResults::default()
253            },
254            gpu_stats: crate::gpu::acceleration::PerformanceStats::default(),
255            distributed_stats: self.distributed_context.stats().clone(),
256            performance_stats: self.performance_stats.clone(),
257            total_time,
258        })
259    }
260
261    /// Initialize population on GPU
262    fn initialize_gpu_population(
263        &self,
264        pop_size: usize,
265        bounds: &[(f64, f64)],
266    ) -> ScirsResult<Array2<f64>> {
267        use scirs2_core::random::{Rng, RngExt};
268        let mut rng = scirs2_core::random::rng();
269
270        let dims = bounds.len();
271        let mut population = Array2::zeros((pop_size, dims));
272
273        for i in 0..pop_size {
274            for j in 0..dims {
275                let (low, high) = bounds[j];
276                population[[i, j]] = rng.random_range(low..=high);
277            }
278        }
279
280        Ok(population)
281    }
282
283    /// Evaluate population using GPU acceleration
284    fn evaluate_population_gpu<F>(
285        &mut self,
286        function: &F,
287        population: &Array2<f64>,
288    ) -> ScirsResult<Array1<f64>>
289    where
290        F: Fn(&ArrayView1<f64>) -> f64,
291    {
292        let pop_size = population.nrows();
293        let mut fitness = Array1::zeros(pop_size);
294
295        // Decide between GPU and CPU based on problem size and configuration
296        let use_gpu = pop_size >= 100 && self.config.gpu_cpu_load_balance > 0.5;
297
298        if use_gpu {
299            // Use GPU acceleration for function evaluation
300            self.performance_stats.gpu_evaluations += pop_size;
301
302            // For now, use CPU evaluation since GPU function interface needs adaptation
303            // TODO: Implement proper GPU function evaluation interface
304            for i in 0..pop_size {
305                fitness[i] = function(&population.row(i));
306            }
307        } else {
308            // Use CPU evaluation
309            self.performance_stats.cpu_evaluations += pop_size;
310            for i in 0..pop_size {
311                fitness[i] = function(&population.row(i));
312            }
313        }
314
315        Ok(fitness)
316    }
317
318    /// Perform GPU-accelerated mutation and crossover
319    fn gpu_mutation_crossover(
320        &self,
321        _kernel: &DifferentialEvolutionKernel,
322        population: &Array2<f64>,
323        f_scale: f64,
324        crossover_rate: f64,
325    ) -> ScirsResult<Array2<f64>> {
326        let (pop_size, dims) = population.dim();
327        let mut trial_population = Array2::zeros((pop_size, dims));
328
329        // For now, implement CPU-based mutation and crossover
330        // TODO: Use actual GPU kernels when properly implemented
331        use scirs2_core::random::{Rng, RngExt};
332        let mut rng = scirs2_core::random::rng();
333
334        for i in 0..pop_size {
335            // Select three random individuals different from current
336            let mut indices = Vec::new();
337            while indices.len() < 3 {
338                let idx = rng.random_range(0..pop_size);
339                if idx != i && !indices.contains(&idx) {
340                    indices.push(idx);
341                }
342            }
343
344            let a = indices[0];
345            let b = indices[1];
346            let c = indices[2];
347
348            // Mutation and crossover
349            let j_rand = rng.random_range(0..dims);
350            for j in 0..dims {
351                if rng.random_range(0.0..1.0) < crossover_rate || j == j_rand {
352                    trial_population[[i, j]] =
353                        population[[a, j]] + f_scale * (population[[b, j]] - population[[c, j]]);
354                } else {
355                    trial_population[[i, j]] = population[[i, j]];
356                }
357            }
358        }
359
360        Ok(trial_population)
361    }
362
363    /// Perform GPU-accelerated selection
364    fn gpu_selection(
365        &self,
366        _kernel: &DifferentialEvolutionKernel,
367        population: &mut Array2<f64>,
368        trial_population: &Array2<f64>,
369        fitness: &mut Array1<f64>,
370        trial_fitness: &Array1<f64>,
371    ) -> ScirsResult<()> {
372        // For now, implement CPU-based selection
373        // TODO: Use actual GPU kernels when properly implemented
374        for i in 0..population.nrows() {
375            if trial_fitness[i] <= fitness[i] {
376                for j in 0..population.ncols() {
377                    population[[i, j]] = trial_population[[i, j]];
378                }
379                fitness[i] = trial_fitness[i];
380            }
381        }
382
383        Ok(())
384    }
385
386    /// Find local best individual and fitness
387    fn find_local_best(&self, fitness: &Array1<f64>) -> ScirsResult<(usize, f64)> {
388        let mut best_idx = 0;
389        let mut best_fitness = fitness[0];
390
391        for (i, &f) in fitness.iter().enumerate() {
392            if f < best_fitness {
393                best_fitness = f;
394                best_idx = i;
395            }
396        }
397
398        Ok((best_idx, best_fitness))
399    }
400
401    /// Communicate best individuals across MPI processes
402    fn communicate_best_individuals(
403        &mut self,
404        local_best: &Array1<f64>,
405        local_best_fitness: f64,
406    ) -> ScirsResult<Option<(Array1<f64>, f64)>> {
407        if self.distributed_context.size() <= 1 {
408            return Ok(None);
409        }
410
411        // For simplicity, we'll use a basic approach
412        // In a full implementation, this would use MPI all-reduce operations
413        // to find the global best across all processes
414
415        // Placeholder implementation
416        Ok(Some((local_best.clone(), local_best_fitness)))
417    }
418
419    /// Perform GPU-to-GPU migration between processes
420    fn gpu_migration(
421        &mut self,
422        population: &mut Array2<f64>,
423        fitness: &mut Array1<f64>,
424    ) -> ScirsResult<()> {
425        match self.config.gpu_communication_strategy {
426            GpuCommunicationStrategy::Direct => {
427                // Use GPUDirect for direct GPU-to-GPU communication
428                self.gpu_direct_migration(population, fitness)
429            }
430            GpuCommunicationStrategy::Staged => {
431                // Stage through CPU memory
432                self.staged_migration(population, fitness)
433            }
434            GpuCommunicationStrategy::AsyncOverlapped => {
435                // Asynchronous communication with computation overlap
436                self.async_migration(population, fitness)
437            }
438            GpuCommunicationStrategy::Hierarchical => {
439                // Hierarchical intra-node GPU, inter-node MPI
440                self.hierarchical_migration(population, fitness)
441            }
442        }
443    }
444
445    /// Direct GPU-to-GPU migration using GPUDirect
446    fn gpu_direct_migration(
447        &mut self,
448        population: &mut Array2<f64>,
449        _fitness: &mut Array1<f64>,
450    ) -> ScirsResult<()> {
451        // Placeholder for GPUDirect implementation
452        // This would use CUDA-aware MPI or similar technology
453        Ok(())
454    }
455
456    /// Staged migration through CPU memory
457    fn staged_migration(
458        &mut self,
459        population: &mut Array2<f64>,
460        _fitness: &mut Array1<f64>,
461    ) -> ScirsResult<()> {
462        // Download from GPU, perform MPI communication, upload back to GPU
463        // This is less efficient but more compatible
464        Ok(())
465    }
466
467    /// Asynchronous migration with computation overlap
468    fn async_migration(
469        &mut self,
470        population: &mut Array2<f64>,
471        _fitness: &mut Array1<f64>,
472    ) -> ScirsResult<()> {
473        // Use asynchronous MPI operations to overlap communication with computation
474        Ok(())
475    }
476
477    /// Hierarchical migration (intra-node GPU, inter-node MPI)
478    fn hierarchical_migration(
479        &mut self,
480        population: &mut Array2<f64>,
481        _fitness: &mut Array1<f64>,
482    ) -> ScirsResult<()> {
483        // First migrate within node between GPUs, then between nodes via MPI
484        Ok(())
485    }
486
487    /// Check convergence criteria
488    fn check_convergence(&self, fitness: &Array1<f64>, iteration: usize) -> ScirsResult<bool> {
489        if fitness.len() < 2 {
490            return Ok(false);
491        }
492
493        let mean = fitness.view().mean();
494        let variance =
495            fitness.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / fitness.len() as f64;
496
497        let std_dev = variance.sqrt();
498
499        // Simple convergence criterion
500        Ok(std_dev < 1e-12 || iteration >= 1000)
501    }
502
503    /// Generate random indices for differential evolution mutation
504    fn generate_random_indices(&self, pop_size: usize) -> ScirsResult<Array2<i32>> {
505        use scirs2_core::random::{Rng, RngExt};
506        let mut rng = scirs2_core::random::rng();
507        let mut indices = Array2::zeros((pop_size, 3));
508
509        for i in 0..pop_size {
510            let mut selected = std::collections::HashSet::new();
511            selected.insert(i);
512
513            for j in 0..3 {
514                loop {
515                    let idx = rng.random_range(0..pop_size);
516                    if !selected.contains(&idx) {
517                        indices[[i, j]] = idx as i32;
518                        selected.insert(idx);
519                        break;
520                    }
521                }
522            }
523        }
524
525        Ok(indices)
526    }
527
528    /// Generate random values for crossover
529    fn generate_random_values(&self, count: usize) -> ScirsResult<Array1<f64>> {
530        use scirs2_core::random::{Rng, RngExt};
531        let mut rng = scirs2_core::random::rng();
532        let mut values = Array1::zeros(count);
533
534        for i in 0..count {
535            values[i] = rng.random_range(0.0..1.0);
536        }
537
538        Ok(values)
539    }
540
541    /// Generate j_rand values for crossover
542    fn generate_j_rand(&self, pop_size: usize, dims: usize) -> ScirsResult<Array1<i32>> {
543        use scirs2_core::random::{Rng, RngExt};
544        let mut rng = scirs2_core::random::rng();
545        let mut j_rand = Array1::zeros(pop_size);
546
547        for i in 0..pop_size {
548            j_rand[i] = rng.random_range(0..dims) as i32;
549        }
550
551        Ok(j_rand)
552    }
553
554    /// Get performance statistics
555    pub fn stats(&self) -> &DistributedGpuStats {
556        &self.performance_stats
557    }
558}
559
560/// Performance statistics for distributed GPU optimization
561#[derive(Debug, Clone)]
562pub struct DistributedGpuStats {
563    /// Total GPU function evaluations
564    pub gpu_evaluations: usize,
565    /// Total CPU function evaluations
566    pub cpu_evaluations: usize,
567    /// GPU utilization percentage
568    pub gpu_utilization: f64,
569    /// Communication overhead time
570    pub communication_time: f64,
571    /// GPU memory usage statistics
572    pub gpu_memory_usage: f64,
573    /// Iteration statistics
574    pub nit: Vec<IterationStats>,
575}
576
577impl DistributedGpuStats {
578    fn new() -> Self {
579        Self {
580            gpu_evaluations: 0,
581            cpu_evaluations: 0,
582            gpu_utilization: 0.0,
583            communication_time: 0.0,
584            gpu_memory_usage: 0.0,
585            nit: Vec::new(),
586        }
587    }
588
589    fn record_iteration(
590        &mut self,
591        iteration: usize,
592        pop_size: usize,
593        best_fitness: f64,
594        elapsed_time: f64,
595    ) {
596        self.nit.push(IterationStats {
597            iteration,
598            population_size: pop_size,
599            best_fitness,
600            elapsed_time,
601        });
602    }
603
604    /// Generate comprehensive performance report
605    pub fn generate_report(&self) -> String {
606        let mut report = String::from("Distributed GPU Optimization Performance Report\n");
607        report.push_str("==============================================\n\n");
608
609        report.push_str(&format!(
610            "GPU Function Evaluations: {}\n",
611            self.gpu_evaluations
612        ));
613        report.push_str(&format!(
614            "CPU Function Evaluations: {}\n",
615            self.cpu_evaluations
616        ));
617
618        let total_evaluations = self.gpu_evaluations + self.cpu_evaluations;
619        if total_evaluations > 0 {
620            let gpu_percentage = (self.gpu_evaluations as f64 / total_evaluations as f64) * 100.0;
621            report.push_str(&format!("GPU Usage: {:.1}%\n", gpu_percentage));
622        }
623
624        report.push_str(&format!(
625            "GPU Utilization: {:.1}%\n",
626            self.gpu_utilization * 100.0
627        ));
628        report.push_str(&format!(
629            "Communication Overhead: {:.3}s\n",
630            self.communication_time
631        ));
632        report.push_str(&format!(
633            "GPU Memory Usage: {:.1}%\n",
634            self.gpu_memory_usage * 100.0
635        ));
636
637        if let Some(last_iteration) = self.nit.last() {
638            report.push_str(&format!(
639                "Final Best Fitness: {:.6e}\n",
640                last_iteration.best_fitness
641            ));
642            report.push_str(&format!(
643                "Total Time: {:.3}s\n",
644                last_iteration.elapsed_time
645            ));
646        }
647
648        report
649    }
650}
651
652/// Statistics for individual iterations
653#[derive(Debug, Clone)]
654pub struct IterationStats {
655    pub iteration: usize,
656    pub population_size: usize,
657    pub best_fitness: f64,
658    pub elapsed_time: f64,
659}
660
661/// Results from distributed GPU optimization
662#[derive(Debug, Clone)]
663pub struct DistributedGpuResults {
664    /// Base optimization results
665    pub base_result: OptimizeResults<f64>,
666    /// GPU-specific performance statistics
667    pub gpu_stats: crate::gpu::acceleration::PerformanceStats,
668    /// Distributed computing statistics
669    pub distributed_stats: DistributedStats,
670    /// Combined performance statistics
671    pub performance_stats: DistributedGpuStats,
672    /// Total optimization time
673    pub total_time: f64,
674}
675
676impl DistributedGpuResults {
677    fn empty() -> Self {
678        Self {
679            base_result: OptimizeResults::<f64> {
680                x: Array1::zeros(0),
681                fun: 0.0,
682                success: false,
683                message: "No work assigned to this process".to_string(),
684                nit: 0,
685                nfev: 0,
686                ..OptimizeResults::default()
687            },
688            gpu_stats: crate::gpu::acceleration::PerformanceStats::default(),
689            distributed_stats: DistributedStats {
690                communication_time: 0.0,
691                computation_time: 0.0,
692                load_balance_ratio: 1.0,
693                synchronizations: 0,
694                bytes_transferred: 0,
695            },
696            performance_stats: DistributedGpuStats::new(),
697            total_time: 0.0,
698        }
699    }
700
701    /// Print comprehensive results summary
702    pub fn print_summary(&self) {
703        println!("Distributed GPU Optimization Results");
704        println!("===================================");
705        println!("Success: {}", self.base_result.success);
706        println!("Final function value: {:.6e}", self.base_result.fun);
707        println!("Iterations: {}", self.base_result.nit);
708        println!("Function evaluations: {}", self.base_result.nfev);
709        println!("Total time: {:.3}s", self.total_time);
710        println!();
711
712        println!("GPU Performance:");
713        println!("{}", self.gpu_stats.generate_report());
714        println!();
715
716        println!("Distributed Performance:");
717        println!("{}", self.distributed_stats.generate_report());
718        println!();
719
720        println!("Combined Performance:");
721        println!("{}", self.performance_stats.generate_report());
722    }
723}
724
725#[cfg(test)]
726mod tests {
727    use super::*;
728
729    #[test]
730    fn test_distributed_gpu_config() {
731        let config = DistributedGpuConfig::default();
732        assert!(config.use_tensor_cores);
733        assert_eq!(config.gpu_cpu_load_balance, 0.8);
734        assert_eq!(
735            config.gpu_communication_strategy,
736            GpuCommunicationStrategy::Direct
737        );
738    }
739
740    #[test]
741    fn test_gpu_communication_strategies() {
742        let strategies = [
743            GpuCommunicationStrategy::Direct,
744            GpuCommunicationStrategy::Staged,
745            GpuCommunicationStrategy::AsyncOverlapped,
746            GpuCommunicationStrategy::Hierarchical,
747        ];
748
749        for strategy in &strategies {
750            let mut config = DistributedGpuConfig::default();
751            config.gpu_communication_strategy = *strategy;
752            // Test that configuration is valid
753            assert_eq!(config.gpu_communication_strategy, *strategy);
754        }
755    }
756
757    #[test]
758    fn test_performance_stats() {
759        let mut stats = DistributedGpuStats::new();
760        stats.gpu_evaluations = 1000;
761        stats.cpu_evaluations = 200;
762        stats.gpu_utilization = 0.85;
763
764        let report = stats.generate_report();
765        assert!(report.contains("GPU Function Evaluations: 1000"));
766        assert!(report.contains("CPU Function Evaluations: 200"));
767        assert!(report.contains("GPU Usage: 83.3%")); // 1000/(1000+200) * 100
768    }
769
770    #[test]
771    #[ignore = "Requires MPI and GPU"]
772    fn test_distributed_gpu_optimization() {
773        // This would test the actual distributed GPU optimization
774        // Implementation depends on having both MPI and GPU available
775    }
776}