1use crate::error::{ScirsError, ScirsResult};
8use ndarray::{Array1, Array2, ArrayView1};
9use scirs2_core::gpu::{GpuBackend, GpuBuffer, GpuContext};
10use statrs::statistics::Statistics;
11use std::sync::Arc;
12
13pub type OptimGpuArray<T> = GpuBuffer<T>;
18
19pub mod acceleration;
20pub mod cuda_kernels;
21pub mod memory_management;
22pub mod tensor_core_optimization;
23
24#[derive(Clone)]
26pub struct GpuOptimizationConfig {
27 pub context: Arc<GpuContext>,
29 pub batch_size: usize,
31 pub memory_limit: Option<usize>,
33 pub use_tensor_cores: bool,
35 pub precision: GpuPrecision,
37 pub stream_count: usize,
39}
40
41impl Default for GpuOptimizationConfig {
42 fn default() -> Self {
43 Self {
44 context: Arc::new(GpuContext::new(GpuBackend::default()).unwrap_or_else(|_| {
45 GpuContext::new(GpuBackend::Cpu).expect("CPU backend should always work")
47 })),
48 batch_size: 1024,
49 memory_limit: None,
50 use_tensor_cores: true,
51 precision: GpuPrecision::F64,
52 stream_count: 4,
53 }
54 }
55}
56
57#[derive(Debug, Clone, Copy, PartialEq)]
59pub enum GpuPrecision {
60 F32,
61 F64,
62 Mixed, }
64
65pub trait GpuFunction {
67 fn evaluate_batch_gpu(&self, points: &OptimGpuArray<f64>) -> ScirsResult<OptimGpuArray<f64>>;
69
70 fn gradient_batch_gpu(&self, points: &OptimGpuArray<f64>) -> ScirsResult<OptimGpuArray<f64>>;
72
73 fn hessian_batch_gpu(&self, points: &OptimGpuArray<f64>) -> ScirsResult<OptimGpuArray<f64>> {
75 Err(ScirsError::NotImplementedError(
76 scirs2_core::error::ErrorContext::new("Hessian evaluation not implemented".to_string()),
77 ))
78 }
79
80 fn supports_gpu(&self) -> bool {
82 true
83 }
84}
85
86pub struct GpuOptimizationContext {
88 config: GpuOptimizationConfig,
89 context: Arc<GpuContext>,
90 memory_pool: memory_management::GpuMemoryPool,
91}
92
93impl GpuOptimizationContext {
94 pub fn new(config: GpuOptimizationConfig) -> ScirsResult<Self> {
96 let context = config.context.clone();
97 let memory_pool = memory_management::GpuMemoryPool::new_stub();
98
99 Ok(Self {
100 config,
101 context,
102 memory_pool,
103 })
104 }
105
106 pub fn config(&self) -> &GpuOptimizationConfig {
108 &self.config
109 }
110
111 pub fn context(&self) -> &Arc<GpuContext> {
113 &self.context
114 }
115
116 pub fn allocate_workspace(
118 &mut self,
119 size: usize,
120 ) -> ScirsResult<memory_management::GpuWorkspace> {
121 self.memory_pool.allocate_workspace(size)
122 }
123
124 pub fn transfer_to_gpu<T>(&self, data: &Array2<T>) -> ScirsResult<OptimGpuArray<T>>
126 where
127 T: Clone + Send + Sync + 'static + scirs2_core::GpuDataType,
128 {
129 let shape = data.dim();
131 let total_size = shape.0 * shape.1;
132 let flat_data = data.as_slice().unwrap();
133
134 Err(ScirsError::NotImplementedError(
138 scirs2_core::error::ErrorContext::new(
139 "GPU buffer creation not yet implemented".to_string(),
140 ),
141 ))
142 }
143
144 pub fn transfer_from_gpu<T>(&self, gpu_data: &OptimGpuArray<T>) -> ScirsResult<Array2<T>>
146 where
147 T: Clone + Send + Sync + Default + 'static + scirs2_core::GpuDataType,
148 {
149 let total_size = gpu_data.len();
153 let dims = (total_size as f64).sqrt() as usize;
154
155 let mut host_data = vec![T::default(); total_size];
157 gpu_data.copy_to_host(&mut host_data)?;
158
159 Array2::from_shape_vec((dims, dims), host_data).map_err(|e| {
161 ScirsError::ComputationError(scirs2_core::error::ErrorContext::new(format!(
162 "Shape error: {}",
163 e
164 )))
165 })
166 }
167
168 pub fn upload_array<T>(&self, data: &Array2<T>) -> ScirsResult<OptimGpuArray<T>>
170 where
171 T: Clone + Send + Sync + 'static + scirs2_core::GpuDataType,
172 {
173 self.transfer_to_gpu(data)
174 }
175
176 pub fn download_array<T>(&self, gpu_data: &OptimGpuArray<T>) -> ScirsResult<Array2<T>>
178 where
179 T: Clone + Send + Sync + Default + 'static + scirs2_core::GpuDataType,
180 {
181 self.transfer_from_gpu(gpu_data)
182 }
183
184 pub fn evaluate_function_batch<F>(
186 &self,
187 function: &F,
188 points: &Array2<f64>,
189 ) -> ScirsResult<Array1<f64>>
190 where
191 F: GpuFunction,
192 {
193 if !function.supports_gpu() {
194 return Err(ScirsError::InvalidInput(
195 scirs2_core::error::ErrorContext::new(
196 "Function does not support GPU acceleration".to_string(),
197 ),
198 ));
199 }
200
201 let gpu_points = self.transfer_to_gpu(points)?;
202 let gpu_results = function.evaluate_batch_gpu(&gpu_points)?;
203 let cpu_results = self.transfer_from_gpu(&gpu_results)?;
204
205 Ok(cpu_results.column(0).to_owned())
207 }
208
209 pub fn evaluate_gradient_batch<F>(
211 &self,
212 function: &F,
213 points: &Array2<f64>,
214 ) -> ScirsResult<Array2<f64>>
215 where
216 F: GpuFunction,
217 {
218 if !function.supports_gpu() {
219 return Err(ScirsError::InvalidInput(
220 scirs2_core::error::ErrorContext::new(
221 "Function does not support GPU acceleration".to_string(),
222 ),
223 ));
224 }
225
226 let gpu_points = self.transfer_to_gpu(points)?;
227 let gpu_gradients = function.gradient_batch_gpu(&gpu_points)?;
228 self.transfer_from_gpu(&gpu_gradients)
229 }
230
231 pub fn compute_gradient_finite_diff<F>(
233 &self,
234 function: &F,
235 x: &Array1<f64>,
236 h: f64,
237 ) -> ScirsResult<Array1<f64>>
238 where
239 F: Fn(&ArrayView1<f64>) -> f64,
240 {
241 let n = x.len();
242 let mut gradient = Array1::zeros(n);
243
244 for i in 0..n {
246 let mut x_plus = x.clone();
247 let mut x_minus = x.clone();
248 x_plus[i] += h;
249 x_minus[i] -= h;
250
251 gradient[i] = (function(&x_plus.view()) - function(&x_minus.view())) / (2.0 * h);
252 }
253
254 Ok(gradient)
255 }
256
257 pub fn compute_search_direction(&self, gradient: &Array1<f64>) -> ScirsResult<Array1<f64>> {
259 Ok(-gradient.clone())
261 }
262}
263
264pub mod algorithms {
266 use super::*;
267 use crate::result::OptimizeResults;
268
269 pub struct GpuDifferentialEvolution {
271 context: GpuOptimizationContext,
272 population_size: usize,
273 max_nit: usize,
274 f_scale: f64,
275 crossover_rate: f64,
276 }
277
278 impl GpuDifferentialEvolution {
279 pub fn new(
281 context: GpuOptimizationContext,
282 population_size: usize,
283 max_nit: usize,
284 ) -> Self {
285 Self {
286 context,
287 population_size,
288 max_nit,
289 f_scale: 0.8,
290 crossover_rate: 0.7,
291 }
292 }
293
294 pub fn with_f_scale(mut self, f_scale: f64) -> Self {
296 self.f_scale = f_scale;
297 self
298 }
299
300 pub fn with_crossover_rate(mut self, crossover_rate: f64) -> Self {
302 self.crossover_rate = crossover_rate;
303 self
304 }
305
306 pub fn optimize<F>(
308 &mut self,
309 function: &F,
310 bounds: &[(f64, f64)],
311 ) -> ScirsResult<OptimizeResults<f64>>
312 where
313 F: GpuFunction,
314 {
315 let dims = bounds.len();
316
317 let mut population = self.initialize_population_gpu(bounds)?;
319 let mut fitness = self.evaluate_population_gpu(function, &population)?;
320
321 let mut best_idx = 0;
322 let mut best_fitness = fitness[0];
323 for (i, &f) in fitness.iter().enumerate() {
324 if f < best_fitness {
325 best_fitness = f;
326 best_idx = i;
327 }
328 }
329
330 let mut function_evaluations = self.population_size;
331
332 for iteration in 0..self.max_nit {
333 let trial_population = self.generate_trial_population_gpu(&population)?;
335 let trial_fitness = self.evaluate_population_gpu(function, &trial_population)?;
336
337 self.selection_gpu(
339 &mut population,
340 &mut fitness,
341 &trial_population,
342 &trial_fitness,
343 )?;
344
345 function_evaluations += self.population_size;
346
347 for (i, &f) in fitness.iter().enumerate() {
349 if f < best_fitness {
350 best_fitness = f;
351 best_idx = i;
352 }
353 }
354
355 if iteration % 10 == 0 {
357 let fitness_std = self.calculate_fitness_std(&fitness);
358 if fitness_std < 1e-12 {
359 break;
360 }
361 }
362 }
363
364 let best_x = population.row(best_idx).to_owned();
366
367 Ok(OptimizeResults::<f64> {
368 x: best_x,
369 fun: best_fitness,
370 success: true,
371 message: "GPU differential evolution completed".to_string(),
372 nit: self.max_nit,
373 nfev: function_evaluations,
374 ..OptimizeResults::default()
375 })
376 }
377
378 fn initialize_population_gpu(&self, bounds: &[(f64, f64)]) -> ScirsResult<Array2<f64>> {
379 use rand::Rng;
380 let mut rng = rand::rng();
381
382 let dims = bounds.len();
383 let mut population = Array2::zeros((self.population_size, dims));
384
385 for i in 0..self.population_size {
386 for j in 0..dims {
387 let (low, high) = bounds[j];
388 population[[i, j]] = rng.gen_range(low..=high);
389 }
390 }
391
392 Ok(population)
393 }
394
395 fn evaluate_population_gpu<F>(
396 &self,
397 function: &F,
398 population: &Array2<f64>,
399 ) -> ScirsResult<Array1<f64>>
400 where
401 F: GpuFunction,
402 {
403 self.context.evaluate_function_batch(function, population)
404 }
405
406 fn generate_trial_population_gpu(
407 &self,
408 population: &Array2<f64>,
409 ) -> ScirsResult<Array2<f64>> {
410 use rand::Rng;
413 let mut rng = rand::rng();
414
415 let (pop_size, dims) = population.dim();
416 let mut trial_population = Array2::zeros((pop_size, dims));
417
418 for i in 0..pop_size {
419 let mut indices = Vec::new();
421 while indices.len() < 3 {
422 let idx = rng.gen_range(0..pop_size);
423 if idx != i && !indices.contains(&idx) {
424 indices.push(idx);
425 }
426 }
427
428 let [a, b, c] = [indices[0], indices[1], indices[2]];
429
430 let j_rand = rng.gen_range(0..dims);
432 for j in 0..dims {
433 if rng.gen_range(0.0..1.0) < self.crossover_rate || j == j_rand {
434 trial_population[[i, j]] = population[[a, j]]
435 + self.f_scale * (population[[b, j]] - population[[c, j]]);
436 } else {
437 trial_population[[i, j]] = population[[i, j]];
438 }
439 }
440 }
441
442 Ok(trial_population)
443 }
444
445 fn selection_gpu(
446 &self,
447 population: &mut Array2<f64>,
448 fitness: &mut Array1<f64>,
449 trial_population: &Array2<f64>,
450 trial_fitness: &Array1<f64>,
451 ) -> ScirsResult<()> {
452 for i in 0..population.nrows() {
453 if trial_fitness[i] <= fitness[i] {
454 for j in 0..population.ncols() {
455 population[[i, j]] = trial_population[[i, j]];
456 }
457 fitness[i] = trial_fitness[i];
458 }
459 }
460 Ok(())
461 }
462
463 fn calculate_fitness_std(&self, fitness: &Array1<f64>) -> f64 {
464 let mean = fitness.view().mean();
465 let variance =
466 fitness.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / fitness.len() as f64;
467 variance.sqrt()
468 }
469 }
470
471 pub struct GpuParticleSwarm {
473 context: GpuOptimizationContext,
474 swarm_size: usize,
475 max_nit: usize,
476 w: f64, c1: f64, c2: f64, }
480
481 impl GpuParticleSwarm {
482 pub fn new(context: GpuOptimizationContext, swarm_size: usize, max_nit: usize) -> Self {
484 Self {
485 context,
486 swarm_size,
487 max_nit,
488 w: 0.729,
489 c1: 1.49445,
490 c2: 1.49445,
491 }
492 }
493
494 pub fn with_parameters(mut self, w: f64, c1: f64, c2: f64) -> Self {
496 self.w = w;
497 self.c1 = c1;
498 self.c2 = c2;
499 self
500 }
501
502 pub fn optimize<F>(
504 &mut self,
505 function: &F,
506 bounds: &[(f64, f64)],
507 ) -> ScirsResult<OptimizeResults<f64>>
508 where
509 F: GpuFunction,
510 {
511 let dims = bounds.len();
512
513 let mut positions = self.initialize_positions_gpu(bounds)?;
515 let mut velocities = Array2::zeros((self.swarm_size, dims));
516 let mut personal_best = positions.clone();
517 let mut personal_best_fitness = self.evaluate_population_gpu(function, &positions)?;
518
519 let mut global_best_idx = 0;
521 let mut global_best_fitness = personal_best_fitness[0];
522 for (i, &fitness) in personal_best_fitness.iter().enumerate() {
523 if fitness < global_best_fitness {
524 global_best_fitness = fitness;
525 global_best_idx = i;
526 }
527 }
528 let mut global_best = personal_best.row(global_best_idx).to_owned();
529
530 let mut function_evaluations = self.swarm_size;
531
532 for iteration in 0..self.max_nit {
533 self.update_swarm_gpu(
535 &mut positions,
536 &mut velocities,
537 &personal_best,
538 &global_best,
539 bounds,
540 )?;
541
542 let fitness = self.evaluate_population_gpu(function, &positions)?;
544 function_evaluations += self.swarm_size;
545
546 for i in 0..self.swarm_size {
548 if fitness[i] < personal_best_fitness[i] {
549 personal_best_fitness[i] = fitness[i];
550 for j in 0..dims {
551 personal_best[[i, j]] = positions[[i, j]];
552 }
553
554 if fitness[i] < global_best_fitness {
556 global_best_fitness = fitness[i];
557 global_best = positions.row(i).to_owned();
558 }
559 }
560 }
561
562 if iteration % 10 == 0 {
564 let fitness_std = self.calculate_fitness_std(&personal_best_fitness);
565 if fitness_std < 1e-12 {
566 break;
567 }
568 }
569 }
570
571 Ok(OptimizeResults::<f64> {
572 x: global_best,
573 fun: global_best_fitness,
574 success: true,
575 message: "GPU particle swarm optimization completed".to_string(),
576 nit: self.max_nit,
577 nfev: function_evaluations,
578 ..OptimizeResults::default()
579 })
580 }
581
582 fn initialize_positions_gpu(&self, bounds: &[(f64, f64)]) -> ScirsResult<Array2<f64>> {
583 use rand::Rng;
584 let mut rng = rand::rng();
585
586 let dims = bounds.len();
587 let mut positions = Array2::zeros((self.swarm_size, dims));
588
589 for i in 0..self.swarm_size {
590 for j in 0..dims {
591 let (low, high) = bounds[j];
592 positions[[i, j]] = rng.gen_range(low..=high);
593 }
594 }
595
596 Ok(positions)
597 }
598
599 fn evaluate_population_gpu<F>(
600 &self,
601 function: &F,
602 population: &Array2<f64>,
603 ) -> ScirsResult<Array1<f64>>
604 where
605 F: GpuFunction,
606 {
607 self.context.evaluate_function_batch(function, population)
608 }
609
610 fn update_swarm_gpu(
611 &self,
612 positions: &mut Array2<f64>,
613 velocities: &mut Array2<f64>,
614 personal_best: &Array2<f64>,
615 global_best: &Array1<f64>,
616 bounds: &[(f64, f64)],
617 ) -> ScirsResult<()> {
618 use rand::Rng;
619 let mut rng = rand::rng();
620
621 let (swarm_size, dims) = positions.dim();
622
623 for i in 0..swarm_size {
624 for j in 0..dims {
625 let r1: f64 = rng.gen_range(0.0..1.0);
626 let r2: f64 = rng.gen_range(0.0..1.0);
627
628 velocities[[i, j]] = self.w * velocities[[i, j]]
630 + self.c1 * r1 * (personal_best[[i, j]] - positions[[i, j]])
631 + self.c2 * r2 * (global_best[j] - positions[[i, j]]);
632
633 positions[[i, j]] += velocities[[i, j]];
635
636 let (low, high) = bounds[j];
638 if positions[[i, j]] < low {
639 positions[[i, j]] = low;
640 velocities[[i, j]] = 0.0;
641 } else if positions[[i, j]] > high {
642 positions[[i, j]] = high;
643 velocities[[i, j]] = 0.0;
644 }
645 }
646 }
647
648 Ok(())
649 }
650
651 fn calculate_fitness_std(&self, fitness: &Array1<f64>) -> f64 {
652 let mean = fitness.view().mean();
653 let variance =
654 fitness.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / fitness.len() as f64;
655 variance.sqrt()
656 }
657 }
658}
659
660pub mod utils {
662 use super::*;
663
664 pub fn should_use_gpu(_problem_size: usize, batch_size: usize) -> bool {
666 _problem_size * batch_size > 10000
668 }
669
670 pub fn estimate_optimal_batch_size(
672 problem_dims: usize,
673 available_memory: usize,
674 precision: GpuPrecision,
675 ) -> usize {
676 let element_size = match precision {
677 GpuPrecision::F32 => 4,
678 GpuPrecision::F64 => 8,
679 GpuPrecision::Mixed => 6, };
681
682 let memory_per_point = problem_dims * element_size * 3; let batch_size = available_memory / memory_per_point;
684
685 batch_size.max(1).min(65536)
687 }
688
689 pub fn create_optimal_config(
691 problem_dims: usize,
692 expected_evaluations: usize,
693 ) -> ScirsResult<GpuOptimizationConfig> {
694 let context = GpuContext::new(scirs2_core::gpu::GpuBackend::Cuda)?;
695 let available_memory = 1024 * 1024 * 1024; let batch_size = estimate_optimal_batch_size(
698 problem_dims,
699 available_memory / 2, GpuPrecision::F64,
701 );
702
703 Ok(GpuOptimizationConfig {
704 context: Arc::new(context),
705 batch_size,
706 memory_limit: Some(available_memory / 2),
707 use_tensor_cores: true,
708 precision: GpuPrecision::F64,
709 stream_count: 4,
710 })
711 }
712}
713
714#[cfg(test)]
715mod tests {
716 use super::*;
717
718 #[test]
719 fn test_gpu_config_creation() {
720 let config = GpuOptimizationConfig::default();
721 assert_eq!(config.batch_size, 1024);
722 assert_eq!(config.precision, GpuPrecision::F64);
723 assert!(config.use_tensor_cores);
724 }
725
726 #[test]
727 fn test_optimal_batch_size_estimation() {
728 let batch_size = utils::estimate_optimal_batch_size(
729 10, 1024 * 1024, GpuPrecision::F64,
732 );
733 assert!(batch_size > 0);
734 assert!(batch_size <= 65536);
735 }
736
737 #[test]
738 fn test_gpu_usage_heuristic() {
739 assert!(!utils::should_use_gpu(10, 10)); assert!(utils::should_use_gpu(1000, 100)); assert!(utils::should_use_gpu(100, 1000)); }
743}