1use crate::error::{ClusteringError, Result};
11use scirs2_core::ndarray::{Array1, Array2, ArrayView2, Axis};
12use scirs2_core::numeric::{Float, FromPrimitive};
13use serde::{Deserialize, Serialize};
14use std::collections::HashMap;
15use std::fmt;
16use std::time::{Duration, Instant};
17
18use super::core::{DeviceSelection, GpuBackend, GpuConfig, GpuContext, GpuDevice};
19use super::memory::{GpuMemoryBlock, GpuMemoryManager, MemoryStats, MemoryStrategy};
20
21#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
27pub enum AdvancedMemoryStrategy {
28 Conservative,
30 Aggressive,
32 Adaptive,
34 Streaming {
36 chunk_size: usize,
38 },
39 Unified,
41 Pool {
43 pool_size: usize,
45 },
46}
47
48impl Default for AdvancedMemoryStrategy {
49 fn default() -> Self {
50 AdvancedMemoryStrategy::Adaptive
51 }
52}
53
54impl fmt::Display for AdvancedMemoryStrategy {
55 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
56 match self {
57 AdvancedMemoryStrategy::Conservative => write!(f, "Conservative"),
58 AdvancedMemoryStrategy::Aggressive => write!(f, "Aggressive"),
59 AdvancedMemoryStrategy::Adaptive => write!(f, "Adaptive"),
60 AdvancedMemoryStrategy::Streaming { chunk_size } => {
61 write!(f, "Streaming({}MB)", chunk_size / (1024 * 1024))
62 }
63 AdvancedMemoryStrategy::Unified => write!(f, "Unified"),
64 AdvancedMemoryStrategy::Pool { pool_size } => {
65 write!(f, "Pool({}MB)", pool_size / (1024 * 1024))
66 }
67 }
68 }
69}
70
71#[derive(Debug)]
73pub struct AdvancedGpuMemoryManager {
74 base_manager: GpuMemoryManager,
76 strategy: AdvancedMemoryStrategy,
78 available_memory: usize,
80 allocation_history: Vec<AllocationRecord>,
82 pressure_threshold: f64,
84 enable_defrag: bool,
86 usage_stats: MemoryUsageStats,
88}
89
90#[derive(Debug, Clone)]
92pub struct AllocationRecord {
93 pub size: usize,
95 pub timestamp: Instant,
97 pub duration: Option<Duration>,
99 pub success: bool,
101}
102
103#[derive(Debug, Clone, Default, Serialize, Deserialize)]
105pub struct MemoryUsageStats {
106 pub total_allocations: usize,
108 pub successful_allocations: usize,
110 pub failed_allocations: usize,
112 pub total_bytes_allocated: usize,
114 pub current_bytes_in_use: usize,
116 pub peak_bytes_in_use: usize,
118 pub avg_allocation_size: f64,
120 pub efficiency: f64,
122}
123
124impl AdvancedGpuMemoryManager {
125 pub fn new(strategy: AdvancedMemoryStrategy, available_memory: usize) -> Self {
127 let alignment = 256; let max_pool_size = match strategy {
129 AdvancedMemoryStrategy::Pool { pool_size } => pool_size / (1024 * 1024),
130 _ => 100, };
132
133 Self {
134 base_manager: GpuMemoryManager::new(alignment, max_pool_size),
135 strategy,
136 available_memory,
137 allocation_history: Vec::new(),
138 pressure_threshold: 0.85,
139 enable_defrag: true,
140 usage_stats: MemoryUsageStats::default(),
141 }
142 }
143
144 pub fn allocate(&mut self, size: usize) -> Result<GpuMemoryBlock> {
146 self.usage_stats.total_allocations += 1;
147
148 let memory_pressure = self.calculate_memory_pressure();
150
151 let result = match self.strategy {
153 AdvancedMemoryStrategy::Conservative => self.allocate_conservative(size),
154 AdvancedMemoryStrategy::Aggressive => self.allocate_aggressive(size),
155 AdvancedMemoryStrategy::Adaptive => self.allocate_adaptive(size, memory_pressure),
156 AdvancedMemoryStrategy::Streaming { chunk_size } => {
157 self.allocate_streaming(size, chunk_size)
158 }
159 AdvancedMemoryStrategy::Unified => self.allocate_unified(size),
160 AdvancedMemoryStrategy::Pool { .. } => self.base_manager.allocate(size),
161 };
162
163 let success = result.is_ok();
165 self.allocation_history.push(AllocationRecord {
166 size,
167 timestamp: Instant::now(),
168 duration: None,
169 success,
170 });
171
172 if success {
173 self.usage_stats.successful_allocations += 1;
174 self.usage_stats.total_bytes_allocated += size;
175 self.usage_stats.current_bytes_in_use += size;
176 self.usage_stats.peak_bytes_in_use = self
177 .usage_stats
178 .peak_bytes_in_use
179 .max(self.usage_stats.current_bytes_in_use);
180 } else {
181 self.usage_stats.failed_allocations += 1;
182 }
183
184 self.update_efficiency();
185 result
186 }
187
188 pub fn deallocate(&mut self, block: GpuMemoryBlock) -> Result<()> {
190 let size = block.size;
191 self.base_manager.deallocate(block)?;
192 self.usage_stats.current_bytes_in_use =
193 self.usage_stats.current_bytes_in_use.saturating_sub(size);
194 Ok(())
195 }
196
197 fn allocate_conservative(&mut self, size: usize) -> Result<GpuMemoryBlock> {
199 if self.usage_stats.current_bytes_in_use + size > self.available_memory {
201 self.compact_memory()?;
203 }
204 self.base_manager.allocate(size)
205 }
206
207 fn allocate_aggressive(&mut self, size: usize) -> Result<GpuMemoryBlock> {
209 self.base_manager.allocate(size)
211 }
212
213 fn allocate_adaptive(&mut self, size: usize, memory_pressure: f64) -> Result<GpuMemoryBlock> {
215 if memory_pressure > self.pressure_threshold {
216 self.allocate_conservative(size)
218 } else {
219 self.allocate_aggressive(size)
221 }
222 }
223
224 fn allocate_streaming(&mut self, size: usize, chunk_size: usize) -> Result<GpuMemoryBlock> {
226 let actual_size = size.min(chunk_size);
227 self.base_manager.allocate(actual_size)
228 }
229
230 fn allocate_unified(&mut self, size: usize) -> Result<GpuMemoryBlock> {
232 self.base_manager.allocate(size)
234 }
235
236 fn calculate_memory_pressure(&self) -> f64 {
238 if self.available_memory == 0 {
239 return 1.0;
240 }
241 self.usage_stats.current_bytes_in_use as f64 / self.available_memory as f64
242 }
243
244 fn compact_memory(&mut self) -> Result<()> {
246 self.base_manager.clear_pools()
247 }
248
249 fn update_efficiency(&mut self) {
251 if self.usage_stats.total_allocations > 0 {
252 self.usage_stats.efficiency = self.usage_stats.successful_allocations as f64
253 / self.usage_stats.total_allocations as f64;
254 self.usage_stats.avg_allocation_size = self.usage_stats.total_bytes_allocated as f64
255 / self.usage_stats.successful_allocations.max(1) as f64;
256 }
257 }
258
259 pub fn get_stats(&self) -> &MemoryUsageStats {
261 &self.usage_stats
262 }
263
264 pub fn strategy(&self) -> AdvancedMemoryStrategy {
266 self.strategy
267 }
268
269 pub fn set_strategy(&mut self, strategy: AdvancedMemoryStrategy) {
271 self.strategy = strategy;
272 }
273
274 pub fn pressure_threshold(&self) -> f64 {
276 self.pressure_threshold
277 }
278
279 pub fn set_pressure_threshold(&mut self, threshold: f64) {
281 self.pressure_threshold = threshold.clamp(0.0, 1.0);
282 }
283
284 pub fn is_defrag_enabled(&self) -> bool {
286 self.enable_defrag
287 }
288
289 pub fn set_defrag_enabled(&mut self, enabled: bool) {
291 self.enable_defrag = enabled;
292 }
293}
294
295#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
301pub enum PrecisionMode {
302 Full,
304 Single,
306 Half,
308 Mixed,
310 BFloat16,
312 TensorFloat32,
314 Auto,
316}
317
318impl Default for PrecisionMode {
319 fn default() -> Self {
320 PrecisionMode::Auto
321 }
322}
323
324impl fmt::Display for PrecisionMode {
325 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
326 match self {
327 PrecisionMode::Full => write!(f, "Full (f64)"),
328 PrecisionMode::Single => write!(f, "Single (f32)"),
329 PrecisionMode::Half => write!(f, "Half (f16)"),
330 PrecisionMode::Mixed => write!(f, "Mixed (f16/f32)"),
331 PrecisionMode::BFloat16 => write!(f, "BFloat16"),
332 PrecisionMode::TensorFloat32 => write!(f, "TF32"),
333 PrecisionMode::Auto => write!(f, "Auto"),
334 }
335 }
336}
337
338#[derive(Debug, Clone, Serialize, Deserialize)]
340pub struct TensorCoreConfig {
341 pub enabled: bool,
343 pub precision: PrecisionMode,
345 pub tile_size: (usize, usize, usize),
347 pub use_sparsity: bool,
349 pub sparsity_ratio: f64,
351 pub auto_scale: bool,
353 pub loss_scale: f64,
355}
356
357impl Default for TensorCoreConfig {
358 fn default() -> Self {
359 Self {
360 enabled: true,
361 precision: PrecisionMode::Auto,
362 tile_size: (16, 16, 16),
363 use_sparsity: false,
364 sparsity_ratio: 0.5,
365 auto_scale: true,
366 loss_scale: 1.0,
367 }
368 }
369}
370
371#[derive(Debug, Clone, Serialize, Deserialize)]
373pub struct TensorCoreCapabilities {
374 pub available: bool,
376 pub supported_precisions: Vec<PrecisionMode>,
378 pub supported_tile_sizes: Vec<(usize, usize, usize)>,
380 pub supports_sparsity: bool,
382 pub peak_tops: Option<f64>,
384 pub architecture: String,
386}
387
388impl Default for TensorCoreCapabilities {
389 fn default() -> Self {
390 Self {
391 available: false,
392 supported_precisions: vec![PrecisionMode::Single],
393 supported_tile_sizes: vec![(16, 16, 16)],
394 supports_sparsity: false,
395 peak_tops: None,
396 architecture: "Unknown".to_string(),
397 }
398 }
399}
400
401pub fn detect_tensor_core_capabilities(device: &GpuDevice) -> TensorCoreCapabilities {
403 match device.backend {
404 GpuBackend::Cuda => {
405 TensorCoreCapabilities {
407 available: true,
408 supported_precisions: vec![
409 PrecisionMode::Half,
410 PrecisionMode::Mixed,
411 PrecisionMode::BFloat16,
412 PrecisionMode::TensorFloat32,
413 ],
414 supported_tile_sizes: vec![(16, 16, 16), (32, 8, 16), (8, 32, 16)],
415 supports_sparsity: true, peak_tops: Some(312.0), architecture: "NVIDIA Tensor Cores".to_string(),
418 }
419 }
420 GpuBackend::Rocm => {
421 TensorCoreCapabilities {
423 available: true,
424 supported_precisions: vec![
425 PrecisionMode::Half,
426 PrecisionMode::Mixed,
427 PrecisionMode::BFloat16,
428 ],
429 supported_tile_sizes: vec![(32, 32, 8), (16, 16, 16)],
430 supports_sparsity: false,
431 peak_tops: Some(383.0), architecture: "AMD Matrix Cores".to_string(),
433 }
434 }
435 GpuBackend::Metal => {
436 TensorCoreCapabilities {
438 available: true,
439 supported_precisions: vec![PrecisionMode::Half, PrecisionMode::Single],
440 supported_tile_sizes: vec![(16, 16, 16)],
441 supports_sparsity: false,
442 peak_tops: Some(15.8), architecture: "Apple Neural Engine".to_string(),
444 }
445 }
446 _ => TensorCoreCapabilities::default(),
447 }
448}
449
450#[derive(Debug, Clone, Serialize, Deserialize)]
456pub enum AdvancedDeviceSelection {
457 First,
459 MostMemory,
461 HighestCompute,
463 Specific(u32),
465 Auto,
467 Fastest,
469 MostEfficient,
471 RoundRobin,
473 LoadBalanced,
475 MultiGpu {
477 max_gpus: usize,
479 },
480}
481
482impl Default for AdvancedDeviceSelection {
483 fn default() -> Self {
484 AdvancedDeviceSelection::Auto
485 }
486}
487
488impl From<AdvancedDeviceSelection> for DeviceSelection {
489 fn from(adv: AdvancedDeviceSelection) -> Self {
490 match adv {
491 AdvancedDeviceSelection::First => DeviceSelection::First,
492 AdvancedDeviceSelection::MostMemory => DeviceSelection::MostMemory,
493 AdvancedDeviceSelection::HighestCompute => DeviceSelection::HighestCompute,
494 AdvancedDeviceSelection::Specific(id) => DeviceSelection::Specific(id),
495 AdvancedDeviceSelection::Auto => DeviceSelection::Auto,
496 AdvancedDeviceSelection::Fastest => DeviceSelection::Fastest,
497 _ => DeviceSelection::Auto, }
499 }
500}
501
502#[derive(Debug)]
504pub struct DeviceSelector {
505 devices: Vec<GpuDevice>,
507 strategy: AdvancedDeviceSelection,
509 utilization: HashMap<u32, f64>,
511 round_robin_idx: usize,
513 benchmarks: HashMap<u32, DeviceBenchmark>,
515}
516
517#[derive(Debug, Clone, Serialize, Deserialize)]
519pub struct DeviceBenchmark {
520 pub device_id: u32,
522 pub distance_throughput: f64,
524 pub kmeans_time_ms: f64,
526 pub memory_bandwidth: f64,
528 pub power_consumption: Option<f64>,
530 pub timestamp: std::time::SystemTime,
532}
533
534impl DeviceSelector {
535 pub fn new(strategy: AdvancedDeviceSelection) -> Self {
537 Self {
538 devices: Vec::new(),
539 strategy,
540 utilization: HashMap::new(),
541 round_robin_idx: 0,
542 benchmarks: HashMap::new(),
543 }
544 }
545
546 pub fn add_device(&mut self, device: GpuDevice) {
548 self.utilization.insert(device.device_id, 0.0);
549 self.devices.push(device);
550 }
551
552 pub fn select_device(&mut self) -> Option<&GpuDevice> {
554 if self.devices.is_empty() {
555 return None;
556 }
557
558 match &self.strategy {
559 AdvancedDeviceSelection::First => self.devices.first(),
560 AdvancedDeviceSelection::MostMemory => {
561 self.devices.iter().max_by_key(|d| d.available_memory)
562 }
563 AdvancedDeviceSelection::HighestCompute => {
564 self.devices.iter().max_by_key(|d| d.compute_units)
565 }
566 AdvancedDeviceSelection::Specific(id) => {
567 self.devices.iter().find(|d| d.device_id == *id)
568 }
569 AdvancedDeviceSelection::Auto => {
570 self.devices.iter().max_by(|a, b| {
572 a.get_device_score()
573 .partial_cmp(&b.get_device_score())
574 .unwrap_or(std::cmp::Ordering::Equal)
575 })
576 }
577 AdvancedDeviceSelection::Fastest => {
578 if self.benchmarks.is_empty() {
580 self.devices.first()
581 } else {
582 let fastest_id = self
583 .benchmarks
584 .iter()
585 .min_by(|a, b| {
586 a.1.kmeans_time_ms
587 .partial_cmp(&b.1.kmeans_time_ms)
588 .unwrap_or(std::cmp::Ordering::Equal)
589 })
590 .map(|(id, _)| *id);
591
592 fastest_id.and_then(|id| self.devices.iter().find(|d| d.device_id == id))
593 }
594 }
595 AdvancedDeviceSelection::MostEfficient => {
596 if self.benchmarks.is_empty() {
598 self.devices.first()
599 } else {
600 let most_efficient_id = self
601 .benchmarks
602 .iter()
603 .filter_map(|(id, bench)| {
604 bench.power_consumption.map(|power| {
605 let efficiency = bench.distance_throughput / power;
606 (*id, efficiency)
607 })
608 })
609 .max_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
610 .map(|(id, _)| id);
611
612 most_efficient_id.and_then(|id| self.devices.iter().find(|d| d.device_id == id))
613 }
614 }
615 AdvancedDeviceSelection::RoundRobin => {
616 let idx = self.round_robin_idx % self.devices.len();
617 self.round_robin_idx += 1;
618 self.devices.get(idx)
619 }
620 AdvancedDeviceSelection::LoadBalanced => {
621 let least_utilized = self
623 .utilization
624 .iter()
625 .min_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
626 .map(|(id, _)| *id);
627
628 least_utilized.and_then(|id| self.devices.iter().find(|d| d.device_id == id))
629 }
630 AdvancedDeviceSelection::MultiGpu { max_gpus } => {
631 self.devices.iter().take(*max_gpus).next()
633 }
634 }
635 }
636
637 pub fn update_utilization(&mut self, device_id: u32, utilization: f64) {
639 self.utilization
640 .insert(device_id, utilization.clamp(0.0, 1.0));
641 }
642
643 pub fn add_benchmark(&mut self, benchmark: DeviceBenchmark) {
645 self.benchmarks.insert(benchmark.device_id, benchmark);
646 }
647
648 pub fn devices(&self) -> &[GpuDevice] {
650 &self.devices
651 }
652
653 pub fn strategy(&self) -> &AdvancedDeviceSelection {
655 &self.strategy
656 }
657
658 pub fn set_strategy(&mut self, strategy: AdvancedDeviceSelection) {
660 self.strategy = strategy;
661 }
662}
663
664#[derive(Debug, Clone, Serialize, Deserialize)]
670pub struct GpuAccelerationConfig {
671 pub enabled: bool,
673 pub backend: GpuBackend,
675 pub device_selection: AdvancedDeviceSelection,
677 pub memory_strategy: AdvancedMemoryStrategy,
679 pub tensor_cores: TensorCoreConfig,
681 pub auto_fallback: bool,
683 pub min_problem_size: usize,
685 pub tile_size: usize,
687 pub async_execution: bool,
689 pub num_streams: usize,
691 pub enable_profiling: bool,
693 pub kernel_optimizations: KernelOptimizations,
695}
696
697impl Default for GpuAccelerationConfig {
698 fn default() -> Self {
699 Self {
700 enabled: true,
701 backend: GpuBackend::CpuFallback,
702 device_selection: AdvancedDeviceSelection::Auto,
703 memory_strategy: AdvancedMemoryStrategy::Adaptive,
704 tensor_cores: TensorCoreConfig::default(),
705 auto_fallback: true,
706 min_problem_size: 1000,
707 tile_size: 256,
708 async_execution: true,
709 num_streams: 4,
710 enable_profiling: false,
711 kernel_optimizations: KernelOptimizations::default(),
712 }
713 }
714}
715
716impl GpuAccelerationConfig {
717 pub fn cuda() -> Self {
719 Self {
720 backend: GpuBackend::Cuda,
721 tensor_cores: TensorCoreConfig {
722 enabled: true,
723 precision: PrecisionMode::Mixed,
724 ..Default::default()
725 },
726 ..Default::default()
727 }
728 }
729
730 pub fn opencl() -> Self {
732 Self {
733 backend: GpuBackend::OpenCl,
734 tensor_cores: TensorCoreConfig {
735 enabled: false,
736 precision: PrecisionMode::Single,
737 ..Default::default()
738 },
739 ..Default::default()
740 }
741 }
742
743 pub fn rocm() -> Self {
745 Self {
746 backend: GpuBackend::Rocm,
747 tensor_cores: TensorCoreConfig {
748 enabled: true,
749 precision: PrecisionMode::Mixed,
750 ..Default::default()
751 },
752 ..Default::default()
753 }
754 }
755
756 pub fn metal() -> Self {
758 Self {
759 backend: GpuBackend::Metal,
760 tensor_cores: TensorCoreConfig {
761 enabled: true,
762 precision: PrecisionMode::Half,
763 ..Default::default()
764 },
765 ..Default::default()
766 }
767 }
768
769 pub fn cpu() -> Self {
771 Self {
772 enabled: false,
773 backend: GpuBackend::CpuFallback,
774 ..Default::default()
775 }
776 }
777
778 pub fn to_basic_config(&self) -> GpuConfig {
780 GpuConfig {
781 preferred_backend: self.backend,
782 device_selection: self.device_selection.clone().into(),
783 auto_fallback: self.auto_fallback,
784 memory_pool_size: match self.memory_strategy {
785 AdvancedMemoryStrategy::Pool { pool_size } => Some(pool_size),
786 _ => None,
787 },
788 optimize_memory: true,
789 backend_options: HashMap::new(),
790 }
791 }
792}
793
794#[derive(Debug, Clone, Serialize, Deserialize)]
796pub struct KernelOptimizations {
797 pub loop_unrolling: bool,
799 pub shared_memory_tiling: bool,
801 pub register_blocking: bool,
803 pub vectorized_loads: bool,
805 pub texture_memory: bool,
807 pub constant_memory: bool,
809 pub occupancy_level: u8,
811}
812
813impl Default for KernelOptimizations {
814 fn default() -> Self {
815 Self {
816 loop_unrolling: true,
817 shared_memory_tiling: true,
818 register_blocking: true,
819 vectorized_loads: true,
820 texture_memory: false,
821 constant_memory: true,
822 occupancy_level: 2,
823 }
824 }
825}
826
827#[derive(Debug)]
833pub struct GpuKMeans<F: Float> {
834 config: GpuAccelerationConfig,
836 context: Option<GpuContext>,
838 memory_manager: AdvancedGpuMemoryManager,
840 device_selector: DeviceSelector,
842 tensor_caps: TensorCoreCapabilities,
844 gpu_available: bool,
846 profiling_data: Vec<ProfilingRecord>,
848 _phantom: std::marker::PhantomData<F>,
850}
851
852#[derive(Debug, Clone)]
854pub struct ProfilingRecord {
855 pub operation: String,
857 pub duration_us: u64,
859 pub memory_transferred: usize,
861 pub compute_ops: usize,
863 pub timestamp: Instant,
865}
866
867#[derive(Debug, Clone)]
869pub struct GpuKMeansResult<F: Float> {
870 pub centroids: Array2<F>,
872 pub labels: Array1<usize>,
874 pub inertia: F,
876 pub n_iterations: usize,
878 pub converged: bool,
880 pub metrics: KMeansMetrics,
882}
883
884#[derive(Debug, Clone, Serialize, Deserialize)]
886pub struct KMeansMetrics {
887 pub total_time_ms: f64,
889 pub distance_time_ms: f64,
891 pub centroid_update_time_ms: f64,
893 pub label_assignment_time_ms: f64,
895 pub transfer_time_ms: f64,
897 pub used_gpu: bool,
899 pub backend: String,
901 pub memory_used: usize,
903 pub throughput: f64,
905}
906
907impl<F: Float + FromPrimitive + Send + Sync + 'static> GpuKMeans<F> {
908 pub fn new(config: GpuAccelerationConfig) -> Result<Self> {
910 let device_selector = DeviceSelector::new(config.device_selection.clone());
911
912 let (context, gpu_available, tensor_caps) = Self::try_create_context(&config)?;
914
915 let available_memory = context
916 .as_ref()
917 .map(|ctx| ctx.device.available_memory)
918 .unwrap_or(1024 * 1024 * 1024); let memory_manager =
921 AdvancedGpuMemoryManager::new(config.memory_strategy, available_memory);
922
923 Ok(Self {
924 config,
925 context,
926 memory_manager,
927 device_selector,
928 tensor_caps,
929 gpu_available,
930 profiling_data: Vec::new(),
931 _phantom: std::marker::PhantomData,
932 })
933 }
934
935 fn try_create_context(
937 config: &GpuAccelerationConfig,
938 ) -> Result<(Option<GpuContext>, bool, TensorCoreCapabilities)> {
939 if !config.enabled || config.backend == GpuBackend::CpuFallback {
940 return Ok((None, false, TensorCoreCapabilities::default()));
941 }
942
943 let device = GpuDevice::new(
945 0,
946 format!("{} Device", config.backend),
947 8_000_000_000,
948 6_000_000_000,
949 "1.0".to_string(),
950 1024,
951 config.backend,
952 true,
953 );
954
955 let tensor_caps = detect_tensor_core_capabilities(&device);
956 let basic_config = config.to_basic_config();
957
958 match GpuContext::new(device.clone(), basic_config) {
959 Ok(ctx) => Ok((Some(ctx), true, tensor_caps)),
960 Err(_) if config.auto_fallback => Ok((None, false, TensorCoreCapabilities::default())),
961 Err(e) => Err(e),
962 }
963 }
964
965 pub fn fit(
967 &mut self,
968 data: ArrayView2<F>,
969 k: usize,
970 max_iter: usize,
971 tol: F,
972 ) -> Result<GpuKMeansResult<F>> {
973 let start_time = Instant::now();
974 let n_samples = data.nrows();
975 let n_features = data.ncols();
976
977 if k == 0 || k > n_samples {
979 return Err(ClusteringError::InvalidInput(format!(
980 "k must be between 1 and n_samples ({}), got {}",
981 n_samples, k
982 )));
983 }
984
985 let use_gpu = self.should_use_gpu(n_samples, n_features);
987
988 if use_gpu && self.gpu_available {
989 self.fit_gpu(data, k, max_iter, tol, start_time)
990 } else {
991 self.fit_cpu(data, k, max_iter, tol, start_time)
992 }
993 }
994
995 fn should_use_gpu(&self, n_samples: usize, n_features: usize) -> bool {
997 let problem_size = n_samples * n_features;
998 problem_size >= self.config.min_problem_size && self.config.enabled
999 }
1000
1001 fn fit_gpu(
1003 &mut self,
1004 data: ArrayView2<F>,
1005 k: usize,
1006 max_iter: usize,
1007 tol: F,
1008 start_time: Instant,
1009 ) -> Result<GpuKMeansResult<F>> {
1010 let n_samples = data.nrows();
1011 let n_features = data.ncols();
1012
1013 let mut centroids = self.initialize_centroids_gpu(data, k)?;
1015 let mut labels = Array1::zeros(n_samples);
1016 let mut inertia = F::infinity();
1017 let mut converged = false;
1018 let mut n_iterations = 0;
1019
1020 let mut distance_time = Duration::ZERO;
1021 let mut centroid_time = Duration::ZERO;
1022 let mut label_time = Duration::ZERO;
1023
1024 for iter in 0..max_iter {
1026 n_iterations = iter + 1;
1027
1028 let label_start = Instant::now();
1030 let (new_labels, distances) = self.compute_labels_gpu(data, centroids.view())?;
1031 labels = new_labels;
1032 label_time += label_start.elapsed();
1033
1034 let centroid_start = Instant::now();
1036 let new_centroids = self.compute_centroids_gpu(data, &labels, k)?;
1037 centroid_time += centroid_start.elapsed();
1038
1039 let new_inertia = self.compute_inertia(&distances);
1041 let centroid_shift =
1042 self.compute_centroid_shift(centroids.view(), new_centroids.view());
1043
1044 centroids = new_centroids;
1045
1046 if centroid_shift <= tol
1047 || (inertia - new_inertia).abs() < tol * F::from(0.01).unwrap_or(tol)
1048 {
1049 converged = true;
1050 inertia = new_inertia;
1051 break;
1052 }
1053
1054 inertia = new_inertia;
1055 }
1056
1057 let total_time = start_time.elapsed();
1058
1059 let metrics = KMeansMetrics {
1060 total_time_ms: total_time.as_secs_f64() * 1000.0,
1061 distance_time_ms: distance_time.as_secs_f64() * 1000.0,
1062 centroid_update_time_ms: centroid_time.as_secs_f64() * 1000.0,
1063 label_assignment_time_ms: label_time.as_secs_f64() * 1000.0,
1064 transfer_time_ms: 0.0, used_gpu: true,
1066 backend: format!("{}", self.config.backend),
1067 memory_used: self.memory_manager.get_stats().current_bytes_in_use,
1068 throughput: n_samples as f64 / total_time.as_secs_f64(),
1069 };
1070
1071 Ok(GpuKMeansResult {
1072 centroids,
1073 labels,
1074 inertia,
1075 n_iterations,
1076 converged,
1077 metrics,
1078 })
1079 }
1080
1081 fn fit_cpu(
1083 &self,
1084 data: ArrayView2<F>,
1085 k: usize,
1086 max_iter: usize,
1087 tol: F,
1088 start_time: Instant,
1089 ) -> Result<GpuKMeansResult<F>> {
1090 let n_samples = data.nrows();
1091 let n_features = data.ncols();
1092
1093 let mut centroids = self.initialize_centroids_cpu(data, k)?;
1095 let mut labels = Array1::zeros(n_samples);
1096 let mut inertia = F::infinity();
1097 let mut converged = false;
1098 let mut n_iterations = 0;
1099
1100 for iter in 0..max_iter {
1102 n_iterations = iter + 1;
1103
1104 let (new_labels, distances) = self.assign_labels_cpu(data, centroids.view())?;
1106 labels = new_labels;
1107
1108 let new_centroids = self.update_centroids_cpu(data, &labels, k, n_features)?;
1110
1111 let new_inertia = self.compute_inertia(&distances);
1113 let centroid_shift =
1114 self.compute_centroid_shift(centroids.view(), new_centroids.view());
1115
1116 centroids = new_centroids;
1117
1118 if centroid_shift <= tol {
1119 converged = true;
1120 inertia = new_inertia;
1121 break;
1122 }
1123
1124 inertia = new_inertia;
1125 }
1126
1127 let total_time = start_time.elapsed();
1128
1129 let metrics = KMeansMetrics {
1130 total_time_ms: total_time.as_secs_f64() * 1000.0,
1131 distance_time_ms: 0.0,
1132 centroid_update_time_ms: 0.0,
1133 label_assignment_time_ms: 0.0,
1134 transfer_time_ms: 0.0,
1135 used_gpu: false,
1136 backend: "CPU".to_string(),
1137 memory_used: 0,
1138 throughput: n_samples as f64 / total_time.as_secs_f64(),
1139 };
1140
1141 Ok(GpuKMeansResult {
1142 centroids,
1143 labels,
1144 inertia,
1145 n_iterations,
1146 converged,
1147 metrics,
1148 })
1149 }
1150
1151 fn initialize_centroids_gpu(&self, data: ArrayView2<F>, k: usize) -> Result<Array2<F>> {
1153 self.initialize_centroids_cpu(data, k)
1155 }
1156
1157 fn initialize_centroids_cpu(&self, data: ArrayView2<F>, k: usize) -> Result<Array2<F>> {
1159 let n_samples = data.nrows();
1160 let n_features = data.ncols();
1161 let mut centroids = Array2::zeros((k, n_features));
1162 let mut rng = scirs2_core::random::rng();
1163
1164 let first_idx = scirs2_core::random::RngExt::random_range(&mut rng, 0..n_samples);
1166 for j in 0..n_features {
1167 centroids[[0, j]] = data[[first_idx, j]];
1168 }
1169
1170 if k == 1 {
1171 return Ok(centroids);
1172 }
1173
1174 let mut min_distances = Array1::from_elem(n_samples, F::infinity());
1176
1177 for i in 1..k {
1178 for sample_idx in 0..n_samples {
1180 let dist =
1181 self.euclidean_distance_squared(data.row(sample_idx), centroids.row(i - 1));
1182 if dist < min_distances[sample_idx] {
1183 min_distances[sample_idx] = dist;
1184 }
1185 }
1186
1187 let sum_distances: F = min_distances.iter().copied().fold(F::zero(), |a, b| a + b);
1189 if sum_distances <= F::zero() {
1190 let idx = scirs2_core::random::RngExt::random_range(&mut rng, 0..n_samples);
1192 for j in 0..n_features {
1193 centroids[[i, j]] = data[[idx, j]];
1194 }
1195 continue;
1196 }
1197
1198 let threshold = F::from(scirs2_core::random::RngExt::random_range(
1200 &mut rng,
1201 0.0..1.0,
1202 ))
1203 .unwrap_or(F::zero())
1204 * sum_distances;
1205 let mut cumsum = F::zero();
1206 let mut next_idx = 0;
1207
1208 for (idx, &dist) in min_distances.iter().enumerate() {
1209 cumsum = cumsum + dist;
1210 if cumsum >= threshold {
1211 next_idx = idx;
1212 break;
1213 }
1214 }
1215
1216 for j in 0..n_features {
1217 centroids[[i, j]] = data[[next_idx, j]];
1218 }
1219 }
1220
1221 Ok(centroids)
1222 }
1223
1224 fn compute_labels_gpu(
1226 &self,
1227 data: ArrayView2<F>,
1228 centroids: ArrayView2<F>,
1229 ) -> Result<(Array1<usize>, Array1<F>)> {
1230 self.assign_labels_cpu(data, centroids)
1233 }
1234
1235 fn compute_centroids_gpu(
1237 &self,
1238 data: ArrayView2<F>,
1239 labels: &Array1<usize>,
1240 k: usize,
1241 ) -> Result<Array2<F>> {
1242 let n_features = data.ncols();
1243 self.update_centroids_cpu(data, labels, k, n_features)
1244 }
1245
1246 fn assign_labels_cpu(
1248 &self,
1249 data: ArrayView2<F>,
1250 centroids: ArrayView2<F>,
1251 ) -> Result<(Array1<usize>, Array1<F>)> {
1252 let n_samples = data.nrows();
1253 let n_centroids = centroids.nrows();
1254 let mut labels = Array1::zeros(n_samples);
1255 let mut distances = Array1::zeros(n_samples);
1256
1257 for i in 0..n_samples {
1258 let mut min_dist = F::infinity();
1259 let mut min_label = 0;
1260
1261 for j in 0..n_centroids {
1262 let dist = self.euclidean_distance_squared(data.row(i), centroids.row(j));
1263 if dist < min_dist {
1264 min_dist = dist;
1265 min_label = j;
1266 }
1267 }
1268
1269 labels[i] = min_label;
1270 distances[i] = min_dist;
1271 }
1272
1273 Ok((labels, distances))
1274 }
1275
1276 fn update_centroids_cpu(
1278 &self,
1279 data: ArrayView2<F>,
1280 labels: &Array1<usize>,
1281 k: usize,
1282 n_features: usize,
1283 ) -> Result<Array2<F>> {
1284 let mut centroids = Array2::zeros((k, n_features));
1285 let mut counts = vec![0usize; k];
1286
1287 for (i, &label) in labels.iter().enumerate() {
1289 if label < k {
1290 for j in 0..n_features {
1291 centroids[[label, j]] = centroids[[label, j]] + data[[i, j]];
1292 }
1293 counts[label] += 1;
1294 }
1295 }
1296
1297 for i in 0..k {
1299 if counts[i] > 0 {
1300 let count = F::from(counts[i]).unwrap_or(F::one());
1301 for j in 0..n_features {
1302 centroids[[i, j]] = centroids[[i, j]] / count;
1303 }
1304 }
1305 }
1306
1307 Ok(centroids)
1308 }
1309
1310 fn euclidean_distance_squared(
1312 &self,
1313 a: scirs2_core::ndarray::ArrayView1<F>,
1314 b: scirs2_core::ndarray::ArrayView1<F>,
1315 ) -> F {
1316 a.iter()
1317 .zip(b.iter())
1318 .map(|(&x, &y)| {
1319 let diff = x - y;
1320 diff * diff
1321 })
1322 .fold(F::zero(), |acc, x| acc + x)
1323 }
1324
1325 fn compute_inertia(&self, distances: &Array1<F>) -> F {
1327 distances.iter().copied().fold(F::zero(), |a, b| a + b)
1328 }
1329
1330 fn compute_centroid_shift(&self, old: ArrayView2<F>, new: ArrayView2<F>) -> F {
1332 let mut max_shift = F::zero();
1333 for i in 0..old.nrows() {
1334 let shift = self
1335 .euclidean_distance_squared(old.row(i), new.row(i))
1336 .sqrt();
1337 if shift > max_shift {
1338 max_shift = shift;
1339 }
1340 }
1341 max_shift
1342 }
1343
1344 pub fn config(&self) -> &GpuAccelerationConfig {
1346 &self.config
1347 }
1348
1349 pub fn is_gpu_available(&self) -> bool {
1351 self.gpu_available
1352 }
1353
1354 pub fn tensor_core_capabilities(&self) -> &TensorCoreCapabilities {
1356 &self.tensor_caps
1357 }
1358
1359 pub fn memory_stats(&self) -> &MemoryUsageStats {
1361 self.memory_manager.get_stats()
1362 }
1363
1364 pub fn profiling_data(&self) -> &[ProfilingRecord] {
1366 &self.profiling_data
1367 }
1368}
1369
1370#[cfg(test)]
1375mod tests {
1376 use super::*;
1377 use scirs2_core::ndarray::Array2;
1378
1379 #[test]
1380 fn test_advanced_memory_strategy_display() {
1381 assert_eq!(
1382 AdvancedMemoryStrategy::Conservative.to_string(),
1383 "Conservative"
1384 );
1385 assert_eq!(
1386 AdvancedMemoryStrategy::Streaming {
1387 chunk_size: 1024 * 1024
1388 }
1389 .to_string(),
1390 "Streaming(1MB)"
1391 );
1392 }
1393
1394 #[test]
1395 fn test_advanced_memory_manager_creation() {
1396 let manager = AdvancedGpuMemoryManager::new(
1397 AdvancedMemoryStrategy::Adaptive,
1398 4 * 1024 * 1024 * 1024, );
1400 assert_eq!(manager.strategy(), AdvancedMemoryStrategy::Adaptive);
1401 }
1402
1403 #[test]
1404 fn test_advanced_memory_allocation() {
1405 let mut manager = AdvancedGpuMemoryManager::new(
1406 AdvancedMemoryStrategy::Conservative,
1407 1024 * 1024 * 1024, );
1409
1410 let result = manager.allocate(1024);
1411 assert!(result.is_ok());
1412
1413 let stats = manager.get_stats();
1414 assert_eq!(stats.total_allocations, 1);
1415 assert_eq!(stats.successful_allocations, 1);
1416 }
1417
1418 #[test]
1419 fn test_precision_mode_display() {
1420 assert_eq!(PrecisionMode::Mixed.to_string(), "Mixed (f16/f32)");
1421 assert_eq!(PrecisionMode::TensorFloat32.to_string(), "TF32");
1422 }
1423
1424 #[test]
1425 fn test_tensor_core_config_default() {
1426 let config = TensorCoreConfig::default();
1427 assert!(config.enabled);
1428 assert_eq!(config.precision, PrecisionMode::Auto);
1429 assert!(config.auto_scale);
1430 }
1431
1432 #[test]
1433 fn test_device_selector_creation() {
1434 let selector = DeviceSelector::new(AdvancedDeviceSelection::Auto);
1435 assert!(selector.devices().is_empty());
1436 }
1437
1438 #[test]
1439 fn test_device_selector_add_device() {
1440 let mut selector = DeviceSelector::new(AdvancedDeviceSelection::MostMemory);
1441
1442 let device = GpuDevice::new(
1443 0,
1444 "Test GPU".to_string(),
1445 8_000_000_000,
1446 6_000_000_000,
1447 "1.0".to_string(),
1448 1024,
1449 GpuBackend::Cuda,
1450 true,
1451 );
1452
1453 selector.add_device(device);
1454 assert_eq!(selector.devices().len(), 1);
1455 }
1456
1457 #[test]
1458 fn test_gpu_acceleration_config_default() {
1459 let config = GpuAccelerationConfig::default();
1460 assert!(config.enabled);
1461 assert!(config.auto_fallback);
1462 }
1463
1464 #[test]
1465 fn test_gpu_acceleration_config_cuda() {
1466 let config = GpuAccelerationConfig::cuda();
1467 assert_eq!(config.backend, GpuBackend::Cuda);
1468 assert!(config.tensor_cores.enabled);
1469 }
1470
1471 #[test]
1472 fn test_gpu_kmeans_creation() {
1473 let config = GpuAccelerationConfig::cpu();
1474 let kmeans = GpuKMeans::<f64>::new(config);
1475 assert!(kmeans.is_ok());
1476 }
1477
1478 #[test]
1479 fn test_gpu_kmeans_fit_cpu_fallback() {
1480 let config = GpuAccelerationConfig::cpu();
1481 let mut kmeans = GpuKMeans::<f64>::new(config).expect("Failed to create GpuKMeans");
1482
1483 let data = Array2::from_shape_vec(
1485 (6, 2),
1486 vec![1.0, 2.0, 1.2, 1.8, 0.8, 1.9, 4.0, 5.0, 4.2, 4.8, 3.9, 5.1],
1487 )
1488 .expect("Failed to create test data");
1489
1490 let result = kmeans.fit(data.view(), 2, 100, 1e-4);
1491 assert!(result.is_ok());
1492
1493 let result = result.expect("Failed to fit");
1494 assert_eq!(result.centroids.nrows(), 2);
1495 assert_eq!(result.labels.len(), 6);
1496 assert!(!result.metrics.used_gpu);
1497 }
1498
1499 #[test]
1500 fn test_gpu_kmeans_convergence() {
1501 let config = GpuAccelerationConfig::cpu();
1502 let mut kmeans = GpuKMeans::<f64>::new(config).expect("Failed to create GpuKMeans");
1503
1504 let data = Array2::from_shape_vec(
1506 (8, 2),
1507 vec![
1508 0.0, 0.0, 0.1, 0.1, 0.0, 0.1, 0.1, 0.0, 10.0, 10.0, 10.1, 10.1, 10.0, 10.1, 10.1,
1509 10.0,
1510 ],
1511 )
1512 .expect("Failed to create test data");
1513
1514 let result = kmeans.fit(data.view(), 2, 100, 1e-6);
1515 assert!(result.is_ok());
1516
1517 let result = result.expect("Failed to fit");
1518 assert!(result.converged);
1519 assert!(result.n_iterations < 50);
1520 }
1521
1522 #[test]
1523 fn test_memory_usage_stats() {
1524 let mut manager =
1525 AdvancedGpuMemoryManager::new(AdvancedMemoryStrategy::Aggressive, 1024 * 1024 * 1024);
1526
1527 for _ in 0..5 {
1529 let _ = manager.allocate(1024);
1530 }
1531
1532 let stats = manager.get_stats();
1533 assert_eq!(stats.total_allocations, 5);
1534 assert!(stats.efficiency > 0.0);
1535 }
1536
1537 #[test]
1538 fn test_kernel_optimizations_default() {
1539 let opts = KernelOptimizations::default();
1540 assert!(opts.loop_unrolling);
1541 assert!(opts.shared_memory_tiling);
1542 assert_eq!(opts.occupancy_level, 2);
1543 }
1544
1545 #[test]
1546 fn test_detect_tensor_core_capabilities() {
1547 let cuda_device = GpuDevice::new(
1548 0,
1549 "CUDA Device".to_string(),
1550 8_000_000_000,
1551 6_000_000_000,
1552 "8.0".to_string(),
1553 1024,
1554 GpuBackend::Cuda,
1555 true,
1556 );
1557
1558 let caps = detect_tensor_core_capabilities(&cuda_device);
1559 assert!(caps.available);
1560 assert!(!caps.supported_precisions.is_empty());
1561 }
1562
1563 #[test]
1564 fn test_profiling_record_creation() {
1565 let record = ProfilingRecord {
1566 operation: "distance_compute".to_string(),
1567 duration_us: 1000,
1568 memory_transferred: 1024 * 1024,
1569 compute_ops: 1000000,
1570 timestamp: Instant::now(),
1571 };
1572
1573 assert_eq!(record.operation, "distance_compute");
1574 assert_eq!(record.duration_us, 1000);
1575 }
1576}