Skip to main content

scirs2_cluster/gpu/
acceleration.rs

1//! Advanced GPU Acceleration Module for v0.2.0
2//!
3//! This module provides comprehensive GPU acceleration for clustering algorithms with:
4//! - Multiple backend support (CUDA, OpenCL, ROCm, Metal, OneAPI)
5//! - Advanced memory management strategies
6//! - Tensor core and mixed precision support
7//! - Automatic CPU fallback
8//! - GPU-accelerated K-means clustering
9
10use crate::error::{ClusteringError, Result};
11use scirs2_core::ndarray::{Array1, Array2, ArrayView2, Axis};
12use scirs2_core::numeric::{Float, FromPrimitive};
13use serde::{Deserialize, Serialize};
14use std::collections::HashMap;
15use std::fmt;
16use std::time::{Duration, Instant};
17
18use super::core::{DeviceSelection, GpuBackend, GpuConfig, GpuContext, GpuDevice};
19use super::memory::{GpuMemoryBlock, GpuMemoryManager, MemoryStats, MemoryStrategy};
20
21// ============================================================================
22// Advanced Memory Management
23// ============================================================================
24
25/// Advanced memory management strategy for GPU operations
26#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
27pub enum AdvancedMemoryStrategy {
28    /// Conservative: Minimize GPU memory usage, more host-device transfers
29    Conservative,
30    /// Aggressive: Maximize GPU memory usage for speed
31    Aggressive,
32    /// Adaptive: Dynamically adjust based on available memory and workload
33    Adaptive,
34    /// Streaming: Process data in chunks for datasets larger than GPU memory
35    Streaming {
36        /// Chunk size in bytes
37        chunk_size: usize,
38    },
39    /// Unified: Use unified memory where available (CUDA managed memory)
40    Unified,
41    /// Pool: Use memory pool for fast allocations/deallocations
42    Pool {
43        /// Pool size in bytes
44        pool_size: usize,
45    },
46}
47
48impl Default for AdvancedMemoryStrategy {
49    fn default() -> Self {
50        AdvancedMemoryStrategy::Adaptive
51    }
52}
53
54impl fmt::Display for AdvancedMemoryStrategy {
55    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
56        match self {
57            AdvancedMemoryStrategy::Conservative => write!(f, "Conservative"),
58            AdvancedMemoryStrategy::Aggressive => write!(f, "Aggressive"),
59            AdvancedMemoryStrategy::Adaptive => write!(f, "Adaptive"),
60            AdvancedMemoryStrategy::Streaming { chunk_size } => {
61                write!(f, "Streaming({}MB)", chunk_size / (1024 * 1024))
62            }
63            AdvancedMemoryStrategy::Unified => write!(f, "Unified"),
64            AdvancedMemoryStrategy::Pool { pool_size } => {
65                write!(f, "Pool({}MB)", pool_size / (1024 * 1024))
66            }
67        }
68    }
69}
70
71/// Advanced GPU memory manager with multiple strategies
72#[derive(Debug)]
73pub struct AdvancedGpuMemoryManager {
74    /// Base memory manager
75    base_manager: GpuMemoryManager,
76    /// Current memory strategy
77    strategy: AdvancedMemoryStrategy,
78    /// Available GPU memory in bytes
79    available_memory: usize,
80    /// Memory allocation history for adaptive strategy
81    allocation_history: Vec<AllocationRecord>,
82    /// Memory pressure threshold (0.0 to 1.0)
83    pressure_threshold: f64,
84    /// Enable memory defragmentation
85    enable_defrag: bool,
86    /// Memory usage statistics
87    usage_stats: MemoryUsageStats,
88}
89
90/// Record of memory allocation for adaptive management
91#[derive(Debug, Clone)]
92pub struct AllocationRecord {
93    /// Size of allocation
94    pub size: usize,
95    /// Timestamp
96    pub timestamp: Instant,
97    /// Duration of use
98    pub duration: Option<Duration>,
99    /// Was allocation successful
100    pub success: bool,
101}
102
103/// Memory usage statistics
104#[derive(Debug, Clone, Default, Serialize, Deserialize)]
105pub struct MemoryUsageStats {
106    /// Total allocations
107    pub total_allocations: usize,
108    /// Successful allocations
109    pub successful_allocations: usize,
110    /// Failed allocations (out of memory)
111    pub failed_allocations: usize,
112    /// Total bytes allocated over time
113    pub total_bytes_allocated: usize,
114    /// Current bytes in use
115    pub current_bytes_in_use: usize,
116    /// Peak bytes in use
117    pub peak_bytes_in_use: usize,
118    /// Average allocation size
119    pub avg_allocation_size: f64,
120    /// Memory efficiency (successful/total)
121    pub efficiency: f64,
122}
123
124impl AdvancedGpuMemoryManager {
125    /// Create a new advanced memory manager
126    pub fn new(strategy: AdvancedMemoryStrategy, available_memory: usize) -> Self {
127        let alignment = 256; // 256-byte alignment for GPU
128        let max_pool_size = match strategy {
129            AdvancedMemoryStrategy::Pool { pool_size } => pool_size / (1024 * 1024),
130            _ => 100, // Default pool size
131        };
132
133        Self {
134            base_manager: GpuMemoryManager::new(alignment, max_pool_size),
135            strategy,
136            available_memory,
137            allocation_history: Vec::new(),
138            pressure_threshold: 0.85,
139            enable_defrag: true,
140            usage_stats: MemoryUsageStats::default(),
141        }
142    }
143
144    /// Allocate memory with strategy-aware logic
145    pub fn allocate(&mut self, size: usize) -> Result<GpuMemoryBlock> {
146        self.usage_stats.total_allocations += 1;
147
148        // Check memory pressure
149        let memory_pressure = self.calculate_memory_pressure();
150
151        // Handle based on strategy
152        let result = match self.strategy {
153            AdvancedMemoryStrategy::Conservative => self.allocate_conservative(size),
154            AdvancedMemoryStrategy::Aggressive => self.allocate_aggressive(size),
155            AdvancedMemoryStrategy::Adaptive => self.allocate_adaptive(size, memory_pressure),
156            AdvancedMemoryStrategy::Streaming { chunk_size } => {
157                self.allocate_streaming(size, chunk_size)
158            }
159            AdvancedMemoryStrategy::Unified => self.allocate_unified(size),
160            AdvancedMemoryStrategy::Pool { .. } => self.base_manager.allocate(size),
161        };
162
163        // Record allocation
164        let success = result.is_ok();
165        self.allocation_history.push(AllocationRecord {
166            size,
167            timestamp: Instant::now(),
168            duration: None,
169            success,
170        });
171
172        if success {
173            self.usage_stats.successful_allocations += 1;
174            self.usage_stats.total_bytes_allocated += size;
175            self.usage_stats.current_bytes_in_use += size;
176            self.usage_stats.peak_bytes_in_use = self
177                .usage_stats
178                .peak_bytes_in_use
179                .max(self.usage_stats.current_bytes_in_use);
180        } else {
181            self.usage_stats.failed_allocations += 1;
182        }
183
184        self.update_efficiency();
185        result
186    }
187
188    /// Deallocate memory
189    pub fn deallocate(&mut self, block: GpuMemoryBlock) -> Result<()> {
190        let size = block.size;
191        self.base_manager.deallocate(block)?;
192        self.usage_stats.current_bytes_in_use =
193            self.usage_stats.current_bytes_in_use.saturating_sub(size);
194        Ok(())
195    }
196
197    /// Conservative allocation strategy
198    fn allocate_conservative(&mut self, size: usize) -> Result<GpuMemoryBlock> {
199        // Check if we have enough memory before allocating
200        if self.usage_stats.current_bytes_in_use + size > self.available_memory {
201            // Try to free unused memory first
202            self.compact_memory()?;
203        }
204        self.base_manager.allocate(size)
205    }
206
207    /// Aggressive allocation strategy
208    fn allocate_aggressive(&mut self, size: usize) -> Result<GpuMemoryBlock> {
209        // Allocate without checking, rely on GPU driver
210        self.base_manager.allocate(size)
211    }
212
213    /// Adaptive allocation strategy
214    fn allocate_adaptive(&mut self, size: usize, memory_pressure: f64) -> Result<GpuMemoryBlock> {
215        if memory_pressure > self.pressure_threshold {
216            // High pressure: use conservative approach
217            self.allocate_conservative(size)
218        } else {
219            // Low pressure: use aggressive approach
220            self.allocate_aggressive(size)
221        }
222    }
223
224    /// Streaming allocation for large datasets
225    fn allocate_streaming(&mut self, size: usize, chunk_size: usize) -> Result<GpuMemoryBlock> {
226        let actual_size = size.min(chunk_size);
227        self.base_manager.allocate(actual_size)
228    }
229
230    /// Unified memory allocation
231    fn allocate_unified(&mut self, size: usize) -> Result<GpuMemoryBlock> {
232        // In real implementation, would use CUDA managed memory
233        self.base_manager.allocate(size)
234    }
235
236    /// Calculate current memory pressure (0.0 to 1.0)
237    fn calculate_memory_pressure(&self) -> f64 {
238        if self.available_memory == 0 {
239            return 1.0;
240        }
241        self.usage_stats.current_bytes_in_use as f64 / self.available_memory as f64
242    }
243
244    /// Compact memory by freeing unused allocations
245    fn compact_memory(&mut self) -> Result<()> {
246        self.base_manager.clear_pools()
247    }
248
249    /// Update efficiency statistics
250    fn update_efficiency(&mut self) {
251        if self.usage_stats.total_allocations > 0 {
252            self.usage_stats.efficiency = self.usage_stats.successful_allocations as f64
253                / self.usage_stats.total_allocations as f64;
254            self.usage_stats.avg_allocation_size = self.usage_stats.total_bytes_allocated as f64
255                / self.usage_stats.successful_allocations.max(1) as f64;
256        }
257    }
258
259    /// Get memory statistics
260    pub fn get_stats(&self) -> &MemoryUsageStats {
261        &self.usage_stats
262    }
263
264    /// Get current memory strategy
265    pub fn strategy(&self) -> AdvancedMemoryStrategy {
266        self.strategy
267    }
268
269    /// Set memory strategy
270    pub fn set_strategy(&mut self, strategy: AdvancedMemoryStrategy) {
271        self.strategy = strategy;
272    }
273
274    /// Get memory pressure threshold
275    pub fn pressure_threshold(&self) -> f64 {
276        self.pressure_threshold
277    }
278
279    /// Set memory pressure threshold
280    pub fn set_pressure_threshold(&mut self, threshold: f64) {
281        self.pressure_threshold = threshold.clamp(0.0, 1.0);
282    }
283
284    /// Check if defragmentation is enabled
285    pub fn is_defrag_enabled(&self) -> bool {
286        self.enable_defrag
287    }
288
289    /// Enable or disable defragmentation
290    pub fn set_defrag_enabled(&mut self, enabled: bool) {
291        self.enable_defrag = enabled;
292    }
293}
294
295// ============================================================================
296// Tensor Core and Mixed Precision Support
297// ============================================================================
298
299/// Precision mode for GPU computations
300#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
301pub enum PrecisionMode {
302    /// Full precision (f64)
303    Full,
304    /// Single precision (f32)
305    Single,
306    /// Half precision (f16)
307    Half,
308    /// Mixed precision (f16 compute, f32 accumulator)
309    Mixed,
310    /// Brain floating point (bf16)
311    BFloat16,
312    /// Tensor float 32 (TF32) for NVIDIA Ampere+
313    TensorFloat32,
314    /// Automatic selection based on hardware
315    Auto,
316}
317
318impl Default for PrecisionMode {
319    fn default() -> Self {
320        PrecisionMode::Auto
321    }
322}
323
324impl fmt::Display for PrecisionMode {
325    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
326        match self {
327            PrecisionMode::Full => write!(f, "Full (f64)"),
328            PrecisionMode::Single => write!(f, "Single (f32)"),
329            PrecisionMode::Half => write!(f, "Half (f16)"),
330            PrecisionMode::Mixed => write!(f, "Mixed (f16/f32)"),
331            PrecisionMode::BFloat16 => write!(f, "BFloat16"),
332            PrecisionMode::TensorFloat32 => write!(f, "TF32"),
333            PrecisionMode::Auto => write!(f, "Auto"),
334        }
335    }
336}
337
338/// Tensor core configuration
339#[derive(Debug, Clone, Serialize, Deserialize)]
340pub struct TensorCoreConfig {
341    /// Enable tensor cores if available
342    pub enabled: bool,
343    /// Precision mode
344    pub precision: PrecisionMode,
345    /// Tile size for tensor core operations (M, N, K)
346    pub tile_size: (usize, usize, usize),
347    /// Use structured sparsity if available (NVIDIA Ampere+)
348    pub use_sparsity: bool,
349    /// Sparsity ratio (e.g., 0.5 for 2:4 sparsity)
350    pub sparsity_ratio: f64,
351    /// Enable automatic precision scaling
352    pub auto_scale: bool,
353    /// Loss scaling factor for mixed precision training
354    pub loss_scale: f64,
355}
356
357impl Default for TensorCoreConfig {
358    fn default() -> Self {
359        Self {
360            enabled: true,
361            precision: PrecisionMode::Auto,
362            tile_size: (16, 16, 16),
363            use_sparsity: false,
364            sparsity_ratio: 0.5,
365            auto_scale: true,
366            loss_scale: 1.0,
367        }
368    }
369}
370
371/// Tensor core capabilities detection
372#[derive(Debug, Clone, Serialize, Deserialize)]
373pub struct TensorCoreCapabilities {
374    /// Tensor cores available
375    pub available: bool,
376    /// Supported precision modes
377    pub supported_precisions: Vec<PrecisionMode>,
378    /// Supported tile sizes
379    pub supported_tile_sizes: Vec<(usize, usize, usize)>,
380    /// Supports structured sparsity
381    pub supports_sparsity: bool,
382    /// Peak TOPS (Tera Operations Per Second)
383    pub peak_tops: Option<f64>,
384    /// Architecture name
385    pub architecture: String,
386}
387
388impl Default for TensorCoreCapabilities {
389    fn default() -> Self {
390        Self {
391            available: false,
392            supported_precisions: vec![PrecisionMode::Single],
393            supported_tile_sizes: vec![(16, 16, 16)],
394            supports_sparsity: false,
395            peak_tops: None,
396            architecture: "Unknown".to_string(),
397        }
398    }
399}
400
401/// Detect tensor core capabilities for a GPU device
402pub fn detect_tensor_core_capabilities(device: &GpuDevice) -> TensorCoreCapabilities {
403    match device.backend {
404        GpuBackend::Cuda => {
405            // NVIDIA Tensor Core detection
406            TensorCoreCapabilities {
407                available: true,
408                supported_precisions: vec![
409                    PrecisionMode::Half,
410                    PrecisionMode::Mixed,
411                    PrecisionMode::BFloat16,
412                    PrecisionMode::TensorFloat32,
413                ],
414                supported_tile_sizes: vec![(16, 16, 16), (32, 8, 16), (8, 32, 16)],
415                supports_sparsity: true, // Ampere+ supports 2:4 sparsity
416                peak_tops: Some(312.0),  // Example for A100
417                architecture: "NVIDIA Tensor Cores".to_string(),
418            }
419        }
420        GpuBackend::Rocm => {
421            // AMD Matrix Cores
422            TensorCoreCapabilities {
423                available: true,
424                supported_precisions: vec![
425                    PrecisionMode::Half,
426                    PrecisionMode::Mixed,
427                    PrecisionMode::BFloat16,
428                ],
429                supported_tile_sizes: vec![(32, 32, 8), (16, 16, 16)],
430                supports_sparsity: false,
431                peak_tops: Some(383.0), // Example for MI250X
432                architecture: "AMD Matrix Cores".to_string(),
433            }
434        }
435        GpuBackend::Metal => {
436            // Apple Neural Engine / GPU
437            TensorCoreCapabilities {
438                available: true,
439                supported_precisions: vec![PrecisionMode::Half, PrecisionMode::Single],
440                supported_tile_sizes: vec![(16, 16, 16)],
441                supports_sparsity: false,
442                peak_tops: Some(15.8), // Example for M1
443                architecture: "Apple Neural Engine".to_string(),
444            }
445        }
446        _ => TensorCoreCapabilities::default(),
447    }
448}
449
450// ============================================================================
451// Device Selection Strategies
452// ============================================================================
453
454/// Advanced device selection strategy
455#[derive(Debug, Clone, Serialize, Deserialize)]
456pub enum AdvancedDeviceSelection {
457    /// Use the first available device
458    First,
459    /// Use the device with most available memory
460    MostMemory,
461    /// Use the device with highest compute capability
462    HighestCompute,
463    /// Use a specific device by ID
464    Specific(u32),
465    /// Automatic selection based on workload
466    Auto,
467    /// Use fastest device for current workload (benchmarked)
468    Fastest,
469    /// Use device with best power efficiency
470    MostEfficient,
471    /// Round-robin across available devices
472    RoundRobin,
473    /// Load-balanced across devices based on utilization
474    LoadBalanced,
475    /// Multi-GPU: use all available GPUs
476    MultiGpu {
477        /// Maximum number of GPUs to use
478        max_gpus: usize,
479    },
480}
481
482impl Default for AdvancedDeviceSelection {
483    fn default() -> Self {
484        AdvancedDeviceSelection::Auto
485    }
486}
487
488impl From<AdvancedDeviceSelection> for DeviceSelection {
489    fn from(adv: AdvancedDeviceSelection) -> Self {
490        match adv {
491            AdvancedDeviceSelection::First => DeviceSelection::First,
492            AdvancedDeviceSelection::MostMemory => DeviceSelection::MostMemory,
493            AdvancedDeviceSelection::HighestCompute => DeviceSelection::HighestCompute,
494            AdvancedDeviceSelection::Specific(id) => DeviceSelection::Specific(id),
495            AdvancedDeviceSelection::Auto => DeviceSelection::Auto,
496            AdvancedDeviceSelection::Fastest => DeviceSelection::Fastest,
497            _ => DeviceSelection::Auto, // Default for advanced strategies
498        }
499    }
500}
501
502/// Device selector for multi-GPU operations
503#[derive(Debug)]
504pub struct DeviceSelector {
505    /// Available devices
506    devices: Vec<GpuDevice>,
507    /// Selection strategy
508    strategy: AdvancedDeviceSelection,
509    /// Device utilization tracking
510    utilization: HashMap<u32, f64>,
511    /// Round-robin counter
512    round_robin_idx: usize,
513    /// Device benchmark results
514    benchmarks: HashMap<u32, DeviceBenchmark>,
515}
516
517/// Device benchmark result
518#[derive(Debug, Clone, Serialize, Deserialize)]
519pub struct DeviceBenchmark {
520    /// Device ID
521    pub device_id: u32,
522    /// Distance computation throughput (GFLOPS)
523    pub distance_throughput: f64,
524    /// K-means iteration time (ms)
525    pub kmeans_time_ms: f64,
526    /// Memory bandwidth (GB/s)
527    pub memory_bandwidth: f64,
528    /// Power consumption (W)
529    pub power_consumption: Option<f64>,
530    /// Benchmark timestamp
531    pub timestamp: std::time::SystemTime,
532}
533
534impl DeviceSelector {
535    /// Create a new device selector
536    pub fn new(strategy: AdvancedDeviceSelection) -> Self {
537        Self {
538            devices: Vec::new(),
539            strategy,
540            utilization: HashMap::new(),
541            round_robin_idx: 0,
542            benchmarks: HashMap::new(),
543        }
544    }
545
546    /// Add a device to the selector
547    pub fn add_device(&mut self, device: GpuDevice) {
548        self.utilization.insert(device.device_id, 0.0);
549        self.devices.push(device);
550    }
551
552    /// Select the best device based on current strategy
553    pub fn select_device(&mut self) -> Option<&GpuDevice> {
554        if self.devices.is_empty() {
555            return None;
556        }
557
558        match &self.strategy {
559            AdvancedDeviceSelection::First => self.devices.first(),
560            AdvancedDeviceSelection::MostMemory => {
561                self.devices.iter().max_by_key(|d| d.available_memory)
562            }
563            AdvancedDeviceSelection::HighestCompute => {
564                self.devices.iter().max_by_key(|d| d.compute_units)
565            }
566            AdvancedDeviceSelection::Specific(id) => {
567                self.devices.iter().find(|d| d.device_id == *id)
568            }
569            AdvancedDeviceSelection::Auto => {
570                // Score-based selection
571                self.devices.iter().max_by(|a, b| {
572                    a.get_device_score()
573                        .partial_cmp(&b.get_device_score())
574                        .unwrap_or(std::cmp::Ordering::Equal)
575                })
576            }
577            AdvancedDeviceSelection::Fastest => {
578                // Use benchmark results if available
579                if self.benchmarks.is_empty() {
580                    self.devices.first()
581                } else {
582                    let fastest_id = self
583                        .benchmarks
584                        .iter()
585                        .min_by(|a, b| {
586                            a.1.kmeans_time_ms
587                                .partial_cmp(&b.1.kmeans_time_ms)
588                                .unwrap_or(std::cmp::Ordering::Equal)
589                        })
590                        .map(|(id, _)| *id);
591
592                    fastest_id.and_then(|id| self.devices.iter().find(|d| d.device_id == id))
593                }
594            }
595            AdvancedDeviceSelection::MostEfficient => {
596                // Use power efficiency if available
597                if self.benchmarks.is_empty() {
598                    self.devices.first()
599                } else {
600                    let most_efficient_id = self
601                        .benchmarks
602                        .iter()
603                        .filter_map(|(id, bench)| {
604                            bench.power_consumption.map(|power| {
605                                let efficiency = bench.distance_throughput / power;
606                                (*id, efficiency)
607                            })
608                        })
609                        .max_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
610                        .map(|(id, _)| id);
611
612                    most_efficient_id.and_then(|id| self.devices.iter().find(|d| d.device_id == id))
613                }
614            }
615            AdvancedDeviceSelection::RoundRobin => {
616                let idx = self.round_robin_idx % self.devices.len();
617                self.round_robin_idx += 1;
618                self.devices.get(idx)
619            }
620            AdvancedDeviceSelection::LoadBalanced => {
621                // Select least utilized device
622                let least_utilized = self
623                    .utilization
624                    .iter()
625                    .min_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
626                    .map(|(id, _)| *id);
627
628                least_utilized.and_then(|id| self.devices.iter().find(|d| d.device_id == id))
629            }
630            AdvancedDeviceSelection::MultiGpu { max_gpus } => {
631                // For multi-GPU, return the first device (caller handles multi-GPU logic)
632                self.devices.iter().take(*max_gpus).next()
633            }
634        }
635    }
636
637    /// Update device utilization
638    pub fn update_utilization(&mut self, device_id: u32, utilization: f64) {
639        self.utilization
640            .insert(device_id, utilization.clamp(0.0, 1.0));
641    }
642
643    /// Add benchmark result
644    pub fn add_benchmark(&mut self, benchmark: DeviceBenchmark) {
645        self.benchmarks.insert(benchmark.device_id, benchmark);
646    }
647
648    /// Get all devices
649    pub fn devices(&self) -> &[GpuDevice] {
650        &self.devices
651    }
652
653    /// Get current strategy
654    pub fn strategy(&self) -> &AdvancedDeviceSelection {
655        &self.strategy
656    }
657
658    /// Set selection strategy
659    pub fn set_strategy(&mut self, strategy: AdvancedDeviceSelection) {
660        self.strategy = strategy;
661    }
662}
663
664// ============================================================================
665// GPU Acceleration Configuration
666// ============================================================================
667
668/// Comprehensive GPU acceleration configuration
669#[derive(Debug, Clone, Serialize, Deserialize)]
670pub struct GpuAccelerationConfig {
671    /// Enable GPU acceleration
672    pub enabled: bool,
673    /// Preferred backend
674    pub backend: GpuBackend,
675    /// Device selection strategy
676    pub device_selection: AdvancedDeviceSelection,
677    /// Memory management strategy
678    pub memory_strategy: AdvancedMemoryStrategy,
679    /// Tensor core configuration
680    pub tensor_cores: TensorCoreConfig,
681    /// Enable automatic CPU fallback
682    pub auto_fallback: bool,
683    /// Minimum problem size for GPU acceleration
684    pub min_problem_size: usize,
685    /// Tile size for blocked algorithms
686    pub tile_size: usize,
687    /// Enable asynchronous execution
688    pub async_execution: bool,
689    /// Number of CUDA streams / OpenCL queues
690    pub num_streams: usize,
691    /// Enable profiling
692    pub enable_profiling: bool,
693    /// Custom kernel optimizations
694    pub kernel_optimizations: KernelOptimizations,
695}
696
697impl Default for GpuAccelerationConfig {
698    fn default() -> Self {
699        Self {
700            enabled: true,
701            backend: GpuBackend::CpuFallback,
702            device_selection: AdvancedDeviceSelection::Auto,
703            memory_strategy: AdvancedMemoryStrategy::Adaptive,
704            tensor_cores: TensorCoreConfig::default(),
705            auto_fallback: true,
706            min_problem_size: 1000,
707            tile_size: 256,
708            async_execution: true,
709            num_streams: 4,
710            enable_profiling: false,
711            kernel_optimizations: KernelOptimizations::default(),
712        }
713    }
714}
715
716impl GpuAccelerationConfig {
717    /// Create CUDA configuration
718    pub fn cuda() -> Self {
719        Self {
720            backend: GpuBackend::Cuda,
721            tensor_cores: TensorCoreConfig {
722                enabled: true,
723                precision: PrecisionMode::Mixed,
724                ..Default::default()
725            },
726            ..Default::default()
727        }
728    }
729
730    /// Create OpenCL configuration
731    pub fn opencl() -> Self {
732        Self {
733            backend: GpuBackend::OpenCl,
734            tensor_cores: TensorCoreConfig {
735                enabled: false,
736                precision: PrecisionMode::Single,
737                ..Default::default()
738            },
739            ..Default::default()
740        }
741    }
742
743    /// Create ROCm configuration
744    pub fn rocm() -> Self {
745        Self {
746            backend: GpuBackend::Rocm,
747            tensor_cores: TensorCoreConfig {
748                enabled: true,
749                precision: PrecisionMode::Mixed,
750                ..Default::default()
751            },
752            ..Default::default()
753        }
754    }
755
756    /// Create Metal configuration
757    pub fn metal() -> Self {
758        Self {
759            backend: GpuBackend::Metal,
760            tensor_cores: TensorCoreConfig {
761                enabled: true,
762                precision: PrecisionMode::Half,
763                ..Default::default()
764            },
765            ..Default::default()
766        }
767    }
768
769    /// Create CPU fallback configuration
770    pub fn cpu() -> Self {
771        Self {
772            enabled: false,
773            backend: GpuBackend::CpuFallback,
774            ..Default::default()
775        }
776    }
777
778    /// Convert to basic GpuConfig
779    pub fn to_basic_config(&self) -> GpuConfig {
780        GpuConfig {
781            preferred_backend: self.backend,
782            device_selection: self.device_selection.clone().into(),
783            auto_fallback: self.auto_fallback,
784            memory_pool_size: match self.memory_strategy {
785                AdvancedMemoryStrategy::Pool { pool_size } => Some(pool_size),
786                _ => None,
787            },
788            optimize_memory: true,
789            backend_options: HashMap::new(),
790        }
791    }
792}
793
794/// Kernel optimization settings
795#[derive(Debug, Clone, Serialize, Deserialize)]
796pub struct KernelOptimizations {
797    /// Use loop unrolling
798    pub loop_unrolling: bool,
799    /// Use shared memory tiling
800    pub shared_memory_tiling: bool,
801    /// Use register blocking
802    pub register_blocking: bool,
803    /// Use vectorized loads (e.g., float4)
804    pub vectorized_loads: bool,
805    /// Use texture memory for read-only data
806    pub texture_memory: bool,
807    /// Use constant memory for frequently accessed constants
808    pub constant_memory: bool,
809    /// Occupancy optimization level (0-3)
810    pub occupancy_level: u8,
811}
812
813impl Default for KernelOptimizations {
814    fn default() -> Self {
815        Self {
816            loop_unrolling: true,
817            shared_memory_tiling: true,
818            register_blocking: true,
819            vectorized_loads: true,
820            texture_memory: false,
821            constant_memory: true,
822            occupancy_level: 2,
823        }
824    }
825}
826
827// ============================================================================
828// GPU Accelerated K-Means
829// ============================================================================
830
831/// GPU-accelerated K-means clustering
832#[derive(Debug)]
833pub struct GpuKMeans<F: Float> {
834    /// Configuration
835    config: GpuAccelerationConfig,
836    /// GPU context (if available)
837    context: Option<GpuContext>,
838    /// Memory manager
839    memory_manager: AdvancedGpuMemoryManager,
840    /// Device selector
841    device_selector: DeviceSelector,
842    /// Tensor core capabilities
843    tensor_caps: TensorCoreCapabilities,
844    /// Is GPU actually available
845    gpu_available: bool,
846    /// Profiling data
847    profiling_data: Vec<ProfilingRecord>,
848    /// Phantom data for type
849    _phantom: std::marker::PhantomData<F>,
850}
851
852/// Profiling record for GPU operations
853#[derive(Debug, Clone)]
854pub struct ProfilingRecord {
855    /// Operation name
856    pub operation: String,
857    /// Duration in microseconds
858    pub duration_us: u64,
859    /// Memory transferred (bytes)
860    pub memory_transferred: usize,
861    /// Compute operations performed
862    pub compute_ops: usize,
863    /// Timestamp
864    pub timestamp: Instant,
865}
866
867/// K-means result from GPU computation
868#[derive(Debug, Clone)]
869pub struct GpuKMeansResult<F: Float> {
870    /// Final centroids
871    pub centroids: Array2<F>,
872    /// Cluster assignments for each point
873    pub labels: Array1<usize>,
874    /// Inertia (sum of squared distances to centroids)
875    pub inertia: F,
876    /// Number of iterations
877    pub n_iterations: usize,
878    /// Whether converged
879    pub converged: bool,
880    /// Computation metrics
881    pub metrics: KMeansMetrics,
882}
883
884/// K-means computation metrics
885#[derive(Debug, Clone, Serialize, Deserialize)]
886pub struct KMeansMetrics {
887    /// Total computation time (ms)
888    pub total_time_ms: f64,
889    /// Time for distance computation (ms)
890    pub distance_time_ms: f64,
891    /// Time for centroid update (ms)
892    pub centroid_update_time_ms: f64,
893    /// Time for label assignment (ms)
894    pub label_assignment_time_ms: f64,
895    /// Data transfer time (ms)
896    pub transfer_time_ms: f64,
897    /// Used GPU acceleration
898    pub used_gpu: bool,
899    /// Backend used
900    pub backend: String,
901    /// Memory used (bytes)
902    pub memory_used: usize,
903    /// Throughput (samples/second)
904    pub throughput: f64,
905}
906
907impl<F: Float + FromPrimitive + Send + Sync + 'static> GpuKMeans<F> {
908    /// Create new GPU-accelerated K-means
909    pub fn new(config: GpuAccelerationConfig) -> Result<Self> {
910        let device_selector = DeviceSelector::new(config.device_selection.clone());
911
912        // Try to create GPU context
913        let (context, gpu_available, tensor_caps) = Self::try_create_context(&config)?;
914
915        let available_memory = context
916            .as_ref()
917            .map(|ctx| ctx.device.available_memory)
918            .unwrap_or(1024 * 1024 * 1024); // 1GB default
919
920        let memory_manager =
921            AdvancedGpuMemoryManager::new(config.memory_strategy, available_memory);
922
923        Ok(Self {
924            config,
925            context,
926            memory_manager,
927            device_selector,
928            tensor_caps,
929            gpu_available,
930            profiling_data: Vec::new(),
931            _phantom: std::marker::PhantomData,
932        })
933    }
934
935    /// Try to create GPU context
936    fn try_create_context(
937        config: &GpuAccelerationConfig,
938    ) -> Result<(Option<GpuContext>, bool, TensorCoreCapabilities)> {
939        if !config.enabled || config.backend == GpuBackend::CpuFallback {
940            return Ok((None, false, TensorCoreCapabilities::default()));
941        }
942
943        // Create device and context
944        let device = GpuDevice::new(
945            0,
946            format!("{} Device", config.backend),
947            8_000_000_000,
948            6_000_000_000,
949            "1.0".to_string(),
950            1024,
951            config.backend,
952            true,
953        );
954
955        let tensor_caps = detect_tensor_core_capabilities(&device);
956        let basic_config = config.to_basic_config();
957
958        match GpuContext::new(device.clone(), basic_config) {
959            Ok(ctx) => Ok((Some(ctx), true, tensor_caps)),
960            Err(_) if config.auto_fallback => Ok((None, false, TensorCoreCapabilities::default())),
961            Err(e) => Err(e),
962        }
963    }
964
965    /// Fit K-means to data
966    pub fn fit(
967        &mut self,
968        data: ArrayView2<F>,
969        k: usize,
970        max_iter: usize,
971        tol: F,
972    ) -> Result<GpuKMeansResult<F>> {
973        let start_time = Instant::now();
974        let n_samples = data.nrows();
975        let n_features = data.ncols();
976
977        // Validate inputs
978        if k == 0 || k > n_samples {
979            return Err(ClusteringError::InvalidInput(format!(
980                "k must be between 1 and n_samples ({}), got {}",
981                n_samples, k
982            )));
983        }
984
985        // Decide whether to use GPU
986        let use_gpu = self.should_use_gpu(n_samples, n_features);
987
988        if use_gpu && self.gpu_available {
989            self.fit_gpu(data, k, max_iter, tol, start_time)
990        } else {
991            self.fit_cpu(data, k, max_iter, tol, start_time)
992        }
993    }
994
995    /// Decide whether to use GPU based on problem size
996    fn should_use_gpu(&self, n_samples: usize, n_features: usize) -> bool {
997        let problem_size = n_samples * n_features;
998        problem_size >= self.config.min_problem_size && self.config.enabled
999    }
1000
1001    /// GPU implementation of K-means
1002    fn fit_gpu(
1003        &mut self,
1004        data: ArrayView2<F>,
1005        k: usize,
1006        max_iter: usize,
1007        tol: F,
1008        start_time: Instant,
1009    ) -> Result<GpuKMeansResult<F>> {
1010        let n_samples = data.nrows();
1011        let n_features = data.ncols();
1012
1013        // Initialize centroids using K-means++
1014        let mut centroids = self.initialize_centroids_gpu(data, k)?;
1015        let mut labels = Array1::zeros(n_samples);
1016        let mut inertia = F::infinity();
1017        let mut converged = false;
1018        let mut n_iterations = 0;
1019
1020        let mut distance_time = Duration::ZERO;
1021        let mut centroid_time = Duration::ZERO;
1022        let mut label_time = Duration::ZERO;
1023
1024        // Main K-means loop
1025        for iter in 0..max_iter {
1026            n_iterations = iter + 1;
1027
1028            // Step 1: Compute distances and assign labels
1029            let label_start = Instant::now();
1030            let (new_labels, distances) = self.compute_labels_gpu(data, centroids.view())?;
1031            labels = new_labels;
1032            label_time += label_start.elapsed();
1033
1034            // Step 2: Compute new centroids
1035            let centroid_start = Instant::now();
1036            let new_centroids = self.compute_centroids_gpu(data, &labels, k)?;
1037            centroid_time += centroid_start.elapsed();
1038
1039            // Step 3: Check convergence
1040            let new_inertia = self.compute_inertia(&distances);
1041            let centroid_shift =
1042                self.compute_centroid_shift(centroids.view(), new_centroids.view());
1043
1044            centroids = new_centroids;
1045
1046            if centroid_shift <= tol
1047                || (inertia - new_inertia).abs() < tol * F::from(0.01).unwrap_or(tol)
1048            {
1049                converged = true;
1050                inertia = new_inertia;
1051                break;
1052            }
1053
1054            inertia = new_inertia;
1055        }
1056
1057        let total_time = start_time.elapsed();
1058
1059        let metrics = KMeansMetrics {
1060            total_time_ms: total_time.as_secs_f64() * 1000.0,
1061            distance_time_ms: distance_time.as_secs_f64() * 1000.0,
1062            centroid_update_time_ms: centroid_time.as_secs_f64() * 1000.0,
1063            label_assignment_time_ms: label_time.as_secs_f64() * 1000.0,
1064            transfer_time_ms: 0.0, // Would be populated with actual transfer times
1065            used_gpu: true,
1066            backend: format!("{}", self.config.backend),
1067            memory_used: self.memory_manager.get_stats().current_bytes_in_use,
1068            throughput: n_samples as f64 / total_time.as_secs_f64(),
1069        };
1070
1071        Ok(GpuKMeansResult {
1072            centroids,
1073            labels,
1074            inertia,
1075            n_iterations,
1076            converged,
1077            metrics,
1078        })
1079    }
1080
1081    /// CPU fallback implementation of K-means
1082    fn fit_cpu(
1083        &self,
1084        data: ArrayView2<F>,
1085        k: usize,
1086        max_iter: usize,
1087        tol: F,
1088        start_time: Instant,
1089    ) -> Result<GpuKMeansResult<F>> {
1090        let n_samples = data.nrows();
1091        let n_features = data.ncols();
1092
1093        // Initialize centroids using K-means++
1094        let mut centroids = self.initialize_centroids_cpu(data, k)?;
1095        let mut labels = Array1::zeros(n_samples);
1096        let mut inertia = F::infinity();
1097        let mut converged = false;
1098        let mut n_iterations = 0;
1099
1100        // Main K-means loop
1101        for iter in 0..max_iter {
1102            n_iterations = iter + 1;
1103
1104            // Step 1: Assign labels
1105            let (new_labels, distances) = self.assign_labels_cpu(data, centroids.view())?;
1106            labels = new_labels;
1107
1108            // Step 2: Update centroids
1109            let new_centroids = self.update_centroids_cpu(data, &labels, k, n_features)?;
1110
1111            // Step 3: Check convergence
1112            let new_inertia = self.compute_inertia(&distances);
1113            let centroid_shift =
1114                self.compute_centroid_shift(centroids.view(), new_centroids.view());
1115
1116            centroids = new_centroids;
1117
1118            if centroid_shift <= tol {
1119                converged = true;
1120                inertia = new_inertia;
1121                break;
1122            }
1123
1124            inertia = new_inertia;
1125        }
1126
1127        let total_time = start_time.elapsed();
1128
1129        let metrics = KMeansMetrics {
1130            total_time_ms: total_time.as_secs_f64() * 1000.0,
1131            distance_time_ms: 0.0,
1132            centroid_update_time_ms: 0.0,
1133            label_assignment_time_ms: 0.0,
1134            transfer_time_ms: 0.0,
1135            used_gpu: false,
1136            backend: "CPU".to_string(),
1137            memory_used: 0,
1138            throughput: n_samples as f64 / total_time.as_secs_f64(),
1139        };
1140
1141        Ok(GpuKMeansResult {
1142            centroids,
1143            labels,
1144            inertia,
1145            n_iterations,
1146            converged,
1147            metrics,
1148        })
1149    }
1150
1151    /// Initialize centroids using K-means++ on GPU
1152    fn initialize_centroids_gpu(&self, data: ArrayView2<F>, k: usize) -> Result<Array2<F>> {
1153        // For now, use CPU initialization (GPU K-means++ is complex)
1154        self.initialize_centroids_cpu(data, k)
1155    }
1156
1157    /// Initialize centroids using K-means++ on CPU
1158    fn initialize_centroids_cpu(&self, data: ArrayView2<F>, k: usize) -> Result<Array2<F>> {
1159        let n_samples = data.nrows();
1160        let n_features = data.ncols();
1161        let mut centroids = Array2::zeros((k, n_features));
1162        let mut rng = scirs2_core::random::rng();
1163
1164        // Choose first centroid randomly
1165        let first_idx = scirs2_core::random::RngExt::random_range(&mut rng, 0..n_samples);
1166        for j in 0..n_features {
1167            centroids[[0, j]] = data[[first_idx, j]];
1168        }
1169
1170        if k == 1 {
1171            return Ok(centroids);
1172        }
1173
1174        // Choose remaining centroids with K-means++
1175        let mut min_distances = Array1::from_elem(n_samples, F::infinity());
1176
1177        for i in 1..k {
1178            // Update minimum distances
1179            for sample_idx in 0..n_samples {
1180                let dist =
1181                    self.euclidean_distance_squared(data.row(sample_idx), centroids.row(i - 1));
1182                if dist < min_distances[sample_idx] {
1183                    min_distances[sample_idx] = dist;
1184                }
1185            }
1186
1187            // Compute probability distribution
1188            let sum_distances: F = min_distances.iter().copied().fold(F::zero(), |a, b| a + b);
1189            if sum_distances <= F::zero() {
1190                // All points are at centroids, pick random
1191                let idx = scirs2_core::random::RngExt::random_range(&mut rng, 0..n_samples);
1192                for j in 0..n_features {
1193                    centroids[[i, j]] = data[[idx, j]];
1194                }
1195                continue;
1196            }
1197
1198            // Sample next centroid
1199            let threshold = F::from(scirs2_core::random::RngExt::random_range(
1200                &mut rng,
1201                0.0..1.0,
1202            ))
1203            .unwrap_or(F::zero())
1204                * sum_distances;
1205            let mut cumsum = F::zero();
1206            let mut next_idx = 0;
1207
1208            for (idx, &dist) in min_distances.iter().enumerate() {
1209                cumsum = cumsum + dist;
1210                if cumsum >= threshold {
1211                    next_idx = idx;
1212                    break;
1213                }
1214            }
1215
1216            for j in 0..n_features {
1217                centroids[[i, j]] = data[[next_idx, j]];
1218            }
1219        }
1220
1221        Ok(centroids)
1222    }
1223
1224    /// Compute labels using GPU acceleration
1225    fn compute_labels_gpu(
1226        &self,
1227        data: ArrayView2<F>,
1228        centroids: ArrayView2<F>,
1229    ) -> Result<(Array1<usize>, Array1<F>)> {
1230        // GPU-accelerated label assignment
1231        // For now, use CPU implementation with the structure for GPU
1232        self.assign_labels_cpu(data, centroids)
1233    }
1234
1235    /// Compute centroids using GPU acceleration
1236    fn compute_centroids_gpu(
1237        &self,
1238        data: ArrayView2<F>,
1239        labels: &Array1<usize>,
1240        k: usize,
1241    ) -> Result<Array2<F>> {
1242        let n_features = data.ncols();
1243        self.update_centroids_cpu(data, labels, k, n_features)
1244    }
1245
1246    /// Assign labels to each point (CPU implementation)
1247    fn assign_labels_cpu(
1248        &self,
1249        data: ArrayView2<F>,
1250        centroids: ArrayView2<F>,
1251    ) -> Result<(Array1<usize>, Array1<F>)> {
1252        let n_samples = data.nrows();
1253        let n_centroids = centroids.nrows();
1254        let mut labels = Array1::zeros(n_samples);
1255        let mut distances = Array1::zeros(n_samples);
1256
1257        for i in 0..n_samples {
1258            let mut min_dist = F::infinity();
1259            let mut min_label = 0;
1260
1261            for j in 0..n_centroids {
1262                let dist = self.euclidean_distance_squared(data.row(i), centroids.row(j));
1263                if dist < min_dist {
1264                    min_dist = dist;
1265                    min_label = j;
1266                }
1267            }
1268
1269            labels[i] = min_label;
1270            distances[i] = min_dist;
1271        }
1272
1273        Ok((labels, distances))
1274    }
1275
1276    /// Update centroids (CPU implementation)
1277    fn update_centroids_cpu(
1278        &self,
1279        data: ArrayView2<F>,
1280        labels: &Array1<usize>,
1281        k: usize,
1282        n_features: usize,
1283    ) -> Result<Array2<F>> {
1284        let mut centroids = Array2::zeros((k, n_features));
1285        let mut counts = vec![0usize; k];
1286
1287        // Sum points in each cluster
1288        for (i, &label) in labels.iter().enumerate() {
1289            if label < k {
1290                for j in 0..n_features {
1291                    centroids[[label, j]] = centroids[[label, j]] + data[[i, j]];
1292                }
1293                counts[label] += 1;
1294            }
1295        }
1296
1297        // Divide by counts
1298        for i in 0..k {
1299            if counts[i] > 0 {
1300                let count = F::from(counts[i]).unwrap_or(F::one());
1301                for j in 0..n_features {
1302                    centroids[[i, j]] = centroids[[i, j]] / count;
1303                }
1304            }
1305        }
1306
1307        Ok(centroids)
1308    }
1309
1310    /// Compute squared Euclidean distance
1311    fn euclidean_distance_squared(
1312        &self,
1313        a: scirs2_core::ndarray::ArrayView1<F>,
1314        b: scirs2_core::ndarray::ArrayView1<F>,
1315    ) -> F {
1316        a.iter()
1317            .zip(b.iter())
1318            .map(|(&x, &y)| {
1319                let diff = x - y;
1320                diff * diff
1321            })
1322            .fold(F::zero(), |acc, x| acc + x)
1323    }
1324
1325    /// Compute inertia from distances
1326    fn compute_inertia(&self, distances: &Array1<F>) -> F {
1327        distances.iter().copied().fold(F::zero(), |a, b| a + b)
1328    }
1329
1330    /// Compute centroid shift
1331    fn compute_centroid_shift(&self, old: ArrayView2<F>, new: ArrayView2<F>) -> F {
1332        let mut max_shift = F::zero();
1333        for i in 0..old.nrows() {
1334            let shift = self
1335                .euclidean_distance_squared(old.row(i), new.row(i))
1336                .sqrt();
1337            if shift > max_shift {
1338                max_shift = shift;
1339            }
1340        }
1341        max_shift
1342    }
1343
1344    /// Get configuration
1345    pub fn config(&self) -> &GpuAccelerationConfig {
1346        &self.config
1347    }
1348
1349    /// Check if GPU is available
1350    pub fn is_gpu_available(&self) -> bool {
1351        self.gpu_available
1352    }
1353
1354    /// Get tensor core capabilities
1355    pub fn tensor_core_capabilities(&self) -> &TensorCoreCapabilities {
1356        &self.tensor_caps
1357    }
1358
1359    /// Get memory statistics
1360    pub fn memory_stats(&self) -> &MemoryUsageStats {
1361        self.memory_manager.get_stats()
1362    }
1363
1364    /// Get profiling data
1365    pub fn profiling_data(&self) -> &[ProfilingRecord] {
1366        &self.profiling_data
1367    }
1368}
1369
1370// ============================================================================
1371// Tests
1372// ============================================================================
1373
1374#[cfg(test)]
1375mod tests {
1376    use super::*;
1377    use scirs2_core::ndarray::Array2;
1378
1379    #[test]
1380    fn test_advanced_memory_strategy_display() {
1381        assert_eq!(
1382            AdvancedMemoryStrategy::Conservative.to_string(),
1383            "Conservative"
1384        );
1385        assert_eq!(
1386            AdvancedMemoryStrategy::Streaming {
1387                chunk_size: 1024 * 1024
1388            }
1389            .to_string(),
1390            "Streaming(1MB)"
1391        );
1392    }
1393
1394    #[test]
1395    fn test_advanced_memory_manager_creation() {
1396        let manager = AdvancedGpuMemoryManager::new(
1397            AdvancedMemoryStrategy::Adaptive,
1398            4 * 1024 * 1024 * 1024, // 4GB
1399        );
1400        assert_eq!(manager.strategy(), AdvancedMemoryStrategy::Adaptive);
1401    }
1402
1403    #[test]
1404    fn test_advanced_memory_allocation() {
1405        let mut manager = AdvancedGpuMemoryManager::new(
1406            AdvancedMemoryStrategy::Conservative,
1407            1024 * 1024 * 1024, // 1GB
1408        );
1409
1410        let result = manager.allocate(1024);
1411        assert!(result.is_ok());
1412
1413        let stats = manager.get_stats();
1414        assert_eq!(stats.total_allocations, 1);
1415        assert_eq!(stats.successful_allocations, 1);
1416    }
1417
1418    #[test]
1419    fn test_precision_mode_display() {
1420        assert_eq!(PrecisionMode::Mixed.to_string(), "Mixed (f16/f32)");
1421        assert_eq!(PrecisionMode::TensorFloat32.to_string(), "TF32");
1422    }
1423
1424    #[test]
1425    fn test_tensor_core_config_default() {
1426        let config = TensorCoreConfig::default();
1427        assert!(config.enabled);
1428        assert_eq!(config.precision, PrecisionMode::Auto);
1429        assert!(config.auto_scale);
1430    }
1431
1432    #[test]
1433    fn test_device_selector_creation() {
1434        let selector = DeviceSelector::new(AdvancedDeviceSelection::Auto);
1435        assert!(selector.devices().is_empty());
1436    }
1437
1438    #[test]
1439    fn test_device_selector_add_device() {
1440        let mut selector = DeviceSelector::new(AdvancedDeviceSelection::MostMemory);
1441
1442        let device = GpuDevice::new(
1443            0,
1444            "Test GPU".to_string(),
1445            8_000_000_000,
1446            6_000_000_000,
1447            "1.0".to_string(),
1448            1024,
1449            GpuBackend::Cuda,
1450            true,
1451        );
1452
1453        selector.add_device(device);
1454        assert_eq!(selector.devices().len(), 1);
1455    }
1456
1457    #[test]
1458    fn test_gpu_acceleration_config_default() {
1459        let config = GpuAccelerationConfig::default();
1460        assert!(config.enabled);
1461        assert!(config.auto_fallback);
1462    }
1463
1464    #[test]
1465    fn test_gpu_acceleration_config_cuda() {
1466        let config = GpuAccelerationConfig::cuda();
1467        assert_eq!(config.backend, GpuBackend::Cuda);
1468        assert!(config.tensor_cores.enabled);
1469    }
1470
1471    #[test]
1472    fn test_gpu_kmeans_creation() {
1473        let config = GpuAccelerationConfig::cpu();
1474        let kmeans = GpuKMeans::<f64>::new(config);
1475        assert!(kmeans.is_ok());
1476    }
1477
1478    #[test]
1479    fn test_gpu_kmeans_fit_cpu_fallback() {
1480        let config = GpuAccelerationConfig::cpu();
1481        let mut kmeans = GpuKMeans::<f64>::new(config).expect("Failed to create GpuKMeans");
1482
1483        // Create test data with two clear clusters
1484        let data = Array2::from_shape_vec(
1485            (6, 2),
1486            vec![1.0, 2.0, 1.2, 1.8, 0.8, 1.9, 4.0, 5.0, 4.2, 4.8, 3.9, 5.1],
1487        )
1488        .expect("Failed to create test data");
1489
1490        let result = kmeans.fit(data.view(), 2, 100, 1e-4);
1491        assert!(result.is_ok());
1492
1493        let result = result.expect("Failed to fit");
1494        assert_eq!(result.centroids.nrows(), 2);
1495        assert_eq!(result.labels.len(), 6);
1496        assert!(!result.metrics.used_gpu);
1497    }
1498
1499    #[test]
1500    fn test_gpu_kmeans_convergence() {
1501        let config = GpuAccelerationConfig::cpu();
1502        let mut kmeans = GpuKMeans::<f64>::new(config).expect("Failed to create GpuKMeans");
1503
1504        // Well-separated clusters should converge quickly
1505        let data = Array2::from_shape_vec(
1506            (8, 2),
1507            vec![
1508                0.0, 0.0, 0.1, 0.1, 0.0, 0.1, 0.1, 0.0, 10.0, 10.0, 10.1, 10.1, 10.0, 10.1, 10.1,
1509                10.0,
1510            ],
1511        )
1512        .expect("Failed to create test data");
1513
1514        let result = kmeans.fit(data.view(), 2, 100, 1e-6);
1515        assert!(result.is_ok());
1516
1517        let result = result.expect("Failed to fit");
1518        assert!(result.converged);
1519        assert!(result.n_iterations < 50);
1520    }
1521
1522    #[test]
1523    fn test_memory_usage_stats() {
1524        let mut manager =
1525            AdvancedGpuMemoryManager::new(AdvancedMemoryStrategy::Aggressive, 1024 * 1024 * 1024);
1526
1527        // Multiple allocations
1528        for _ in 0..5 {
1529            let _ = manager.allocate(1024);
1530        }
1531
1532        let stats = manager.get_stats();
1533        assert_eq!(stats.total_allocations, 5);
1534        assert!(stats.efficiency > 0.0);
1535    }
1536
1537    #[test]
1538    fn test_kernel_optimizations_default() {
1539        let opts = KernelOptimizations::default();
1540        assert!(opts.loop_unrolling);
1541        assert!(opts.shared_memory_tiling);
1542        assert_eq!(opts.occupancy_level, 2);
1543    }
1544
1545    #[test]
1546    fn test_detect_tensor_core_capabilities() {
1547        let cuda_device = GpuDevice::new(
1548            0,
1549            "CUDA Device".to_string(),
1550            8_000_000_000,
1551            6_000_000_000,
1552            "8.0".to_string(),
1553            1024,
1554            GpuBackend::Cuda,
1555            true,
1556        );
1557
1558        let caps = detect_tensor_core_capabilities(&cuda_device);
1559        assert!(caps.available);
1560        assert!(!caps.supported_precisions.is_empty());
1561    }
1562
1563    #[test]
1564    fn test_profiling_record_creation() {
1565        let record = ProfilingRecord {
1566            operation: "distance_compute".to_string(),
1567            duration_us: 1000,
1568            memory_transferred: 1024 * 1024,
1569            compute_ops: 1000000,
1570            timestamp: Instant::now(),
1571        };
1572
1573        assert_eq!(record.operation, "distance_compute");
1574        assert_eq!(record.duration_us, 1000);
1575    }
1576}