Skip to main content

sklears_simd/
gpu_memory.rs

1//! GPU memory management utilities
2//!
3//! This module provides advanced GPU memory management including:
4//! - Memory pools for efficient allocation/deallocation
5//! - Unified memory management for CUDA
6//! - Multi-GPU memory distribution
7//! - Memory bandwidth optimization
8
9use crate::gpu::{GpuBackend, GpuDevice};
10use crate::traits::SimdError;
11
12#[cfg(feature = "no-std")]
13use alloc::collections::BTreeMap as HashMap;
14#[cfg(feature = "no-std")]
15use alloc::sync::Arc;
16#[cfg(not(feature = "no-std"))]
17use std::collections::HashMap;
18#[cfg(not(feature = "no-std"))]
19use std::sync::Arc;
20
21#[cfg(feature = "no-std")]
22use alloc::{format, string::ToString};
23
24#[cfg(feature = "no-std")]
25use alloc::vec::Vec;
26#[cfg(not(feature = "no-std"))]
27use std::vec::Vec;
28
29#[cfg(feature = "no-std")]
30use spin::Mutex;
31#[cfg(not(feature = "no-std"))]
32use std::sync::Mutex;
33
34#[cfg(feature = "no-std")]
35use core::slice;
36#[cfg(not(feature = "no-std"))]
37use std::slice;
38
39/// Memory pool for GPU allocations
40pub struct GpuMemoryPool {
41    device: GpuDevice,
42    free_blocks: HashMap<usize, Vec<GpuMemoryBlock>>,
43    allocated_blocks: HashMap<usize, GpuMemoryBlock>,
44    total_allocated: usize,
45    peak_usage: usize,
46    allocation_count: usize,
47}
48
49/// GPU memory block descriptor
50#[derive(Debug, Clone)]
51pub struct GpuMemoryBlock {
52    ptr: *mut u8,
53    size: usize,
54    #[allow(dead_code)] // Stored for device-specific deallocation routing
55    device_id: u32,
56    #[allow(dead_code)] // Used to select correct backend free path when backends are enabled
57    backend: GpuBackend,
58    #[allow(dead_code)] // Distinguishes unified vs device memory for proper teardown
59    is_unified: bool,
60}
61
62unsafe impl Send for GpuMemoryBlock {}
63unsafe impl Sync for GpuMemoryBlock {}
64
65/// Memory allocation strategy
66#[derive(Debug, Clone, Copy)]
67pub enum AllocationStrategy {
68    /// Simple allocation without pooling
69    Simple,
70    /// Pool-based allocation with size classes
71    Pooled,
72    /// Unified memory allocation (CUDA only)
73    Unified,
74    /// Pinned host memory for fast transfers
75    Pinned,
76}
77
78/// Memory bandwidth optimization settings
79#[derive(Debug, Clone)]
80pub struct BandwidthConfig {
81    pub use_async_transfers: bool,
82    pub prefer_pinned_memory: bool,
83    pub coalesce_transfers: bool,
84    pub max_concurrent_streams: u32,
85}
86
87impl Default for BandwidthConfig {
88    fn default() -> Self {
89        Self {
90            use_async_transfers: true,
91            prefer_pinned_memory: true,
92            coalesce_transfers: true,
93            max_concurrent_streams: 4,
94        }
95    }
96}
97
98impl GpuMemoryPool {
99    /// Create a new memory pool for the specified device
100    pub fn new(device: GpuDevice) -> Self {
101        Self {
102            device,
103            free_blocks: HashMap::new(),
104            allocated_blocks: HashMap::new(),
105            total_allocated: 0,
106            peak_usage: 0,
107            allocation_count: 0,
108        }
109    }
110
111    /// Allocate memory from the pool
112    pub fn allocate(
113        &mut self,
114        size: usize,
115        strategy: AllocationStrategy,
116    ) -> Result<GpuMemoryBlock, SimdError> {
117        self.allocation_count += 1;
118
119        // Try to find a suitable free block
120        if let Some(block) = self.find_free_block(size) {
121            self.allocated_blocks
122                .insert(block.ptr as usize, block.clone());
123            return Ok(block);
124        }
125
126        // Allocate new block
127        let block = self.allocate_new_block(size, strategy)?;
128        self.allocated_blocks
129            .insert(block.ptr as usize, block.clone());
130        self.total_allocated += size;
131        self.peak_usage = self.peak_usage.max(self.total_allocated);
132
133        Ok(block)
134    }
135
136    /// Deallocate memory and return to pool
137    pub fn deallocate(&mut self, ptr: *mut u8) -> Result<(), SimdError> {
138        if let Some(block) = self.allocated_blocks.remove(&(ptr as usize)) {
139            self.total_allocated -= block.size;
140
141            // Add to free blocks for reuse
142            let size_class = self.get_size_class(block.size);
143            self.free_blocks.entry(size_class).or_default().push(block);
144
145            Ok(())
146        } else {
147            Err(SimdError::InvalidParameter {
148                name: "ptr".to_string(),
149                value: "Invalid pointer for deallocation".to_string(),
150            })
151        }
152    }
153
154    /// Get memory usage statistics
155    pub fn get_stats(&self) -> MemoryStats {
156        MemoryStats {
157            total_allocated: self.total_allocated,
158            peak_usage: self.peak_usage,
159            allocation_count: self.allocation_count,
160            free_blocks_count: self.free_blocks.values().map(|v| v.len()).sum(),
161            device_memory_mb: self.device.memory_mb,
162        }
163    }
164
165    /// Clear all free blocks to reclaim memory
166    pub fn trim(&mut self) {
167        self.free_blocks.clear();
168    }
169
170    fn find_free_block(&mut self, size: usize) -> Option<GpuMemoryBlock> {
171        let size_class = self.get_size_class(size);
172
173        // Try exact size class first
174        if let Some(blocks) = self.free_blocks.get_mut(&size_class) {
175            if let Some(block) = blocks.pop() {
176                return Some(block);
177            }
178        }
179
180        // Try larger size classes
181        for (&class_size, blocks) in self.free_blocks.iter_mut() {
182            if class_size >= size_class && !blocks.is_empty() {
183                return blocks.pop();
184            }
185        }
186
187        None
188    }
189
190    fn allocate_new_block(
191        &self,
192        size: usize,
193        strategy: AllocationStrategy,
194    ) -> Result<GpuMemoryBlock, SimdError> {
195        match strategy {
196            AllocationStrategy::Simple => self.allocate_simple(size),
197            AllocationStrategy::Pooled => self.allocate_pooled(size),
198            AllocationStrategy::Unified => self.allocate_unified(size),
199            AllocationStrategy::Pinned => self.allocate_pinned(size),
200        }
201    }
202
203    fn allocate_simple(&self, size: usize) -> Result<GpuMemoryBlock, SimdError> {
204        match self.device.backend {
205            GpuBackend::Cuda => {
206                // CUDA disabled for macOS compatibility
207                let _ = size;
208                Err(SimdError::UnsupportedOperation(
209                    "CUDA not available".to_string(),
210                ))
211            }
212            GpuBackend::OpenCL => {
213                // OpenCL disabled for macOS compatibility
214                let _ = size;
215                Err(SimdError::UnsupportedOperation(
216                    "OpenCL not available".to_string(),
217                ))
218            }
219            _ => Err(SimdError::UnsupportedOperation(
220                "Backend not supported".to_string(),
221            )),
222        }
223    }
224
225    fn allocate_pooled(&self, size: usize) -> Result<GpuMemoryBlock, SimdError> {
226        // Allocate larger blocks for pooling efficiency
227        let pool_size = (size * 2).max(1024 * 1024); // At least 1MB
228        self.allocate_simple(pool_size)
229    }
230
231    fn allocate_unified(&self, size: usize) -> Result<GpuMemoryBlock, SimdError> {
232        if self.device.backend != GpuBackend::Cuda {
233            return Err(SimdError::UnsupportedOperation(
234                "Unified memory only available with CUDA".to_string(),
235            ));
236        }
237
238        // CUDA disabled for macOS compatibility
239        let _ = size;
240        Err(SimdError::UnsupportedOperation(
241            "CUDA not available".to_string(),
242        ))
243    }
244
245    fn allocate_pinned(&self, size: usize) -> Result<GpuMemoryBlock, SimdError> {
246        match self.device.backend {
247            GpuBackend::Cuda => {
248                // CUDA disabled for macOS compatibility
249                let _ = size;
250                Err(SimdError::UnsupportedOperation(
251                    "CUDA not available".to_string(),
252                ))
253            }
254            _ => Err(SimdError::UnsupportedOperation(
255                "Pinned memory only available with CUDA".to_string(),
256            )),
257        }
258    }
259
260    fn get_size_class(&self, size: usize) -> usize {
261        // Round up to nearest power of 2 for size classes
262        if size == 0 {
263            return 1;
264        }
265        1 << (64 - size.leading_zeros())
266    }
267}
268
269/// Memory usage statistics
270#[derive(Debug, Clone)]
271pub struct MemoryStats {
272    pub total_allocated: usize,
273    pub peak_usage: usize,
274    pub allocation_count: usize,
275    pub free_blocks_count: usize,
276    pub device_memory_mb: u64,
277}
278
279impl MemoryStats {
280    /// Get memory utilization as a percentage
281    pub fn utilization_percent(&self) -> f64 {
282        if self.device_memory_mb == 0 {
283            return 0.0;
284        }
285        (self.total_allocated as f64 / (self.device_memory_mb * 1024 * 1024) as f64) * 100.0
286    }
287
288    /// Check if memory usage is approaching limit
289    pub fn is_high_usage(&self, threshold: f64) -> bool {
290        self.utilization_percent() > threshold
291    }
292}
293
294/// Multi-GPU memory manager
295pub struct MultiGpuMemoryManager {
296    pools: HashMap<u32, Arc<Mutex<GpuMemoryPool>>>,
297    allocation_strategy: AllocationStrategy,
298    bandwidth_config: BandwidthConfig,
299}
300
301impl MultiGpuMemoryManager {
302    pub fn new() -> Self {
303        Self {
304            pools: HashMap::new(),
305            allocation_strategy: AllocationStrategy::Pooled,
306            bandwidth_config: BandwidthConfig::default(),
307        }
308    }
309
310    /// Add a device to the manager
311    pub fn add_device(&mut self, device: GpuDevice) {
312        let pool = Arc::new(Mutex::new(GpuMemoryPool::new(device.clone())));
313        self.pools.insert(device.id, pool);
314    }
315
316    /// Allocate memory on the specified device
317    pub fn allocate_on_device(
318        &self,
319        device_id: u32,
320        size: usize,
321    ) -> Result<GpuMemoryBlock, SimdError> {
322        if let Some(pool) = self.pools.get(&device_id) {
323            #[cfg(not(feature = "no-std"))]
324            let mut pool = pool.lock().map_err(|_| {
325                SimdError::ExternalLibraryError("Failed to lock memory pool".to_string())
326            })?;
327            #[cfg(feature = "no-std")]
328            let mut pool = pool.lock();
329            pool.allocate(size, self.allocation_strategy)
330        } else {
331            Err(SimdError::InvalidParameter {
332                name: "device_id".to_string(),
333                value: format!("Device {} not found", device_id),
334            })
335        }
336    }
337
338    /// Allocate memory on the best available device
339    pub fn allocate_on_best_device(&self, size: usize) -> Result<(u32, GpuMemoryBlock), SimdError> {
340        let best_device = self.find_best_device_for_allocation(size)?;
341        let block = self.allocate_on_device(best_device, size)?;
342        Ok((best_device, block))
343    }
344
345    /// Deallocate memory on the specified device
346    pub fn deallocate_on_device(&self, device_id: u32, ptr: *mut u8) -> Result<(), SimdError> {
347        if let Some(pool) = self.pools.get(&device_id) {
348            #[cfg(not(feature = "no-std"))]
349            let mut pool = pool.lock().map_err(|_| {
350                SimdError::ExternalLibraryError("Failed to lock memory pool".to_string())
351            })?;
352            #[cfg(feature = "no-std")]
353            let mut pool = pool.lock();
354            pool.deallocate(ptr)
355        } else {
356            Err(SimdError::InvalidParameter {
357                name: "device_id".to_string(),
358                value: format!("Device {} not found", device_id),
359            })
360        }
361    }
362
363    /// Get memory statistics for all devices
364    pub fn get_all_stats(&self) -> HashMap<u32, MemoryStats> {
365        let mut stats = HashMap::new();
366        for (&device_id, pool) in &self.pools {
367            #[cfg(not(feature = "no-std"))]
368            {
369                if let Ok(pool) = pool.lock() {
370                    stats.insert(device_id, pool.get_stats());
371                }
372            }
373            #[cfg(feature = "no-std")]
374            {
375                let pool = pool.lock();
376                stats.insert(device_id, pool.get_stats());
377            }
378        }
379        stats
380    }
381
382    /// Find the best device for allocation based on available memory
383    fn find_best_device_for_allocation(&self, size: usize) -> Result<u32, SimdError> {
384        let mut best_device = None;
385        let mut min_usage = f64::INFINITY;
386
387        for (&device_id, pool) in &self.pools {
388            #[cfg(not(feature = "no-std"))]
389            let pool_result = pool.lock();
390            #[cfg(feature = "no-std")]
391            let pool_result: Result<_, ()> = Ok(pool.lock());
392
393            if let Ok(pool) = pool_result {
394                let stats = pool.get_stats();
395                let usage = stats.utilization_percent();
396
397                // Check if device has enough memory
398                let available_mb =
399                    stats.device_memory_mb - (stats.total_allocated / (1024 * 1024)) as u64;
400                let required_mb = (size / (1024 * 1024)) as u64 + 1;
401
402                if available_mb >= required_mb && usage < min_usage {
403                    min_usage = usage;
404                    best_device = Some(device_id);
405                }
406            }
407        }
408
409        best_device.ok_or_else(|| {
410            SimdError::ExternalLibraryError("No suitable device found for allocation".to_string())
411        })
412    }
413
414    /// Configure allocation strategy
415    pub fn set_allocation_strategy(&mut self, strategy: AllocationStrategy) {
416        self.allocation_strategy = strategy;
417    }
418
419    /// Configure bandwidth optimization
420    pub fn set_bandwidth_config(&mut self, config: BandwidthConfig) {
421        self.bandwidth_config = config;
422    }
423}
424
425impl Default for MultiGpuMemoryManager {
426    fn default() -> Self {
427        Self::new()
428    }
429}
430
431/// Unified memory buffer for CUDA
432#[derive(Debug)]
433pub struct UnifiedMemoryBuffer<T> {
434    ptr: *mut T,
435    size: usize,
436    #[allow(dead_code)] // Stored for cudarc deallocation routing when cuda feature is enabled
437    device_id: u32,
438}
439
440impl<T> UnifiedMemoryBuffer<T> {
441    /// Create a new unified memory buffer
442    pub fn new(size: usize, device_id: u32) -> Result<Self, SimdError> {
443        // CUDA disabled for macOS compatibility
444        let _ = (size, device_id);
445        Err(SimdError::UnsupportedOperation(
446            "CUDA unified memory not available".to_string(),
447        ))
448    }
449
450    /// Get mutable slice to the data
451    pub fn as_mut_slice(&mut self) -> &mut [T] {
452        unsafe { slice::from_raw_parts_mut(self.ptr, self.size) }
453    }
454
455    /// Get immutable slice to the data
456    pub fn as_slice(&self) -> &[T] {
457        unsafe { slice::from_raw_parts(self.ptr, self.size) }
458    }
459
460    /// Prefetch to GPU
461    pub fn prefetch_to_gpu(&self) -> Result<(), SimdError> {
462        // CUDA disabled for macOS compatibility
463        Err(SimdError::UnsupportedOperation(
464            "CUDA not available".to_string(),
465        ))
466    }
467
468    /// Prefetch to CPU
469    pub fn prefetch_to_cpu(&self) -> Result<(), SimdError> {
470        // CUDA disabled for macOS compatibility
471        Err(SimdError::UnsupportedOperation(
472            "CUDA not available".to_string(),
473        ))
474    }
475}
476
477impl<T> Drop for UnifiedMemoryBuffer<T> {
478    fn drop(&mut self) {
479        // CUDA disabled for macOS compatibility - no cleanup needed
480    }
481}
482
483unsafe impl<T: Send> Send for UnifiedMemoryBuffer<T> {}
484unsafe impl<T: Sync> Sync for UnifiedMemoryBuffer<T> {}
485
486#[allow(non_snake_case)]
487#[cfg(all(test, not(feature = "no-std")))]
488mod tests {
489    use super::*;
490    use crate::gpu::GpuBackend;
491
492    #[cfg(feature = "no-std")]
493    use alloc::{
494        string::{String, ToString},
495        vec,
496        vec::Vec,
497    };
498
499    #[test]
500    fn test_memory_pool_creation() {
501        let device = GpuDevice {
502            id: 0,
503            name: "Test Device".to_string(),
504            backend: GpuBackend::Cuda,
505            compute_units: 80,
506            memory_mb: 8192,
507            supports_f64: true,
508            supports_f16: true,
509        };
510
511        let pool = GpuMemoryPool::new(device);
512        let stats = pool.get_stats();
513
514        assert_eq!(stats.total_allocated, 0);
515        assert_eq!(stats.allocation_count, 0);
516    }
517
518    #[test]
519    fn test_size_class_calculation() {
520        let device = GpuDevice {
521            id: 0,
522            name: "Test Device".to_string(),
523            backend: GpuBackend::Cuda,
524            compute_units: 80,
525            memory_mb: 8192,
526            supports_f64: true,
527            supports_f16: true,
528        };
529
530        let pool = GpuMemoryPool::new(device);
531
532        assert_eq!(pool.get_size_class(0), 1);
533        assert_eq!(pool.get_size_class(1), 2);
534        assert_eq!(pool.get_size_class(1000), 1024);
535        assert_eq!(pool.get_size_class(1024), 2048);
536    }
537
538    #[test]
539    fn test_memory_stats() {
540        let stats = MemoryStats {
541            total_allocated: 1024 * 1024, // 1MB
542            peak_usage: 2 * 1024 * 1024,  // 2MB
543            allocation_count: 5,
544            free_blocks_count: 2,
545            device_memory_mb: 1024, // 1GB
546        };
547
548        assert!((stats.utilization_percent() - 0.09765625).abs() < 0.001); // ~0.1%
549        assert!(!stats.is_high_usage(50.0));
550        assert!(stats.is_high_usage(0.05));
551    }
552
553    #[test]
554    fn test_multi_gpu_manager() {
555        let mut manager = MultiGpuMemoryManager::new();
556
557        let device1 = GpuDevice {
558            id: 0,
559            name: "Device 0".to_string(),
560            backend: GpuBackend::Cuda,
561            compute_units: 80,
562            memory_mb: 8192,
563            supports_f64: true,
564            supports_f16: true,
565        };
566
567        let device2 = GpuDevice {
568            id: 1,
569            name: "Device 1".to_string(),
570            backend: GpuBackend::Cuda,
571            compute_units: 40,
572            memory_mb: 4096,
573            supports_f64: true,
574            supports_f16: true,
575        };
576
577        manager.add_device(device1);
578        manager.add_device(device2);
579
580        let stats = manager.get_all_stats();
581        assert_eq!(stats.len(), 2);
582        assert!(stats.contains_key(&0));
583        assert!(stats.contains_key(&1));
584    }
585
586    #[test]
587    fn test_bandwidth_config() {
588        let config = BandwidthConfig::default();
589        assert!(config.use_async_transfers);
590        assert!(config.prefer_pinned_memory);
591        assert!(config.coalesce_transfers);
592        assert_eq!(config.max_concurrent_streams, 4);
593    }
594
595    #[test]
596    fn test_allocation_strategies() {
597        let strategies = vec![
598            AllocationStrategy::Simple,
599            AllocationStrategy::Pooled,
600            AllocationStrategy::Unified,
601            AllocationStrategy::Pinned,
602        ];
603
604        // Test that strategies can be compared and used
605        for strategy in strategies {
606            let _ = match strategy {
607                AllocationStrategy::Simple => 0,
608                AllocationStrategy::Pooled => 1,
609                AllocationStrategy::Unified => 2,
610                AllocationStrategy::Pinned => 3,
611            };
612        }
613    }
614}