use crate::gpu::{GpuBackend, GpuDevice};
use crate::traits::SimdError;
#[cfg(feature = "no-std")]
use alloc::collections::BTreeMap as HashMap;
#[cfg(feature = "no-std")]
use alloc::sync::Arc;
#[cfg(not(feature = "no-std"))]
use std::collections::HashMap;
#[cfg(not(feature = "no-std"))]
use std::sync::Arc;
#[cfg(feature = "no-std")]
use alloc::{format, string::ToString};
#[cfg(feature = "no-std")]
use alloc::vec::Vec;
#[cfg(not(feature = "no-std"))]
use std::vec::Vec;
#[cfg(feature = "no-std")]
use spin::Mutex;
#[cfg(not(feature = "no-std"))]
use std::sync::Mutex;
#[cfg(feature = "no-std")]
use core::slice;
#[cfg(not(feature = "no-std"))]
use std::slice;
pub struct GpuMemoryPool {
device: GpuDevice,
free_blocks: HashMap<usize, Vec<GpuMemoryBlock>>,
allocated_blocks: HashMap<usize, GpuMemoryBlock>,
total_allocated: usize,
peak_usage: usize,
allocation_count: usize,
}
#[derive(Debug, Clone)]
pub struct GpuMemoryBlock {
ptr: *mut u8,
size: usize,
#[allow(dead_code)] device_id: u32,
#[allow(dead_code)] backend: GpuBackend,
#[allow(dead_code)] is_unified: bool,
}
unsafe impl Send for GpuMemoryBlock {}
unsafe impl Sync for GpuMemoryBlock {}
#[derive(Debug, Clone, Copy)]
pub enum AllocationStrategy {
Simple,
Pooled,
Unified,
Pinned,
}
#[derive(Debug, Clone)]
pub struct BandwidthConfig {
pub use_async_transfers: bool,
pub prefer_pinned_memory: bool,
pub coalesce_transfers: bool,
pub max_concurrent_streams: u32,
}
impl Default for BandwidthConfig {
fn default() -> Self {
Self {
use_async_transfers: true,
prefer_pinned_memory: true,
coalesce_transfers: true,
max_concurrent_streams: 4,
}
}
}
impl GpuMemoryPool {
pub fn new(device: GpuDevice) -> Self {
Self {
device,
free_blocks: HashMap::new(),
allocated_blocks: HashMap::new(),
total_allocated: 0,
peak_usage: 0,
allocation_count: 0,
}
}
pub fn allocate(
&mut self,
size: usize,
strategy: AllocationStrategy,
) -> Result<GpuMemoryBlock, SimdError> {
self.allocation_count += 1;
if let Some(block) = self.find_free_block(size) {
self.allocated_blocks
.insert(block.ptr as usize, block.clone());
return Ok(block);
}
let block = self.allocate_new_block(size, strategy)?;
self.allocated_blocks
.insert(block.ptr as usize, block.clone());
self.total_allocated += size;
self.peak_usage = self.peak_usage.max(self.total_allocated);
Ok(block)
}
pub fn deallocate(&mut self, ptr: *mut u8) -> Result<(), SimdError> {
if let Some(block) = self.allocated_blocks.remove(&(ptr as usize)) {
self.total_allocated -= block.size;
let size_class = self.get_size_class(block.size);
self.free_blocks.entry(size_class).or_default().push(block);
Ok(())
} else {
Err(SimdError::InvalidParameter {
name: "ptr".to_string(),
value: "Invalid pointer for deallocation".to_string(),
})
}
}
pub fn get_stats(&self) -> MemoryStats {
MemoryStats {
total_allocated: self.total_allocated,
peak_usage: self.peak_usage,
allocation_count: self.allocation_count,
free_blocks_count: self.free_blocks.values().map(|v| v.len()).sum(),
device_memory_mb: self.device.memory_mb,
}
}
pub fn trim(&mut self) {
self.free_blocks.clear();
}
fn find_free_block(&mut self, size: usize) -> Option<GpuMemoryBlock> {
let size_class = self.get_size_class(size);
if let Some(blocks) = self.free_blocks.get_mut(&size_class) {
if let Some(block) = blocks.pop() {
return Some(block);
}
}
for (&class_size, blocks) in self.free_blocks.iter_mut() {
if class_size >= size_class && !blocks.is_empty() {
return blocks.pop();
}
}
None
}
fn allocate_new_block(
&self,
size: usize,
strategy: AllocationStrategy,
) -> Result<GpuMemoryBlock, SimdError> {
match strategy {
AllocationStrategy::Simple => self.allocate_simple(size),
AllocationStrategy::Pooled => self.allocate_pooled(size),
AllocationStrategy::Unified => self.allocate_unified(size),
AllocationStrategy::Pinned => self.allocate_pinned(size),
}
}
fn allocate_simple(&self, size: usize) -> Result<GpuMemoryBlock, SimdError> {
match self.device.backend {
GpuBackend::Cuda => {
let _ = size;
Err(SimdError::UnsupportedOperation(
"CUDA not available".to_string(),
))
}
GpuBackend::OpenCL => {
let _ = size;
Err(SimdError::UnsupportedOperation(
"OpenCL not available".to_string(),
))
}
_ => Err(SimdError::UnsupportedOperation(
"Backend not supported".to_string(),
)),
}
}
fn allocate_pooled(&self, size: usize) -> Result<GpuMemoryBlock, SimdError> {
let pool_size = (size * 2).max(1024 * 1024); self.allocate_simple(pool_size)
}
fn allocate_unified(&self, size: usize) -> Result<GpuMemoryBlock, SimdError> {
if self.device.backend != GpuBackend::Cuda {
return Err(SimdError::UnsupportedOperation(
"Unified memory only available with CUDA".to_string(),
));
}
let _ = size;
Err(SimdError::UnsupportedOperation(
"CUDA not available".to_string(),
))
}
fn allocate_pinned(&self, size: usize) -> Result<GpuMemoryBlock, SimdError> {
match self.device.backend {
GpuBackend::Cuda => {
let _ = size;
Err(SimdError::UnsupportedOperation(
"CUDA not available".to_string(),
))
}
_ => Err(SimdError::UnsupportedOperation(
"Pinned memory only available with CUDA".to_string(),
)),
}
}
fn get_size_class(&self, size: usize) -> usize {
if size == 0 {
return 1;
}
1 << (64 - size.leading_zeros())
}
}
#[derive(Debug, Clone)]
pub struct MemoryStats {
pub total_allocated: usize,
pub peak_usage: usize,
pub allocation_count: usize,
pub free_blocks_count: usize,
pub device_memory_mb: u64,
}
impl MemoryStats {
pub fn utilization_percent(&self) -> f64 {
if self.device_memory_mb == 0 {
return 0.0;
}
(self.total_allocated as f64 / (self.device_memory_mb * 1024 * 1024) as f64) * 100.0
}
pub fn is_high_usage(&self, threshold: f64) -> bool {
self.utilization_percent() > threshold
}
}
pub struct MultiGpuMemoryManager {
pools: HashMap<u32, Arc<Mutex<GpuMemoryPool>>>,
allocation_strategy: AllocationStrategy,
bandwidth_config: BandwidthConfig,
}
impl MultiGpuMemoryManager {
pub fn new() -> Self {
Self {
pools: HashMap::new(),
allocation_strategy: AllocationStrategy::Pooled,
bandwidth_config: BandwidthConfig::default(),
}
}
pub fn add_device(&mut self, device: GpuDevice) {
let pool = Arc::new(Mutex::new(GpuMemoryPool::new(device.clone())));
self.pools.insert(device.id, pool);
}
pub fn allocate_on_device(
&self,
device_id: u32,
size: usize,
) -> Result<GpuMemoryBlock, SimdError> {
if let Some(pool) = self.pools.get(&device_id) {
#[cfg(not(feature = "no-std"))]
let mut pool = pool.lock().map_err(|_| {
SimdError::ExternalLibraryError("Failed to lock memory pool".to_string())
})?;
#[cfg(feature = "no-std")]
let mut pool = pool.lock();
pool.allocate(size, self.allocation_strategy)
} else {
Err(SimdError::InvalidParameter {
name: "device_id".to_string(),
value: format!("Device {} not found", device_id),
})
}
}
pub fn allocate_on_best_device(&self, size: usize) -> Result<(u32, GpuMemoryBlock), SimdError> {
let best_device = self.find_best_device_for_allocation(size)?;
let block = self.allocate_on_device(best_device, size)?;
Ok((best_device, block))
}
pub fn deallocate_on_device(&self, device_id: u32, ptr: *mut u8) -> Result<(), SimdError> {
if let Some(pool) = self.pools.get(&device_id) {
#[cfg(not(feature = "no-std"))]
let mut pool = pool.lock().map_err(|_| {
SimdError::ExternalLibraryError("Failed to lock memory pool".to_string())
})?;
#[cfg(feature = "no-std")]
let mut pool = pool.lock();
pool.deallocate(ptr)
} else {
Err(SimdError::InvalidParameter {
name: "device_id".to_string(),
value: format!("Device {} not found", device_id),
})
}
}
pub fn get_all_stats(&self) -> HashMap<u32, MemoryStats> {
let mut stats = HashMap::new();
for (&device_id, pool) in &self.pools {
#[cfg(not(feature = "no-std"))]
{
if let Ok(pool) = pool.lock() {
stats.insert(device_id, pool.get_stats());
}
}
#[cfg(feature = "no-std")]
{
let pool = pool.lock();
stats.insert(device_id, pool.get_stats());
}
}
stats
}
fn find_best_device_for_allocation(&self, size: usize) -> Result<u32, SimdError> {
let mut best_device = None;
let mut min_usage = f64::INFINITY;
for (&device_id, pool) in &self.pools {
#[cfg(not(feature = "no-std"))]
let pool_result = pool.lock();
#[cfg(feature = "no-std")]
let pool_result: Result<_, ()> = Ok(pool.lock());
if let Ok(pool) = pool_result {
let stats = pool.get_stats();
let usage = stats.utilization_percent();
let available_mb =
stats.device_memory_mb - (stats.total_allocated / (1024 * 1024)) as u64;
let required_mb = (size / (1024 * 1024)) as u64 + 1;
if available_mb >= required_mb && usage < min_usage {
min_usage = usage;
best_device = Some(device_id);
}
}
}
best_device.ok_or_else(|| {
SimdError::ExternalLibraryError("No suitable device found for allocation".to_string())
})
}
pub fn set_allocation_strategy(&mut self, strategy: AllocationStrategy) {
self.allocation_strategy = strategy;
}
pub fn set_bandwidth_config(&mut self, config: BandwidthConfig) {
self.bandwidth_config = config;
}
}
impl Default for MultiGpuMemoryManager {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug)]
pub struct UnifiedMemoryBuffer<T> {
ptr: *mut T,
size: usize,
#[allow(dead_code)] device_id: u32,
}
impl<T> UnifiedMemoryBuffer<T> {
pub fn new(size: usize, device_id: u32) -> Result<Self, SimdError> {
let _ = (size, device_id);
Err(SimdError::UnsupportedOperation(
"CUDA unified memory not available".to_string(),
))
}
pub fn as_mut_slice(&mut self) -> &mut [T] {
unsafe { slice::from_raw_parts_mut(self.ptr, self.size) }
}
pub fn as_slice(&self) -> &[T] {
unsafe { slice::from_raw_parts(self.ptr, self.size) }
}
pub fn prefetch_to_gpu(&self) -> Result<(), SimdError> {
Err(SimdError::UnsupportedOperation(
"CUDA not available".to_string(),
))
}
pub fn prefetch_to_cpu(&self) -> Result<(), SimdError> {
Err(SimdError::UnsupportedOperation(
"CUDA not available".to_string(),
))
}
}
impl<T> Drop for UnifiedMemoryBuffer<T> {
fn drop(&mut self) {
}
}
unsafe impl<T: Send> Send for UnifiedMemoryBuffer<T> {}
unsafe impl<T: Sync> Sync for UnifiedMemoryBuffer<T> {}
#[allow(non_snake_case)]
#[cfg(all(test, not(feature = "no-std")))]
mod tests {
use super::*;
use crate::gpu::GpuBackend;
#[cfg(feature = "no-std")]
use alloc::{
string::{String, ToString},
vec,
vec::Vec,
};
#[test]
fn test_memory_pool_creation() {
let device = GpuDevice {
id: 0,
name: "Test Device".to_string(),
backend: GpuBackend::Cuda,
compute_units: 80,
memory_mb: 8192,
supports_f64: true,
supports_f16: true,
};
let pool = GpuMemoryPool::new(device);
let stats = pool.get_stats();
assert_eq!(stats.total_allocated, 0);
assert_eq!(stats.allocation_count, 0);
}
#[test]
fn test_size_class_calculation() {
let device = GpuDevice {
id: 0,
name: "Test Device".to_string(),
backend: GpuBackend::Cuda,
compute_units: 80,
memory_mb: 8192,
supports_f64: true,
supports_f16: true,
};
let pool = GpuMemoryPool::new(device);
assert_eq!(pool.get_size_class(0), 1);
assert_eq!(pool.get_size_class(1), 2);
assert_eq!(pool.get_size_class(1000), 1024);
assert_eq!(pool.get_size_class(1024), 2048);
}
#[test]
fn test_memory_stats() {
let stats = MemoryStats {
total_allocated: 1024 * 1024, peak_usage: 2 * 1024 * 1024, allocation_count: 5,
free_blocks_count: 2,
device_memory_mb: 1024, };
assert!((stats.utilization_percent() - 0.09765625).abs() < 0.001); assert!(!stats.is_high_usage(50.0));
assert!(stats.is_high_usage(0.05));
}
#[test]
fn test_multi_gpu_manager() {
let mut manager = MultiGpuMemoryManager::new();
let device1 = GpuDevice {
id: 0,
name: "Device 0".to_string(),
backend: GpuBackend::Cuda,
compute_units: 80,
memory_mb: 8192,
supports_f64: true,
supports_f16: true,
};
let device2 = GpuDevice {
id: 1,
name: "Device 1".to_string(),
backend: GpuBackend::Cuda,
compute_units: 40,
memory_mb: 4096,
supports_f64: true,
supports_f16: true,
};
manager.add_device(device1);
manager.add_device(device2);
let stats = manager.get_all_stats();
assert_eq!(stats.len(), 2);
assert!(stats.contains_key(&0));
assert!(stats.contains_key(&1));
}
#[test]
fn test_bandwidth_config() {
let config = BandwidthConfig::default();
assert!(config.use_async_transfers);
assert!(config.prefer_pinned_memory);
assert!(config.coalesce_transfers);
assert_eq!(config.max_concurrent_streams, 4);
}
#[test]
fn test_allocation_strategies() {
let strategies = vec![
AllocationStrategy::Simple,
AllocationStrategy::Pooled,
AllocationStrategy::Unified,
AllocationStrategy::Pinned,
];
for strategy in strategies {
let _ = match strategy {
AllocationStrategy::Simple => 0,
AllocationStrategy::Pooled => 1,
AllocationStrategy::Unified => 2,
AllocationStrategy::Pinned => 3,
};
}
}
}