use crate::error::Result;
use crate::prelude::{Complex64, SimulatorError};
#[cfg(feature = "advanced_math")]
use std::collections::HashMap;
#[cfg(feature = "advanced_math")]
pub type CudaDevicePointer = usize;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum GpuMemoryType {
Device,
Host,
Unified,
Pinned,
}
#[cfg(feature = "advanced_math")]
pub struct GpuMemory {
allocated: usize,
device_ptr: Option<CudaDevicePointer>,
host_ptr: Option<*mut std::ffi::c_void>,
memory_type: GpuMemoryType,
alignment: usize,
}
#[cfg(feature = "advanced_math")]
pub struct GpuMemoryPool {
pub allocated_blocks: HashMap<usize, GpuMemoryBlock>,
pub free_blocks: Vec<GpuMemoryBlock>,
pub total_allocated: usize,
pub peak_usage: usize,
}
#[cfg(feature = "advanced_math")]
#[derive(Clone)]
pub struct GpuMemoryBlock {
pub ptr: CudaDevicePointer,
pub size: usize,
pub alignment: usize,
pub in_use: bool,
}
#[cfg(feature = "advanced_math")]
impl GpuMemory {
pub fn new() -> Self {
Self {
allocated: 0,
device_ptr: None,
host_ptr: None,
memory_type: GpuMemoryType::Device,
alignment: 256, }
}
pub fn new_with_type(memory_type: GpuMemoryType) -> Self {
Self {
allocated: 0,
device_ptr: None,
host_ptr: None,
memory_type,
alignment: 256,
}
}
pub fn allocate_pool(&mut self, size: usize) -> Result<()> {
match self.memory_type {
GpuMemoryType::Device => {
let ptr = Self::cuda_malloc(size)?;
self.device_ptr = Some(ptr);
}
GpuMemoryType::Host => {
let ptr = Self::cuda_malloc_host(size)?;
self.host_ptr = Some(ptr);
}
GpuMemoryType::Unified => {
let ptr = Self::cuda_malloc_managed(size)?;
self.device_ptr = Some(ptr as CudaDevicePointer);
self.host_ptr = Some(ptr);
}
GpuMemoryType::Pinned => {
let ptr = Self::cuda_host_alloc(size)?;
self.host_ptr = Some(ptr);
}
}
self.allocated = size;
Ok(())
}
pub fn allocate_and_copy(&mut self, data: &[Complex64]) -> Result<GpuMemory> {
let size = std::mem::size_of_val(data);
let mut gpu_memory = GpuMemory::new_with_type(self.memory_type);
gpu_memory.allocate_pool(size)?;
gpu_memory.copy_from_host(data)?;
Ok(gpu_memory)
}
pub fn copy_from_host(&mut self, data: &[Complex64]) -> Result<()> {
let size = std::mem::size_of_val(data);
match self.memory_type {
GpuMemoryType::Device => {
if let Some(device_ptr) = self.device_ptr {
Self::cuda_memcpy_h2d(
device_ptr,
data.as_ptr() as *const std::ffi::c_void,
size,
)?;
}
}
GpuMemoryType::Host | GpuMemoryType::Pinned => {
if let Some(host_ptr) = self.host_ptr {
unsafe {
std::ptr::copy_nonoverlapping(
data.as_ptr() as *const u8,
host_ptr as *mut u8,
size,
);
}
}
}
GpuMemoryType::Unified => {
if let Some(host_ptr) = self.host_ptr {
unsafe {
std::ptr::copy_nonoverlapping(
data.as_ptr() as *const u8,
host_ptr as *mut u8,
size,
);
}
Self::cuda_mem_prefetch_async(self.device_ptr.unwrap_or(0), size, 0)?;
}
}
}
Ok(())
}
pub fn as_ptr(&self) -> *const std::ffi::c_void {
match self.memory_type {
GpuMemoryType::Device => self
.device_ptr
.map(|p| p as *const std::ffi::c_void)
.unwrap_or(std::ptr::null()),
_ => self.host_ptr.unwrap_or(std::ptr::null_mut()),
}
}
pub fn as_device_ptr(&self) -> Option<CudaDevicePointer> {
self.device_ptr
}
pub fn copy_to_host(&self, data: &mut [Complex64]) -> Result<()> {
let size = std::mem::size_of_val(data);
match self.memory_type {
GpuMemoryType::Device => {
if let Some(device_ptr) = self.device_ptr {
Self::cuda_memcpy_d2h(
data.as_mut_ptr() as *mut std::ffi::c_void,
device_ptr,
size,
)?;
}
}
GpuMemoryType::Host | GpuMemoryType::Pinned => {
if let Some(host_ptr) = self.host_ptr {
unsafe {
std::ptr::copy_nonoverlapping(
host_ptr as *const u8,
data.as_mut_ptr() as *mut u8,
size,
);
}
}
}
GpuMemoryType::Unified => {
if let Some(host_ptr) = self.host_ptr {
Self::cuda_device_synchronize()?;
unsafe {
std::ptr::copy_nonoverlapping(
host_ptr as *const u8,
data.as_mut_ptr() as *mut u8,
size,
);
}
}
}
}
Ok(())
}
pub fn get_size(&self) -> usize {
self.allocated
}
pub fn get_memory_type(&self) -> GpuMemoryType {
self.memory_type
}
fn cuda_malloc(size: usize) -> Result<CudaDevicePointer> {
if size == 0 {
return Err(SimulatorError::InvalidInput(
"Cannot allocate zero bytes".to_string(),
));
}
Ok(size) }
fn cuda_malloc_host(size: usize) -> Result<*mut std::ffi::c_void> {
let layout = std::alloc::Layout::from_size_align(size, 256)
.map_err(|e| SimulatorError::InvalidInput(format!("Invalid memory layout: {}", e)))?;
let ptr = unsafe { std::alloc::alloc(layout) };
if ptr.is_null() {
Err(SimulatorError::ResourceExhausted(
"Failed to allocate host memory".to_string(),
))
} else {
Ok(ptr as *mut std::ffi::c_void)
}
}
fn cuda_malloc_managed(size: usize) -> Result<*mut std::ffi::c_void> {
Self::cuda_malloc_host(size)
}
fn cuda_host_alloc(size: usize) -> Result<*mut std::ffi::c_void> {
Self::cuda_malloc_host(size)
}
fn cuda_memcpy_h2d(
dst: CudaDevicePointer,
src: *const std::ffi::c_void,
size: usize,
) -> Result<()> {
Ok(())
}
fn cuda_memcpy_d2h(
dst: *mut std::ffi::c_void,
src: CudaDevicePointer,
size: usize,
) -> Result<()> {
Ok(())
}
fn cuda_mem_prefetch_async(ptr: CudaDevicePointer, size: usize, device: i32) -> Result<()> {
Ok(())
}
fn cuda_device_synchronize() -> Result<()> {
Ok(())
}
fn cuda_free(ptr: CudaDevicePointer) -> Result<()> {
Ok(())
}
fn cuda_free_host(ptr: *mut std::ffi::c_void) -> Result<()> {
if !ptr.is_null() {
if let Ok(layout) = std::alloc::Layout::from_size_align(1, 256) {
unsafe {
std::alloc::dealloc(ptr as *mut u8, layout);
}
}
}
Ok(())
}
}
#[cfg(feature = "advanced_math")]
impl Drop for GpuMemory {
fn drop(&mut self) {
if let Some(device_ptr) = self.device_ptr {
let _ = Self::cuda_free(device_ptr);
}
if let Some(host_ptr) = self.host_ptr {
match self.memory_type {
GpuMemoryType::Host | GpuMemoryType::Pinned => {
let _ = Self::cuda_free_host(host_ptr);
}
_ => {}
}
}
}
}
#[cfg(feature = "advanced_math")]
impl GpuMemoryPool {
pub fn new() -> Self {
Self {
allocated_blocks: HashMap::new(),
free_blocks: Vec::new(),
total_allocated: 0,
peak_usage: 0,
}
}
pub fn allocate(&mut self, size: usize) -> Result<GpuMemoryBlock> {
if let Some(index) = self
.free_blocks
.iter()
.position(|block| block.size >= size && !block.in_use)
{
let mut block = self.free_blocks.remove(index);
block.in_use = true;
self.allocated_blocks.insert(block.ptr, block.clone());
return Ok(block);
}
let ptr = GpuMemory::cuda_malloc(size)?;
let block = GpuMemoryBlock {
ptr,
size,
alignment: 256,
in_use: true,
};
self.allocated_blocks.insert(ptr, block.clone());
self.total_allocated += size;
if self.total_allocated > self.peak_usage {
self.peak_usage = self.total_allocated;
}
Ok(block)
}
pub fn deallocate(&mut self, ptr: CudaDevicePointer) -> Result<()> {
if let Some(mut block) = self.allocated_blocks.remove(&ptr) {
block.in_use = false;
self.total_allocated -= block.size;
self.free_blocks.push(block);
Ok(())
} else {
Err(SimulatorError::InvalidInput(
"Attempting to free unknown pointer".to_string(),
))
}
}
pub fn get_total_allocated(&self) -> usize {
self.total_allocated
}
pub fn get_peak_usage(&self) -> usize {
self.peak_usage
}
}
#[cfg(feature = "advanced_math")]
impl GpuMemoryBlock {
pub fn new(ptr: CudaDevicePointer, size: usize) -> Self {
Self {
ptr,
size,
alignment: 256,
in_use: false,
}
}
}