use crate::buffer::CudaBuffer;
use crate::device::GpuDevice;
use crate::error::{GpuError, GpuResult};
#[cfg(feature = "cuda")]
pub fn cpu_to_gpu<T>(data: &[T], device: &GpuDevice) -> GpuResult<CudaBuffer<T>>
where
T: cudarc::driver::DeviceRepr,
{
let slice = device.stream().clone_htod(data)?;
Ok(CudaBuffer {
data: Some(slice),
len: data.len(),
alloc_len: data.len(),
device_ordinal: device.ordinal(),
pool_fn: None,
})
}
#[cfg(feature = "cuda")]
pub fn gpu_to_cpu<T>(buffer: &CudaBuffer<T>, device: &GpuDevice) -> GpuResult<Vec<T>>
where
T: cudarc::driver::DeviceRepr,
{
if buffer.device_ordinal() != device.ordinal() {
return Err(GpuError::DeviceMismatch {
expected: buffer.device_ordinal(),
got: device.ordinal(),
});
}
let mut vec = device.stream().clone_dtoh(buffer.inner())?;
vec.truncate(buffer.len());
Ok(vec)
}
#[cfg(feature = "cuda")]
pub fn alloc_zeros_f32(len: usize, device: &GpuDevice) -> GpuResult<CudaBuffer<f32>> {
use cudarc::driver::CudaSlice;
let rounded = crate::pool::round_len(len);
if let Some(mut slice) = crate::pool::pool_take::<CudaSlice<f32>>(device.ordinal(), rounded, 4)
{
device.stream().memset_zeros(&mut slice)?;
return Ok(CudaBuffer::<f32>::new_pooled(
slice,
len,
rounded,
device.ordinal(),
));
}
let slice = device.stream().alloc_zeros::<f32>(rounded)?;
Ok(CudaBuffer::<f32>::new_pooled(
slice,
len,
rounded,
device.ordinal(),
))
}
#[cfg(feature = "cuda")]
pub fn alloc_zeros_f64(len: usize, device: &GpuDevice) -> GpuResult<CudaBuffer<f64>> {
use cudarc::driver::CudaSlice;
let rounded = crate::pool::round_len(len);
if let Some(mut slice) = crate::pool::pool_take::<CudaSlice<f64>>(device.ordinal(), rounded, 8)
{
device.stream().memset_zeros(&mut slice)?;
return Ok(CudaBuffer::<f64>::new_pooled(
slice,
len,
rounded,
device.ordinal(),
));
}
let slice = device.stream().alloc_zeros::<f64>(rounded)?;
Ok(CudaBuffer::<f64>::new_pooled(
slice,
len,
rounded,
device.ordinal(),
))
}
#[cfg(feature = "cuda")]
pub fn alloc_zeros<T>(len: usize, device: &GpuDevice) -> GpuResult<CudaBuffer<T>>
where
T: cudarc::driver::DeviceRepr + cudarc::driver::ValidAsZeroBits,
{
let slice = device.stream().alloc_zeros::<T>(len)?;
Ok(CudaBuffer {
data: Some(slice),
len,
alloc_len: len,
device_ordinal: device.ordinal(),
pool_fn: None,
})
}
#[cfg(feature = "cuda")]
pub fn cpu_to_gpu_pinned<T>(data: &[T], device: &GpuDevice) -> GpuResult<CudaBuffer<T>>
where
T: cudarc::driver::DeviceRepr + cudarc::driver::ValidAsZeroBits + Copy,
{
let ctx = device.context();
let stream = device.stream();
let mut pinned = unsafe { ctx.alloc_pinned::<T>(data.len())? };
pinned.as_mut_slice()?.copy_from_slice(data);
let slice = stream.clone_htod(&pinned)?;
drop(pinned);
Ok(CudaBuffer {
data: Some(slice),
len: data.len(),
alloc_len: data.len(),
device_ordinal: device.ordinal(),
pool_fn: None,
})
}
#[cfg(not(feature = "cuda"))]
pub fn cpu_to_gpu_pinned<T>(_data: &[T], _device: &GpuDevice) -> GpuResult<CudaBuffer<T>> {
Err(GpuError::NoCudaFeature)
}
#[cfg(not(feature = "cuda"))]
pub fn cpu_to_gpu<T>(_data: &[T], _device: &GpuDevice) -> GpuResult<CudaBuffer<T>> {
Err(GpuError::NoCudaFeature)
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_to_cpu<T>(_buffer: &CudaBuffer<T>, _device: &GpuDevice) -> GpuResult<Vec<T>> {
Err(GpuError::NoCudaFeature)
}
#[cfg(not(feature = "cuda"))]
pub fn alloc_zeros<T>(_len: usize, _device: &GpuDevice) -> GpuResult<CudaBuffer<T>> {
Err(GpuError::NoCudaFeature)
}
#[cfg(not(feature = "cuda"))]
pub fn alloc_zeros_f32(_len: usize, _device: &GpuDevice) -> GpuResult<CudaBuffer<f32>> {
Err(GpuError::NoCudaFeature)
}
#[cfg(not(feature = "cuda"))]
pub fn alloc_zeros_f64(_len: usize, _device: &GpuDevice) -> GpuResult<CudaBuffer<f64>> {
Err(GpuError::NoCudaFeature)
}
#[cfg(test)]
#[cfg(feature = "cuda")]
mod tests {
use super::*;
#[test]
fn round_trip_f32() {
let device = GpuDevice::new(0).expect("CUDA device 0");
let host: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let gpu_buf = cpu_to_gpu(&host, &device).expect("cpu_to_gpu");
assert_eq!(gpu_buf.len(), 5);
assert_eq!(gpu_buf.device_ordinal(), 0);
let back = gpu_to_cpu(&gpu_buf, &device).expect("gpu_to_cpu");
assert_eq!(back, host);
}
#[test]
#[allow(clippy::approx_constant)] fn round_trip_f64() {
let device = GpuDevice::new(0).expect("CUDA device 0");
let host: Vec<f64> = vec![1.0, -2.5, 3.14, 0.0, f64::MAX];
let gpu_buf = cpu_to_gpu(&host, &device).expect("cpu_to_gpu");
assert_eq!(gpu_buf.len(), 5);
let back = gpu_to_cpu(&gpu_buf, &device).expect("gpu_to_cpu");
assert_eq!(back, host);
}
#[test]
fn alloc_zeros_f32_basic() {
let device = GpuDevice::new(0).expect("CUDA device 0");
let buf = alloc_zeros_f32(1024, &device).expect("alloc_zeros_f32");
assert_eq!(buf.len(), 1024);
assert!(buf.pool_fn.is_some());
let host = gpu_to_cpu(&buf, &device).expect("gpu_to_cpu");
assert!(host.iter().all(|&x| x == 0.0));
}
#[test]
fn pool_reuse_f32() {
let device = GpuDevice::new(0).expect("CUDA device 0");
let buf = alloc_zeros_f32(512, &device).expect("alloc 1");
assert!(buf.pool_fn.is_some());
drop(buf);
assert!(crate::pool::cached_bytes(0) > 0);
let buf2 = alloc_zeros_f32(512, &device).expect("alloc 2");
assert!(buf2.pool_fn.is_some());
let host = gpu_to_cpu(&buf2, &device).expect("gpu_to_cpu");
assert!(
host.iter().all(|&x| x == 0.0),
"pooled buffer must be zeroed"
);
}
#[test]
fn empty_cache_clears_pool() {
let device = GpuDevice::new(0).expect("CUDA device 0");
let buf = alloc_zeros_f32(256, &device).expect("alloc");
drop(buf);
assert!(crate::pool::cached_bytes(0) > 0);
crate::pool::empty_cache(0);
assert_eq!(crate::pool::cached_bytes(0), 0);
}
#[test]
fn alloc_zeros_generic() {
let device = GpuDevice::new(0).expect("CUDA device 0");
let buf = alloc_zeros::<f32>(1024, &device).expect("alloc_zeros");
assert_eq!(buf.len(), 1024);
let host = gpu_to_cpu(&buf, &device).expect("gpu_to_cpu");
assert!(host.iter().all(|&x| x == 0.0));
}
#[test]
fn empty_transfer() {
let device = GpuDevice::new(0).expect("CUDA device 0");
let host: Vec<f32> = vec![];
let gpu_buf = cpu_to_gpu(&host, &device).expect("cpu_to_gpu");
assert_eq!(gpu_buf.len(), 0);
assert!(gpu_buf.is_empty());
let back = gpu_to_cpu(&gpu_buf, &device).expect("gpu_to_cpu");
assert!(back.is_empty());
}
#[test]
fn large_transfer() {
let device = GpuDevice::new(0).expect("CUDA device 0");
let n = 1_000_000;
let host: Vec<f32> = (0..n).map(|i| i as f32).collect();
let gpu_buf = cpu_to_gpu(&host, &device).expect("cpu_to_gpu");
assert_eq!(gpu_buf.len(), n);
let back = gpu_to_cpu(&gpu_buf, &device).expect("gpu_to_cpu");
assert_eq!(back, host);
}
#[test]
fn device_mismatch_rejected() {
let device = GpuDevice::new(0).expect("CUDA device 0");
let host: Vec<f32> = vec![1.0];
let mut buf = cpu_to_gpu(&host, &device).expect("cpu_to_gpu");
buf.device_ordinal = 99;
let err = gpu_to_cpu(&buf, &device).unwrap_err();
match err {
GpuError::DeviceMismatch {
expected: 99,
got: 0,
} => {}
other => panic!("unexpected error: {other}"),
}
}
}