use super::*;
use super::TransferError;
use crate::block_manager::block::{BlockDataProvider, BlockDataProviderMut};
use anyhow::Result;
use cudarc::driver::CudaStream;
use cudarc::driver::result as cuda_result;
use cudarc::driver::sys::{CUevent_flags, CUresult, cuMemcpyHtoDAsync_v2};
use dynamo_runtime::config::environment_names::cuda as env_cuda;
use std::ops::Range;
use std::sync::Mutex;
use std::sync::OnceLock;
static COPY_KERNEL_MODULE: Mutex<Option<usize>> = Mutex::new(None);
static COPY_KERNEL_FUNCTION: Mutex<Option<usize>> = Mutex::new(None);
type CudaMemcpyFnPtr = unsafe fn(
src_ptr: *const u8,
dst_ptr: *mut u8,
size: usize,
stream: &CudaStream,
) -> Result<(), TransferError>;
fn cuda_memcpy_fn_ptr(strategy: &TransferStrategy) -> Result<CudaMemcpyFnPtr, TransferError> {
match strategy {
TransferStrategy::CudaAsyncH2D => Ok(cuda_memcpy_h2d),
TransferStrategy::CudaAsyncD2H => Ok(cuda_memcpy_d2h),
TransferStrategy::CudaAsyncD2D => Ok(cuda_memcpy_d2d),
_ => Err(TransferError::ExecutionError(
"Unsupported copy strategy for CUDA memcpy async".into(),
)),
}
}
fn collect_kv_addresses<Source, Destination>(
sources: &[Source],
destinations: &[Destination],
num_layers: usize,
num_outer_dims: usize,
) -> Result<(Vec<u64>, Vec<u64>), TransferError>
where
Source: BlockDataProvider,
Destination: BlockDataProviderMut,
{
if sources.is_empty() {
return Err(TransferError::ExecutionError(
"No source blocks provided".to_string(),
));
}
let total_address_pairs = sources.len() * num_layers * num_outer_dims;
let mut src_addresses = Vec::with_capacity(total_address_pairs);
let mut dst_addresses = Vec::with_capacity(total_address_pairs);
let src_block_data: Vec<_> = sources.iter().map(|block| block.block_data()).collect();
let dst_block_data: Vec<_> = destinations
.iter()
.map(|block| block.block_data())
.collect();
for (src_data, dst_data) in src_block_data.iter().zip(dst_block_data.iter()) {
for layer_idx in 0..num_layers {
for outer_idx in 0..num_outer_dims {
let src_view = src_data.layer_view(layer_idx, outer_idx)?;
let dst_view = dst_data.layer_view(layer_idx, outer_idx)?;
unsafe {
src_addresses.push(src_view.as_ptr() as u64);
dst_addresses.push(dst_view.as_ptr() as u64);
}
}
}
}
Ok((src_addresses, dst_addresses))
}
unsafe fn launch_copy_kernel_direct(
src_pinned_ptr: u64,
dst_pinned_ptr: u64,
address_count: usize,
layer_size: usize,
stream: &CudaStream,
) -> Result<(), TransferError> {
let kernel = get_copy_kernel()?;
tracing::debug!(
"LAUNCHING KERNEL: {} pairs, src=0x{:x}, dst=0x{:x}",
address_count,
src_pinned_ptr,
dst_pinned_ptr
);
let threads_per_block = 256u32;
let max_blocks = 1024u32;
let blocks_needed = std::cmp::min(max_blocks, address_count as u32);
let grid_dim = (blocks_needed, 1, 1);
let block_dim = (threads_per_block, 1, 1);
let src_ptr_param = src_pinned_ptr;
let dst_ptr_param = dst_pinned_ptr;
let size_param = layer_size;
let num_pairs_param = address_count as i32;
let params = [
&src_ptr_param as *const _ as *mut std::ffi::c_void,
&dst_ptr_param as *const _ as *mut std::ffi::c_void,
&size_param as *const _ as *mut std::ffi::c_void,
&num_pairs_param as *const _ as *mut std::ffi::c_void,
];
let result = unsafe {
cudarc::driver::sys::cuLaunchKernel(
kernel,
grid_dim.0,
grid_dim.1,
grid_dim.2,
block_dim.0,
block_dim.1,
block_dim.2,
0, stream.cu_stream(),
params.as_ptr() as *mut *mut std::ffi::c_void,
std::ptr::null_mut(), )
};
if result != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
tracing::error!(
"Kernel launch failed: {:?} - kernel params: {} pairs, layer_size={}, src=0x{:x}, dst=0x{:x}",
result,
address_count,
layer_size,
src_pinned_ptr,
dst_pinned_ptr
);
return Err(TransferError::ExecutionError(format!(
"CUDA kernel launch failed: {:?} (address_count={}, layer_size={})",
result, address_count, layer_size
)));
}
Ok(())
}
#[derive(Clone, Copy, Debug)]
struct CachedBlockDimensions {
num_layers: usize,
num_outer_dims: usize,
layer_size: usize,
}
static BLOCK_DIMENSIONS_CACHE: OnceLock<CachedBlockDimensions> = OnceLock::new();
fn get_cached_block_dimensions<T: BlockDataProvider>(
block: &T,
) -> Result<CachedBlockDimensions, TransferError> {
Ok(*BLOCK_DIMENSIONS_CACHE
.get_or_init(|| calculate_block_dimensions_from_layout(block).unwrap()))
}
fn calculate_block_dimensions_from_layout<T: BlockDataProvider>(
block: &T,
) -> Result<CachedBlockDimensions, TransferError> {
let block_data = block.block_data();
let num_layers = block_data.num_layers();
let num_outer_dims = block_data.num_outer_dims();
let layer_size = block_data.layer_view(0, 0).map(|v| v.size()).unwrap_or(0);
Ok(CachedBlockDimensions {
num_layers,
num_outer_dims,
layer_size,
})
}
pub fn copy_blocks_with_customized_kernel<'a, Source, Destination>(
sources: &'a [Source],
destinations: &'a mut [Destination],
stream: &CudaStream,
ctx: &crate::block_manager::block::transfer::TransferContext,
) -> Result<(), TransferError>
where
Source: BlockDataProvider,
Destination: BlockDataProviderMut,
{
let _context_guard = stream.context().bind_to_thread();
let dims = get_cached_block_dimensions(&sources[0])?;
let (src_addresses, dst_addresses) =
collect_kv_addresses(sources, destinations, dims.num_layers, dims.num_outer_dims)?;
tracing::debug!(
"Using vectorized_copy for {} blocks [{}L×{}O×{}B], {} address pairs",
sources.len(),
dims.num_layers,
dims.num_outer_dims,
dims.layer_size,
src_addresses.len()
);
let size = src_addresses.len() * std::mem::size_of::<u64>();
let pool = ctx.cuda_mem_pool().ok_or_else(|| {
TransferError::ExecutionError(
"TransferContext was not instantiated with a CudaPool; please report this error"
.to_string(),
)
})?;
let src_buffer = pool.alloc_async(size, stream).map_err(|e| {
TransferError::ExecutionError(format!("CUDA pool allocation failed: {}", e))
})?;
let dst_buffer = pool.alloc_async(size, stream).map_err(|e| {
TransferError::ExecutionError(format!("CUDA pool allocation failed: {}", e))
})?;
let result_src = unsafe {
cuMemcpyHtoDAsync_v2(
src_buffer,
src_addresses.as_ptr() as *const std::ffi::c_void,
size,
stream.cu_stream(),
)
};
if result_src != CUresult::CUDA_SUCCESS {
return Err(TransferError::ExecutionError(format!(
"H2D memcpy for src buffer failed: {:?}",
result_src
)));
}
let result_dst = unsafe {
cuMemcpyHtoDAsync_v2(
dst_buffer,
dst_addresses.as_ptr() as *const std::ffi::c_void,
size,
stream.cu_stream(),
)
};
if result_dst != CUresult::CUDA_SUCCESS {
return Err(TransferError::ExecutionError(format!(
"H2D memcpy for dst buffer failed: {:?}",
result_dst
)));
}
let h2d_event = stream
.record_event(Some(CUevent_flags::CU_EVENT_BLOCKING_SYNC))
.map_err(|e| TransferError::ExecutionError(format!("Failed to record H2D event: {}", e)))?;
unsafe {
launch_copy_kernel_direct(
src_buffer,
dst_buffer,
src_addresses.len(),
dims.layer_size,
stream,
)?;
}
pool.free_async(src_buffer, stream)
.map_err(|e| TransferError::ExecutionError(format!("Failed to free src buffer: {}", e)))?;
pool.free_async(dst_buffer, stream)
.map_err(|e| TransferError::ExecutionError(format!("Failed to free dst buffer: {}", e)))?;
h2d_event
.synchronize()
.map_err(|e| TransferError::ExecutionError(format!("Failed to sync H2D event: {}", e)))?;
Ok(())
}
pub fn copy_block<'a, Source, Destination>(
sources: &'a Source,
destinations: &'a mut Destination,
stream: &CudaStream,
strategy: TransferStrategy,
) -> Result<(), TransferError>
where
Source: BlockDataProvider,
Destination: BlockDataProviderMut,
{
let src_data = sources.block_data();
let dst_data = destinations.block_data_mut();
let memcpy_fn = cuda_memcpy_fn_ptr(&strategy)?;
#[cfg(debug_assertions)]
{
let expected_strategy =
expected_strategy::<Source::StorageType, Destination::StorageType>();
assert_eq!(strategy, expected_strategy);
}
if src_data.is_fully_contiguous() && dst_data.is_fully_contiguous() {
let src_view = src_data.block_view()?;
let mut dst_view = dst_data.block_view_mut()?;
debug_assert_eq!(src_view.size(), dst_view.size());
unsafe {
memcpy_fn(
src_view.as_ptr(),
dst_view.as_mut_ptr(),
src_view.size(),
stream,
)?;
}
} else {
assert_eq!(src_data.num_layers(), dst_data.num_layers());
copy_layers(
0..src_data.num_layers(),
sources,
destinations,
stream,
strategy,
)?;
}
Ok(())
}
pub fn copy_layers<'a, Source, Destination>(
layer_range: Range<usize>,
sources: &'a Source,
destinations: &'a mut Destination,
stream: &CudaStream,
strategy: TransferStrategy,
) -> Result<(), TransferError>
where
Source: BlockDataProvider,
Destination: BlockDataProviderMut,
{
let src_data = sources.block_data();
let dst_data = destinations.block_data_mut();
let memcpy_fn = cuda_memcpy_fn_ptr(&strategy)?;
#[cfg(debug_assertions)]
{
let expected_strategy =
expected_strategy::<Source::StorageType, Destination::StorageType>();
assert_eq!(strategy, expected_strategy);
}
for layer_idx in layer_range {
for outer_idx in 0..src_data.num_outer_dims() {
let src_view = src_data.layer_view(layer_idx, outer_idx)?;
let mut dst_view = dst_data.layer_view_mut(layer_idx, outer_idx)?;
debug_assert_eq!(src_view.size(), dst_view.size());
unsafe {
memcpy_fn(
src_view.as_ptr(),
dst_view.as_mut_ptr(),
src_view.size(),
stream,
)?;
}
}
}
Ok(())
}
#[allow(dead_code)]
fn expected_strategy<Source: Storage, Dest: Storage>() -> TransferStrategy {
match (
std::any::TypeId::of::<Source>(),
std::any::TypeId::of::<Dest>(),
) {
(src, dst)
if src == std::any::TypeId::of::<PinnedStorage>()
&& dst == std::any::TypeId::of::<DeviceStorage>() =>
{
TransferStrategy::CudaAsyncH2D
}
(src, dst)
if src == std::any::TypeId::of::<DeviceStorage>()
&& dst == std::any::TypeId::of::<PinnedStorage>() =>
{
TransferStrategy::CudaAsyncD2H
}
(src, dst)
if src == std::any::TypeId::of::<DeviceStorage>()
&& dst == std::any::TypeId::of::<DeviceStorage>() =>
{
TransferStrategy::CudaAsyncD2D
}
_ => TransferStrategy::Invalid,
}
}
#[inline(always)]
unsafe fn cuda_memcpy_h2d(
src_ptr: *const u8,
dst_ptr: *mut u8,
size: usize,
stream: &CudaStream,
) -> Result<(), TransferError> {
debug_assert!(!src_ptr.is_null(), "Source host pointer is null");
debug_assert!(!dst_ptr.is_null(), "Destination device pointer is null");
unsafe {
let src_slice = std::slice::from_raw_parts(src_ptr, size);
cuda_result::memcpy_htod_async(dst_ptr as u64, src_slice, stream.cu_stream())
.map_err(|e| TransferError::ExecutionError(format!("CUDA H2D memcpy failed: {}", e)))?
};
Ok(())
}
#[inline(always)]
unsafe fn cuda_memcpy_d2h(
src_ptr: *const u8,
dst_ptr: *mut u8,
size: usize,
stream: &CudaStream,
) -> Result<(), TransferError> {
debug_assert!(!src_ptr.is_null(), "Source device pointer is null");
debug_assert!(!dst_ptr.is_null(), "Destination host pointer is null");
unsafe {
let dst_slice = std::slice::from_raw_parts_mut(dst_ptr, size);
cuda_result::memcpy_dtoh_async(dst_slice, src_ptr as u64, stream.cu_stream())
.map_err(|e| TransferError::ExecutionError(format!("CUDA D2H memcpy failed: {}", e)))?;
}
Ok(())
}
#[inline(always)]
unsafe fn cuda_memcpy_d2d(
src_ptr: *const u8,
dst_ptr: *mut u8,
size: usize,
stream: &CudaStream,
) -> Result<(), TransferError> {
debug_assert!(!src_ptr.is_null(), "Source device pointer is null");
debug_assert!(!dst_ptr.is_null(), "Destination device pointer is null");
debug_assert!(
(src_ptr as usize + size <= dst_ptr as usize)
|| (dst_ptr as usize + size <= src_ptr as usize),
"Source and destination device memory regions must not overlap for D2D copy"
);
unsafe {
cuda_result::memcpy_dtod_async(dst_ptr as u64, src_ptr as u64, size, stream.cu_stream())
.map_err(|e| TransferError::ExecutionError(format!("CUDA D2D memcpy failed: {}", e)))?
};
Ok(())
}
fn get_copy_kernel_module() -> Result<cudarc::driver::sys::CUmodule, TransferError> {
let mut module_guard = COPY_KERNEL_MODULE.lock().unwrap();
if let Some(module_ptr) = *module_guard {
return Ok(module_ptr as cudarc::driver::sys::CUmodule);
}
let module = match load_embedded_fatbin() {
Ok(module) => {
tracing::debug!("Successfully loaded embedded FATBIN module");
module
}
Err(embedded_err) => {
tracing::debug!("Embedded FATBIN loading failed: {:?}", embedded_err);
match load_runtime_fatbin() {
Ok(module) => {
tracing::debug!("Successfully loaded runtime FATBIN module");
module
}
Err(runtime_err) => {
tracing::error!(" Both FATBIN loading methods failed:");
tracing::error!(" Embedded error: {:?}", embedded_err);
tracing::error!(" Runtime error: {:?}", runtime_err);
return Err(TransferError::ExecutionError(
"No vectorized_copy FATBIN found (tried embedded and runtime paths)"
.to_string(),
));
}
}
}
};
let module_ptr = module as usize;
*module_guard = Some(module_ptr);
Ok(module as cudarc::driver::sys::CUmodule)
}
fn get_copy_kernel() -> Result<cudarc::driver::sys::CUfunction, TransferError> {
let mut func_guard = COPY_KERNEL_FUNCTION.lock().unwrap();
if let Some(func_ptr) = *func_guard {
return Ok(func_ptr as cudarc::driver::sys::CUfunction);
}
let module = get_copy_kernel_module()?;
let func = unsafe {
let mut func = std::ptr::null_mut();
let func_name = std::ffi::CString::new("vectorised_copy").unwrap();
let result =
cudarc::driver::sys::cuModuleGetFunction(&mut func, module, func_name.as_ptr());
if result == cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
func
} else {
return Err(TransferError::ExecutionError(format!(
"Failed to get kernel function: {:?}",
result
)));
}
};
let func_ptr = func as usize;
*func_guard = Some(func_ptr);
Ok(func as cudarc::driver::sys::CUfunction)
}
#[cfg(have_vec_copy_fatbin)]
fn load_embedded_fatbin() -> Result<cudarc::driver::sys::CUmodule, cudarc::driver::DriverError> {
const FATBIN: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/vectorized_copy.fatbin"));
tracing::debug!("Loading embedded FATBIN ({} bytes)", FATBIN.len());
unsafe {
let mut module = std::ptr::null_mut();
let result = cudarc::driver::sys::cuModuleLoadData(
&mut module,
FATBIN.as_ptr() as *const std::ffi::c_void,
);
if result == cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
tracing::debug!("Embedded FATBIN module loaded successfully: {:p}", module);
return Ok(module);
} else {
tracing::error!(
"Embedded FATBIN cuModuleLoadData failed with CUDA error: {:?}",
result
);
}
}
Err(cudarc::driver::DriverError(
cudarc::driver::sys::cudaError_enum::CUDA_ERROR_FILE_NOT_FOUND,
))
}
#[cfg(not(have_vec_copy_fatbin))]
fn load_embedded_fatbin() -> Result<cudarc::driver::sys::CUmodule, cudarc::driver::DriverError> {
tracing::debug!("No embedded FATBIN available (not compiled with have_vec_copy_fatbin)");
Err(cudarc::driver::DriverError(
cudarc::driver::sys::cudaError_enum::CUDA_ERROR_FILE_NOT_FOUND,
))
}
fn load_runtime_fatbin() -> Result<cudarc::driver::sys::CUmodule, cudarc::driver::DriverError> {
if let Ok(runtime_path) = std::env::var(env_cuda::DYN_FATBIN_PATH)
&& let Ok(fatbin_data) = std::fs::read(&runtime_path)
{
tracing::debug!("Loading FATBIN from runtime env var: {}", runtime_path);
unsafe {
let mut module = std::ptr::null_mut();
let result = cudarc::driver::sys::cuModuleLoadData(
&mut module,
fatbin_data.as_ptr() as *const std::ffi::c_void,
);
if result == cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
tracing::debug!("Runtime FATBIN module loaded successfully: {:p}", module);
return Ok(module);
} else {
tracing::error!(
"Runtime FATBIN cuModuleLoadData failed with CUDA error: {:?}",
result
);
}
}
}
let runtime_paths = ["./src/block_manager/block/transfer/kernels/vectorized_copy.fatbin"];
for path in &runtime_paths {
if let Ok(fatbin_data) = std::fs::read(path) {
tracing::debug!("Loading FATBIN from runtime path: {}", path);
unsafe {
let mut module = std::ptr::null_mut();
let result = cudarc::driver::sys::cuModuleLoadData(
&mut module,
fatbin_data.as_ptr() as *const std::ffi::c_void,
);
if result == cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
tracing::debug!(
"Runtime path FATBIN module loaded successfully: {:p}",
module
);
return Ok(module);
} else {
tracing::error!(
"Runtime path FATBIN cuModuleLoadData failed with CUDA error: {:?}",
result
);
}
}
} else {
tracing::debug!("Could not read FATBIN file: {}", path);
}
}
Err(cudarc::driver::DriverError(
cudarc::driver::sys::cudaError_enum::CUDA_ERROR_FILE_NOT_FOUND,
))
}
#[cfg(all(test, feature = "testing-cuda"))]
mod tests {
use super::*;
use crate::block_manager::storage::{
DeviceAllocator, PinnedAllocator, StorageAllocator, StorageMemset,
};
#[test]
fn test_memset_and_transfer() {
let device_allocator = DeviceAllocator::default();
let pinned_allocator = PinnedAllocator::default();
let ctx = device_allocator.ctx().clone();
let stream = ctx.new_stream().unwrap();
let mut host = pinned_allocator.allocate(1024).unwrap();
let mut device = device_allocator.allocate(1024).unwrap();
StorageMemset::memset(&mut host, 42, 0, 1024).unwrap();
unsafe {
let ptr = host.as_ptr();
let slice = std::slice::from_raw_parts(ptr, 1024);
assert!(slice.iter().all(|&x| x == 42));
}
unsafe {
cuda_memcpy_h2d(host.as_ptr(), device.as_mut_ptr(), 1024, stream.as_ref()).unwrap();
}
stream.synchronize().unwrap();
StorageMemset::memset(&mut host, 0, 0, 1024).unwrap();
unsafe {
let ptr = host.as_ptr();
let slice = std::slice::from_raw_parts(ptr, 1024);
assert!(slice.iter().all(|&x| x == 0));
}
unsafe {
cuda_memcpy_d2h(device.as_ptr(), host.as_mut_ptr(), 1024, stream.as_ref()).unwrap();
}
stream.synchronize().unwrap();
unsafe {
let ptr = host.as_ptr();
let slice = std::slice::from_raw_parts(ptr, 1024);
assert!(slice.iter().all(|&x| x == 42));
}
}
mod layout_transfer_tests {
use super::*;
use crate::block_manager::layout::{
FullyContiguous, GenericBlockLayout, LayerSeparate, LayoutConfig,
};
const TEST_NUM_BLOCKS: usize = 4;
const TEST_NUM_LAYERS: usize = 3;
const TEST_OUTER_DIM: usize = 2;
const TEST_PAGE_SIZE: usize = 8;
const TEST_INNER_DIM: usize = 16;
const TEST_DTYPE_WIDTH_BYTES: usize = 2;
fn create_test_config() -> LayoutConfig {
LayoutConfig {
num_blocks: TEST_NUM_BLOCKS,
num_layers: TEST_NUM_LAYERS,
outer_dim: TEST_OUTER_DIM,
page_size: TEST_PAGE_SIZE,
inner_dim: TEST_INNER_DIM,
alignment: 256, dtype_width_bytes: TEST_DTYPE_WIDTH_BYTES,
}
}
#[test]
fn test_h2d_fc_host_to_ls_device() {
let device_allocator = DeviceAllocator::default();
let pinned_allocator = PinnedAllocator::default();
let ctx = device_allocator.ctx().clone();
let stream = ctx.new_stream().unwrap();
let config = create_test_config();
let host_layout = FullyContiguous::allocate(config.clone(), &pinned_allocator).unwrap();
let device_layout = LayerSeparate::allocate(config, &device_allocator, true).unwrap();
for block_idx in 0..TEST_NUM_BLOCKS {
for layer_idx in 0..TEST_NUM_LAYERS {
for outer_idx in 0..TEST_OUTER_DIM {
let host_region = host_layout
.memory_region(block_idx, layer_idx, outer_idx)
.unwrap();
let device_region = device_layout
.memory_region(block_idx, layer_idx, outer_idx)
.unwrap();
assert_eq!(
host_region.size(),
device_region.size(),
"Region size mismatch at ({}, {}, {})",
block_idx,
layer_idx,
outer_idx
);
let pattern =
((block_idx as u8) << 4) | ((layer_idx as u8) << 2) | (outer_idx as u8);
unsafe {
let host_slice = std::slice::from_raw_parts_mut(
host_region.addr() as *mut u8,
host_region.size(),
);
host_slice.fill(pattern);
}
unsafe {
cuda_memcpy_h2d(
host_region.addr() as *const u8,
device_region.addr() as *mut u8,
host_region.size(),
stream.as_ref(),
)
.unwrap();
}
}
}
}
stream.synchronize().unwrap();
for block_idx in 0..TEST_NUM_BLOCKS {
for layer_idx in 0..TEST_NUM_LAYERS {
for outer_idx in 0..TEST_OUTER_DIM {
let host_region = host_layout
.memory_region(block_idx, layer_idx, outer_idx)
.unwrap();
let device_region = device_layout
.memory_region(block_idx, layer_idx, outer_idx)
.unwrap();
let expected_pattern =
((block_idx as u8) << 4) | ((layer_idx as u8) << 2) | (outer_idx as u8);
let mut verify_buffer =
pinned_allocator.allocate(host_region.size()).unwrap();
unsafe {
cuda_memcpy_d2h(
device_region.addr() as *const u8,
verify_buffer.as_mut_ptr(),
host_region.size(),
stream.as_ref(),
)
.unwrap();
}
stream.synchronize().unwrap();
unsafe {
let verify_slice = std::slice::from_raw_parts(
verify_buffer.as_ptr(),
host_region.size(),
);
assert!(
verify_slice.iter().all(|&x| x == expected_pattern),
"Pattern mismatch at ({}, {}, {}) - expected {}, got {:?}",
block_idx,
layer_idx,
outer_idx,
expected_pattern,
&verify_slice[0..std::cmp::min(8, verify_slice.len())]
);
}
}
}
}
}
#[test]
fn test_d2h_ls_device_to_fc_host() {
let device_allocator = DeviceAllocator::default();
let pinned_allocator = PinnedAllocator::default();
let ctx = device_allocator.ctx().clone();
let stream = ctx.new_stream().unwrap();
let config = create_test_config();
let device_layout =
LayerSeparate::allocate(config.clone(), &device_allocator, false).unwrap();
let host_layout = FullyContiguous::allocate(config, &pinned_allocator).unwrap();
for block_idx in 0..TEST_NUM_BLOCKS {
for layer_idx in 0..TEST_NUM_LAYERS {
for outer_idx in 0..TEST_OUTER_DIM {
let device_region = device_layout
.memory_region(block_idx, layer_idx, outer_idx)
.unwrap();
let pattern = ((block_idx as u8) << 4)
| ((layer_idx as u8) << 2)
| (outer_idx as u8)
| 0x80;
let mut temp_buffer =
pinned_allocator.allocate(device_region.size()).unwrap();
unsafe {
let temp_slice = std::slice::from_raw_parts_mut(
temp_buffer.as_mut_ptr(),
device_region.size(),
);
temp_slice.fill(pattern);
}
unsafe {
cuda_memcpy_h2d(
temp_buffer.as_ptr(),
device_region.addr() as *mut u8,
device_region.size(),
stream.as_ref(),
)
.unwrap();
}
}
}
}
stream.synchronize().unwrap();
for block_idx in 0..TEST_NUM_BLOCKS {
for layer_idx in 0..TEST_NUM_LAYERS {
for outer_idx in 0..TEST_OUTER_DIM {
let host_region = host_layout
.memory_region(block_idx, layer_idx, outer_idx)
.unwrap();
unsafe {
let host_slice = std::slice::from_raw_parts_mut(
host_region.addr() as *mut u8,
host_region.size(),
);
host_slice.fill(0);
}
}
}
}
for block_idx in 0..TEST_NUM_BLOCKS {
for layer_idx in 0..TEST_NUM_LAYERS {
for outer_idx in 0..TEST_OUTER_DIM {
let device_region = device_layout
.memory_region(block_idx, layer_idx, outer_idx)
.unwrap();
let host_region = host_layout
.memory_region(block_idx, layer_idx, outer_idx)
.unwrap();
unsafe {
cuda_memcpy_d2h(
device_region.addr() as *const u8,
host_region.addr() as *mut u8,
device_region.size(),
stream.as_ref(),
)
.unwrap();
}
}
}
}
stream.synchronize().unwrap();
for block_idx in 0..TEST_NUM_BLOCKS {
for layer_idx in 0..TEST_NUM_LAYERS {
for outer_idx in 0..TEST_OUTER_DIM {
let host_region = host_layout
.memory_region(block_idx, layer_idx, outer_idx)
.unwrap();
let expected_pattern = ((block_idx as u8) << 4)
| ((layer_idx as u8) << 2)
| (outer_idx as u8)
| 0x80;
unsafe {
let host_slice = std::slice::from_raw_parts(
host_region.addr() as *const u8,
host_region.size(),
);
assert!(
host_slice.iter().all(|&x| x == expected_pattern),
"Pattern mismatch at ({}, {}, {}) - expected {}, got {:?}",
block_idx,
layer_idx,
outer_idx,
expected_pattern,
&host_slice[0..std::cmp::min(8, host_slice.len())]
);
}
}
}
}
}
#[test]
fn test_bidirectional_layout_transfers() {
let device_allocator = DeviceAllocator::default();
let pinned_allocator = PinnedAllocator::default();
let ctx = device_allocator.ctx().clone();
let stream = ctx.new_stream().unwrap();
let config = create_test_config();
let host_fc = FullyContiguous::allocate(config.clone(), &pinned_allocator).unwrap();
let device_ls_outer =
LayerSeparate::allocate(config.clone(), &device_allocator, true).unwrap();
let device_ls_block =
LayerSeparate::allocate(config, &device_allocator, false).unwrap();
for block_idx in 0..TEST_NUM_BLOCKS {
for layer_idx in 0..TEST_NUM_LAYERS {
for outer_idx in 0..TEST_OUTER_DIM {
let original_pattern = ((block_idx as u8) << 4)
| ((layer_idx as u8) << 2)
| (outer_idx as u8)
| 0x40;
let host_region = host_fc
.memory_region(block_idx, layer_idx, outer_idx)
.unwrap();
unsafe {
let host_slice = std::slice::from_raw_parts_mut(
host_region.addr() as *mut u8,
host_region.size(),
);
host_slice.fill(original_pattern);
}
let device_outer_region = device_ls_outer
.memory_region(block_idx, layer_idx, outer_idx)
.unwrap();
unsafe {
cuda_memcpy_h2d(
host_region.addr() as *const u8,
device_outer_region.addr() as *mut u8,
host_region.size(),
stream.as_ref(),
)
.unwrap();
}
let device_block_region = device_ls_block
.memory_region(block_idx, layer_idx, outer_idx)
.unwrap();
unsafe {
cuda_memcpy_d2d(
device_outer_region.addr() as *const u8,
device_block_region.addr() as *mut u8,
device_outer_region.size(),
stream.as_ref(),
)
.unwrap();
}
stream.synchronize().unwrap();
unsafe {
let host_slice = std::slice::from_raw_parts_mut(
host_region.addr() as *mut u8,
host_region.size(),
);
host_slice.fill(0);
}
unsafe {
cuda_memcpy_d2h(
device_block_region.addr() as *const u8,
host_region.addr() as *mut u8,
device_block_region.size(),
stream.as_ref(),
)
.unwrap();
}
stream.synchronize().unwrap();
unsafe {
let host_slice = std::slice::from_raw_parts(
host_region.addr() as *const u8,
host_region.size(),
);
assert!(
host_slice.iter().all(|&x| x == original_pattern),
"Round-trip pattern mismatch at ({}, {}, {}) - expected {}, got {:?}",
block_idx,
layer_idx,
outer_idx,
original_pattern,
&host_slice[0..std::cmp::min(8, host_slice.len())]
);
}
}
}
}
}
#[test]
fn test_layout_transfer_alignment_performance() {
let device_allocator = DeviceAllocator::default();
let pinned_allocator = PinnedAllocator::default();
let ctx = device_allocator.ctx().clone();
let stream = ctx.new_stream().unwrap();
for alignment in [1, 64, 256, 512] {
let config = LayoutConfig {
num_blocks: 2,
num_layers: 2,
outer_dim: 1,
page_size: 1024,
inner_dim: 256,
alignment,
dtype_width_bytes: 4,
};
let host_layout =
FullyContiguous::allocate(config.clone(), &pinned_allocator).unwrap();
let device_layout = FullyContiguous::allocate(config, &device_allocator).unwrap();
let start = std::time::Instant::now();
for block_idx in 0..2 {
for layer_idx in 0..2 {
let host_region =
host_layout.memory_region(block_idx, layer_idx, 0).unwrap();
let device_region = device_layout
.memory_region(block_idx, layer_idx, 0)
.unwrap();
unsafe {
cuda_memcpy_h2d(
host_region.addr() as *const u8,
device_region.addr() as *mut u8,
host_region.size(),
stream.as_ref(),
)
.unwrap();
}
}
}
stream.synchronize().unwrap();
let duration = start.elapsed();
let region = host_layout.memory_region(0, 0, 0).unwrap();
if alignment > 1 {
assert_eq!(
region.addr() % alignment,
0,
"Memory not aligned to {} bytes",
alignment
);
}
println!("Transfer with alignment {} took {:?}", alignment, duration);
}
}
}
}