#[cfg(feature = "cuda")]
mod fkr_011_tests {
use std::ffi::c_void;
use trueno_gpu::driver::{CudaContext, CudaModule, CudaStream, GpuBuffer, LaunchConfig};
use trueno_gpu::ptx::{
PtxArithmetic, PtxComparison, PtxControl, PtxKernel, PtxMemory, PtxModule, PtxReg, PtxType,
};
fn cuda_available() -> bool {
CudaContext::new(0).is_ok()
}
#[test]
fn fkr_011a_direct_shared_addressing() {
if !cuda_available() {
eprintln!("FKR-011a SKIPPED: No CUDA device available");
return;
}
let kernel = PtxKernel::new("direct_shared_test")
.param(PtxType::U64, "output")
.shared_memory(4096) .build(|ctx| {
let tid = ctx.special_reg(PtxReg::TidX);
let mask = ctx.mov_u32_imm(31);
let lane = ctx.and_u32(tid, mask);
let offset = ctx.mul_u32(lane, 4);
ctx.st_shared_u32(offset, lane);
ctx.bar_sync(0);
let val = ctx.ld_shared_u32(offset);
let out_ptr = ctx.load_param_u64("output");
let out_off = ctx.mul_u32(lane, 4);
let out_off_64 = ctx.cvt_u64_u32(out_off);
let out_addr = ctx.add_u64(out_ptr, out_off_64);
ctx.st_global_u32(out_addr, val);
ctx.ret();
});
let ptx = PtxModule::new()
.version(8, 0)
.target("sm_89")
.address_size(64)
.add_kernel(kernel)
.emit();
println!("=== Direct .shared PTX ===\n{}", ptx);
let cuda_ctx = CudaContext::new(0).expect("CUDA context");
let stream = CudaStream::new(&cuda_ctx).expect("CUDA stream");
let output_buf: GpuBuffer<u32> = GpuBuffer::new(&cuda_ctx, 32).unwrap();
let mut module = CudaModule::from_ptx(&cuda_ctx, &ptx).expect("PTX compilation");
let config = LaunchConfig {
grid: (1, 1, 1),
block: (32, 1, 1),
shared_mem: 0,
};
let mut args: [*mut c_void; 1] = [output_buf.as_kernel_arg()];
unsafe {
stream
.launch_kernel(&mut module, "direct_shared_test", &config, &mut args)
.expect("Kernel launch");
}
stream.synchronize().expect("Sync");
let mut output = vec![0u32; 32];
output_buf.copy_to_host(&mut output).unwrap();
println!("Output: {:?}", output);
for (i, &val) in output.iter().enumerate() {
assert_eq!(val, i as u32, "Lane {} should have value {}", i, i);
}
println!("FKR-011a: Direct .shared addressing PASSED!");
}
#[test]
fn fkr_011b_multi_warp_direct_shared() {
if !cuda_available() {
eprintln!("FKR-011b SKIPPED: No CUDA device available");
return;
}
const WARP_SMEM_SIZE: u32 = 12544;
let kernel = PtxKernel::new("multi_warp_shared_test")
.param(PtxType::U64, "output")
.shared_memory((WARP_SMEM_SIZE * 3) as usize) .build(|ctx| {
let tid = ctx.special_reg(PtxReg::TidX);
let warp_id = ctx.shr_u32_imm(tid, 5);
let mask31 = ctx.mov_u32_imm(31);
let lane = ctx.and_u32(tid, mask31);
let warp_offset = ctx.mul_u32(warp_id, WARP_SMEM_SIZE);
let lane_offset = ctx.mul_u32(lane, 4);
let offset = ctx.add_u32_reg(warp_offset, lane_offset);
ctx.st_shared_u32(offset, tid);
ctx.bar_sync(0);
let val = ctx.ld_shared_u32(offset);
let out_ptr = ctx.load_param_u64("output");
let out_off = ctx.mul_u32(tid, 4);
let out_off_64 = ctx.cvt_u64_u32(out_off);
let out_addr = ctx.add_u64(out_ptr, out_off_64);
ctx.st_global_u32(out_addr, val);
ctx.ret();
});
let ptx = PtxModule::new()
.version(8, 0)
.target("sm_89")
.address_size(64)
.add_kernel(kernel)
.emit();
println!("=== Multi-warp .shared PTX ===\n{}", ptx);
let cuda_ctx = CudaContext::new(0).expect("CUDA context");
let stream = CudaStream::new(&cuda_ctx).expect("CUDA stream");
let output_buf: GpuBuffer<u32> = GpuBuffer::new(&cuda_ctx, 96).unwrap();
let mut module = CudaModule::from_ptx(&cuda_ctx, &ptx).expect("PTX compilation");
let config = LaunchConfig {
grid: (1, 1, 1),
block: (96, 1, 1), shared_mem: 0,
};
let mut args: [*mut c_void; 1] = [output_buf.as_kernel_arg()];
unsafe {
stream
.launch_kernel(&mut module, "multi_warp_shared_test", &config, &mut args)
.expect("Kernel launch");
}
stream.synchronize().expect("Sync");
let mut output = vec![0u32; 96];
output_buf.copy_to_host(&mut output).unwrap();
println!("Output (first 32): {:?}", &output[0..32]);
println!("Output (32-64): {:?}", &output[32..64]);
println!("Output (64-96): {:?}", &output[64..96]);
for (i, &val) in output.iter().enumerate() {
assert_eq!(val, i as u32, "Thread {} should have value {}", i, i);
}
println!("FKR-011b: Multi-warp direct .shared addressing PASSED!");
}
#[test]
fn fkr_011c_lz4_state_pattern() {
if !cuda_available() {
eprintln!("FKR-011c SKIPPED: No CUDA device available");
return;
}
const WARP_SMEM_SIZE: u32 = 12544;
const STATE_OFFSET: u32 = 4096 + 8192 + 128 + 4;
let kernel = PtxKernel::new("lz4_state_pattern_test")
.param(PtxType::U64, "output")
.shared_memory((WARP_SMEM_SIZE * 3) as usize)
.build(|ctx| {
let tid = ctx.special_reg(PtxReg::TidX);
let zero = ctx.mov_u32_imm(0);
let is_leader = ctx.setp_eq_u32(tid, zero);
ctx.branch_if_not(is_leader, "L_skip");
let warp_id = ctx.shr_u32_imm(tid, 5); let warp_offset = ctx.mul_u32(warp_id, WARP_SMEM_SIZE);
let state_offset = ctx.add_u32(warp_offset, STATE_OFFSET);
let test_val1 = ctx.mov_u32_imm(0xDEADBEEF);
ctx.st_shared_u32(state_offset, test_val1);
let off4 = ctx.add_u32(state_offset, 4);
let test_val2 = ctx.mov_u32_imm(0xCAFEBABE);
ctx.st_shared_u32(off4, test_val2);
let off8 = ctx.add_u32(state_offset, 8);
let test_val3 = ctx.mov_u32_imm(0x12345678);
ctx.st_shared_u32(off8, test_val3);
ctx.bar_sync(0);
let read1 = ctx.ld_shared_u32(state_offset);
let read2 = ctx.ld_shared_u32(off4);
let read3 = ctx.ld_shared_u32(off8);
let out_ptr = ctx.load_param_u64("output");
ctx.st_global_u32(out_ptr, read1);
let off4 = ctx.mov_u64_imm(4);
let out1 = ctx.add_u64(out_ptr, off4);
ctx.st_global_u32(out1, read2);
let off8 = ctx.mov_u64_imm(8);
let out2 = ctx.add_u64(out_ptr, off8);
ctx.st_global_u32(out2, read3);
let off12 = ctx.mov_u64_imm(12);
let out3 = ctx.add_u64(out_ptr, off12);
ctx.st_global_u32(out3, state_offset);
ctx.label("L_skip");
ctx.ret();
});
let ptx = PtxModule::new()
.version(8, 0)
.target("sm_89")
.address_size(64)
.add_kernel(kernel)
.emit();
println!("=== LZ4 State Pattern PTX ===\n{}", ptx);
let cuda_ctx = CudaContext::new(0).expect("CUDA context");
let stream = CudaStream::new(&cuda_ctx).expect("CUDA stream");
let output_buf: GpuBuffer<u32> = GpuBuffer::new(&cuda_ctx, 4).unwrap();
let mut module = CudaModule::from_ptx(&cuda_ctx, &ptx).expect("PTX compilation");
let config = LaunchConfig {
grid: (1, 1, 1),
block: (96, 1, 1),
shared_mem: 0,
};
let mut args: [*mut c_void; 1] = [output_buf.as_kernel_arg()];
unsafe {
stream
.launch_kernel(&mut module, "lz4_state_pattern_test", &config, &mut args)
.expect("Kernel launch");
}
stream.synchronize().expect("Sync");
let mut output = vec![0u32; 4];
output_buf.copy_to_host(&mut output).unwrap();
println!("=== LZ4 State Pattern Results ===");
println!(
"state[0] (in_pos): 0x{:08X} (expected 0xDEADBEEF)",
output[0]
);
println!(
"state[1] (out_pos): 0x{:08X} (expected 0xCAFEBABE)",
output[1]
);
println!(
"state[2] (anchor): 0x{:08X} (expected 0x12345678)",
output[2]
);
println!(
"state_offset: {} (expected {})",
output[3], STATE_OFFSET
);
assert_eq!(output[0], 0xDEADBEEF, "in_pos should be 0xDEADBEEF");
assert_eq!(output[1], 0xCAFEBABE, "out_pos should be 0xCAFEBABE");
assert_eq!(output[2], 0x12345678, "anchor should be 0x12345678");
assert_eq!(
output[3], STATE_OFFSET,
"state_offset should be {}",
STATE_OFFSET
);
println!("FKR-011c: LZ4 state pattern PASSED!");
}
}
#[cfg(not(feature = "cuda"))]
mod fkr_011_tests {
#[test]
fn fkr_011_skip_no_cuda() {
println!("FKR-011: Skipped - CUDA feature not enabled");
}
}