1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
impl CudaExecutor {
/// PAR-023: RMSNorm using raw device pointer for gamma
pub(crate) fn rmsnorm_gpu_ptr(
&mut self,
input: &GpuBuffer<f32>,
gamma_ptr: u64, // CUdeviceptr
gamma_len: usize,
hidden_dim: u32,
epsilon: f32,
) -> Result<GpuBuffer<f32>, GpuError> {
if gamma_ptr == 0 {
return Err(GpuError::InvalidLaunchConfig(
"null gamma pointer in rmsnorm_gpu_ptr".to_string(),
));
}
// Create temporary non-owning buffer wrapper
// SAFETY: gamma_ptr points to valid GPU memory owned by rmsnorm_cache
// SAFETY: Pointer valid from allocation, length verified, used within scope
let gamma = unsafe { GpuBuffer::from_raw_parts(gamma_ptr, gamma_len) };
let result = self.rmsnorm_gpu(input, &gamma, hidden_dim, epsilon)?;
// Prevent Drop from freeing the borrowed memory
std::mem::forget(gamma);
Ok(result)
}
/// PAR-044: RMSNorm using raw pointer into existing output buffer
pub(crate) fn rmsnorm_ptr_into(
&mut self,
input: &GpuBuffer<f32>,
gamma_ptr: u64,
gamma_len: usize,
output: &GpuBuffer<f32>,
hidden_dim: u32,
epsilon: f32,
) -> Result<(), GpuError> {
if gamma_ptr == 0 {
return Err(GpuError::InvalidLaunchConfig(
"null gamma pointer in rmsnorm_ptr_into".to_string(),
));
}
// SAFETY: Memory safety ensured by bounds checking and alignment
// SAFETY: Pointer valid from allocation, length verified, used within scope
let gamma = unsafe { GpuBuffer::from_raw_parts(gamma_ptr, gamma_len) };
self.rmsnorm_into(input, &gamma, output, hidden_dim, epsilon)?;
std::mem::forget(gamma);
Ok(())
}
}
include!("batched_forward.rs");
include!("par-121.rs");