entrenar/autograd/cuda_forward/
elementwise.rs1#![allow(unsafe_code)]
2#![allow(trivial_casts)]
3#![allow(clippy::borrow_as_ptr)]
4#![allow(clippy::ref_as_ptr)]
5
6#[cfg(feature = "cuda")]
7use trueno_gpu::driver::{CudaStream, GpuBuffer, LaunchConfig};
8#[cfg(feature = "cuda")]
9use trueno_gpu::kernels::{
10 BatchedToInterleavedKernel, BatchedTransposeKernel, ElementwiseMulKernel,
11 InterleavedToBatchedKernel, Kernel, ResidualAddKernel, ScaleKernel,
12};
13
14use crate::autograd::cuda_tensor::{CudaTensorError, Result};
15
16#[cfg(feature = "cuda")]
17use super::cache::FORWARD_KERNEL_CACHE;
18
19#[cfg(feature = "cuda")]
29pub fn residual_add_forward(
30 a: &GpuBuffer<f32>,
31 b: &GpuBuffer<f32>,
32 output: &mut GpuBuffer<f32>,
33 n: u32,
34 stream: &CudaStream,
35) -> Result<()> {
36 let cache = FORWARD_KERNEL_CACHE.get().ok_or(CudaTensorError::DeviceNotInitialized)?;
37 let mut cache = cache.lock().map_err(|_err| {
38 CudaTensorError::KernelError("Failed to acquire kernel cache lock".to_string())
39 })?;
40
41 let key = "residual_add_forward".to_string(); let module = match cache.get_cached(&key) {
43 Some(m) => m,
44 None => {
45 let kernel = ResidualAddKernel::new(n);
46 let ptx = kernel.emit_ptx_for_target(cache.sm_target());
47 cache.get_or_compile(&key, &ptx)?
48 }
49 };
50
51 let config = LaunchConfig { grid: (n.div_ceil(256), 1, 1), block: (256, 1, 1), shared_mem: 0 };
52
53 let a_ptr = a.as_ptr();
54 let b_ptr = b.as_ptr();
55 let output_ptr = output.as_ptr();
56
57 let mut args: [*mut std::ffi::c_void; 4] = [
58 &a_ptr as *const _ as *mut _,
59 &b_ptr as *const _ as *mut _,
60 &output_ptr as *const _ as *mut _,
61 &n as *const _ as *mut _,
62 ];
63
64 unsafe {
67 stream.launch_kernel(module, "residual_add", &config, &mut args).map_err(|e| {
68 CudaTensorError::KernelError(format!("Residual add forward launch failed: {e:?}"))
69 })?;
70 }
71
72 Ok(())
73}
74
75#[cfg(feature = "cuda")]
83pub fn inplace_add_gpu(
84 dst: &mut GpuBuffer<f32>,
85 src: &GpuBuffer<f32>,
86 n: u32,
87 stream: &CudaStream,
88) -> Result<()> {
89 let cache = FORWARD_KERNEL_CACHE.get().ok_or(CudaTensorError::DeviceNotInitialized)?;
90 let mut cache = cache.lock().map_err(|_err| {
91 CudaTensorError::KernelError("Failed to acquire kernel cache lock".to_string())
92 })?;
93
94 let key = "inplace_add".to_string(); let module = match cache.get_cached(&key) {
96 Some(m) => m,
97 None => {
98 let kernel = ResidualAddKernel::new(n);
99 let ptx = kernel.emit_ptx_for_target(cache.sm_target());
100 cache.get_or_compile(&key, &ptx)?
101 }
102 };
103
104 let config = LaunchConfig { grid: (n.div_ceil(256), 1, 1), block: (256, 1, 1), shared_mem: 0 };
105
106 let dst_ptr = dst.as_ptr();
111 let src_ptr = src.as_ptr();
112
113 let mut args: [*mut std::ffi::c_void; 4] = [
114 &dst_ptr as *const _ as *mut _,
115 &src_ptr as *const _ as *mut _,
116 &dst_ptr as *const _ as *mut _,
117 &n as *const _ as *mut _,
118 ];
119
120 unsafe {
123 stream.launch_kernel(module, "residual_add", &config, &mut args).map_err(|e| {
124 CudaTensorError::KernelError(format!("In-place add launch failed: {e:?}"))
125 })?;
126 }
127
128 Ok(())
129}
130
131#[cfg(feature = "cuda")]
141pub fn elementwise_mul_forward(
142 a: &GpuBuffer<f32>,
143 b: &GpuBuffer<f32>,
144 output: &mut GpuBuffer<f32>,
145 n: u32,
146 stream: &CudaStream,
147) -> Result<()> {
148 let cache = FORWARD_KERNEL_CACHE.get().ok_or(CudaTensorError::DeviceNotInitialized)?;
149 let mut cache = cache.lock().map_err(|_err| {
150 CudaTensorError::KernelError("Failed to acquire kernel cache lock".to_string())
151 })?;
152
153 let key = "elementwise_mul_forward".to_string(); let module = match cache.get_cached(&key) {
155 Some(m) => m,
156 None => {
157 let kernel = ElementwiseMulKernel::new(n);
158 let ptx = kernel.emit_ptx_for_target(cache.sm_target());
159 cache.get_or_compile(&key, &ptx)?
160 }
161 };
162
163 let config = LaunchConfig { grid: (n.div_ceil(256), 1, 1), block: (256, 1, 1), shared_mem: 0 };
164
165 let a_ptr = a.as_ptr();
166 let b_ptr = b.as_ptr();
167 let output_ptr = output.as_ptr();
168
169 let mut args: [*mut std::ffi::c_void; 4] = [
170 &a_ptr as *const _ as *mut _,
171 &b_ptr as *const _ as *mut _,
172 &output_ptr as *const _ as *mut _,
173 &n as *const _ as *mut _,
174 ];
175
176 unsafe {
179 stream.launch_kernel(module, "elementwise_mul", &config, &mut args).map_err(|e| {
180 CudaTensorError::KernelError(format!("Elementwise mul forward launch failed: {e:?}"))
181 })?;
182 }
183
184 Ok(())
185}
186
187#[cfg(feature = "cuda")]
197pub fn scale_forward(
198 input: &GpuBuffer<f32>,
199 output: &mut GpuBuffer<f32>,
200 scale: f32,
201 n: u32,
202 stream: &CudaStream,
203) -> Result<()> {
204 let cache = FORWARD_KERNEL_CACHE.get().ok_or(CudaTensorError::DeviceNotInitialized)?;
205 let mut cache = cache.lock().map_err(|_err| {
206 CudaTensorError::KernelError("Failed to acquire kernel cache lock".to_string())
207 })?;
208
209 let key = "scale_forward".to_string(); let module = match cache.get_cached(&key) {
211 Some(m) => m,
212 None => {
213 let kernel = ScaleKernel::new(n);
214 let ptx = kernel.emit_ptx_for_target(cache.sm_target());
215 cache.get_or_compile(&key, &ptx)?
216 }
217 };
218
219 let config = LaunchConfig { grid: (n.div_ceil(256), 1, 1), block: (256, 1, 1), shared_mem: 0 };
220
221 let input_ptr = input.as_ptr();
222 let output_ptr = output.as_ptr();
223
224 let mut args: [*mut std::ffi::c_void; 4] = [
225 &input_ptr as *const _ as *mut _,
226 &output_ptr as *const _ as *mut _,
227 &scale as *const _ as *mut _,
228 &n as *const _ as *mut _,
229 ];
230
231 unsafe {
234 stream.launch_kernel(module, "scale", &config, &mut args).map_err(|e| {
235 CudaTensorError::KernelError(format!("Scale forward launch failed: {e:?}"))
236 })?;
237 }
238
239 Ok(())
240}
241
242#[cfg(feature = "cuda")]
254pub fn interleaved_to_batched_forward(
255 input: &GpuBuffer<f32>,
256 output: &mut GpuBuffer<f32>,
257 seq_len: u32,
258 n_heads: u32,
259 head_dim: u32,
260 stream: &CudaStream,
261) -> Result<()> {
262 let cache = FORWARD_KERNEL_CACHE.get().ok_or(CudaTensorError::DeviceNotInitialized)?;
263 let mut cache = cache.lock().map_err(|_err| {
264 CudaTensorError::KernelError("Failed to acquire kernel cache lock".to_string())
265 })?;
266
267 let total = seq_len * n_heads * head_dim;
268 let key = "interleaved_to_batched";
271 let module = match cache.get_cached(key) {
272 Some(m) => m,
273 None => {
274 let kernel = InterleavedToBatchedKernel::new(seq_len, n_heads, head_dim);
276 let ptx = kernel.emit_ptx_for_target(cache.sm_target());
277 cache.get_or_compile(key, &ptx)?
278 }
279 };
280
281 let config =
282 LaunchConfig { grid: (total.div_ceil(256), 1, 1), block: (256, 1, 1), shared_mem: 0 };
283
284 let input_ptr = input.as_ptr();
285 let output_ptr = output.as_ptr();
286
287 let mut args: [*mut std::ffi::c_void; 6] = [
289 &input_ptr as *const _ as *mut _,
290 &output_ptr as *const _ as *mut _,
291 &seq_len as *const _ as *mut _,
292 &n_heads as *const _ as *mut _,
293 &head_dim as *const _ as *mut _,
294 &total as *const _ as *mut _,
295 ];
296
297 unsafe {
299 stream.launch_kernel(module, "interleaved_to_batched", &config, &mut args).map_err(
300 |e| {
301 CudaTensorError::KernelError(format!("Interleaved-to-batched launch failed: {e:?}"))
302 },
303 )?;
304 }
305
306 Ok(())
307}
308
309#[cfg(feature = "cuda")]
321pub fn batched_transpose_forward(
322 input: &GpuBuffer<f32>,
323 output: &mut GpuBuffer<f32>,
324 batch: u32,
325 rows: u32,
326 cols: u32,
327 stream: &CudaStream,
328) -> Result<()> {
329 let cache = FORWARD_KERNEL_CACHE.get().ok_or(CudaTensorError::DeviceNotInitialized)?;
330 let mut cache = cache.lock().map_err(|_err| {
331 CudaTensorError::KernelError("Failed to acquire kernel cache lock".to_string())
332 })?;
333
334 let total_per_batch = rows * cols;
335 let key = "batched_transpose";
337 let module = match cache.get_cached(key) {
338 Some(m) => m,
339 None => {
340 let kernel = BatchedTransposeKernel::new(batch, rows, cols);
341 let ptx = kernel.emit_ptx_for_target(cache.sm_target());
342 cache.get_or_compile(key, &ptx)?
343 }
344 };
345
346 let config = LaunchConfig {
348 grid: (total_per_batch.div_ceil(256), 1, batch),
349 block: (256, 1, 1),
350 shared_mem: 0,
351 };
352
353 let input_ptr = input.as_ptr();
354 let output_ptr = output.as_ptr();
355
356 let mut args: [*mut std::ffi::c_void; 6] = [
358 &input_ptr as *const _ as *mut _,
359 &output_ptr as *const _ as *mut _,
360 &batch as *const _ as *mut _,
361 &rows as *const _ as *mut _,
362 &cols as *const _ as *mut _,
363 &total_per_batch as *const _ as *mut _,
364 ];
365
366 unsafe {
368 stream.launch_kernel(module, "batched_transpose", &config, &mut args).map_err(|e| {
369 CudaTensorError::KernelError(format!("Batched transpose launch failed: {e:?}"))
370 })?;
371 }
372
373 Ok(())
374}
375
376#[cfg(feature = "cuda")]
388pub fn batched_to_interleaved_forward(
389 input: &GpuBuffer<f32>,
390 output: &mut GpuBuffer<f32>,
391 seq_len: u32,
392 n_heads: u32,
393 head_dim: u32,
394 stream: &CudaStream,
395) -> Result<()> {
396 let cache = FORWARD_KERNEL_CACHE.get().ok_or(CudaTensorError::DeviceNotInitialized)?;
397 let mut cache = cache.lock().map_err(|_err| {
398 CudaTensorError::KernelError("Failed to acquire kernel cache lock".to_string())
399 })?;
400
401 let total = seq_len * n_heads * head_dim;
402 let key = "batched_to_interleaved";
404 let module = match cache.get_cached(key) {
405 Some(m) => m,
406 None => {
407 let kernel = BatchedToInterleavedKernel::new(seq_len, n_heads, head_dim);
408 let ptx = kernel.emit_ptx_for_target(cache.sm_target());
409 cache.get_or_compile(key, &ptx)?
410 }
411 };
412
413 let config =
414 LaunchConfig { grid: (total.div_ceil(256), 1, 1), block: (256, 1, 1), shared_mem: 0 };
415
416 let input_ptr = input.as_ptr();
417 let output_ptr = output.as_ptr();
418
419 let mut args: [*mut std::ffi::c_void; 6] = [
421 &input_ptr as *const _ as *mut _,
422 &output_ptr as *const _ as *mut _,
423 &seq_len as *const _ as *mut _,
424 &n_heads as *const _ as *mut _,
425 &head_dim as *const _ as *mut _,
426 &total as *const _ as *mut _,
427 ];
428
429 unsafe {
431 stream.launch_kernel(module, "batched_to_interleaved", &config, &mut args).map_err(
432 |e| {
433 CudaTensorError::KernelError(format!("Batched-to-interleaved launch failed: {e:?}"))
434 },
435 )?;
436 }
437
438 Ok(())
439}
440
441#[cfg(feature = "cuda")]
453pub fn expand_kv_heads(
454 src: &GpuBuffer<f32>,
455 dst: &mut GpuBuffer<f32>,
456 num_kv_heads: usize,
457 heads_per_kv: usize,
458 elems_per_head: usize,
459 stream: &CudaStream,
460) -> Result<()> {
461 for kv_h in 0..num_kv_heads {
462 let src_offset = kv_h * elems_per_head;
463 for rep in 0..heads_per_kv {
464 let dst_offset = (kv_h * heads_per_kv + rep) * elems_per_head;
465 unsafe {
468 dst.copy_from_buffer_at_async(src, dst_offset, src_offset, elems_per_head, stream)
469 .map_err(|e| {
470 CudaTensorError::TransferFailed(format!(
471 "GQA head expansion D2D copy failed: {e}"
472 ))
473 })?;
474 }
475 }
476 }
477 Ok(())
478}