1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
//! # CudaGraph - v8_shared_bytes_group Methods
//!
//! This module contains method implementations for `CudaGraph`.
//!
//! 🤖 Generated with [SplitRS](https://github.com/cool-japan/splitrs)
use cudarc::driver::CudaSlice;
use std::sync::Arc;
use super::types::CudaGraphError;
use super::cudagraph_type::CudaGraph;
impl CudaGraph {
/// Shared-memory bytes required for the V8 kernel at given `k`.
/// Returns `None` when `k` exceeds the 48 KB default shared-mem limit.
#[inline]
pub(crate) fn v8_shared_bytes(k: usize) -> Option<u32> {
super::super::cuda_kernels::v8_shared_mem_bytes(k, 49_152)
}
/// Public auto-dispatch GEMV: uses V8 when `k` fits in shared mem, V7 otherwise.
///
/// # Safety
/// All slices must be valid device pointers on `self.stream`.
pub unsafe fn launch_gemv_pub(
&self,
d_weight: &Arc<CudaSlice<u8>>,
d_input: &CudaSlice<f32>,
d_output: &mut CudaSlice<f32>,
n_rows: u32,
k: u32,
) -> Result<(), CudaGraphError> {
match Self::v8_shared_bytes(k as usize) {
Some(smem) => self.launch_gemv_v8(d_weight, d_input, d_output, n_rows, k, smem),
None => self.launch_gemv_v9(d_weight, d_input, d_output, n_rows, k),
}
}
/// Public auto-dispatch GEMV with fused in-place residual add.
///
/// Computes `d_inout[row] = dot(weight[row], d_input) + d_inout[row]` for all rows.
/// Uses V8 (shared-memory cache) when k fits in 49 KB; V9 (vectorised loads) otherwise.
///
/// # Safety
/// All slices must be valid device pointers on `self.stream`.
/// `d_inout` is used as both the residual source and the output destination.
/// Each output row is written exactly once by a single warp, so the read-before-write
/// within the fused kernel is data-race-free even with aliased pointers.
pub unsafe fn launch_gemv_residual_pub(
&self,
d_weight: &Arc<CudaSlice<u8>>,
d_input: &CudaSlice<f32>,
d_inout: &mut CudaSlice<f32>,
n_rows: u32,
k: u32,
) -> Result<(), CudaGraphError> {
let d_residual = &*(d_inout as *const CudaSlice<f32>);
match Self::v8_shared_bytes(k as usize) {
Some(smem) => self.launch_gemv_v8_residual(
&**d_weight,
d_input,
d_residual,
d_inout,
n_rows,
k,
smem,
),
None => {
self.launch_gemv_v9_residual(&**d_weight, d_input, d_residual, d_inout, n_rows, k)
}
}
}
}