oxibonsai-kernels 0.1.4

1-bit Q1_0_g128 compute kernels (dequant, GEMV, GEMM) for OxiBonsai
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
//! Process-wide state, types, init, and resource-acquisition helpers for the
//! CUDA prefill path.
//!
//! Holds the singleton `CudaPrefillState` (compiled modules, cached prefill
//! buffers, KV cache, single-token scratch, cached logits buffer) plus the
//! typed containers [`CudaPrefillBuffers`] and [`CudaPrefillModules`].
//!
//! All public types in this file are re-exported by [`super`] so external
//! callers continue to see the same `super::cuda_prefill::*` API surface.

use cudarc::driver::{CudaFunction, CudaSlice};
use std::sync::{Arc, Mutex, OnceLock};

use crate::gpu_backend::cuda_full_layer::{CudaFullLayerBuffers, CudaKvCache};
use crate::gpu_backend::cuda_graph::{compile_or_load_ptx, CudaGraph, CudaGraphError};
use crate::gpu_backend::cuda_prefill_kernels::CUDA_PREFILL_KERNELS_SRC;

/// Type alias for the per-layer weight tuple used during prefill.
/// Fields (in order): attn_norm, fused_qkv, q_norm, k_norm, attn_proj, ffn_norm, gate_up, down.
pub(super) type LayerWeightArcs = (
    Arc<CudaSlice<f32>>, // attn_norm
    Arc<CudaSlice<u8>>,  // fused_qkv
    Arc<CudaSlice<f32>>, // q_norm
    Arc<CudaSlice<f32>>, // k_norm
    Arc<CudaSlice<u8>>,  // attn_proj
    Arc<CudaSlice<f32>>, // ffn_norm
    Arc<CudaSlice<u8>>,  // gate_up (fused)
    Arc<CudaSlice<u8>>,  // down
);

// =============================================================================
// Pre-allocated prefill GPU buffers
// =============================================================================

/// Pre-allocated GPU activation buffers for prefill (batch) processing.
///
/// All batched buffers use column-major layout: `buf[col * dim + element]`.
pub struct CudaPrefillBuffers {
    /// Batched hidden states: `[capacity * hidden_size]` f32 (column-major).
    pub d_input: CudaSlice<f32>,
    /// Batched RMSNorm output: `[capacity * hidden_size]` f32 (column-major).
    pub d_normed: CudaSlice<f32>,
    /// Batched QKV GEMM output: `[capacity * (nq+2*nkv)*head_dim]` f32.
    pub d_qkv: CudaSlice<f32>,
    /// Batched attention output: `[capacity * nq*head_dim]` f32 (column-major).
    pub d_attn_out: CudaSlice<f32>,
    /// Batched gate+up GEMM output: `[capacity * intermediate_size]` f32.
    /// Layout: `[gate: bs*inter | up: bs*inter]` for `batched_swiglu`.
    pub d_gate_up: CudaSlice<f32>,
    /// Batched SwiGLU output: `[capacity * intermediate_size]` f32 (column-major).
    pub d_swiglu: CudaSlice<f32>,
    /// Allocated capacity (max batch_size for which buffers are valid).
    pub capacity: usize,
    /// Currently-active batch size (≤ capacity), set before each encode call.
    pub actual_batch_size: usize,
    pub hidden_size: usize,
    pub intermediate_size: usize,
    pub nq: usize,
    pub nkv: usize,
    pub head_dim: usize,
    pub max_seq: usize,
}

// SAFETY: CudaSlice<f32> is Send in cudarc.
unsafe impl Send for CudaPrefillBuffers {}
unsafe impl Sync for CudaPrefillBuffers {}

impl CudaPrefillBuffers {
    /// Check whether these buffers can serve the requested dimensions.
    ///
    /// `batch_size` uses capacity comparison (`<=`) so buffers allocated for a
    /// larger batch can be reused for smaller batches without reallocation.
    /// All other dimensions must match exactly (they determine pointer layouts).
    #[allow(clippy::too_many_arguments)]
    pub fn matches(
        &self,
        batch_size: usize,
        hidden_size: usize,
        intermediate_size: usize,
        nq: usize,
        nkv: usize,
        head_dim: usize,
        max_seq: usize,
    ) -> bool {
        batch_size <= self.capacity   // capacity-based: smaller batches reuse larger allocations
            && self.hidden_size == hidden_size
            && self.intermediate_size == intermediate_size
            && self.nq == nq
            && self.nkv == nkv
            && self.head_dim == head_dim
            && self.max_seq == max_seq
    }
}

// =============================================================================
// Compiled prefill CUDA modules
// =============================================================================

/// Compiled CUDA function handles for the 8 prefill kernels (5 Q1 + 3 TQ2).
pub struct CudaPrefillModules {
    pub gemm_v7: CudaFunction,
    pub gemm_v7_residual: CudaFunction,
    pub fused_gate_up_swiglu_gemm: CudaFunction,
    pub batched_swiglu: CudaFunction,
    pub batched_rmsnorm: CudaFunction,
    /// TQ2 batch GEMM — accumulates into output with `+=`.
    pub gemm_tq2_v7: CudaFunction,
    /// TQ2 batch GEMM + fused in-place residual add.
    pub gemm_tq2_v7_residual: CudaFunction,
    /// TQ2 fused gate+up+SwiGLU batch GEMM.
    pub fused_gate_up_swiglu_gemm_tq2: CudaFunction,
}

// SAFETY: CudaFunction is Send in cudarc.
unsafe impl Send for CudaPrefillModules {}
unsafe impl Sync for CudaPrefillModules {}

// =============================================================================
// Process-wide singleton state for the prefill path
// =============================================================================

struct CudaPrefillState {
    prefill_modules: Mutex<Option<Arc<CudaPrefillModules>>>,
    prefill_buffers: Mutex<Option<CudaPrefillBuffers>>,
    /// Shared KV cache (same singleton as the decode path).
    kv_cache: Mutex<Option<CudaKvCache>>,
    /// Reuse the single-token full-layer buffers for per-token attention.
    full_layer_buffers: Mutex<Option<CudaFullLayerBuffers>>,
    /// Cached logits buffer: (buffer, out_features_count).
    prefill_logits: Mutex<Option<(CudaSlice<f32>, usize)>>,
}

unsafe impl Send for CudaPrefillState {}
unsafe impl Sync for CudaPrefillState {}

static PREFILL_STATE: OnceLock<CudaPrefillState> = OnceLock::new();

fn prefill_state() -> &'static CudaPrefillState {
    PREFILL_STATE.get_or_init(|| CudaPrefillState {
        prefill_modules: Mutex::new(None),
        prefill_buffers: Mutex::new(None),
        kv_cache: Mutex::new(None),
        full_layer_buffers: Mutex::new(None),
        prefill_logits: Mutex::new(None),
    })
}

// =============================================================================
// Module init
// =============================================================================

/// Compile and cache the 5 CUDA prefill kernels.
///
/// Idempotent: the second call returns the already-compiled modules immediately.
pub fn init_prefill_modules(graph: &CudaGraph) -> Result<Arc<CudaPrefillModules>, CudaGraphError> {
    let state = prefill_state();
    let mut guard = state
        .prefill_modules
        .lock()
        .map_err(|_| CudaGraphError::LockPoisoned)?;

    if let Some(ref m) = *guard {
        return Ok(Arc::clone(m));
    }

    let ptx = compile_or_load_ptx(CUDA_PREFILL_KERNELS_SRC, "prefill_kernels")?;

    let module = graph
        .context_arc()
        .load_module(ptx)
        .map_err(|e| CudaGraphError::DriverError(format!("load_module prefill: {e}")))?;

    let load = |name: &str| -> Result<CudaFunction, CudaGraphError> {
        module
            .load_function(name)
            .map_err(|e| CudaGraphError::DriverError(format!("load_function({name}): {e}")))
    };

    let modules = Arc::new(CudaPrefillModules {
        gemm_v7: load("gemm_q1_g128_v7")?,
        gemm_v7_residual: load("gemm_q1_g128_v7_residual")?,
        fused_gate_up_swiglu_gemm: load("fused_gate_up_swiglu_gemm_q1")?,
        batched_swiglu: load("batched_swiglu")?,
        batched_rmsnorm: load("batched_rmsnorm_v2")?,
        gemm_tq2_v7: load("gemm_tq2_g128_v7")?,
        gemm_tq2_v7_residual: load("gemm_tq2_g128_v7_residual")?,
        fused_gate_up_swiglu_gemm_tq2: load("fused_gate_up_swiglu_gemm_tq2")?,
    });

    *guard = Some(Arc::clone(&modules));
    Ok(modules)
}

// =============================================================================
// Buffer / cache acquisition helpers
// =============================================================================

/// Round up `n` to the next power of two (minimum 1).
fn next_pow2_capacity(n: usize) -> usize {
    if n == 0 {
        return 1;
    }
    let mut cap = 1usize;
    while cap < n {
        cap <<= 1;
    }
    cap
}

/// Acquire or (re-)allocate the prefill activation buffers.
#[allow(clippy::too_many_arguments)]
pub(super) fn acquire_prefill_buffers(
    graph: &CudaGraph,
    batch_size: usize,
    hidden_size: usize,
    intermediate_size: usize,
    nq: usize,
    nkv: usize,
    head_dim: usize,
    max_seq: usize,
) -> Result<std::sync::MutexGuard<'static, Option<CudaPrefillBuffers>>, CudaGraphError> {
    let state = prefill_state();
    let mut guard = state
        .prefill_buffers
        .lock()
        .map_err(|_| CudaGraphError::LockPoisoned)?;

    let needs_alloc = match guard.as_ref() {
        Some(b) => !b.matches(
            batch_size,
            hidden_size,
            intermediate_size,
            nq,
            nkv,
            head_dim,
            max_seq,
        ),
        None => true,
    };

    if needs_alloc {
        let capacity = next_pow2_capacity(batch_size);
        let alloc = |n: usize| -> Result<CudaSlice<f32>, CudaGraphError> {
            graph
                .stream_arc()
                .alloc_zeros::<f32>(n)
                .map_err(|e| CudaGraphError::DriverError(format!("alloc_zeros pb({n}): {e}")))
        };

        let qkv_total = (nq + 2 * nkv) * head_dim;

        *guard = Some(CudaPrefillBuffers {
            d_input: alloc(capacity * hidden_size)?,
            d_normed: alloc(capacity * hidden_size)?,
            d_qkv: alloc(capacity * qkv_total)?,
            d_attn_out: alloc(capacity * nq * head_dim)?,
            d_gate_up: alloc(2 * capacity * intermediate_size)?,
            d_swiglu: alloc(capacity * intermediate_size)?,
            capacity,
            actual_batch_size: batch_size,
            hidden_size,
            intermediate_size,
            nq,
            nkv,
            head_dim,
            max_seq,
        });
    } else {
        // Reusing existing allocation — just update the active batch size.
        // SAFETY: needs_alloc is false only when guard is Some(b), so this is infallible.
        guard
            .as_mut()
            .expect("guard is Some when needs_alloc is false")
            .actual_batch_size = batch_size;
    }

    Ok(guard)
}

/// Acquire or (re-)allocate the shared GPU KV cache.
pub(super) fn acquire_prefill_kv_cache(
    graph: &CudaGraph,
    n_layers: usize,
    n_kv: usize,
    max_seq: usize,
    head_dim: usize,
) -> Result<std::sync::MutexGuard<'static, Option<CudaKvCache>>, CudaGraphError> {
    let state = prefill_state();
    let mut guard = state
        .kv_cache
        .lock()
        .map_err(|_| CudaGraphError::LockPoisoned)?;

    let needs_alloc = match guard.as_ref() {
        Some(c) => !c.matches(n_layers, n_kv, max_seq, head_dim),
        None => true,
    };

    if needs_alloc {
        let total = n_layers * n_kv * max_seq * head_dim;
        let k_cache = graph
            .stream_arc()
            .alloc_zeros::<u16>(total)
            .map_err(|e| CudaGraphError::DriverError(format!("alloc kv k: {e}")))?;
        let v_cache = graph
            .stream_arc()
            .alloc_zeros::<u16>(total)
            .map_err(|e| CudaGraphError::DriverError(format!("alloc kv v: {e}")))?;

        *guard = Some(CudaKvCache {
            k_cache,
            v_cache,
            n_layers,
            n_kv,
            max_seq,
            head_dim,
        });
    }

    Ok(guard)
}

/// Acquire or (re-)allocate single-token full-layer buffers for per-token attention.
pub(super) fn acquire_single_token_buffers(
    graph: &CudaGraph,
    hidden_size: usize,
    nq: usize,
    nkv: usize,
    head_dim: usize,
    max_seq: usize,
    intermediate_size: usize,
) -> Result<std::sync::MutexGuard<'static, Option<CudaFullLayerBuffers>>, CudaGraphError> {
    let state = prefill_state();
    let mut guard = state
        .full_layer_buffers
        .lock()
        .map_err(|_| CudaGraphError::LockPoisoned)?;

    let needs_alloc = match guard.as_ref() {
        Some(b) => !b.matches(hidden_size, nq, nkv, head_dim, max_seq, intermediate_size),
        None => true,
    };

    if needs_alloc {
        let alloc = |n: usize| -> Result<CudaSlice<f32>, CudaGraphError> {
            graph
                .stream_arc()
                .alloc_zeros::<f32>(n)
                .map_err(|e| CudaGraphError::DriverError(format!("alloc st({n}): {e}")))
        };

        let qkv_total = nq * head_dim + 2 * nkv * head_dim;
        let half_dim = head_dim / 2;

        let alloc_u32 = |n: usize| -> Result<CudaSlice<u32>, CudaGraphError> {
            graph
                .stream_arc()
                .alloc_zeros::<u32>(n)
                .map_err(|e| CudaGraphError::DriverError(format!("alloc u32({n}): {e}")))
        };
        *guard = Some(CudaFullLayerBuffers {
            d_hidden: alloc(hidden_size)?,
            d_normed: alloc(hidden_size)?,
            d_qkv: alloc(qkv_total)?,
            d_q_rope: alloc(nq * head_dim)?,
            d_k_rope: alloc(nkv * head_dim)?,
            d_cos: alloc(half_dim)?,
            d_sin: alloc(half_dim)?,
            d_scores: alloc(nq * max_seq)?,
            d_attn_out: alloc(nq * head_dim)?,
            d_gate_up: alloc(2 * intermediate_size)?,
            d_swiglu: alloc(intermediate_size)?,
            d_pos_seqlen: alloc_u32(2)?,
            hidden_size,
            nq,
            nkv,
            head_dim,
            max_seq,
            intermediate_size,
        });
    }

    Ok(guard)
}

pub(super) type PrefillLogitsGuard =
    std::sync::MutexGuard<'static, Option<(CudaSlice<f32>, usize)>>;

/// Acquire or (re-)allocate the LM-head logits buffer.
pub(super) fn acquire_prefill_logits(
    graph: &CudaGraph,
    n: usize,
) -> Result<PrefillLogitsGuard, CudaGraphError> {
    let state = prefill_state();
    let mut guard = state
        .prefill_logits
        .lock()
        .map_err(|_| CudaGraphError::LockPoisoned)?;
    let needs_alloc = match guard.as_ref() {
        Some((_, sz)) => *sz != n,
        None => true,
    };
    if needs_alloc {
        let buf = graph
            .stream_arc()
            .alloc_zeros::<f32>(n)
            .map_err(|e| CudaGraphError::DriverError(format!("alloc logits buf({n}): {e}")))?;
        *guard = Some((buf, n));
    }
    Ok(guard)
}