rlx-cuda 0.2.6

NVIDIA CUDA backend — cuBLAS for matmul + NVRTC-compiled kernels for everything else, via the pure-Rust `cudarc` crate.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
// RLX — versatile ML compiler + runtime.
// Copyright (C) 2026 Eugene Hauptmann, Nataliya Kosmyna.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 3.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.

// HIP-CPU dispatch entry points for the rlx-cuda validation path.
// Compiled only when `cargo build --features hip-cpu-validate`.
//
// HIP-CPU executes "GPU" kernels on CPU threads via std::thread. The
// kernel sources in `src/kernels/*.cu` use plain CUDA syntax —
// `__global__`, `blockIdx`, `threadIdx`, `__shared__` — which HIP-CPU
// recognizes when `__HIP_CPU_RT__` is defined.
//
// Each `launch_<kernel>` wraps `hipLaunchKernelGGL` so the Rust side
// can call into a stable C ABI. Coverage: all 32 kernel entry points
// (= 30 .cu sources, with matmul + scatter_add each contributing one
// extra entry).

#include <hip/hip_runtime.h>

#include "binary.cu"
#include "fused_binary_unary.cu"
#include "unary.cu"
#include "copy.cu"
#include "matmul.cu"
#include "compare.cu"
#include "where_select.cu"
#include "reduce.cu"
#include "softmax.cu"
#include "layernorm.cu"
#include "fused_residual_ln.cu"
#include "gather.cu"
#include "narrow.cu"
#include "concat.cu"
#include "transpose.cu"
#include "expand.cu"
#include "attention.cu"
#include "argmax.cu"
#include "rope.cu"
#include "cumsum.cu"
#include "topk.cu"
#include "grouped_matmul.cu"
#include "scatter_add.cu"
#include "dequant_matmul.cu"
#include "sample.cu"
#include "selective_scan.cu"
#include "pool1d.cu"
#include "pool2d.cu"
#include "pool3d.cu"
#include "conv1d.cu"
#include "conv2d.cu"
#include "conv3d.cu"
#include "elementwise_region.cu"
#include "batch_elementwise_region.cu"

#define LAUNCH(kfunc, gx, gy, gz, bx, by, bz, ...)                          \
    do {                                                                    \
        hipLaunchKernelGGL(kfunc, dim3((gx), (gy), (gz)),                   \
                                  dim3((bx), (by), (bz)), 0, 0,             \
                           __VA_ARGS__);                                    \
        hipDeviceSynchronize();                                             \
    } while (0)

extern "C" {

// ── Element-wise (1-D dispatch, block_x = 256) ─────────────────────

void launch_binary(float* a, unsigned int n, unsigned int ao, unsigned int bo,
                   unsigned int co, unsigned int op,
                   unsigned int gx, unsigned int bx) {
    LAUNCH(binary, gx,1,1, bx,1,1, a, n, ao, bo, co, op);
}

void launch_unary(float* a, unsigned int n, unsigned int io, unsigned int oo,
                  unsigned int op, unsigned int gx, unsigned int bx) {
    LAUNCH(unary, gx,1,1, bx,1,1, a, n, io, oo, op);
}

void launch_copy(float* a, unsigned int n, unsigned int io, unsigned int oo,
                 unsigned int gx, unsigned int bx) {
    LAUNCH(copy, gx,1,1, bx,1,1, a, n, io, oo);
}

void launch_compare(float* a, unsigned int n, unsigned int ao, unsigned int bo,
                    unsigned int co, unsigned int op,
                    unsigned int gx, unsigned int bx) {
    LAUNCH(compare, gx,1,1, bx,1,1, a, n, ao, bo, co, op);
}

void launch_where_select(float* a, unsigned int n, unsigned int cond_o,
                         unsigned int xo, unsigned int yo, unsigned int oo,
                         unsigned int gx, unsigned int bx) {
    LAUNCH(where_select, gx,1,1, bx,1,1, a, n, cond_o, xo, yo, oo);
}

// ── MatMul + DequantMatMul + GroupedMatmul (2-D dispatch) ──────────

void launch_matmul(float* a,
                   unsigned int m, unsigned int k, unsigned int n,
                   unsigned int ao, unsigned int bo, unsigned int co,
                   unsigned int batch,
                   unsigned int abs_, unsigned int bbs, unsigned int cbs,
                   unsigned int has_bias, unsigned int bias_off,
                   unsigned int act_id,
                   unsigned int gx, unsigned int gy, unsigned int gz,
                   unsigned int bx, unsigned int by) {
    LAUNCH(matmul, gx,gy,gz, bx,by,1,
        a, m,k,n, ao,bo,co, batch, abs_,bbs,cbs, has_bias,bias_off,act_id);
}

void launch_grouped_matmul(float* a,
                           unsigned int m, unsigned int k, unsigned int n,
                           unsigned int num_experts,
                           unsigned int io, unsigned int wo,
                           unsigned int idx_o, unsigned int oo,
                           unsigned int gx, unsigned int gy,
                           unsigned int bx, unsigned int by) {
    LAUNCH(grouped_matmul, gx,gy,1, bx,by,1,
        a, m,k,n, num_experts, io,wo,idx_o,oo);
}

void launch_dequant_matmul(float* a,
                           unsigned int m, unsigned int k, unsigned int n,
                           unsigned int block_size, unsigned int scheme_id,
                           unsigned int xo, unsigned int wo,
                           unsigned int sco, unsigned int zo, unsigned int oo,
                           unsigned int gx, unsigned int gy,
                           unsigned int bx, unsigned int by) {
    LAUNCH(dequant_matmul, gx,gy,1, bx,by,1,
        a, m,k,n, block_size, scheme_id, xo,wo,sco,zo,oo);
}

// ── Reductions (1-D over outer rows) ───────────────────────────────

void launch_reduce(float* a, unsigned int outer, unsigned int inner,
                   unsigned int io, unsigned int oo, unsigned int op,
                   unsigned int gx, unsigned int bx) {
    LAUNCH(reduce, gx,1,1, bx,1,1, a, outer,inner, io,oo, op);
}

void launch_softmax(float* a, unsigned int outer, unsigned int inner,
                    unsigned int io, unsigned int oo,
                    unsigned int gx, unsigned int bx) {
    LAUNCH(softmax, gx,1,1, bx,1,1, a, outer,inner, io,oo);
}

void launch_layernorm(float* a, unsigned int outer, unsigned int inner,
                      unsigned int io, unsigned int oo,
                      unsigned int go, unsigned int beta_o,
                      unsigned int eps_bits, unsigned int op,
                      unsigned int gx, unsigned int bx) {
    LAUNCH(rlx_norm, gx,1,1, bx,1,1, a, outer,inner, io,oo, go,beta_o, eps_bits, op);
}

void launch_fused_residual_ln(float* a, unsigned int outer, unsigned int inner,
                              unsigned int io, unsigned int ro,
                              unsigned int bias_o, unsigned int go, unsigned int beta_o,
                              unsigned int oo, unsigned int eps_bits,
                              unsigned int has_bias,
                              unsigned int gx, unsigned int bx) {
    LAUNCH(fused_residual_ln, gx,1,1, bx,1,1,
        a, outer,inner, io,ro, bias_o,go,beta_o,oo, eps_bits, has_bias);
}

void launch_cumsum(float* a, unsigned int outer, unsigned int inner,
                   unsigned int io, unsigned int oo, unsigned int exclusive,
                   unsigned int gx, unsigned int bx) {
    LAUNCH(cumsum, gx,1,1, bx,1,1, a, outer,inner, io,oo, exclusive);
}

void launch_argmax(float* a, unsigned int outer, unsigned int inner,
                   unsigned int io, unsigned int oo,
                   unsigned int gx, unsigned int bx) {
    LAUNCH(argmax, gx,1,1, bx,1,1, a, outer,inner, io,oo);
}

void launch_topk(float* a, unsigned int outer, unsigned int inner,
                 unsigned int k, unsigned int io, unsigned int oo,
                 unsigned int gx, unsigned int bx) {
    LAUNCH(topk, gx,1,1, bx,1,1, a, outer,inner, k, io,oo);
}

// ── Shape ops ───────────────────────────────────────────────────────

void launch_gather(float* a, unsigned int n_out, unsigned int n_idx,
                   unsigned int dim, unsigned int vocab,
                   unsigned int io, unsigned int idx_o, unsigned int oo,
                   unsigned int gx, unsigned int bx) {
    LAUNCH(gather, gx,1,1, bx,1,1, a, n_out,n_idx, dim,vocab, io,idx_o,oo);
}

void launch_narrow(float* a, unsigned int total, unsigned int outer,
                   unsigned int inner, unsigned int axis_in,
                   unsigned int axis_out, unsigned int start,
                   unsigned int io, unsigned int oo,
                   unsigned int gx, unsigned int bx) {
    LAUNCH(narrow, gx,1,1, bx,1,1, a, total,outer,inner, axis_in,axis_out, start, io,oo);
}

void launch_concat(float* a, unsigned int total, unsigned int outer,
                   unsigned int inner, unsigned int axis_in,
                   unsigned int axis_out, unsigned int start,
                   unsigned int io, unsigned int oo,
                   unsigned int gx, unsigned int bx) {
    LAUNCH(concat, gx,1,1, bx,1,1, a, total,outer,inner, axis_in,axis_out, start, io,oo);
}

void launch_transpose(float* a, unsigned int rank, unsigned int out_total,
                      unsigned int io, unsigned int oo, const unsigned int* meta,
                      unsigned int gx, unsigned int bx) {
    LAUNCH(transpose, gx,1,1, bx,1,1, a, rank, out_total, io, oo, meta);
}

void launch_expand(float* a, unsigned int rank, unsigned int out_total,
                   unsigned int io, unsigned int oo, const unsigned int* meta,
                   unsigned int gx, unsigned int bx) {
    LAUNCH(expand, gx,1,1, bx,1,1, a, rank, out_total, io, oo, meta);
}

// ── Attention + Rope ───────────────────────────────────────────────

void launch_attention(float* a,
                      unsigned int batch, unsigned int heads,
                      unsigned int seq_q, unsigned int seq_k,
                      unsigned int head_dim,
                      unsigned int qo, unsigned int ko,
                      unsigned int vo, unsigned int oo,
                      unsigned int mask_o, unsigned int mask_kind,
                      unsigned int scale_bits, unsigned int window,
                      unsigned int seq_q_stride, unsigned int seq_k_stride,
                      unsigned int mask_batch_stride, unsigned int mask_head_stride,
                      unsigned int q_batch_stride, unsigned int q_head_stride,
                      unsigned int q_seq_stride,
                      unsigned int k_batch_stride, unsigned int k_head_stride,
                      unsigned int k_seq_stride,
                      unsigned int v_batch_stride, unsigned int v_head_stride,
                      unsigned int v_seq_stride,
                      unsigned int o_batch_stride, unsigned int o_head_stride,
                      unsigned int o_seq_stride,
                      unsigned int gx, unsigned int bx) {
    LAUNCH(attention, gx,1,1, bx,1,1,
        a, batch,heads,seq_q,seq_k,head_dim,
        qo,ko,vo,oo, mask_o,mask_kind,scale_bits,window,
        seq_q_stride,seq_k_stride,mask_batch_stride,mask_head_stride,
        q_batch_stride,q_head_stride,q_seq_stride,
        k_batch_stride,k_head_stride,k_seq_stride,
        v_batch_stride,v_head_stride,v_seq_stride,
        o_batch_stride,o_head_stride,o_seq_stride);
}

void launch_rope(float* a, unsigned int n_total, unsigned int seq,
                 unsigned int head_dim, unsigned int half,
                 unsigned int io, unsigned int co, unsigned int so, unsigned int oo,
                 unsigned int last_dim,
                 unsigned int gx, unsigned int bx) {
    LAUNCH(rope, gx,1,1, bx,1,1,
        a, n_total,seq,head_dim,half, io,co,so,oo, last_dim);
}

// ── ScatterAdd (two phases) ────────────────────────────────────────

void launch_scatter_add_zero(float* a, unsigned int oo, unsigned int total,
                             unsigned int gx, unsigned int bx) {
    LAUNCH(scatter_add_zero, gx,1,1, bx,1,1, a, oo, total);
}

void launch_scatter_add_acc(float* a, unsigned int oo, unsigned int upd_o,
                            unsigned int idx_o, unsigned int n_upd,
                            unsigned int trailing, unsigned int out_dim,
                            unsigned int gx, unsigned int bx) {
    LAUNCH(scatter_add_acc, gx,1,1, bx,1,1,
        a, oo, upd_o, idx_o, n_upd, trailing, out_dim);
}

// ── Sample + SelectiveScan ─────────────────────────────────────────

void launch_sample(float* a, unsigned int outer, unsigned int inner,
                   unsigned int io, unsigned int oo,
                   unsigned int top_k, unsigned int top_p_bits,
                   unsigned int temp_bits,
                   unsigned int seed_lo, unsigned int seed_hi,
                   unsigned int gx, unsigned int bx) {
    LAUNCH(sample, gx,1,1, bx,1,1,
        a, outer,inner, io,oo, top_k, top_p_bits, temp_bits, seed_lo, seed_hi);
}

void launch_selective_scan(float* a, unsigned int batch, unsigned int seq,
                           unsigned int hidden, unsigned int state_size,
                           unsigned int xo, unsigned int dt_o,
                           unsigned int ao, unsigned int bo,
                           unsigned int co, unsigned int oo,
                           unsigned int gx, unsigned int bx) {
    LAUNCH(selective_scan, gx,1,1, bx,1,1,
        a, batch,seq,hidden,state_size, xo,dt_o,ao,bo,co,oo);
}

// ── Pool / Conv (1D, 2D, 3D) ───────────────────────────────────────

void launch_pool1d(float* a, unsigned int n, unsigned int c, unsigned int l,
                   unsigned int l_out, unsigned int kl, unsigned int sl,
                   unsigned int pl, unsigned int op,
                   unsigned int io, unsigned int oo,
                   unsigned int gx, unsigned int bx) {
    LAUNCH(pool1d, gx,1,1, bx,1,1, a, n,c,l, l_out, kl,sl,pl, op, io,oo);
}

void launch_pool2d(float* a, unsigned int n, unsigned int c, unsigned int h,
                   unsigned int w, unsigned int h_out, unsigned int w_out,
                   unsigned int kh, unsigned int kw,
                   unsigned int sh, unsigned int sw,
                   unsigned int ph, unsigned int pw, unsigned int op,
                   unsigned int io, unsigned int oo,
                   unsigned int gx, unsigned int bx) {
    LAUNCH(pool2d, gx,1,1, bx,1,1,
        a, n,c,h,w, h_out,w_out, kh,kw, sh,sw, ph,pw, op, io,oo);
}

void launch_pool3d(float* a, unsigned int n, unsigned int c,
                   unsigned int d, unsigned int h, unsigned int w,
                   unsigned int d_out, unsigned int h_out, unsigned int w_out,
                   unsigned int kd, unsigned int kh, unsigned int kw,
                   unsigned int sd, unsigned int sh, unsigned int sw,
                   unsigned int pd, unsigned int ph, unsigned int pw,
                   unsigned int op, unsigned int io, unsigned int oo,
                   unsigned int gx, unsigned int bx) {
    LAUNCH(pool3d, gx,1,1, bx,1,1,
        a, n,c,d,h,w, d_out,h_out,w_out,
        kd,kh,kw, sd,sh,sw, pd,ph,pw, op, io,oo);
}

void launch_conv1d(float* a, unsigned int n, unsigned int c_in,
                   unsigned int c_out, unsigned int l, unsigned int l_out,
                   unsigned int kl, unsigned int sl, unsigned int pl,
                   unsigned int dl, unsigned int groups,
                   unsigned int io, unsigned int wo, unsigned int oo,
                   unsigned int gx, unsigned int bx) {
    LAUNCH(conv1d, gx,1,1, bx,1,1,
        a, n,c_in,c_out, l, l_out, kl,sl,pl,dl, groups, io,wo,oo);
}

void launch_conv2d(float* a, unsigned int n, unsigned int c_in,
                   unsigned int c_out, unsigned int h, unsigned int w,
                   unsigned int h_out, unsigned int w_out,
                   unsigned int kh, unsigned int kw,
                   unsigned int sh, unsigned int sw,
                   unsigned int ph, unsigned int pw,
                   unsigned int dh, unsigned int dw, unsigned int groups,
                   unsigned int io, unsigned int wo, unsigned int oo,
                   unsigned int gx, unsigned int bx) {
    LAUNCH(conv2d, gx,1,1, bx,1,1,
        a, n,c_in,c_out, h,w, h_out,w_out, kh,kw, sh,sw, ph,pw, dh,dw,
        groups, io,wo,oo);
}

void launch_conv3d(float* a, unsigned int n, unsigned int c_in,
                   unsigned int c_out, unsigned int d, unsigned int h,
                   unsigned int w, unsigned int d_out, unsigned int h_out,
                   unsigned int w_out,
                   unsigned int kd, unsigned int kh, unsigned int kw,
                   unsigned int sd, unsigned int sh, unsigned int sw,
                   unsigned int pd, unsigned int ph, unsigned int pw,
                   unsigned int dd, unsigned int dh, unsigned int dw,
                   unsigned int groups,
                   unsigned int io, unsigned int wo, unsigned int oo,
                   unsigned int gx, unsigned int bx) {
    LAUNCH(conv3d, gx,1,1, bx,1,1,
        a, n,c_in,c_out, d,h,w, d_out,h_out,w_out,
        kd,kh,kw, sd,sh,sw, pd,ph,pw, dd,dh,dw,
        groups, io,wo,oo);
}

void launch_fused_binary_unary(float* a, unsigned int n,
                               unsigned int ao, unsigned int bo, unsigned int oo,
                               unsigned int bin_op, unsigned int un_op,
                               unsigned int gx, unsigned int bx) {
    LAUNCH(fused_binary_unary, gx,1,1, bx,1,1,
        a, n, ao, bo, oo, bin_op, un_op);
}

void launch_elementwise_region(float* a, unsigned int len,
                               unsigned int num_inputs, unsigned int num_steps,
                               unsigned int dst_off, const unsigned int* meta,
                               unsigned int scalar_input_mask,
                               const unsigned int* input_modulus,
                               unsigned int gx, unsigned int bx) {
    // Pack the modulus into the same struct the .cu kernel expects.
    InputModulus mod_struct;
    for (int i = 0; i < 16; ++i) mod_struct.v[i] = input_modulus[i];
    LAUNCH(elementwise_region, gx,1,1, bx,1,1,
        a, len, num_inputs, num_steps, dst_off, meta,
        scalar_input_mask, mod_struct);
}

void launch_batch_elementwise_region(float* a, unsigned int slice_len,
                                     unsigned int num_batch,
                                     unsigned int num_steps,
                                     unsigned int base_dst_off,
                                     unsigned int slice_elems,
                                     const unsigned int* batch_input_offs,
                                     const unsigned int* meta,
                                     unsigned int scalar_input_mask,
                                     const unsigned int* input_modulus,
                                     unsigned int gx, unsigned int bx) {
    InputModulus mod_struct;
    for (int i = 0; i < 16; ++i) mod_struct.v[i] = input_modulus[i];
    LAUNCH(batch_elementwise_region, gx, 1, num_batch, bx, 1, 1,
           a, slice_len, num_batch, num_steps, base_dst_off, slice_elems,
           batch_input_offs, meta, scalar_input_mask, mod_struct);
}

}  // extern "C"