realizar 0.8.5

Pure Rust ML inference engine built from scratch - model serving for GGUF and safetensors
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565

/// CUDA kernel types supported by realizar
#[derive(Debug, Clone)]
#[allow(missing_docs)]
pub enum KernelType {
    /// Naive GEMM (simple, for reference)
    GemmNaive {
        m: u32,
        n: u32,
        k: u32,
    },
    /// Tiled GEMM with shared memory
    GemmTiled {
        m: u32,
        n: u32,
        k: u32,
        tile_size: u32,
    },
    /// Tensor Core GEMM (fp16)
    GemmTensorCore {
        m: u32,
        n: u32,
        k: u32,
    },
    /// GEMV (General Matrix-Vector Multiply) - optimized for M=1 (single token generation)
    Gemv {
        k: u32,
        n: u32,
    },
    /// Coalesced GEMV - high-bandwidth M=1 kernel with memory coalescing
    CoalescedGemv {
        k: u32,
        n: u32,
    },
    /// Numerically stable softmax
    Softmax {
        dim: u32,
    },
    /// Layer normalization
    LayerNorm {
        hidden_size: u32,
        epsilon: f32,
        affine: bool,
    },
    /// FlashAttention-style attention (single head)
    Attention {
        seq_len: u32,
        head_dim: u32,
        causal: bool,
    },
    /// Multi-head attention with parallel head processing (PARITY-043)
    MultiHeadAttention {
        seq_len: u32,
        head_dim: u32,
        n_heads: u32,
        causal: bool,
    },
    /// Tensor Core FlashAttention (FP16 WMMA) - REALIZAR-PARITY-001.3
    AttentionTensorCore {
        seq_len: u32,
        head_dim: u32,
        n_heads: u32,
        causal: bool,
    },
    /// Q4_K quantized GEMM (fused dequantization) - simplified format
    QuantizedGemm {
        m: u32,
        n: u32,
        k: u32,
    },
    /// Q4_K quantized GEMM (fused dequantization) - GGML super-block format (PARITY-041)
    QuantizedGemmGgml {
        m: u32,
        n: u32,
        k: u32,
    },
    /// GH-182: Tiled Q4_K GEMM — weight reuse across tile_m rows for prefill
    QuantizedGemmGgmlTiled {
        m: u32,
        n: u32,
        k: u32,
        tile_m: u32,
    },
    /// Q5_K quantized GEMM (fused dequantization) - GGML super-block format (PARITY-116)
    Q5KQuantizedGemm {
        m: u32,
        n: u32,
        k: u32,
    },
    /// Q6_K quantized GEMM (fused dequantization) - GGML super-block format (PARITY-117)
    Q6KQuantizedGemm {
        m: u32,
        n: u32,
        k: u32,
    },
    /// Optimized GEMM with register blocking (IMP-900a)
    GemmOptimized {
        m: u32,
        n: u32,
        k: u32,
        tile_size: u32,
        reg_block: u32,
    },
    /// Fused GEMM + bias + activation (IMP-900b)
    GemmBiasActivation {
        m: u32,
        n: u32,
        k: u32,
        activation: u32,
    },
    /// Element-wise bias + activation epilogue (IMP-1000)
    BiasActivation {
        n: u32,
        bias_size: u32,
        activation: u32,
    },
    /// FP16 Tensor Core GEMM with WMMA intrinsics (IMP-1000a)
    GemmFp16TensorCore {
        m: u32,
        n: u32,
        k: u32,
    },
    /// Fused Q4_K × Q8_0 dot product kernel (PARITY-073)
    FusedQ4Q8Dot {
        n: u32,
    },
    /// Q4_K quantized GEMV (fused dequantization) - PAR-003
    Q4KGemv {
        k: u32,
        n: u32,
    },
    /// PAR-041: Tiled Q4_K GEMV with shared memory input caching
    TiledQ4KGemv {
        k: u32,
        n: u32,
        outputs_per_block: u32,
    },
    /// PAR-056: Chunked Tiled Q4_K GEMV for large K dimensions
    ChunkedTiledQ4KGemv {
        k: u32,
        n: u32,
        outputs_per_block: u32,
    },
    /// PAR-062: Coalesced Q4_K GEMV with bandwidth-optimized memory access
    CoalescedQ4KGemv {
        k: u32,
        n: u32,
    },
    /// PAR-132: Wide Q4_K GEMV with 256 threads (8 warps) per output
    WideQ4KGemv {
        k: u32,
        n: u32,
    },
    /// PAR-069: Vectorized Q4_K GEMV with coalesced u32 weight loads
    VectorizedQ4KGemv {
        k: u32,
        n: u32,
    },
    /// PAR-082-V2: Multi-warp Vectorized Q4_K GEMV
    MwvQ4KGemv {
        k: u32,
        n: u32,
        num_warps: u32,
    },
    /// PAR-082-V4: Multi-warp DP4A Q4_K GEMV with Q8_1-quantized activations
    MwvDp4aQ4KGemv {
        k: u32,
        n: u32,
        num_warps: u32,
    },
    /// GH-176: Half-warp DP4A Q4_K GEMV (16 threads/SB, 1.77x fewer thread-insn)
    HwDp4aQ4KGemv {
        k: u32,
        n: u32,
        num_warps: u32,
    },
    /// PAR-063: DP4A-based Q4_K GEMV with 4x instruction reduction
    Dp4aQ4KGemv {
        k: u32,
        n: u32,
    },
    /// PAR-063-V2: DP4A SIMD Q4_K GEMV with true integer accumulation
    Dp4aSIMDQ4KGemv {
        k: u32,
        n: u32,
    },
    /// PAR-063-V4: Q8 Quantization kernel for activations
    Q8Quantize {
        n: u32,
    },
    /// PAR-063-V5: Q4K × Q8 dot product kernel using integer arithmetic
    Q4KQ8Dot {
        k: u32,
        n: u32,
    },
    /// PAR-063-V6: Packed DP4A Q4K×Q8 kernel with true dp4a.u32.s32 instruction
    PackedDp4aQ4KQ8 {
        k: u32,
        n: u32,
    },
    /// PAR-063-V3: True DP4A Q4K GEMV with proper nibble expansion
    TrueDp4aQ4KGemv {
        k: u32,
        n: u32,
    },
    /// PAR-094: Tensor Core Q4K GEMM for batched speculative decode
    TensorCoreQ4KGemm {
        m: u32,
        k: u32,
        n: u32,
    },
    /// PMAT-045: Multi-Warp Tensor Core Q4K GEMM (4 warps, 32×32 tiles)
    MultiWarpTensorCoreQ4KGemm {
        m: u32,
        k: u32,
        n: u32,
    },
    /// PMAT-091: Coalesced WMMA Q4K GEMM with interleaved weights
    InterleavedWmmaQ4KGemm {
        m: u32,
        k: u32,
        n: u32,
    },
    /// PMAT-054B: W4A16 WMMA Q4K GEMM with pre-computed FP16 scales
    W4a16WmmaQ4KGemm {
        m: u32,
        k: u32,
        n: u32,
    },
    /// PMAT-066: DP4A Q4K×Q8 GEMM — dequant-free prefill via int8 dot product
    Dp4aQ4KGemm {
        m: u32,
        n: u32,
        k: u32,
    },
    /// PAR-108: Batched Q4_K GEMV for 2x Ollama via shared dequantization
    BatchedQ4KGemv {
        m: u32,
        k: u32,
        n: u32,
    },
    /// PAR-129: Multi-warp batched Q4_K GEMV for M=16/32
    MultiWarpBatchedQ4KGemv {
        k: u32,
        n: u32,
        warps: u32,
    },
    /// GH-141: Batched HW DP4A Q4_K GEMV for M=2..8
    /// Reads Q4K weights once, M Q8_1 activations per SB. 4.7x less BW than cuBLAS SGEMM.
    BatchedHwDp4aQ4KGemv {
        k: u32,
        n: u32,
        m: u32,
        num_warps: u32,
    },
    /// PMAT-293: Fused FP32-input Q4K GEMV (no Q8 pre-quantize)
    /// Reads FP32 activations directly, dequants Q4K to FP32, FMA accumulate.
    /// Eliminates Q8 quantize launch — 1 kernel instead of 2 per projection.
    FusedFp32Q4KGemv {
        k: u32,
        n: u32,
        m: u32,
        num_warps: u32,
    },
    /// PMAT-295: Inline Q8 DP4A Q4K GEMV (fuses Q8 quantize into DP4A)
    /// Reads FP32 input, quantizes to INT8 in-register per-thread, then DP4A.
    /// Saves 1 launch per projection (no separate Q8 quantize kernel).
    InlineQ8Dp4aQ4KGemv {
        k: u32,
        n: u32,
        m: u32,
        num_warps: u32,
    },
    /// Q5_K quantized GEMV (fused dequantization) - PAR-003
    Q5KGemv {
        k: u32,
        n: u32,
    },
    /// Q6_K quantized GEMV (fused dequantization) - PAR-003
    Q6KGemv {
        k: u32,
        n: u32,
    },
    /// PAR-066: Coalesced Q6_K GEMV with vectorized scale loading
    CoalescedQ6KGemv {
        k: u32,
        n: u32,
    },
    /// PAR-130: Batched Q6_K GEMV for M>1 batch processing
    BatchedQ6KGemv {
        k: u32,
        n: u32,
        m: u32,
    },
    /// GH-118: Multi-warp Q6_K GEMV for Orin decode throughput (Design by Contract)
    ///
    /// Contracts:
    /// - Precondition: k % 256 == 0 (Q6K super-block alignment)
    /// - Precondition: num_warps in {1,2,3,4,6,8}
    /// - Postcondition: output identical to Q6KGemv (parity contract)
    /// - Invariant: PARITY-114 barrier safety (all threads reach bar.sync)
    MwvQ6KGemv {
        k: u32,
        n: u32,
        num_warps: u32,
    },
    /// DP4A Q6_K GEMV with vectorized int32 loads and dp4a.u32.s32
    ///
    /// ~4x instruction reduction vs MWV Q6K. Pre-quantizes activations to Q8_1.
    /// Enable with `DP4A_Q6K=1` env var.
    Dp4aQ6KGemv {
        k: u32,
        n: u32,
        num_warps: u32,
    },
    /// PMAT-030: Half-warp DP4A Q6K GEMV — 16 threads/SB, direct scale loads
    HwDp4aQ6KGemv {
        k: u32,
        n: u32,
        num_warps: u32,
    },
    /// PAR-053: FP16 Q4_K GEMV - 2x bandwidth savings vs FP32
    Fp16Q4KGemv {
        k: u32,
        n: u32,
    },
    /// Q8_0 quantized GEMV (fused dequantization) - PAR-058
    Q8_0Gemv {
        k: u32,
        n: u32,
    },
    /// Q5_0 quantized GEMV (fused dequantization) - PAR-058
    Q5_0Gemv {
        k: u32,
        n: u32,
    },
    /// Q4_0 quantized GEMV (fused dequantization) - PAR-058
    Q4_0Gemv {
        k: u32,
        n: u32,
    },
    /// Q4_1 quantized GEMV (fused dequantization) - PAR-058
    Q4_1Gemv {
        k: u32,
        n: u32,
    },
    /// Incremental attention for M=1 autoregressive decoding (PAR-020 + PAR-021)
    IncrementalAttention {
        max_seq_len: u32,
        head_dim: u32,
        n_heads: u32,
        n_kv_heads: u32,
        indirect: bool,
    },
    /// PAR-070: Multi-warp incremental attention for decode phase
    MultiWarpAttention {
        max_seq_len: u32,
        head_dim: u32,
        n_heads: u32,
        n_kv_heads: u32,
        num_warps_per_head: u32,
        indirect: bool,
    },
    /// PAR-052: KV Cache Scatter kernel
    KvCacheScatter {
        num_kv_heads: u32,
        head_dim: u32,
        max_len: u32,
    },
    /// PAR-054: KV Cache Scatter with Indirect Position (CUDA Graph Compatible)
    KvCacheScatterIndirect {
        num_kv_heads: u32,
        head_dim: u32,
        max_len: u32,
    },
    /// PAR-023: RMSNorm kernel (Root Mean Square Layer Normalization)
    RmsNorm {
        hidden_size: u32,
        epsilon: f32,
    },
    /// PAR-081: Vectorized RMSNorm kernel with 256 threads
    VectorizedRmsNorm {
        hidden_size: u32,
        epsilon: f32,
    },
    /// PAR-112: Batched Vectorized RMSNorm kernel
    BatchedVectorizedRmsNorm {
        hidden_size: u32,
        batch_size: u32,
        epsilon: f32,
    },
    /// PMAT-092: Batched fused residual add + RMSNorm kernel
    BatchedFusedResidualRmsNorm {
        hidden_size: u32,
        batch_size: u32,
        epsilon: f32,
    },
    /// CORRECTNESS-013: High-precision RMSNorm kernel for CPU/GPU bit-exactness
    PreciseRmsNorm {
        hidden_size: u32,
        epsilon: f32,
    },
    /// GH-559: Shared-memory RMSNorm (no warp shuffles) for Blackwell sm_121
    SharedMemRmsNorm {
        hidden_size: u32,
        epsilon: f32,
    },
    /// GH-280: Per-head QK RMSNorm (Qwen3)
    PerHeadRmsNorm {
        head_dim: u32,
        num_heads: u32,
        epsilon: f32,
    },
    /// PAR-114: Batched RoPE kernel
    BatchedRope {
        num_heads: u32,
        head_dim: u32,
        batch_size: u32,
        theta: f32,
    },
    /// PAR-114: Batched Residual Add kernel
    BatchedResidualAdd {
        n: u32,
        batch_size: u32,
    },
    /// PAR-114: Batched SwiGLU kernel
    BatchedSwiglu {
        n: u32,
        batch_size: u32,
    },
    /// PAR-023: Residual Add kernel for async pipeline
    ResidualAdd {
        n: u32,
    },
    /// PAR-023: Fused Residual Add + RMSNorm kernel
    FusedResidualRmsNorm {
        hidden_size: u32,
        epsilon: f32,
    },
    /// PAR-076: Fused RMSNorm + Q4K GEMV kernel
    FusedRmsNormQ4KGemv {
        k: u32,
        n: u32,
        epsilon: f32,
    },

    /// PAR-077: Fused gate + up Q4K GEMV kernel
    FusedGateUpQ4KGemv {
        k: u32,
        n: u32,
    },

    /// PMAT-034: Fused gate + up + SwiGLU HW DP4A Q4K GEMV kernel
    /// Eliminates 2 kernel launches + 4 intermediate buffer passes
    FusedGateUpSwigluHwDp4aQ4KGemv {
        k: u32,
        n: u32,
    },

    /// trueno#237: Fused K+V HW DP4A Q4K GEMV — 2 projections in 1 launch
    FusedKVHwDp4aQ4KGemv {
        k: u32,
        n: u32,
    },

    /// QWEN-009: 3-way fused kernel: RMSNorm → Gate/Up Q4K GEMV → SwiGLU
    FusedRmsNormGateUpSwigluQ4K {
        k: u32,
        n: u32,
        epsilon: f32,
    },

    // =========================================================================
    // PAR-023: Activation and Element-wise Kernels for GPU-Resident Pipeline
    // =========================================================================
    /// SiLU activation: output = x * sigmoid(x)
    Silu {
        n: u32,
    },

    /// GELU activation: output ≈ 0.5 * x * (1 + tanh(sqrt(2/π) * (x + 0.044715 * x³)))
    Gelu {
        n: u32,
    },

    /// Element-wise multiply: output = input1 * input2
    ElementwiseMul {
        n: u32,
    },

    /// Fused SwiGLU: output = silu(gate) * up
    FusedSwiglu {
        n: u32,
    },

    /// PMAT-PERF-009: Fused Q/K/V projection kernel
    FusedQKV {
        hidden_size: u32,
        kv_dim: u32,
    },

    /// PMAT-PERF-009: Fused Gate+Up FFN kernel with SwiGLU
    FusedGateUp {
        hidden_size: u32,
        intermediate_size: u32,
    },

    /// PAR-060: RoPE (Rotary Position Embedding) kernel
    Rope {
        num_heads: u32,
        head_dim: u32,
        theta: f32,
    },
    /// PAR-054: RoPE with Indirect Position (CUDA Graph Compatible)
    RopeIndirect {
        num_heads: u32,
        head_dim: u32,
        theta: f32,
    },
    /// CORRECTNESS-011: RoPE NEOX style (split halves)
    RopeNeox {
        num_heads: u32,
        head_dim: u32,
        theta: f32,
    },
    /// CORRECTNESS-011: RoPE NEOX Indirect (CUDA Graph compatible)
    RopeNeoxIndirect {
        num_heads: u32,
        head_dim: u32,
        theta: f32,
    },
    /// CORRECTNESS-013: Precise RoPE NEOX Indirect (no .approx trig)
    PreciseRopeNeoxIndirect {
        num_heads: u32,
        head_dim: u32,
        theta: f32,
    },
    /// PAR-062: ArgMax block reduction kernel
    ArgMax {
        length: u32,
    },
    /// PAR-062: ArgMax final reduction kernel
    ArgMaxFinal {
        num_blocks: u32,
    },
    /// QWEN-007: Q8 dequantization kernel for KV cache
    Q8Dequant {
        n: u32,
    },
    /// PMAT-024: Q4K dequantization kernel for cuBLAS GEMM prefill
    Q4KDequant {
        k: u32,
        n: u32,
    },
    /// PMAT-026: Q6K dequantization kernel for cuBLAS GEMM prefill
    Q6KDequant {
        k: u32,
        n: u32,
    },
    /// PMAT-065: Q4K → FP16 direct dequantization (no F32 intermediate)
    Q4KDequantFp16 {
        k: u32,
        n: u32,
    },
}