ripvec-core 0.13.22

Semantic code search engine — GPU-accelerated ModernBERT embeddings, tree-sitter chunking, hybrid BM25+vector ranking
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
//! Hardware-agnostic compute driver trait.
//!
//! The [`Driver`] trait exposes low-level compute primitives (GEMM, layer-norm,
//! activations, etc.) that each hardware backend implements. Model architectures
//! are generic over `D: Driver` and compose these primitives into a forward pass.
//!
//! # Design
//!
//! - **Associated type `Tensor`**: each driver defines its own opaque tensor
//!   handle (Metal: buffer+offset, CUDA: device pointer, CPU: ndarray).
//! - **Not object-safe**: architectures use `D: Driver` generics so the compiler
//!   can monomorphize and inline driver calls.
//! - **Send + Sync**: drivers are shared across the pipeline.

#[cfg(any(feature = "cpu", feature = "cpu-accelerate"))]
pub mod cpu;
#[cfg(feature = "cuda")]
pub mod cuda;
#[cfg(feature = "metal")]
pub mod metal;
#[cfg(feature = "mlx")]
pub mod mlx;

use super::Encoding;

/// Hardware-agnostic compute primitives for BERT inference.
///
/// Each method corresponds to one operation in the forward pass. Drivers handle
/// memory allocation, kernel dispatch, and synchronization. Architectures
/// compose these primitives via the [`super::arch::ModelArch`] trait.
pub trait Driver: Send + Sync {
    /// Opaque tensor handle.
    ///
    /// Metal: `MTLBuffer` + byte offset. CUDA: `CUdeviceptr`. CPU: `Array2<f32>`.
    type Tensor;

    /// Short human-readable label for diagnostics (e.g. "Metal", "CUDA", "CPU").
    /// Surfaced via [`super::EmbedBackend::name`].
    fn name(&self) -> &'static str;

    /// Create a new driver instance for a cloned worker thread.
    ///
    /// CPU drivers are zero-size and always succeed. GPU drivers typically
    /// cannot be cloned this way (they share device state) and should leave
    /// the default panic implementation.
    fn new_for_clone() -> crate::Result<Self>
    where
        Self: Sized,
    {
        Err(crate::Error::Other(anyhow::anyhow!(
            "this driver does not support cloning"
        )))
    }

    // --- Batching ---

    /// Begin batched mode: all subsequent operations encode into one dispatch.
    ///
    /// GPU drivers accumulate into a single command buffer; CPU is a no-op.
    /// Call [`Self::end_batch`] to commit. This eliminates per-call overhead.
    fn begin_batch(&self) -> crate::Result<()> {
        Ok(())
    }

    /// End batched mode: commit all accumulated operations and wait.
    fn end_batch(&self) -> crate::Result<()> {
        Ok(())
    }

    /// Flush the current command buffer and start a new one, preserving pool
    /// state. Use mid-forward-pass to prevent GPU timeouts on deep models.
    fn flush_batch(&self) -> crate::Result<()> {
        Ok(())
    }

    /// Close and reopen the compute encoder within the same command buffer.
    ///
    /// This segments a long sequence of compute dispatches into multiple
    /// encoders without committing or waiting. Metal processes encoders
    /// back-to-back from the same CB — zero sync overhead.
    ///
    /// Use every few layers to prevent encoder state overflow (>~60 dispatches
    /// per encoder can cause hangs on some Apple Silicon GPUs).
    fn segment_encoder(&self) {
        // No-op for non-Metal backends
    }

    /// Save the current pool cursor position. Call BEFORE a layer's work.
    fn save_pool_cursor(&self) -> usize {
        0
    }

    /// Restore the pool cursor to a previously saved position. Call AFTER
    /// a layer's transient tensors have been dropped (out of scope).
    ///
    /// The architecture must ensure only the output tensor (`hidden_states`)
    /// survives — all layer-internal tensors (qkv, scores, context, etc.)
    /// must be dropped before this call so their pool slots can be recycled.
    fn restore_pool_cursor(&self, _saved: usize) {}

    // --- Allocation ---

    /// Allocate a zero-initialized tensor with `n` float elements on device.
    ///
    /// Used by architectures to create workspace buffers (QKV projections,
    /// attention scores, intermediate activations, etc.).
    ///
    /// # Errors
    ///
    /// Returns an error if device memory allocation fails.
    fn alloc_zeros(&self, n: usize) -> crate::Result<Self::Tensor>;

    /// Clone a tensor, producing an independent copy of the data.
    ///
    /// Used when an operation needs both the original and a mutable output
    /// referencing the same logical data (e.g., in-place layer normalization
    /// where input == output).
    ///
    /// # Errors
    ///
    /// Returns an error if device memory allocation or the copy fails.
    fn clone_tensor(&self, tensor: &Self::Tensor, n: usize) -> crate::Result<Self::Tensor>;

    // --- Batch preparation ---

    /// Prepare a batch of encodings for inference, returning input tensors on device.
    ///
    /// Pads all sequences to `max_seq` and uploads `input_ids`, `attention_mask`,
    /// `token_type_ids`, `position_ids`, and a float attention mask to device memory.
    fn prepare_batch(
        &self,
        encodings: &[Encoding],
        max_seq: usize,
    ) -> crate::Result<BatchInputs<Self::Tensor>>;

    /// Prepare a batch WITHOUT padding — concatenate all tokens flat.
    ///
    /// Returns `BatchInputs` with `total_tokens` actual tokens (no padding),
    /// `cu_seqlens` for attention boundaries, and per-token position IDs.
    /// Linear layers (GEMM, LN, GELU) process `total_tokens` rows.
    /// Attention must pad/unpad around the per-head operations.
    fn prepare_batch_unpadded(
        &self,
        encodings: &[Encoding],
    ) -> crate::Result<BatchInputs<Self::Tensor>> {
        // Default: fall back to padded (backends override for unpadded support)
        let max_seq = encodings
            .iter()
            .map(|e| e.input_ids.len())
            .max()
            .unwrap_or(0)
            .next_multiple_of(8);
        self.prepare_batch(encodings, max_seq)
    }

    /// Scatter flat `[total_tokens, dim]` tensor into padded `[batch, max_seq, dim]`.
    ///
    /// Used before attention: linear layers produce unpadded output, but the
    /// QKV split + batched attention GEMM need aligned `[batch*heads, seq, head_dim]`.
    /// Padding positions are zeroed.
    fn pad_to_batch(
        &self,
        flat: &Self::Tensor,
        padded: &mut Self::Tensor,
        seq_lengths: &[usize],
        max_seq: usize,
        dim: usize,
    ) -> crate::Result<()>;

    /// Gather padded `[batch, max_seq, dim]` back to flat `[total_tokens, dim]`.
    ///
    /// Used after attention: extracts only the real tokens, discarding padding.
    fn unpad_from_batch(
        &self,
        padded: &Self::Tensor,
        flat: &mut Self::Tensor,
        seq_lengths: &[usize],
        max_seq: usize,
        dim: usize,
    ) -> crate::Result<()>;

    // --- Embedding operations ---

    /// Word/position/token-type embedding lookup via gather.
    ///
    /// Reads `seq_len` token IDs from `word_ids`, gathers rows from
    /// `embedding_table`, and writes `[seq_len, hidden]` floats to the result.
    fn embedding_lookup(
        &self,
        word_ids: &Self::Tensor,
        embedding_table: &Self::Tensor,
        seq_len: usize,
        hidden: usize,
    ) -> crate::Result<Self::Tensor>;

    /// Element-wise add an embedding table lookup into `hidden`.
    ///
    /// Used for position and token-type embeddings:
    /// `hidden[i] += table[ids[i]]` for each token position.
    fn add_embeddings(
        &self,
        hidden: &mut Self::Tensor,
        table: &Self::Tensor,
        ids: &Self::Tensor,
        seq_len: usize,
        hidden_dim: usize,
    ) -> crate::Result<()>;

    // --- Normalization ---

    /// Layer normalization: `output = (input - mean) / sqrt(var + eps) * weight + bias`.
    fn layer_norm(
        &self,
        output: &mut Self::Tensor,
        input: &Self::Tensor,
        weight: &Self::Tensor,
        bias: &Self::Tensor,
        rows: usize,
        cols: usize,
        eps: f32,
    ) -> crate::Result<()>;

    // --- Linear algebra ---

    /// General matrix multiply: `output = A * B` (or `A * B^T` if `transpose_b`).
    ///
    /// Dimensions: A is `[m, k]`, B is `[k, n]` (or `[n, k]` if transposed),
    /// output is `[m, n]`.
    fn gemm(
        &self,
        a: &Self::Tensor,
        b: &Self::Tensor,
        output: &mut Self::Tensor,
        m: usize,
        n: usize,
        k: usize,
        transpose_b: bool,
    ) -> crate::Result<()>;

    /// Batched GEMM for multi-head attention.
    ///
    /// Performs `batch_count` independent GEMMs with strided access into
    /// contiguous buffers. Used for per-head Q*K^T and attn*V.
    fn gemm_batched(
        &self,
        a: &Self::Tensor,
        b: &Self::Tensor,
        output: &mut Self::Tensor,
        m: usize,
        n: usize,
        k: usize,
        transpose_b: bool,
        stride_a: usize,
        stride_b: usize,
        stride_c: usize,
        batch_count: usize,
    ) -> crate::Result<()>;

    // --- Attention ---

    /// Fused scale + mask + softmax for attention scores.
    ///
    /// `scores = softmax(scores * scale + mask)` computed per-head.
    fn fused_scale_mask_softmax(
        &self,
        scores: &mut Self::Tensor,
        mask: &Self::Tensor,
        batch: usize,
        num_heads: usize,
        seq_len: usize,
        scale: f32,
    ) -> crate::Result<()>;

    /// Fused scale + mask + sliding window + softmax for attention scores.
    ///
    /// Like [`fused_scale_mask_softmax`](Driver::fused_scale_mask_softmax) but
    /// additionally masks out positions where `|query_pos - key_pos| > window_size / 2`.
    /// Used by `ModernBERT`'s local attention layers.
    fn fused_scale_mask_softmax_windowed(
        &self,
        scores: &mut Self::Tensor,
        mask: &Self::Tensor,
        batch: usize,
        num_heads: usize,
        seq_len: usize,
        scale: f32,
        window_size: usize,
    ) -> crate::Result<()>;

    /// Build a float attention mask from an integer mask.
    ///
    /// Converts `[batch * seq]` int mask (0/1) to `[batch * seq]` float mask
    /// (0.0 / -10000.0) for use with [`fused_scale_mask_softmax`](Driver::fused_scale_mask_softmax).
    fn build_attn_mask(
        &self,
        output: &mut Self::Tensor,
        int_mask: &Self::Tensor,
        n: usize,
    ) -> crate::Result<()>;

    /// Split a fused QKV projection into separate Q, K, V tensors.
    fn qkv_split(
        &self,
        q: &mut Self::Tensor,
        k: &mut Self::Tensor,
        v: &mut Self::Tensor,
        qkv: &Self::Tensor,
        batch: usize,
        seq: usize,
        hidden: usize,
        num_heads: usize,
        head_dim: usize,
    ) -> crate::Result<()>;

    // --- Banded (local/sliding-window) attention ---

    /// Banded Q@K^T: compute attention scores only within a sliding window.
    ///
    /// Output shape: `[batch * num_heads, seq, window]` (NOT `[seq, seq]`).
    /// `scores[h, i, w]` = dot(Q[h, i, :], K[h, i - window/2 + w, :])
    /// where out-of-bounds positions are set to `-inf` (masked in softmax).
    ///
    /// Reduces attention compute from O(seq²) to O(seq × window).
    /// For `seq=512, window=128`: **4× less compute** per local layer.
    fn banded_qk(
        &self,
        q: &Self::Tensor,
        k: &Self::Tensor,
        scores: &mut Self::Tensor,
        batch_heads: usize,
        seq: usize,
        head_dim: usize,
        window: usize,
        stride_qk: usize,
        stride_scores: usize,
    ) -> crate::Result<()>;

    /// Banded scores@V: weighted sum using banded attention scores.
    ///
    /// Input scores: `[batch * num_heads, seq, window]` (from `banded_qk`).
    /// Output: `[batch * num_heads, seq, head_dim]`.
    /// `output[h, i, d]` = sum_w scores[h, i, w] * V[h, i - window/2 + w, d]
    fn banded_sv(
        &self,
        scores: &Self::Tensor,
        v: &Self::Tensor,
        output: &mut Self::Tensor,
        batch_heads: usize,
        seq: usize,
        head_dim: usize,
        window: usize,
        stride_scores: usize,
        stride_v: usize,
        stride_out: usize,
    ) -> crate::Result<()>;

    /// Fused scale + softmax over the window dimension (no padding mask needed).
    ///
    /// Operates on `[batch * num_heads * seq, window]` rows.
    fn banded_softmax(
        &self,
        scores: &mut Self::Tensor,
        total_rows: usize,
        window: usize,
        scale: f32,
    ) -> crate::Result<()>;

    /// Reshape attention output from `[batch, num_heads, seq, head_dim]` to
    /// `[batch * seq, hidden]`.
    fn attn_reshape(
        &self,
        output: &mut Self::Tensor,
        input: &Self::Tensor,
        batch: usize,
        seq: usize,
        num_heads: usize,
        head_dim: usize,
    ) -> crate::Result<()>;

    /// Apply Rotary Position Embedding (RoPE) to Q/K tensors.
    ///
    /// Used by ModernBERT (not ClassicBert which uses learned position embeddings).
    fn apply_rope(
        &self,
        qk: &mut Self::Tensor,
        cos: &Self::Tensor,
        sin: &Self::Tensor,
        num_rows: usize,
        seq_len: usize,
        head_dim: usize,
        num_heads: usize,
    ) -> crate::Result<()>;

    // --- Tensor manipulation ---

    /// Split a `[rows, 2*cols]` matrix into two `[rows, cols]` halves.
    ///
    /// Each row of `input` is `[first_half | second_half]`. The first `cols`
    /// elements go to `first`, the remaining `cols` to `second`.
    /// Used by `ModernBERT` for gated MLP splits.
    fn split_gate_value(
        &self,
        first: &mut Self::Tensor,
        second: &mut Self::Tensor,
        input: &Self::Tensor,
        rows: usize,
        cols: usize,
    ) -> crate::Result<()>;

    // --- Activations ---

    /// GELU activation (Gaussian Error Linear Unit), applied in-place.
    fn gelu(&self, x: &mut Self::Tensor, n: usize) -> crate::Result<()>;

    /// SwiGLU gated activation: `output = value * silu(gate)`.
    ///
    /// The gate and value come from splitting the intermediate projection.
    fn swiglu(
        &self,
        value: &Self::Tensor,
        gate: &Self::Tensor,
        output: &mut Self::Tensor,
        n: usize,
    ) -> crate::Result<()>;

    /// `GeGLU` gated activation: `output = gelu(value) * gate`.
    ///
    /// Used by `ModernBERT`. The value and gate come from splitting the
    /// MLP `Wi` projection output in half.
    fn geglu(
        &self,
        value: &Self::Tensor,
        gate: &Self::Tensor,
        output: &mut Self::Tensor,
        n: usize,
    ) -> crate::Result<()>;

    /// Fused bias + GELU: `x = gelu(x + bias)` row-wise.
    fn fused_bias_gelu(
        &self,
        x: &mut Self::Tensor,
        bias: &Self::Tensor,
        rows: usize,
        cols: usize,
    ) -> crate::Result<()>;

    // --- Fused residual operations ---

    /// Fused bias + residual add: `output = input + bias + residual`.
    ///
    /// Bias is broadcast row-wise (`cols`-wide) across `n / cols` rows.
    fn fused_bias_residual(
        &self,
        output: &mut Self::Tensor,
        input: &Self::Tensor,
        bias: &Self::Tensor,
        residual: &Self::Tensor,
        n: usize,
        cols: usize,
    ) -> crate::Result<()>;

    /// Fused residual add + layer normalization.
    ///
    /// `output = layer_norm(hidden + residual, weight, bias, eps)`.
    fn fused_residual_layernorm(
        &self,
        output: &mut Self::Tensor,
        hidden: &Self::Tensor,
        residual: &Self::Tensor,
        weight: &Self::Tensor,
        bias: &Self::Tensor,
        rows: usize,
        cols: usize,
        eps: f32,
    ) -> crate::Result<()>;

    /// Residual add without bias: `output = hidden + residual`.
    ///
    /// Used by `ModernBERT` which has no bias terms.
    fn residual_add(
        &self,
        output: &mut Self::Tensor,
        hidden: &Self::Tensor,
        residual: &Self::Tensor,
        n: usize,
    ) -> crate::Result<()>;

    /// Add bias to a matrix row-wise: `x[row] += bias` for each row.
    fn add_bias(
        &self,
        x: &mut Self::Tensor,
        bias: &Self::Tensor,
        rows: usize,
        cols: usize,
    ) -> crate::Result<()>;

    // --- Pooling ---

    /// CLS pooling: extract the first token's hidden state per batch element.
    fn cls_pool(
        &self,
        output: &mut Self::Tensor,
        hidden: &Self::Tensor,
        batch: usize,
        seq: usize,
        hidden_dim: usize,
    ) -> crate::Result<()>;

    /// Mean pooling: attention-mask-weighted average of hidden states.
    fn mean_pool(
        &self,
        output: &mut Self::Tensor,
        hidden: &Self::Tensor,
        mask: &Self::Tensor,
        batch: usize,
        seq: usize,
        hidden_dim: usize,
    ) -> crate::Result<()>;

    // --- Post-processing ---

    /// L2-normalize each row vector in-place.
    fn l2_normalize(&self, data: &mut Self::Tensor, rows: usize, cols: usize) -> crate::Result<()>;

    /// Copy tensor data back to host memory as `Vec<Vec<f32>>`.
    ///
    /// Returns one `Vec<f32>` of length `dim` per batch element.
    fn to_host(
        &self,
        tensor: &Self::Tensor,
        batch: usize,
        dim: usize,
    ) -> crate::Result<Vec<Vec<f32>>>;

    /// Optional finite-value diagnostic hook for backend tensors.
    ///
    /// Backends should keep this cheap or disabled by default. The CUDA driver
    /// enables full tensor readback only with `RIPVEC_CUDA_DEBUG_TENSORS=1`.
    fn debug_tensor(
        &self,
        _label: &str,
        _tensor: &Self::Tensor,
        _rows: usize,
        _cols: usize,
    ) -> crate::Result<()> {
        Ok(())
    }

    /// Whether calls to [`Driver::debug_tensor`] will inspect tensor contents.
    ///
    /// Architecture code uses this to avoid allocating and converting probe
    /// tensors when diagnostics are disabled.
    fn debug_tensors_enabled(&self) -> bool {
        false
    }

    // =======================================================================
    // FP16 operations for full half-precision pipeline
    //
    // These methods mirror the FP32 counterparts but operate on FP16 tensors.
    // Internal reductions (softmax, layer-norm) use FP32 accumulators but
    // all tensor I/O is half precision. Default implementations return an
    // error — only backends with FP16 support override them.
    // =======================================================================

    /// Allocate a zero-initialized FP16 tensor with `n` half-precision elements.
    ///
    /// # Errors
    ///
    /// Returns an error if device memory allocation fails or FP16 is unsupported.
    fn alloc_zeros_f16(&self, _n: usize) -> crate::Result<Self::Tensor> {
        Err(crate::Error::Metal(
            "FP16 not supported by this driver".into(),
        ))
    }

    /// Convert FP32 tensor to FP16 (element-wise narrowing).
    fn f32_to_f16(
        &self,
        _output: &mut Self::Tensor,
        _input: &Self::Tensor,
        _n: usize,
    ) -> crate::Result<()> {
        Err(crate::Error::Metal(
            "FP16 not supported by this driver".into(),
        ))
    }

    /// Convert FP16 tensor back to FP32 (element-wise widening).
    fn f16_to_f32(
        &self,
        _output: &mut Self::Tensor,
        _input: &Self::Tensor,
        _n: usize,
    ) -> crate::Result<()> {
        Err(crate::Error::Metal(
            "FP16 not supported by this driver".into(),
        ))
    }

    /// Mixed-precision GEMM: FP16 inputs → FP32 output via native simdgroup ops.
    fn gemm_mixed(
        &self,
        _a_f16: &Self::Tensor,
        _b_f16: &Self::Tensor,
        _output_f32: &mut Self::Tensor,
        _m: usize,
        _n: usize,
        _k: usize,
        _transpose_b: bool,
    ) -> crate::Result<()> {
        Err(crate::Error::Metal(
            "gemm_mixed not supported by this driver".into(),
        ))
    }

    /// FP16 GEMM: `output = A * B` (or `A * B^T`). All tensors are half.
    fn gemm_f16(
        &self,
        _a: &Self::Tensor,
        _b: &Self::Tensor,
        _output: &mut Self::Tensor,
        _m: usize,
        _n: usize,
        _k: usize,
        _transpose_b: bool,
    ) -> crate::Result<()> {
        Err(crate::Error::Metal(
            "FP16 not supported by this driver".into(),
        ))
    }

    /// FP16 batched GEMM for multi-head attention. All tensors are half.
    #[expect(
        clippy::too_many_arguments,
        reason = "matches FP32 gemm_batched signature"
    )]
    fn gemm_batched_f16(
        &self,
        _a: &Self::Tensor,
        _b: &Self::Tensor,
        _output: &mut Self::Tensor,
        _m: usize,
        _n: usize,
        _k: usize,
        _transpose_b: bool,
        _stride_a: usize,
        _stride_b: usize,
        _stride_c: usize,
        _batch_count: usize,
    ) -> crate::Result<()> {
        Err(crate::Error::Metal(
            "FP16 not supported by this driver".into(),
        ))
    }

    /// FP16 layer normalization. Half I/O, FP32 reductions.
    fn layer_norm_f16(
        &self,
        _output: &mut Self::Tensor,
        _input: &Self::Tensor,
        _weight: &Self::Tensor,
        _bias: &Self::Tensor,
        _rows: usize,
        _cols: usize,
        _eps: f32,
    ) -> crate::Result<()> {
        Err(crate::Error::Metal(
            "FP16 not supported by this driver".into(),
        ))
    }

    /// FP16 fused scale + mask + softmax. Half scores, FP32 reductions.
    fn fused_scale_mask_softmax_f16(
        &self,
        _scores: &mut Self::Tensor,
        _mask: &Self::Tensor,
        _batch: usize,
        _num_heads: usize,
        _seq_len: usize,
        _scale: f32,
    ) -> crate::Result<()> {
        Err(crate::Error::Metal(
            "FP16 not supported by this driver".into(),
        ))
    }

    /// FP16 fused scale + mask + sliding window + softmax.
    fn fused_scale_mask_softmax_windowed_f16(
        &self,
        _scores: &mut Self::Tensor,
        _mask: &Self::Tensor,
        _batch: usize,
        _num_heads: usize,
        _seq_len: usize,
        _scale: f32,
        _window_size: usize,
    ) -> crate::Result<()> {
        Err(crate::Error::Metal(
            "FP16 not supported by this driver".into(),
        ))
    }

    /// FP16 QKV split: `[batch*seq, 3*hidden]` into Q, K, V per-head layout.
    fn qkv_split_f16(
        &self,
        _q: &mut Self::Tensor,
        _k: &mut Self::Tensor,
        _v: &mut Self::Tensor,
        _qkv: &Self::Tensor,
        _batch: usize,
        _seq: usize,
        _hidden: usize,
        _num_heads: usize,
        _head_dim: usize,
    ) -> crate::Result<()> {
        Err(crate::Error::Metal(
            "FP16 not supported by this driver".into(),
        ))
    }

    /// FP16 attention output reshape: `[batch*num_heads, seq, head_dim]` to
    /// `[batch*seq, hidden]`.
    fn attn_reshape_f16(
        &self,
        _output: &mut Self::Tensor,
        _input: &Self::Tensor,
        _batch: usize,
        _seq: usize,
        _num_heads: usize,
        _head_dim: usize,
    ) -> crate::Result<()> {
        Err(crate::Error::Metal(
            "FP16 not supported by this driver".into(),
        ))
    }

    /// FP16 scatter flat `[total_tokens, dim]` to padded `[batch, max_seq, dim]`.
    fn pad_to_batch_f16(
        &self,
        _flat: &Self::Tensor,
        _padded: &mut Self::Tensor,
        _seq_lengths: &[usize],
        _max_seq: usize,
        _dim: usize,
    ) -> crate::Result<()> {
        Err(crate::Error::Metal(
            "FP16 not supported by this driver".into(),
        ))
    }

    /// FP16 gather padded `[batch, max_seq, dim]` back to flat `[total_tokens, dim]`.
    fn unpad_from_batch_f16(
        &self,
        _padded: &Self::Tensor,
        _flat: &mut Self::Tensor,
        _seq_lengths: &[usize],
        _max_seq: usize,
        _dim: usize,
    ) -> crate::Result<()> {
        Err(crate::Error::Metal(
            "FP16 not supported by this driver".into(),
        ))
    }

    /// FP16 RoPE: apply rotary position embedding. Half Q/K, float cos/sin tables.
    fn rope_encode_f16(
        &self,
        _qk: &mut Self::Tensor,
        _cos: &Self::Tensor,
        _sin: &Self::Tensor,
        _num_rows: usize,
        _seq_len: usize,
        _head_dim: usize,
        _num_heads: usize,
    ) -> crate::Result<()> {
        Err(crate::Error::Metal(
            "FP16 not supported by this driver".into(),
        ))
    }

    /// FP16 `GeGLU` gated activation: `output = gelu(value) * gate`. Half I/O.
    fn geglu_f16(
        &self,
        _value: &Self::Tensor,
        _gate: &Self::Tensor,
        _output: &mut Self::Tensor,
        _n: usize,
    ) -> crate::Result<()> {
        Err(crate::Error::Metal(
            "FP16 not supported by this driver".into(),
        ))
    }

    /// FP16 fused residual add + layer normalization.
    fn fused_residual_layernorm_f16(
        &self,
        _output: &mut Self::Tensor,
        _hidden: &Self::Tensor,
        _residual: &Self::Tensor,
        _weight: &Self::Tensor,
        _bias: &Self::Tensor,
        _rows: usize,
        _cols: usize,
        _eps: f32,
    ) -> crate::Result<()> {
        Err(crate::Error::Metal(
            "FP16 not supported by this driver".into(),
        ))
    }

    /// FP16 residual add (no bias): `output = hidden + residual`.
    fn residual_add_f16(
        &self,
        _output: &mut Self::Tensor,
        _hidden: &Self::Tensor,
        _residual: &Self::Tensor,
        _n: usize,
    ) -> crate::Result<()> {
        Err(crate::Error::Metal(
            "FP16 not supported by this driver".into(),
        ))
    }

    /// FP16 split `[rows, 2*cols]` into two `[rows, cols]` halves.
    fn split_gate_value_f16(
        &self,
        _first: &mut Self::Tensor,
        _second: &mut Self::Tensor,
        _input: &Self::Tensor,
        _rows: usize,
        _cols: usize,
    ) -> crate::Result<()> {
        Err(crate::Error::Metal(
            "FP16 not supported by this driver".into(),
        ))
    }

    /// Fused split + `GeGLU`: read `[rows, 2*cols]`, write `[rows, cols]`.
    ///
    /// Combines [`split_gate_value_f16`](Driver::split_gate_value_f16) and
    /// [`geglu_f16`](Driver::geglu_f16) into a single kernel, eliminating
    /// two intermediate `[rows, cols]` buffers and halving HBM round-trips.
    ///
    /// Default falls back to separate split + geglu calls.
    fn fused_split_geglu_f16(
        &self,
        output: &mut Self::Tensor,
        input: &Self::Tensor,
        rows: usize,
        cols: usize,
    ) -> crate::Result<()> {
        // Default: allocate intermediates and call separately.
        let n = rows * cols;
        let mut value = self.alloc_zeros_f16(n)?;
        let mut gate = self.alloc_zeros_f16(n)?;
        self.split_gate_value_f16(&mut value, &mut gate, input, rows, cols)?;
        self.geglu_f16(&value, &gate, output, n)
    }

    /// Fused pad + QKV split: flat `[total_tokens, 3*hidden]` → Q, K, V
    /// each `[batch*heads, max_seq, head_dim]`.
    ///
    /// Eliminates the padded intermediate buffer. Default calls pad then split.
    #[expect(clippy::too_many_arguments, reason = "mirrors pad + qkv_split args")]
    fn fused_pad_qkv_split_f16(
        &self,
        q: &mut Self::Tensor,
        k: &mut Self::Tensor,
        v: &mut Self::Tensor,
        qkv_flat: &Self::Tensor,
        seq_lengths: &[usize],
        max_seq: usize,
        batch: usize,
        hidden: usize,
        num_heads: usize,
        head_dim: usize,
    ) -> crate::Result<()> {
        // Default: pad then split.
        let padded_tokens = batch * max_seq;
        let mut qkv_padded = self.alloc_zeros_f16(padded_tokens * 3 * hidden)?;
        self.pad_to_batch_f16(qkv_flat, &mut qkv_padded, seq_lengths, max_seq, 3 * hidden)?;
        self.qkv_split_f16(
            q,
            k,
            v,
            &qkv_padded,
            batch,
            max_seq,
            hidden,
            num_heads,
            head_dim,
        )
    }

    /// Fused attn_reshape + unpad: `[batch*heads, max_seq, head_dim]` →
    /// `[total_tokens, hidden]`.
    ///
    /// Eliminates the padded context intermediate. Default calls reshape then unpad.
    fn fused_reshape_unpad_f16(
        &self,
        flat: &mut Self::Tensor,
        heads: &Self::Tensor,
        seq_lengths: &[usize],
        max_seq: usize,
        batch: usize,
        num_heads: usize,
        head_dim: usize,
    ) -> crate::Result<()> {
        // Default: reshape then unpad.
        let hidden = num_heads * head_dim;
        let padded_tokens = batch * max_seq;
        let mut context = self.alloc_zeros_f16(padded_tokens * hidden)?;
        self.attn_reshape_f16(&mut context, heads, batch, max_seq, num_heads, head_dim)?;
        self.unpad_from_batch_f16(&context, flat, seq_lengths, max_seq, hidden)
    }
}

/// Batch input tensors on device, produced by [`Driver::prepare_batch`].
///
/// Supports both padded and unpadded modes:
/// - **Padded**: all sequences padded to `max_seq`. `cu_seqlens` is `None`.
/// - **Unpadded**: sequences concatenated without padding. `cu_seqlens`
///   contains cumulative lengths `[0, len0, len0+len1, ...]` so attention
///   knows where each sequence starts. Eliminates ALL padding compute.
pub struct BatchInputs<T> {
    /// Token IDs — `[batch * max_seq]` (padded) or `[total_tokens]` (unpadded).
    pub input_ids: T,
    /// Attention mask `[batch * max_seq]` as int32 (0 or 1). Unused in unpadded mode.
    pub attention_mask: T,
    /// Token type IDs — same layout as `input_ids`.
    pub token_type_ids: T,
    /// Position IDs — same layout as `input_ids`.
    pub position_ids: T,
    /// Float attention bias mask `[batch * max_seq]` (0.0 or -1e9) for softmax.
    pub float_mask: T,
    /// Float pooling mask `[batch * max_seq]` (1.0 or 0.0) for mean pooling.
    pub pooling_mask: T,
    /// Number of sequences in this batch.
    pub batch: usize,
    /// Maximum sequence length (all sequences padded to this). In unpadded mode,
    /// this is the longest sequence (used for workspace sizing, not padding).
    pub max_seq: usize,
    /// Total actual tokens across all sequences (no padding).
    pub total_tokens: usize,
    /// Per-sequence lengths: `[batch]` — each element is the actual token count.
    pub seq_lengths: Vec<usize>,
    /// Cumulative sequence lengths for unpadded attention: `[batch + 1]`.
    /// `cu_seqlens[i]..cu_seqlens[i+1]` is the token range for sequence `i`.
    /// `None` in padded mode (all sequences padded to max_seq).
    pub cu_seqlens: Option<Vec<usize>>,
}