oxicuda-seq 0.4.0

OxiCUDA: Sequence Models & Structured Prediction (HMM/CRF/Kalman/MRF/alignment)
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
//! On-device GPU validation for the hand-written PTX kernels in
//! [`crate::ptx_kernels`].
//!
//! Each test JIT-compiles a kernel's PTX for the live device's SM version via
//! `Module::from_ptx`, launches it on the real CUDA device through
//! `oxicuda-launch`, copies the results back, and asserts numerical equivalence
//! to a CPU reference. The launch ABI mirrors the proven `oxicuda-snn` /
//! `oxicuda-ot` harnesses: device buffers are passed as their `CUdeviceptr`
//! (a `.param .u64`), scalars are passed as the matching Rust scalar
//! (`.param .u32` / `.param .f32` / `.param .u64`) in the kernel's declared
//! parameter order.
//!
//! ## Oracle strength tiers (honest accounting)
//!
//! * **Crate cross-check** — `edit_dist_kernel`'s final corner cell is also
//!   compared against [`crate::metrics::edit_distance`] (the crate's own
//!   Levenshtein), on top of a full-table independent host DP.
//! * **Independent host re-derivation** — the remaining kernels each mirror a
//!   step fused into a larger CPU routine, so the oracle is an independent Rust
//!   re-implementation of the kernel's documented arithmetic: HMM forward
//!   log-sum-exp (`forward_pass`), Viterbi max+argmax (`viterbi_step`), the CRF
//!   per-cell emission+transition score (`crf_features`), the beam rank
//!   (`beam_topk`), the Kalman predict mat-vec / `A·P·Aᵀ+Q` (`kalman_predict`),
//!   and the Ising Gibbs single-site sigmoid resample with a bit-exact LCG
//!   (`mrf_gibbs`). These genuinely fail if ptxas miscompiles or the PTX has a
//!   wrong constant / shift / index, because the host code is independent of the
//!   JIT-compiled PTX.
//!
//! ## PTX bugs found and fixed (see `ptx_kernels.rs`)
//!
//! * **`forward_pass_kernel`** — base-2 vs base-e error: the log-sum-exp used
//!   `ex2.approx.f32` for `exp` and `lg2.approx.f32` for `ln` with no base
//!   conversion, computing `max + log2(Σ 2^(x−max))` (~30 % wrong) instead of
//!   the natural `max + ln(Σ e^(x−max))`. Fixed by scaling the `ex2` argument by
//!   `log2(e)` and the `lg2` result by `ln(2)`.
//! * **`mrf_gibbs_kernel`** — same class: `p_up = 1/(1+exp(−2·field))` used
//!   `ex2(−2·field)` (a base-2 sigmoid). Fixed by scaling the exponent by
//!   `log2(e)` so the `ex2` evaluates `exp(−2·field)`.
//!
//! Every test skips (returns early) when no CUDA device is present, so the suite
//! stays green on CPU-only machines.

use std::sync::Arc;

use oxicuda_driver::{Context, Device, Module, Stream};
use oxicuda_launch::{Kernel, LaunchParams};
use oxicuda_memory::DeviceBuffer;

use crate::handle::LcgRng;

// ---------------------------------------------------------------------------
// Shared helpers
// ---------------------------------------------------------------------------

/// A live CUDA context plus the device's SM version, or `None` if unavailable.
struct GpuFixture {
    ctx: Arc<Context>,
    sm: u32,
}

/// Acquire a GPU fixture, or `None` when no driver / device is present.
fn gpu_fixture() -> Option<GpuFixture> {
    oxicuda_driver::init().ok()?;
    if Device::count().ok()? == 0 {
        return None;
    }
    let Ok(dev) = Device::get(0) else {
        return None;
    };
    let (major, minor) = dev.compute_capability().ok()?;
    let sm = (major * 10 + minor) as u32;
    let ctx = Context::new(&dev).ok()?;
    Some(GpuFixture {
        ctx: Arc::new(ctx),
        sm,
    })
}

/// Relative-with-absolute-floor closeness test for FP32 comparisons.
fn close(a: f32, b: f32, rel: f32, abs: f32) -> bool {
    (a - b).abs() <= rel * a.abs().max(b.abs()) + abs
}

/// Worst (relative, absolute) divergence over two equal-length slices.
fn worst_diff(gpu: &[f32], cpu: &[f32]) -> (f32, f32) {
    let mut worst_abs = 0.0_f32;
    let mut worst_rel = 0.0_f32;
    for (&g, &c) in gpu.iter().zip(cpu.iter()) {
        let a = (g - c).abs();
        if a > worst_abs {
            worst_abs = a;
        }
        let denom = g.abs().max(c.abs());
        if denom > 0.0 {
            let r = a / denom;
            if r > worst_rel {
                worst_rel = r;
            }
        }
    }
    (worst_rel, worst_abs)
}

/// JIT-compile `ptx` and look up `entry`, returning a launchable kernel.
///
/// A failure here means ptxas rejected the PTX — a real, invalid-PTX bug.
fn load_kernel(ptx: &str, entry: &str) -> Kernel {
    let module = Module::from_ptx(ptx)
        .unwrap_or_else(|e| panic!("PTX JIT compile failed for `{entry}`: {e}"));
    Kernel::from_module(Arc::new(module), entry)
        .unwrap_or_else(|e| panic!("kernel `{entry}` not found in module: {e}"))
}

/// `ceil(n / block)` as a 1-D grid size.
fn grid_1d(n: u32, block: u32) -> u32 {
    n.div_ceil(block)
}

/// Uniform `f32` in `[0, 1)` from this crate's `LcgRng` (which exposes
/// `next_f64`, not `next_f32`).
fn unit(rng: &mut LcgRng) -> f32 {
    rng.next_f64() as f32
}

// ===========================================================================
// 1. forward_pass  —  INDEPENDENT HOST RE-DERIVATION (HMM forward log-sum-exp)
//                     + the base-2 → base-e fix is validated here.
// ===========================================================================

/// Natural, max-stabilised log-sum-exp matching the kernel's documented math
/// (`max + ln Σ exp(x − max)`).
fn host_logsumexp(vals: &[f32]) -> f32 {
    let mut max_val = f32::NEG_INFINITY;
    for &x in vals {
        if x > max_val {
            max_val = x;
        }
    }
    if !max_val.is_finite() {
        return max_val;
    }
    let mut sum = 0.0_f32;
    for &x in vals {
        sum += (x - max_val).exp();
    }
    max_val + sum.ln()
}

#[test]
fn forward_pass_matches_host() {
    let Some(fx) = gpu_fixture() else {
        return;
    };

    // S states. Moderate-magnitude log-domain values keep every exponent O(1)
    // after max-subtraction, so the `ex2.approx` exponential stays well inside
    // its accurate domain and a libm-`exp` oracle is a few-ulp match — yet a
    // base-2 vs base-e error (~30 %) would blow past the 1e-3 tolerance.
    let s = 12_usize;
    let mut rng = LcgRng::new(0x4D31_0001);
    let alpha_prev: Vec<f32> = (0..s).map(|_| unit(&mut rng) * 2.0 - 1.0).collect();
    // log_a[i*S + j] is the log-transition i -> j.
    let log_a: Vec<f32> = (0..s * s).map(|_| unit(&mut rng) * 2.0 - 1.0).collect();
    let log_b_o: Vec<f32> = (0..s).map(|_| unit(&mut rng) * 2.0 - 1.0).collect();

    // Independent host re-derivation:
    //   alpha_next[j] = logsumexp_i(alpha_prev[i] + log_a[i*S + j]) + log_b_o[j].
    let mut next_host = vec![0.0_f32; s];
    let mut col = vec![0.0_f32; s];
    for j in 0..s {
        for i in 0..s {
            col[i] = alpha_prev[i] + log_a[i * s + j];
        }
        next_host[j] = host_logsumexp(&col) + log_b_o[j];
    }

    let ptx = crate::ptx_kernels::forward_pass_ptx(fx.sm);
    let kernel = load_kernel(&ptx, "forward_pass_kernel");
    let stream = Stream::new(&fx.ctx).expect("stream");

    let d_prev = DeviceBuffer::<f32>::from_host(&alpha_prev).expect("d_prev");
    let d_next = DeviceBuffer::<f32>::from_host(&vec![0.0_f32; s]).expect("d_next");
    let d_log_a = DeviceBuffer::<f32>::from_host(&log_a).expect("d_log_a");
    let d_log_b = DeviceBuffer::<f32>::from_host(&log_b_o).expect("d_log_b");

    let block = s as u32;
    let params = LaunchParams::new(grid_1d(s as u32, block), block);
    kernel
        .launch(
            &params,
            &stream,
            &(
                d_prev.as_device_ptr(),
                d_next.as_device_ptr(),
                d_log_a.as_device_ptr(),
                d_log_b.as_device_ptr(),
                s as u32,
            ),
        )
        .expect("launch forward_pass_kernel");
    stream.synchronize().expect("sync");

    let mut next_gpu = vec![0.0_f32; s];
    d_next.copy_to_host(&mut next_gpu).expect("copy next");

    let (rel, abs) = worst_diff(&next_gpu, &next_host);
    for j in 0..s {
        assert!(
            close(next_gpu[j], next_host[j], 1e-3, 1e-4),
            "forward_pass alpha_next[{j}] mismatch: gpu={} host={} (worst rel={rel:e} abs={abs:e})",
            next_gpu[j],
            next_host[j]
        );
    }
}

// ===========================================================================
// 2. viterbi_step  —  INDEPENDENT HOST RE-DERIVATION (max + argmax)
// ===========================================================================

#[test]
fn viterbi_step_matches_host() {
    let Some(fx) = gpu_fixture() else {
        return;
    };

    let s = 12_usize;
    let mut rng = LcgRng::new(0x71B6_0002);
    // Distinct, well-separated additive values so the argmax is unambiguous (no
    // ties) and the GPU's single-rounding compare matches the host exactly.
    let delta_prev: Vec<f32> = (0..s).map(|_| unit(&mut rng) * 4.0 - 2.0).collect();
    let log_a: Vec<f32> = (0..s * s).map(|_| unit(&mut rng) * 4.0 - 2.0).collect();
    let log_b_o: Vec<f32> = (0..s).map(|_| unit(&mut rng) * 2.0 - 1.0).collect();

    // Host: delta_next[j] = max_i(delta_prev[i] + log_a[i*S+j]) + log_b_o[j];
    //       psi[j] = argmax_i.
    let mut next_host = vec![0.0_f32; s];
    let mut psi_host = vec![-1_i32; s];
    for j in 0..s {
        let mut best = f32::NEG_INFINITY;
        let mut arg = -1_i32;
        for i in 0..s {
            let v = delta_prev[i] + log_a[i * s + j];
            if v > best {
                best = v;
                arg = i as i32;
            }
        }
        next_host[j] = best + log_b_o[j];
        psi_host[j] = arg;
    }

    let ptx = crate::ptx_kernels::viterbi_step_ptx(fx.sm);
    let kernel = load_kernel(&ptx, "viterbi_step_kernel");
    let stream = Stream::new(&fx.ctx).expect("stream");

    let d_prev = DeviceBuffer::<f32>::from_host(&delta_prev).expect("d_prev");
    let d_next = DeviceBuffer::<f32>::from_host(&vec![0.0_f32; s]).expect("d_next");
    let d_log_a = DeviceBuffer::<f32>::from_host(&log_a).expect("d_log_a");
    let d_log_b = DeviceBuffer::<f32>::from_host(&log_b_o).expect("d_log_b");
    let d_psi = DeviceBuffer::<i32>::from_host(&vec![0_i32; s]).expect("d_psi");

    let block = s as u32;
    let params = LaunchParams::new(grid_1d(s as u32, block), block);
    kernel
        .launch(
            &params,
            &stream,
            &(
                d_prev.as_device_ptr(),
                d_next.as_device_ptr(),
                d_log_a.as_device_ptr(),
                d_log_b.as_device_ptr(),
                d_psi.as_device_ptr(),
                s as u32,
            ),
        )
        .expect("launch viterbi_step_kernel");
    stream.synchronize().expect("sync");

    let mut next_gpu = vec![0.0_f32; s];
    let mut psi_gpu = vec![0_i32; s];
    d_next.copy_to_host(&mut next_gpu).expect("copy next");
    d_psi.copy_to_host(&mut psi_gpu).expect("copy psi");

    for j in 0..s {
        assert_eq!(
            psi_gpu[j], psi_host[j],
            "viterbi psi[{j}] (argmax) mismatch: gpu={} host={}",
            psi_gpu[j], psi_host[j]
        );
        assert!(
            close(next_gpu[j], next_host[j], 1e-5, 1e-6),
            "viterbi delta_next[{j}] mismatch: gpu={} host={}",
            next_gpu[j],
            next_host[j]
        );
    }
}

// ===========================================================================
// 3. crf_features  —  INDEPENDENT HOST RE-DERIVATION (emission dot + transition)
// ===========================================================================

#[test]
fn crf_features_matches_host() {
    let Some(fx) = gpu_fixture() else {
        return;
    };

    let n_labels = 6_usize;
    let n_features = 5_usize;
    let t_steps = 3_usize;
    let t = 1_u32; // which time step's features to score

    let mut rng = LcgRng::new(0xC4F0_0003);
    let emit: Vec<f32> = (0..n_labels * n_features)
        .map(|_| unit(&mut rng) * 2.0 - 1.0)
        .collect();
    let trans: Vec<f32> = (0..n_labels * n_labels)
        .map(|_| unit(&mut rng) * 2.0 - 1.0)
        .collect();
    let x_feat: Vec<f32> = (0..t_steps * n_features)
        .map(|_| unit(&mut rng) * 2.0 - 1.0)
        .collect();

    // Host: score[y_prev*L + y_cur] = dot(emit[y_cur,:], x_feat[t,:])
    //                                 + trans[y_prev*L + y_cur].
    let mut score_host = vec![0.0_f32; n_labels * n_labels];
    for y_prev in 0..n_labels {
        for y_cur in 0..n_labels {
            let mut emission = 0.0_f32;
            for k in 0..n_features {
                emission += emit[y_cur * n_features + k] * x_feat[t as usize * n_features + k];
            }
            score_host[y_prev * n_labels + y_cur] = emission + trans[y_prev * n_labels + y_cur];
        }
    }

    let ptx = crate::ptx_kernels::crf_features_ptx(fx.sm);
    let kernel = load_kernel(&ptx, "crf_features_kernel");
    let stream = Stream::new(&fx.ctx).expect("stream");

    let d_emit = DeviceBuffer::<f32>::from_host(&emit).expect("d_emit");
    let d_trans = DeviceBuffer::<f32>::from_host(&trans).expect("d_trans");
    let d_x = DeviceBuffer::<f32>::from_host(&x_feat).expect("d_x");
    let d_score =
        DeviceBuffer::<f32>::from_host(&vec![0.0_f32; n_labels * n_labels]).expect("d_score");

    // Grid = (y_cur, y_prev): ctaid.x -> y_cur, ctaid.y -> y_prev, block = (1,1).
    let params = LaunchParams::new((n_labels as u32, n_labels as u32), (1u32, 1u32));
    kernel
        .launch(
            &params,
            &stream,
            &(
                d_emit.as_device_ptr(),
                d_trans.as_device_ptr(),
                d_x.as_device_ptr(),
                d_score.as_device_ptr(),
                t,
                n_labels as u32,
                n_features as u32,
            ),
        )
        .expect("launch crf_features_kernel");
    stream.synchronize().expect("sync");

    let mut score_gpu = vec![0.0_f32; n_labels * n_labels];
    d_score.copy_to_host(&mut score_gpu).expect("copy score");

    let (rel, abs) = worst_diff(&score_gpu, &score_host);
    for k in 0..score_gpu.len() {
        assert!(
            close(score_gpu[k], score_host[k], 1e-5, 1e-6),
            "crf_features score[{k}] mismatch: gpu={} host={} (worst rel={rel:e} abs={abs:e})",
            score_gpu[k],
            score_host[k]
        );
    }
}

// ===========================================================================
// 4. beam_topk  —  INDEPENDENT HOST RE-DERIVATION (strict-greater rank)
// ===========================================================================

#[test]
fn beam_topk_matches_host() {
    let Some(fx) = gpu_fixture() else {
        return;
    };

    let n = 32_usize;
    let k = 8_u32;

    // Distinct scores (no ties) so the strict-greater rank is unambiguous and
    // the GPU's `setp.gt.f32` compare is bit-exact against the host.
    let scores: Vec<f32> = (0..n).map(|i| (i as f32) * 0.37 - 5.0).collect();
    // Deterministically shuffle to avoid an already-sorted input.
    let mut scores = scores;
    let mut rng = LcgRng::new(0xBEE7_0004);
    for i in (1..n).rev() {
        let j = rng.next_usize(i + 1);
        scores.swap(i, j);
    }

    // Host: rank[i] = #{ j : scores[j] > scores[i] }; survivors keep rank, else -1.
    let mut rank_host = vec![0_i32; n];
    for i in 0..n {
        let mut rank = 0_u32;
        for j in 0..n {
            if scores[j] > scores[i] {
                rank += 1;
            }
        }
        rank_host[i] = if rank < k { rank as i32 } else { -1 };
    }

    let ptx = crate::ptx_kernels::beam_topk_ptx(fx.sm);
    let kernel = load_kernel(&ptx, "beam_topk_kernel");
    let stream = Stream::new(&fx.ctx).expect("stream");

    let d_scores = DeviceBuffer::<f32>::from_host(&scores).expect("d_scores");
    let d_rank = DeviceBuffer::<i32>::from_host(&vec![0_i32; n]).expect("d_rank");

    let block = 64_u32;
    let params = LaunchParams::new(grid_1d(n as u32, block), block);
    kernel
        .launch(
            &params,
            &stream,
            &(
                d_scores.as_device_ptr(),
                d_rank.as_device_ptr(),
                n as u32,
                k,
            ),
        )
        .expect("launch beam_topk_kernel");
    stream.synchronize().expect("sync");

    let mut rank_gpu = vec![0_i32; n];
    d_rank.copy_to_host(&mut rank_gpu).expect("copy rank");

    for i in 0..n {
        assert_eq!(
            rank_gpu[i], rank_host[i],
            "beam_topk rank[{i}] mismatch: gpu={} host={} (score={})",
            rank_gpu[i], rank_host[i], scores[i]
        );
    }
    // Exactly k survivors (rank >= 0) must exist.
    let survivors = rank_gpu.iter().filter(|&&r| r >= 0).count();
    assert_eq!(
        survivors, k as usize,
        "beam_topk: expected {k} survivors, got {survivors}"
    );
}

// ===========================================================================
// 5. edit_dist  —  FULL-TABLE HOST DP + crate::metrics::edit_distance corner
//                  cross-check (anti-diagonal wavefront, one launch per diag).
// ===========================================================================

#[test]
fn edit_dist_matches_host_and_crate() {
    let Some(fx) = gpu_fixture() else {
        return;
    };

    // Two integer "strings"; the kernel compares characters with `setp.eq.s32`.
    let a: Vec<i32> = vec![3, 1, 4, 1, 5, 9, 2, 6];
    let b: Vec<i32> = vec![3, 1, 5, 9, 2, 6, 5];
    let n_a = a.len();
    let n_b = b.len();
    let cols = n_b + 1;

    // Host Levenshtein DP table (the kernel fills exactly this).
    let mut dp_host = vec![0_i32; (n_a + 1) * cols];
    for i in 0..=n_a {
        dp_host[i * cols] = i as i32;
    }
    for j in 0..=n_b {
        dp_host[j] = j as i32;
    }
    for i in 1..=n_a {
        for j in 1..=n_b {
            let cost = if a[i - 1] == b[j - 1] { 0 } else { 1 };
            let del = dp_host[(i - 1) * cols + j] + 1;
            let ins = dp_host[i * cols + (j - 1)] + 1;
            let sub = dp_host[(i - 1) * cols + (j - 1)] + cost;
            dp_host[i * cols + j] = del.min(ins).min(sub);
        }
    }

    // Device DP buffer pre-initialised with the boundary row/column; the kernel
    // only fills interior cells (i,j) with 1 <= i <= n_a, 1 <= j <= n_b.
    let mut dp_init = vec![0_i32; (n_a + 1) * cols];
    for i in 0..=n_a {
        dp_init[i * cols] = i as i32;
    }
    for j in 0..=n_b {
        dp_init[j] = j as i32;
    }

    let ptx = crate::ptx_kernels::edit_dist_ptx(fx.sm);
    let kernel = load_kernel(&ptx, "edit_dist_kernel");
    let stream = Stream::new(&fx.ctx).expect("stream");

    let d_dp = DeviceBuffer::<i32>::from_host(&dp_init).expect("d_dp");
    let d_a = DeviceBuffer::<i32>::from_host(&a).expect("d_a");
    let d_b = DeviceBuffer::<i32>::from_host(&b).expect("d_b");

    // Anti-diagonal wavefront: diag = i + j runs 2 ..= n_a + n_b. Each launch
    // fills every interior cell on that diagonal (tid -> i = tid+1).
    let block = n_a as u32;
    for diag in 2..=(n_a + n_b) {
        let params = LaunchParams::new(1u32, block);
        kernel
            .launch(
                &params,
                &stream,
                &(
                    d_dp.as_device_ptr(),
                    d_a.as_device_ptr(),
                    d_b.as_device_ptr(),
                    n_a as u32,
                    n_b as u32,
                    diag as u32,
                ),
            )
            .expect("launch edit_dist_kernel");
        stream.synchronize().expect("sync");
    }

    let mut dp_gpu = vec![0_i32; (n_a + 1) * cols];
    d_dp.copy_to_host(&mut dp_gpu).expect("copy dp");

    for idx in 0..dp_gpu.len() {
        assert_eq!(
            dp_gpu[idx],
            dp_host[idx],
            "edit_dist dp[{}][{}] mismatch: gpu={} host={}",
            idx / cols,
            idx % cols,
            dp_gpu[idx],
            dp_host[idx]
        );
    }

    // Crate cross-check: the final corner is the full edit distance.
    let crate_dist = crate::metrics::edit_distance(&a, &b) as i32;
    assert_eq!(
        dp_gpu[n_a * cols + n_b],
        crate_dist,
        "edit_dist corner vs crate::metrics::edit_distance: gpu={} crate={}",
        dp_gpu[n_a * cols + n_b],
        crate_dist
    );
}

// ===========================================================================
// 6. kalman_predict  —  INDEPENDENT HOST RE-DERIVATION (x_pred = A·x,
//                       P_pred = A·P·Aᵀ + Q)
// ===========================================================================

#[test]
fn kalman_predict_matches_host() {
    let Some(fx) = gpu_fixture() else {
        return;
    };

    let n = 4_usize;
    let mut rng = LcgRng::new(0x4A15_0006);
    let x: Vec<f32> = (0..n).map(|_| unit(&mut rng) * 2.0 - 1.0).collect();
    let a_mat: Vec<f32> = (0..n * n).map(|_| unit(&mut rng) * 2.0 - 1.0).collect();
    let p_mat: Vec<f32> = (0..n * n).map(|_| unit(&mut rng) * 2.0 - 1.0).collect();
    let q_mat: Vec<f32> = (0..n * n).map(|_| unit(&mut rng) * 0.5).collect();

    // Host: x_pred[i] = Σ_k A[i,k]·x[k].
    let mut x_pred_host = vec![0.0_f32; n];
    for i in 0..n {
        let mut s = 0.0_f32;
        for k in 0..n {
            s += a_mat[i * n + k] * x[k];
        }
        x_pred_host[i] = s;
    }
    // Host: P_pred[i,j] = Σ_k A[i,k] (Σ_l P[k,l]·A[j,l]) + Q[i,j].
    let mut p_pred_host = vec![0.0_f32; n * n];
    for i in 0..n {
        for j in 0..n {
            let mut acc = 0.0_f32;
            for k in 0..n {
                let mut inner = 0.0_f32;
                for l in 0..n {
                    inner += p_mat[k * n + l] * a_mat[j * n + l];
                }
                acc += a_mat[i * n + k] * inner;
            }
            p_pred_host[i * n + j] = acc + q_mat[i * n + j];
        }
    }

    let ptx = crate::ptx_kernels::kalman_predict_ptx(fx.sm);
    let kernel = load_kernel(&ptx, "kalman_predict_kernel");
    let stream = Stream::new(&fx.ctx).expect("stream");

    let d_x = DeviceBuffer::<f32>::from_host(&x).expect("d_x");
    let d_x_pred = DeviceBuffer::<f32>::from_host(&vec![0.0_f32; n]).expect("d_x_pred");
    let d_a = DeviceBuffer::<f32>::from_host(&a_mat).expect("d_a");
    let d_p = DeviceBuffer::<f32>::from_host(&p_mat).expect("d_p");
    let d_p_pred = DeviceBuffer::<f32>::from_host(&vec![0.0_f32; n * n]).expect("d_p_pred");
    let d_q = DeviceBuffer::<f32>::from_host(&q_mat).expect("d_q");

    let block = n as u32;
    let params = LaunchParams::new(grid_1d(n as u32, block), block);
    kernel
        .launch(
            &params,
            &stream,
            &(
                d_x.as_device_ptr(),
                d_x_pred.as_device_ptr(),
                d_a.as_device_ptr(),
                d_p.as_device_ptr(),
                d_p_pred.as_device_ptr(),
                d_q.as_device_ptr(),
                n as u32,
            ),
        )
        .expect("launch kalman_predict_kernel");
    stream.synchronize().expect("sync");

    let mut x_pred_gpu = vec![0.0_f32; n];
    let mut p_pred_gpu = vec![0.0_f32; n * n];
    d_x_pred.copy_to_host(&mut x_pred_gpu).expect("copy x_pred");
    d_p_pred.copy_to_host(&mut p_pred_gpu).expect("copy p_pred");

    let (rel_x, abs_x) = worst_diff(&x_pred_gpu, &x_pred_host);
    for i in 0..n {
        assert!(
            close(x_pred_gpu[i], x_pred_host[i], 1e-4, 1e-5),
            "kalman x_pred[{i}] mismatch: gpu={} host={} (worst rel={rel_x:e} abs={abs_x:e})",
            x_pred_gpu[i],
            x_pred_host[i]
        );
    }
    let (rel_p, abs_p) = worst_diff(&p_pred_gpu, &p_pred_host);
    for k in 0..p_pred_gpu.len() {
        assert!(
            close(p_pred_gpu[k], p_pred_host[k], 1e-4, 1e-5),
            "kalman P_pred[{k}] mismatch: gpu={} host={} (worst rel={rel_p:e} abs={abs_p:e})",
            p_pred_gpu[k],
            p_pred_host[k]
        );
    }
}

// ===========================================================================
// 7. mrf_gibbs  —  INDEPENDENT HOST RE-DERIVATION (Ising single-site sigmoid
//                  resample with bit-exact LCG); the base-2 → base-e fix is
//                  validated here.
// ===========================================================================

const LCG_MUL: u64 = 6_364_136_223_846_793_005;
const LCG_ADD: u64 = 1_442_695_040_888_963_407;

/// Re-derive the kernel's inline counter-based uniform draw for cell `idx`.
///
/// `state = (idx ^ seed)·M + A`; `u = ((state >> 32) >> 8) · 2^-24`. The 24-bit
/// integer fits a float mantissa exactly and `2^-24` is an exact power of two,
/// so `u` is bit-identical to the GPU's `cvt.rn.f32.u32` + `mul.f32`.
fn gibbs_uniform(idx: u64, seed: u64) -> f32 {
    let state = (idx ^ seed).wrapping_mul(LCG_MUL).wrapping_add(LCG_ADD);
    let high = (state >> 32) as u32;
    let r = high >> 8;
    (r as f32) * (1.0_f32 / 16_777_216.0_f32)
}

#[test]
fn mrf_gibbs_matches_host() {
    let Some(fx) = gpu_fixture() else {
        return;
    };

    // KEY conditioning: coupling j = 0 decouples every site from its neighbours,
    // so `field = j·Σ(neighbours) + h = h` is a constant independent of the
    // (racy) concurrent spin writes — the kernel's output is deterministic. With
    // h chosen so p_up = sigmoid(2h) ≈ 0.6, the base-2 bug (which would yield
    // p_up ≈ 0.57) flips every cell whose uniform draw lands in the ~0.03 gap,
    // so this test fails loudly if the base conversion is wrong.
    let n_rows = 16_usize;
    let n_cols = 16_usize;
    let h = 0.5_f32 * 1.5_f32.ln(); // sigmoid(2h) = 0.6
    let j_coupling = 0.0_f32;
    // Seed chosen (offline search) so that, over the 256 cells, no uniform draw
    // lands within 1e-3 of p_up = 0.6. The draws form a low-discrepancy set
    // (the cell index spans only the low 8 bits XORed into the LCG input), so
    // the best achievable decision margin is ~2.5e-3 — still ~12× the strictest
    // 1e-3 floor here yet far below the ~0.03 base-2 error gap this test guards.
    let seed = 0x4612_8395_5D0E_D500_u64;

    let p_up = 1.0_f32 / (1.0_f32 + (-2.0_f32 * h).exp());

    // Host reference + decision-margin precondition (so the comparison is an
    // honest bit-exact check, not a knife-edge near u == p_up).
    let mut spins_host = vec![0_i32; n_rows * n_cols];
    let mut min_margin = f32::INFINITY;
    for row in 0..n_rows {
        for c in 0..n_cols {
            let idx = (row * n_cols + c) as u64;
            let u = gibbs_uniform(idx, seed);
            min_margin = min_margin.min((u - p_up).abs());
            spins_host[row * n_cols + c] = if u < p_up { 1 } else { -1 };
        }
    }
    assert!(
        min_margin > 1e-3,
        "test setup: a Gibbs cell sits within {min_margin} of p_up={p_up}; \
         pick another seed so the spin decision is unambiguous"
    );

    // Initial spins are irrelevant (multiplied by j = 0) but must be valid ±1.
    let spins_init = vec![1_i32; n_rows * n_cols];

    let ptx = crate::ptx_kernels::mrf_gibbs_ptx(fx.sm);
    let kernel = load_kernel(&ptx, "mrf_gibbs_kernel");
    let stream = Stream::new(&fx.ctx).expect("stream");

    let d_spins = DeviceBuffer::<i32>::from_host(&spins_init).expect("d_spins");

    // Grid = (1,1), block = (n_cols, n_rows): tid.x -> col, tid.y -> row.
    let params = LaunchParams::new((1u32, 1u32), (n_cols as u32, n_rows as u32));
    kernel
        .launch(
            &params,
            &stream,
            &(
                d_spins.as_device_ptr(),
                h,
                j_coupling,
                n_rows as u32,
                n_cols as u32,
                seed,
            ),
        )
        .expect("launch mrf_gibbs_kernel");
    stream.synchronize().expect("sync");

    let mut spins_gpu = vec![0_i32; n_rows * n_cols];
    d_spins.copy_to_host(&mut spins_gpu).expect("copy spins");

    let mut flips = 0usize;
    for k in 0..spins_gpu.len() {
        assert!(
            spins_gpu[k] == 1 || spins_gpu[k] == -1,
            "mrf_gibbs spin[{k}] = {} not ±1",
            spins_gpu[k]
        );
        if spins_gpu[k] != spins_host[k] {
            flips += 1;
        }
    }
    assert_eq!(
        flips,
        0,
        "mrf_gibbs: {flips}/{} spins differ from the base-e host oracle (p_up={p_up})",
        spins_gpu.len()
    );
}