vyre-libs 0.6.4

vyre Category A library ecosystem - pure-IR compositions over vyre-ops hardware primitives
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
//! Resident-buffer dispatch for [`GpuLiteralSet`] region-presence scans.
//!
//! # Why this exists
//!
//! [`GpuLiteralSet::scan_presence_by_region`](super::literal_set::GpuLiteralSet::scan_presence_by_region)
//! and its async sibling issue every dispatch through `dispatch_borrowed`, which
//! re-encodes and **re-uploads seven immutable tables on every call**: the DFA
//! transition / output-offset / output-record tables, the per-pattern length
//! table, and the three suffix-prefilter masks (end mask, 2-gram mask, 3-gram
//! bloom). For a large detector set the DFA transition table alone is
//! `state_count × 256` u32s — multiple MiB — and it is *identical* across every
//! file of a corpus. A consumer that scans many coalesced batches (a downstream
//! consumer's phase-1 layout) pays that multi-MiB host→device transfer once per batch even
//! though only the haystack and the per-region presence output actually change.
//!
//! [`ResidentPresencePipeline`] uploads those seven tables **once** into
//! backend-resident resources and keeps them resident for the session lifetime.
//! Each [`scan_into`](ResidentPresencePipeline::scan_into) then transfers only the
//! per-file haystack (a ranged upload into the resident haystack buffer), the small
//! per-scan control values (haystack length, region starts, region base), and
//! zeroes the used prefix of the resident presence buffer, dispatches against the
//! resident tables, and reads back the per-region presence bitmap — the per-scan
//! transfer drops from `O(tables + haystack)` to `O(haystack + region rows)`.
//! This is the region-presence counterpart of
//! [`RulePipeline::prepare_resident`](super::mega_scan::RulePipeline::prepare_resident)
//! (the regex/NFA mega-scan path) and of
//! [`GpuLiteralSet::prepare_presence_by_region_dispatch`](super::literal_set::GpuLiteralSet::prepare_presence_by_region_dispatch)
//! (the backend-neutral single-shot prepared payload).
//!
//! The decoded bitmap is byte-identical to
//! [`GpuLiteralSet::scan_presence_by_region`]'s return (bit `p` of region `r`'s
//! row is set iff pattern `p`'s literal occurs in region `r`), so a consumer can
//! swap the borrowed path for a resident session without changing any
//! post-processing — proven by the GPU parity test in the integration suite and
//! the host-orchestration unit test below.
//!
//! # The `max_regions` cap
//!
//! The resident program is built for `max_regions` coalesced files: that count
//! sizes the resident presence buffer (binding 6) and the kernel's
//! `ceil_log2(max_regions)` region binary-search width. The ACTUAL per-scan region
//! count is read dynamically from `buf_len(region_starts)`, so one resident
//! session serves any batch with `region_count <= max_regions`. A batch that
//! exceeds the cap is rejected **loudly** (it would index past the resident
//! presence buffer); the caller re-dispatches it through the per-batch-sized
//! borrowed [`GpuLiteralSet::scan_presence_by_region`] — never a silent truncation.
//!
//! # Backend support
//!
//! Resident dispatch requires a backend that implements the resident half of the
//! [`VyreBackend`] contract (`allocate_resident`, `upload_resident*`,
//! `dispatch_resident_timed`). The wgpu and CUDA backends do; the CPU reference
//! does not. [`GpuLiteralSet::prepare_resident_presence`] surfaces the backend's
//! `UnsupportedFeature` error **loudly** — the caller must handle it explicitly
//! (fail closed, or a loud/recorded fallback), never degrade silently.

use vyre::{BackendError, VyreBackend};
use vyre_driver::{Resource, TimedDispatchResult};
use vyre_foundation::ir::Program;

use super::dispatch_io;
use super::literal_set::{decode_presence_words_into, GpuLiteralSet};

const U32_BYTES: usize = std::mem::size_of::<u32>();

/// Number of buffer bindings in the region-presence program (see
/// [`super::literal_set::GpuLiteralSet::build_presence_by_region_dispatch`]).
const PRESENCE_BY_REGION_BINDINGS: usize = 12;

/// A [`GpuLiteralSet`] with its immutable region-presence tables uploaded into
/// backend-resident resources, ready for repeated low-overhead scans.
///
/// Construct with [`GpuLiteralSet::prepare_resident_presence`]. The session owns
/// twelve resident resources — haystack, the seven immutable tables, the read-write
/// presence buffer, and the three per-scan control buffers (haystack_len,
/// region_starts, region_base) — so the dispatch is ALL-resident (the CUDA backend's
/// resident dispatch rejects a borrowed-resource mix). Call [`free`](Self::free) to
/// release them, or drop the session and let the backend reclaim them when its
/// device context is torn down.
///
/// The session is `Send + Sync`: the resident handles are opaque ids and all
/// mutation happens through the borrowed `backend`, so a single session can be
/// shared across scan threads (each thread supplies its own packing scratch).
#[derive(Debug)]
pub struct ResidentPresencePipeline {
    /// Region-presence program sized for `max_regions` coalesced files.
    program: Program,
    /// Resident haystack buffer, sized to `haystack_capacity` padded bytes.
    haystack: Resource,
    /// Resident DFA transition table (immutable, uploaded once).
    transitions: Resource,
    /// Resident DFA output-offset table (immutable, uploaded once).
    output_offsets: Resource,
    /// Resident DFA output-record table (immutable, uploaded once).
    output_records: Resource,
    /// Resident per-pattern length table (immutable, uploaded once).
    pattern_lengths: Resource,
    /// Resident per-region presence buffer (read-write; used prefix reset per scan).
    presence: Resource,
    /// Resident suffix prefilter end mask (immutable, uploaded once).
    candidate_end_mask: Resource,
    /// Resident suffix prefilter 2-gram mask (immutable, uploaded once).
    candidate_suffix2_mask: Resource,
    /// Resident suffix prefilter 3-gram bloom (immutable, uploaded once).
    candidate_suffix3_bloom: Resource,
    /// Resident haystack-length control buffer (1 u32; re-uploaded per scan).
    haystack_len_buf: Resource,
    /// Resident region-starts control buffer (sized for `max_regions`; re-uploaded
    /// per scan, padded with a `u32::MAX` sentinel so `buf_len` stays fixed and no
    /// hit maps to a padding region — see [`ResidentPresencePipeline::scan_into`]).
    region_starts_buf: Resource,
    /// Resident shard-base control buffer (1 u32; re-uploaded per scan).
    region_base_buf: Resource,
    /// Padded byte capacity of the resident haystack buffer.
    haystack_capacity: usize,
    /// Largest coalesced-file count this session's presence buffer was sized for.
    max_regions: u32,
    /// Pattern count (bit width of each per-region presence row).
    pattern_count: u32,
    /// Presence bitmap `u32` words per region.
    presence_words: u32,
    /// Program workgroup X extent, for the per-scan byte-scan dispatch geometry.
    workgroup_x: u32,
}

// SAFETY mirror of the `ResidentRulePipeline`/`GpuLiteralSet` contract: `Resource`
// handles are plain ids and `Program` is `Send + Sync`.
const _: () = {
    const fn assert_send_sync<T: Send + Sync>() {}
    let _ = assert_send_sync::<ResidentPresencePipeline>;
};

impl GpuLiteralSet {
    /// Upload this matcher's immutable region-presence tables into backend-resident
    /// resources and return a [`ResidentPresencePipeline`] for repeated scans.
    ///
    /// `haystack_capacity_bytes` is the largest coalesced haystack the session will
    /// scan (e.g. the consumer's batch cap); the resident haystack buffer is
    /// allocated once at that padded size and every scan uploads only its real
    /// bytes. `max_regions` is the largest coalesced-file count the session will
    /// scan; it sizes the resident presence buffer and the kernel's region
    /// binary-search width, and caps decoded regions.
    ///
    /// # Errors
    ///
    /// Returns [`BackendError`] when the backend does not support resident
    /// resources, when `max_regions` is zero, or when allocation / upload of the
    /// resident tables fails. The caller must handle this loudly (fail closed or a
    /// recorded fallback) — never degrade silently.
    pub fn prepare_resident_presence(
        &self,
        backend: &dyn VyreBackend,
        haystack_capacity_bytes: usize,
        max_regions: u32,
    ) -> Result<ResidentPresencePipeline, BackendError> {
        let tables = self.resident_presence_tables(max_regions)?;

        let haystack_capacity = dispatch_io::haystack_padded_u32_byte_len(haystack_capacity_bytes)?;
        let haystack = backend.allocate_resident(haystack_capacity)?;

        // The seven immutable tables: allocate + upload ONCE.
        let transitions = allocate_and_upload(backend, &tables.transitions)?;
        let output_offsets = allocate_and_upload(backend, &tables.output_offsets)?;
        let output_records = allocate_and_upload(backend, &tables.output_records)?;
        let pattern_lengths = allocate_and_upload(backend, &tables.pattern_lengths)?;
        let candidate_end_mask = allocate_and_upload(backend, &tables.candidate_end_mask)?;
        let candidate_suffix2_mask =
            allocate_and_upload(backend, &tables.candidate_suffix2_mask)?;
        let candidate_suffix3_bloom =
            allocate_and_upload(backend, &tables.candidate_suffix3_bloom)?;

        // The read-write presence buffer: sized for the full max_regions capacity,
        // zeroed per scan over the used prefix only.
        let presence_capacity_words = (max_regions as usize)
            .checked_mul(tables.presence_words as usize)
            .ok_or_else(|| {
                BackendError::new(format!(
                    "resident region-presence capacity {max_regions} regions × {} words/region overflows host usize. Fix: lower max_regions or shard the pattern set.",
                    tables.presence_words
                ))
            })?;
        let presence_capacity_bytes = presence_capacity_words
            .checked_mul(U32_BYTES)
            .ok_or_else(|| {
                BackendError::new(
                    "resident region-presence presence-buffer byte capacity overflows host usize. Fix: lower max_regions or shard the pattern set.".to_string(),
                )
            })?;
        let presence = backend.allocate_resident(presence_capacity_bytes)?;

        // The three per-scan control buffers are ALSO resident. The CUDA backend's
        // resident dispatch rejects any borrowed resource (it resolves every binding
        // to a resident handle), so a resident dispatch must be ALL-resident — a
        // borrowed-control mix works on wgpu but fails closed on CUDA, a downstream
        // consumer's backend. haystack_len and region_base are one u32 each; region_starts is
        // sized for the full max_regions cap and padded per scan so its `buf_len`
        // (the kernel's live region count) stays fixed at max_regions.
        let region_starts_capacity_bytes =
            (max_regions as usize).checked_mul(U32_BYTES).ok_or_else(|| {
                BackendError::new(
                    "resident region-presence region-starts byte capacity overflows host usize. Fix: lower max_regions.".to_string(),
                )
            })?;
        let haystack_len_buf = backend.allocate_resident(U32_BYTES)?;
        let region_starts_buf = backend.allocate_resident(region_starts_capacity_bytes)?;
        let region_base_buf = backend.allocate_resident(U32_BYTES)?;

        Ok(ResidentPresencePipeline {
            program: tables.program,
            haystack,
            transitions,
            output_offsets,
            output_records,
            pattern_lengths,
            presence,
            candidate_end_mask,
            candidate_suffix2_mask,
            candidate_suffix3_bloom,
            haystack_len_buf,
            region_starts_buf,
            region_base_buf,
            haystack_capacity,
            max_regions,
            pattern_count: tables.pattern_count,
            presence_words: tables.presence_words,
            workgroup_x: tables.workgroup_x,
        })
    }
}

/// Allocate a resident buffer sized to `bytes` and upload them once.
fn allocate_and_upload(backend: &dyn VyreBackend, bytes: &[u8]) -> Result<Resource, BackendError> {
    let resource = backend.allocate_resident(bytes.len())?;
    backend.upload_resident(&resource, bytes)?;
    Ok(resource)
}

impl ResidentPresencePipeline {
    /// Scan `haystack` (a coalesced batch with ascending `region_starts` beginning
    /// at 0) against the resident pipeline, decoding the per-region presence bitmap
    /// into caller-owned `out`. Equivalent to
    /// [`GpuLiteralSet::scan_presence_by_region`] but with the immutable tables
    /// already resident (no per-scan table transfer).
    ///
    /// `region_base` is added to every candidate position before the region binary
    /// search; pass `0` for a single-dispatch scan (see
    /// [`GpuLiteralSet::scan_presence_by_region_with_scratch`] for the sharded
    /// meaning). `scratch` reuses the packed-haystack / presence-reset staging
    /// buffer across calls; pass a per-thread `Vec` that lives as long as the scan
    /// loop.
    ///
    /// On return, `out` holds `region_starts.len() × presence_words` packed `u32`
    /// words: bit `p` of region `r`'s row is set iff pattern `p` occurs in region
    /// `r`.
    ///
    /// # Errors
    /// Returns [`BackendError`] when `region_starts` is empty / does not begin at 0,
    /// when `region_count` exceeds the session's `max_regions` cap, when `haystack`
    /// exceeds the session's haystack capacity, or on upload / dispatch / readback
    /// failure. On any error `out` is left cleared (no partial bitmap).
    pub fn scan_into(
        &self,
        backend: &dyn VyreBackend,
        haystack: &[u8],
        region_starts: &[u32],
        region_base: u32,
        out: &mut Vec<u32>,
        scratch: &mut Vec<u8>,
    ) -> Result<(), BackendError> {
        self.scan_into_timed(backend, haystack, region_starts, region_base, out, scratch)?;
        Ok(())
    }

    /// Like [`scan_into`](Self::scan_into) but returns the dispatch's
    /// [`TimedDispatchResult`] so a consumer or benchmark can attribute the
    /// per-scan cost between the GPU kernel (`device_ns`) and host-side
    /// staging/readback (`wall_ns - device_ns`). The decoded per-region presence
    /// bitmap is written to `out` identically to [`scan_into`](Self::scan_into);
    /// the returned result's `outputs` are the same raw presence bytes already
    /// decoded into `out`.
    ///
    /// # Errors
    /// Same as [`scan_into`](Self::scan_into).
    pub fn scan_into_timed(
        &self,
        backend: &dyn VyreBackend,
        haystack: &[u8],
        region_starts: &[u32],
        region_base: u32,
        out: &mut Vec<u32>,
        scratch: &mut Vec<u8>,
    ) -> Result<TimedDispatchResult, BackendError> {
        out.clear();

        let region_count = u32::try_from(region_starts.len()).map_err(|_| {
            BackendError::new(
                "resident region-presence: region count exceeds u32 GPU ABI".to_string(),
            )
        })?;
        if region_count == 0 {
            return Err(BackendError::new(
                "resident region-presence: region_starts must be non-empty. Fix: pass one start offset per coalesced file, beginning with 0.".to_string(),
            ));
        }
        if region_starts[0] != 0 {
            return Err(BackendError::new(
                "resident region-presence: region_starts[0] must be 0 (the kernel binary-search lower bound). Fix: the first coalesced file must start at offset 0.".to_string(),
            ));
        }
        if region_count > self.max_regions {
            return Err(BackendError::new(format!(
                "resident region-presence batch has {region_count} regions but the session was prepared for at most {}. Fix: raise max_regions in prepare_resident_presence, or dispatch this batch through the per-batch-sized borrowed GpuLiteralSet::scan_presence_by_region (a larger cap would index past the resident presence buffer).",
                self.max_regions
            )));
        }

        let haystack_len = dispatch_io::scan_guard(
            haystack,
            "ResidentPresencePipeline::scan",
            dispatch_io::DEFAULT_MAX_SCAN_BYTES,
        )?;

        // (1) Stage the haystack into the resident buffer (real bytes only; the
        // kernel bounds its cursor with haystack_len so the stale tail is never
        // read).
        dispatch_io::pack_haystack_u32_into(haystack, scratch)?;
        if scratch.len() > self.haystack_capacity {
            return Err(BackendError::new(format!(
                "ResidentPresencePipeline haystack is {} packed byte(s) but the resident buffer holds {}. Fix: raise haystack_capacity_bytes in prepare_resident_presence or shard the haystack.",
                scratch.len(),
                self.haystack_capacity
            )));
        }
        backend.upload_resident_at(&self.haystack, 0, scratch)?;

        // (2) Zero the USED prefix of the resident presence buffer (binding 6 is
        // OR-accumulated by the kernel, so it must arrive zeroed). Rows beyond
        // region_count are never written (the kernel bounds the region index by
        // buf_len(region_starts)) and never read, so only the used prefix needs
        // clearing — the resident analogue of `ResidentRulePipeline`'s 4-byte
        // counter reset. Reusing `scratch` is safe: `upload_resident_at` copies the
        // source synchronously (wgpu `Queue::write_buffer` into the staging belt,
        // CUDA H2D memcpy), so the buffer is free to repurpose the instant the
        // haystack upload above returns.
        let used_words = (region_count as usize)
            .checked_mul(self.presence_words as usize)
            .ok_or_else(|| {
                BackendError::new(
                    "resident region-presence used-word count overflows host usize. Fix: lower the region count or shard the pattern set.".to_string(),
                )
            })?;
        let reset_bytes = used_words.checked_mul(U32_BYTES).ok_or_else(|| {
            BackendError::new(
                "resident region-presence presence-reset byte count overflows host usize. Fix: lower the region count or shard the pattern set.".to_string(),
            )
        })?;
        scratch.clear();
        scratch.resize(reset_bytes, 0);
        backend.upload_resident_at(&self.presence, 0, scratch)?;

        // (3) Stage the three per-scan control buffers. They MUST be resident, not
        // borrowed: the CUDA resident dispatch resolves every binding to a resident
        // handle and rejects a borrowed mix (`cuda_compiled_persistent_borrowed_resource`),
        // so an all-resident dispatch is the only form portable across wgpu AND CUDA
        // (a downstream consumer's backend). haystack_len and region_base are one u32 each.
        backend.upload_resident_at(&self.haystack_len_buf, 0, &haystack_len.to_le_bytes())?;
        backend.upload_resident_at(&self.region_base_buf, 0, &region_base.to_le_bytes())?;

        // region_starts is a FIXED `max_regions`-sized resident buffer so its
        // `buf_len` — the kernel's live region count — does not change with the
        // batch. The real starts fill [0, region_count); the tail
        // [region_count, max_regions) is padded with `u32::MAX`, a sentinel strictly
        // greater than any candidate position (positions are bounded by the scan
        // size << u32::MAX), so the region binary search never maps a hit to a
        // padding row. Those rows stay untouched and are never decoded — the result
        // for the real regions is identical to a `region_count`-length region_starts.
        // Reusing `scratch` is safe (synchronous upload copy, as above).
        scratch.clear();
        let region_starts_words = self.max_regions as usize;
        scratch.reserve(region_starts_words.saturating_mul(U32_BYTES));
        for &start in region_starts {
            scratch.extend_from_slice(&start.to_le_bytes());
        }
        for _ in (region_count as usize)..region_starts_words {
            scratch.extend_from_slice(&u32::MAX.to_le_bytes());
        }
        backend.upload_resident_at(&self.region_starts_buf, 0, scratch)?;

        // (4) Bind in program order — every binding resident (the CUDA all-resident
        // requirement; wgpu accepts resident bindings identically).
        let resources = [
            self.haystack.clone(),                // 0: haystack
            self.transitions.clone(),             // 1: transitions
            self.output_offsets.clone(),          // 2: output_offsets
            self.output_records.clone(),          // 3: output_records
            self.pattern_lengths.clone(),         // 4: pattern_lengths
            self.haystack_len_buf.clone(),        // 5: haystack_len
            self.presence.clone(),                // 6: presence (read_write)
            self.candidate_end_mask.clone(),      // 7: candidate_end_mask
            self.candidate_suffix2_mask.clone(),  // 8: candidate_suffix2_mask
            self.candidate_suffix3_bloom.clone(), // 9: candidate_suffix3_bloom
            self.region_starts_buf.clone(),       // 10: region_starts (padded)
            self.region_base_buf.clone(),         // 11: region_base
        ];
        debug_assert_eq!(resources.len(), PRESENCE_BY_REGION_BINDINGS);

        let config = dispatch_io::byte_scan_dispatch_config(haystack_len, self.workgroup_x);
        let timed = backend.dispatch_resident_timed(&self.program, &resources, &config)?;

        // The presence buffer is the program's only ReadWrite storage, returned at
        // output index 0 — identical decode to `scan_presence_by_region`.
        let presence_bytes = dispatch_io::try_output_bytes(
            &timed.outputs,
            0,
            "ResidentPresencePipeline presence buffer",
        )?;
        // The single region-presence wire decoder (shared with the sync / async /
        // prepared / fused paths in literal_set), filling the caller's `out`.
        decode_presence_words_into(presence_bytes, used_words, out);
        // Fail CLOSED on a short readback: a presence resource that returns fewer
        // than the used words would otherwise hand back a silently truncated bitmap
        // (some regions reported clean that were never scanned — Law 10).
        if out.len() != used_words {
            let returned = out.len();
            out.clear();
            return Err(BackendError::new(format!(
                "ResidentPresencePipeline presence readback returned {returned} u32 word(s) but the {region_count}-region scan needs {used_words}. Fix: ensure the backend reads back the full binding-6 presence resource."
            )));
        }
        Ok(timed)
    }

    /// Largest coalesced-file count this session's presence buffer was sized for.
    #[must_use]
    pub fn max_regions(&self) -> u32 {
        self.max_regions
    }

    /// Pattern count (bit width of each per-region presence row).
    #[must_use]
    pub fn pattern_count(&self) -> u32 {
        self.pattern_count
    }

    /// Presence bitmap `u32` words per region.
    #[must_use]
    pub fn presence_words(&self) -> u32 {
        self.presence_words
    }

    /// Padded byte capacity of the resident haystack buffer.
    #[must_use]
    pub fn haystack_capacity(&self) -> usize {
        self.haystack_capacity
    }

    /// Release every resident resource this session owns.
    ///
    /// Call this before the backend's device context is dropped to reclaim the
    /// resident allocations eagerly; otherwise they are reclaimed when the backend
    /// tears down. The session is consumed.
    ///
    /// # Errors
    /// Returns the first [`BackendError`] from freeing a resource; remaining
    /// resources are still attempted.
    pub fn free(self, backend: &dyn VyreBackend) -> Result<(), BackendError> {
        let mut first_err = None;
        for resource in [
            self.haystack,
            self.transitions,
            self.output_offsets,
            self.output_records,
            self.pattern_lengths,
            self.presence,
            self.candidate_end_mask,
            self.candidate_suffix2_mask,
            self.candidate_suffix3_bloom,
            self.haystack_len_buf,
            self.region_starts_buf,
            self.region_base_buf,
        ] {
            if let Err(error) = backend.free_resident(resource) {
                first_err.get_or_insert(error);
            }
        }
        first_err.map_or(Ok(()), Err)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::collections::BTreeSet;
    use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
    use std::sync::Mutex;
    use vyre::DispatchConfig as Config;
    use vyre_driver::TimedDispatchResult;

    // pattern_id order matches the integration corpus: key=0 .. api=7.
    const LITERALS: &[&[u8]] = &[
        b"key", b"token", b"secret", b"AKIA", b"ghp_", b"sk_live_", b"password", b"api",
    ];

    /// Mock backend that records resident traffic and returns a canned presence
    /// buffer, so the host orchestration (seven-table-upload-once, per-scan haystack
    /// staging + presence reset, 12-binding dispatch, decode) is validated without a
    /// GPU. Real GPU resident-vs-borrowed parity is asserted in the integration
    /// suite where a live wgpu backend is available. `VyreBackend` requires
    /// `Send + Sync`, so the counters use atomics / `Mutex`.
    struct MockResidentBackend {
        next_id: AtomicU64,
        /// (handle_id, byte_len) for every allocate_resident call.
        allocations: Mutex<Vec<(u64, usize)>>,
        /// Number of full uploads (immutable table uploads) seen.
        full_uploads: AtomicUsize,
        /// Number of ranged uploads (haystack stage + presence reset) seen.
        ranged_uploads: AtomicUsize,
        /// Byte lengths of every ranged upload, in order (haystack, reset, ...).
        ranged_upload_lens: Mutex<Vec<usize>>,
        /// Canned presence-buffer bytes returned at output index 0.
        presence_buffer: Vec<u8>,
    }

    impl MockResidentBackend {
        fn new(presence_buffer: Vec<u8>) -> Self {
            Self {
                next_id: AtomicU64::new(1),
                allocations: Mutex::new(Vec::new()),
                full_uploads: AtomicUsize::new(0),
                ranged_uploads: AtomicUsize::new(0),
                ranged_upload_lens: Mutex::new(Vec::new()),
                presence_buffer,
            }
        }
    }

    impl vyre::backend::private::Sealed for MockResidentBackend {}

    impl VyreBackend for MockResidentBackend {
        fn id(&self) -> &'static str {
            "mock-resident-presence"
        }

        fn dispatch(
            &self,
            _program: &Program,
            _inputs: &[Vec<u8>],
            _config: &Config,
        ) -> Result<Vec<Vec<u8>>, BackendError> {
            unreachable!("resident path does not use borrowed dispatch")
        }

        fn allocate_resident(&self, byte_len: usize) -> Result<Resource, BackendError> {
            let handle = self.next_id.fetch_add(1, Ordering::Relaxed);
            self.allocations
                .lock()
                .expect("mock allocations mutex")
                .push((handle, byte_len));
            Ok(Resource::Resident(handle))
        }

        fn upload_resident(&self, _resource: &Resource, _bytes: &[u8]) -> Result<(), BackendError> {
            self.full_uploads.fetch_add(1, Ordering::Relaxed);
            Ok(())
        }

        fn upload_resident_at(
            &self,
            _resource: &Resource,
            _dst_offset_bytes: usize,
            bytes: &[u8],
        ) -> Result<(), BackendError> {
            self.ranged_uploads.fetch_add(1, Ordering::Relaxed);
            self.ranged_upload_lens
                .lock()
                .expect("mock ranged-upload mutex")
                .push(bytes.len());
            Ok(())
        }

        fn free_resident(&self, _resource: Resource) -> Result<(), BackendError> {
            Ok(())
        }

        fn dispatch_resident_timed(
            &self,
            _program: &Program,
            resources: &[Resource],
            config: &Config,
        ) -> Result<TimedDispatchResult, BackendError> {
            // Contract checks the consumer relies on:
            assert_eq!(
                resources.len(),
                PRESENCE_BY_REGION_BINDINGS,
                "region-presence binds twelve buffers"
            );
            // EVERY binding must be resident — the CUDA resident dispatch rejects a
            // borrowed-resource mix, so no binding (not even the tiny per-scan
            // control buffers) may be Borrowed.
            for idx in 0..PRESENCE_BY_REGION_BINDINGS {
                assert!(
                    matches!(resources[idx], Resource::Resident(_)),
                    "binding {idx} must be resident (no borrowed mix in a resident dispatch)"
                );
            }
            assert!(
                config.grid_override.is_some(),
                "resident region-presence scan must supply a byte-scan grid override"
            );
            Ok(TimedDispatchResult {
                outputs: vec![self.presence_buffer.clone()],
                wall_ns: 0,
                device_ns: None,
                enqueue_ns: None,
                wait_ns: None,
            })
        }
    }

    /// Decode one single-word region row into the set of pattern ids whose bit is set.
    fn present_ids(word: u32, pattern_count: u32) -> BTreeSet<u32> {
        (0..pattern_count).filter(|&p| (word >> p) & 1 == 1).collect()
    }

    #[test]
    fn prepare_uploads_tables_once_then_scans_transfer_only_haystack_and_reset() {
        let matcher = GpuLiteralSet::compile(LITERALS);
        let pattern_count = LITERALS.len() as u32;
        assert_eq!(pattern_count, 8);
        // 8 patterns -> 1 presence word/region. max_regions = 4 -> capacity 4 words.
        // Canned presence: planted prefix [row0,row1,row2] + a stale 4th word the
        // 3-region decode must ignore.
        let row0 = (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3) | (1 << 7); // {key,token,secret,AKIA,api}
        let row1 = (1 << 4) | (1 << 5) | (1 << 6); // {ghp_,sk_live_,password}
        let row2 = 0u32; // {}
        let stale = 0xDEAD_BEEFu32;
        let mut canned = Vec::new();
        for w in [row0, row1, row2, stale] {
            canned.extend_from_slice(&w.to_le_bytes());
        }
        let backend = MockResidentBackend::new(canned);

        let session = matcher
            .prepare_resident_presence(&backend, 4096, 4)
            .expect("mock backend supports resident allocation");

        // Twelve resident allocations: haystack + 7 immutable tables + presence +
        // the three per-scan control buffers (haystack_len, region_starts, region_base).
        {
            let allocs = backend.allocations.lock().unwrap();
            assert_eq!(allocs.len(), 12, "haystack + 7 tables + presence + 3 controls");
            // presence (idx 8) = max_regions × words × 4 = 4 × 1 × 4 = 16 bytes.
            assert_eq!(allocs[8].1, 4 * 1 * U32_BYTES, "presence sized for max_regions");
            assert_eq!(allocs[9].1, U32_BYTES, "haystack_len control is one u32");
            assert_eq!(allocs[10].1, 4 * U32_BYTES, "region_starts sized for max_regions");
            assert_eq!(allocs[11].1, U32_BYTES, "region_base control is one u32");
        }
        // The seven immutable tables are uploaded exactly once, at prepare time; the
        // control buffers are staged per scan (ranged), not at prepare.
        assert_eq!(
            backend.full_uploads.load(Ordering::Relaxed),
            7,
            "seven immutable tables uploaded once each"
        );
        assert_eq!(backend.ranged_uploads.load(Ordering::Relaxed), 0);

        // A 3-region coalesced batch (regions at 0 / 4 / 9; first start == 0).
        let haystack = b"aaa\nbbbb\nccc\n";
        let region_starts = [0u32, 4, 9];
        let mut out = Vec::new();
        let mut scratch = Vec::new();
        for _ in 0..3 {
            session
                .scan_into(&backend, haystack, &region_starts, 0, &mut out, &mut scratch)
                .expect("resident region-presence scan decodes canned bitmap");
        }

        // Decode parity: the canned planted prefix surfaces; the stale 4th word is
        // never observed by the 3-region decode.
        assert_eq!(out, vec![row0, row1, row2], "3 regions × 1 word, stale tail ignored");
        assert_eq!(present_ids(out[0], pattern_count), BTreeSet::from([0, 1, 2, 3, 7]));
        assert_eq!(present_ids(out[1], pattern_count), BTreeSet::from([4, 5, 6]));
        assert_eq!(present_ids(out[2], pattern_count), BTreeSet::new());

        // No further full uploads after prepare; each scan does exactly FIVE ranged
        // uploads (haystack stage, presence reset, haystack_len, region_base,
        // region_starts) — the immutable tables never move again.
        assert_eq!(
            backend.full_uploads.load(Ordering::Relaxed),
            7,
            "immutable tables re-uploaded mid-loop"
        );
        assert_eq!(
            backend.ranged_uploads.load(Ordering::Relaxed),
            15,
            "3 scans × 5 ranged uploads (haystack, presence reset, haystack_len, region_base, region_starts)"
        );
        // Per-scan upload order is [haystack, reset, haystack_len, region_base,
        // region_starts]. The presence reset (2nd of each group of 5) uploads exactly
        // used_words × 4 = 3 × 1 × 4 = 12 bytes; region_starts (5th) uploads the full
        // padded max_regions × 4 = 16 bytes regardless of the 3-region batch.
        let lens = backend.ranged_upload_lens.lock().unwrap();
        let nth_of_each_scan = |offset: usize| -> Vec<usize> {
            lens.iter().skip(offset).step_by(5).copied().collect()
        };
        assert_eq!(
            nth_of_each_scan(1),
            vec![12, 12, 12],
            "each presence reset zeroes only the 3-region used prefix"
        );
        assert_eq!(
            nth_of_each_scan(2),
            vec![U32_BYTES, U32_BYTES, U32_BYTES],
            "haystack_len control is one u32 per scan"
        );
        assert_eq!(
            nth_of_each_scan(3),
            vec![U32_BYTES, U32_BYTES, U32_BYTES],
            "region_base control is one u32 per scan"
        );
        assert_eq!(
            nth_of_each_scan(4),
            vec![4 * U32_BYTES, 4 * U32_BYTES, 4 * U32_BYTES],
            "region_starts is uploaded padded to the full max_regions width every scan"
        );
    }

    #[test]
    fn scan_rejects_region_count_over_the_max_regions_cap() {
        let matcher = GpuLiteralSet::compile(LITERALS);
        // Canned presence is irrelevant; the cap guard fires before dispatch.
        let backend = MockResidentBackend::new(vec![0u8; 4]);
        let session = matcher
            .prepare_resident_presence(&backend, 4096, 2)
            .expect("prepare with a 2-region cap");

        let haystack = b"a\nb\nc\n";
        let region_starts = [0u32, 2, 4]; // 3 regions > cap of 2
        let mut out = vec![999];
        let mut scratch = Vec::new();
        let err = session
            .scan_into(&backend, haystack, &region_starts, 0, &mut out, &mut scratch)
            .expect_err("a batch over the resident region cap must error, not truncate");
        assert!(
            err.to_string().contains("session was prepared for at most 2") && out.is_empty(),
            "cap error must name the limit and expose no partial bitmap: {err}"
        );
        // The over-cap batch must never reach the device.
        assert_eq!(
            backend.ranged_uploads.load(Ordering::Relaxed),
            0,
            "rejected batch must not stage any resident upload"
        );
    }

    #[test]
    fn scan_rejects_haystack_larger_than_resident_capacity() {
        let matcher = GpuLiteralSet::compile(LITERALS);
        let backend = MockResidentBackend::new(vec![0u8; 4]);
        let session = matcher
            .prepare_resident_presence(&backend, 8, 4)
            .expect("prepare with an 8-byte haystack capacity");

        let mut out = Vec::new();
        let mut scratch = Vec::new();
        let region_starts = [0u32];
        let err = session
            .scan_into(&backend, &[b'a'; 64], &region_starts, 0, &mut out, &mut scratch)
            .expect_err("64-byte haystack must not fit an 8-byte resident buffer");
        assert!(
            err.to_string().contains("resident buffer holds") && out.is_empty(),
            "capacity error must name the limit and expose no stale bitmap: {err}"
        );
    }

    #[test]
    fn prepare_rejects_zero_max_regions() {
        let matcher = GpuLiteralSet::compile(LITERALS);
        let backend = MockResidentBackend::new(vec![0u8; 4]);
        let err = matcher
            .prepare_resident_presence(&backend, 4096, 0)
            .expect_err("max_regions = 0 cannot size the presence buffer");
        assert!(
            err.to_string().contains("max_regions must be >= 1"),
            "zero-cap error must explain the cause: {err}"
        );
    }
}