Skip to main content

vyre_runtime/megakernel/
rule_catalog.rs

1//! DFA rule catalog packing for batched megakernel dispatch.
2
3use super::staging_reserve::{
4    reserve_hash_map_capacity as reserve_catalog_map, reserve_vec_capacity as reserve_catalog_vec,
5};
6use crate::PipelineError;
7use rustc_hash::FxHashMap;
8
9/// Dense byte alphabet used by the DFA transition table as the INPUT
10/// (`BatchRuleProgram`) representation: every rule still arrives as a dense
11/// `state * 256 + byte` table. The on-device packed table is byte-class
12/// compressed (see [`pack_rule_catalog_into`]); this constant is the source
13/// alphabet width the compressor folds DOWN from.
14pub const ALPHABET_SIZE: u32 = 256;
15const ALPHABET_SIZE_USIZE: usize = 256;
16
17/// Number of `u32` words per rule metadata entry. The kernel reads these in
18/// order: `transition_base`, `accept_base`, `state_count`, `class_map_base`,
19/// `num_classes`. Bump in lockstep with [`RuleMeta`] and the dispatcher's
20/// `dfa_byte_scanner` if the per-rule metadata grows.
21pub const RULE_META_WORDS: usize = 5;
22
23/// One compiled DFA-backed rule program consumed by the batch dispatcher.
24#[derive(Debug, Clone, PartialEq, Eq)]
25pub struct BatchRuleProgram {
26    /// Stable rule-table index.
27    pub rule_idx: u32,
28    /// Dense DFA transition table (`state * 256 + byte -> next_state`).
29    pub transitions: Vec<u32>,
30    /// Dense DFA accept table (`state -> non-zero match marker`).
31    pub accept: Vec<u32>,
32    /// DFA state count.
33    pub state_count: u32,
34}
35
36impl BatchRuleProgram {
37    /// Build one DFA-backed rule program.
38    ///
39    /// # Errors
40    ///
41    /// Returns [`PipelineError::Backend`] when the DFA buffers do not match
42    /// `state_count`.
43    pub fn new(
44        rule_idx: u32,
45        transitions: Vec<u32>,
46        accept: Vec<u32>,
47        state_count: u32,
48    ) -> Result<Self, PipelineError> {
49        validate_rule_shape(rule_idx, &transitions, &accept, state_count)?;
50        Ok(Self {
51            rule_idx,
52            transitions,
53            accept,
54            state_count,
55        })
56    }
57}
58
59/// Packed metadata for one byte-class-compressed DFA rule entry.
60///
61/// The on-device transition table is `transitions[transition_base + state *
62/// num_classes + class_maps[class_map_base + byte]]`: each rule carries a
63/// 256-entry byte→class map (into the shared `class_maps` buffer) and a
64/// compressed `state_count * num_classes` transition block, instead of a dense
65/// `state_count * 256` block. The compression is LOSSLESS — bytes share a class
66/// only when their transition column is identical across every state — so GPU
67/// firings are byte-for-byte identical to the dense table.
68#[repr(C)]
69#[derive(Debug, Clone, Copy, PartialEq, Eq, bytemuck::Pod, bytemuck::Zeroable)]
70pub struct RuleMeta {
71    /// Word offset into the flattened (compressed) transition table.
72    pub transition_base: u32,
73    /// Word offset into the flattened accept table.
74    pub accept_base: u32,
75    /// DFA state count for this rule.
76    pub state_count: u32,
77    /// Word offset into the shared 256-entry-per-rule byte→class map table.
78    pub class_map_base: u32,
79    /// Number of distinct byte classes for this rule (the compressed row width).
80    pub num_classes: u32,
81}
82
83/// One rule rejected from a megakernel batch while other rules still ran.
84#[derive(Debug, Clone, PartialEq, Eq)]
85pub struct BatchRuleRejection {
86    /// Caller-supplied rule index when present.
87    pub rule_idx: Option<u32>,
88    /// Human-readable rejection reason.
89    pub reason: String,
90}
91
92/// Packed rule catalog uploaded to device storage buffers.
93pub struct PackedRuleCatalog {
94    /// Dense per-rule metadata table.
95    pub rule_meta: Vec<RuleMeta>,
96    /// Deduplicated flattened byte-class-COMPRESSED DFA transition storage.
97    /// Indexed `transitions[rule.transition_base + state * rule.num_classes +
98    /// class]` where `class = class_maps[rule.class_map_base + byte]`.
99    pub transitions: Vec<u32>,
100    /// Deduplicated flattened DFA accept storage.
101    pub accept: Vec<u32>,
102    /// Deduplicated flattened 256-entry-per-rule byte→class maps. Indexed
103    /// `class_maps[rule.class_map_base + byte]`.
104    pub class_maps: Vec<u32>,
105    /// Rules rejected during validation or dense-slot assignment.
106    pub rejected_rules: Vec<BatchRuleRejection>,
107}
108
109/// Caller-owned storage for packing rule catalogs without rebuilding host
110/// allocations on every refresh.
111#[derive(Default)]
112pub struct RuleCatalogPackingScratch {
113    /// Dense per-rule metadata table.
114    pub rule_meta: Vec<RuleMeta>,
115    /// Deduplicated flattened byte-class-COMPRESSED DFA transition storage.
116    pub transitions: Vec<u32>,
117    /// Deduplicated flattened DFA accept storage.
118    pub accept: Vec<u32>,
119    /// Deduplicated flattened 256-entry-per-rule byte→class maps.
120    pub class_maps: Vec<u32>,
121    /// Rules rejected during validation or dense-slot assignment.
122    pub rejected_rules: Vec<BatchRuleRejection>,
123    /// fingerprint -> (transition_base, accept_base, state_count,
124    /// class_map_base, num_classes) for storage dedup across identical DFAs.
125    unique_storage: FxHashMap<[u8; 32], UniqueStorageLayout>,
126    occupied: Vec<bool>,
127    addressed: Vec<bool>,
128    /// Reusable 256-entry byte→class scratch built per unique DFA.
129    class_scratch: Vec<u32>,
130}
131
132/// Resident-buffer layout for one deduplicated unique DFA storage block.
133#[derive(Clone, Copy)]
134struct UniqueStorageLayout {
135    transition_base: u32,
136    accept_base: u32,
137    state_count: u32,
138    class_map_base: u32,
139    num_classes: u32,
140}
141
142/// Fingerprints for the valid dense catalog entries.
143#[must_use]
144pub fn accepted_rule_fingerprints(
145    rules: &[BatchRuleProgram],
146) -> (Vec<[u8; 32]>, Vec<BatchRuleRejection>) {
147    let mut fingerprints = Vec::new();
148    let mut occupied = Vec::new();
149    let mut addressed = Vec::new();
150    let rejections =
151        accepted_rule_fingerprints_into(rules, &mut fingerprints, &mut occupied, &mut addressed);
152    (fingerprints, rejections)
153}
154
155/// Fill caller-owned storage with fingerprints for valid dense catalog entries.
156///
157/// The output fingerprint order matches dense rule-table order, not input
158/// order. `fingerprints`, `occupied`, and `addressed` are cleared and reused so
159/// dispatchers can check resident catalog identity without allocating on every
160/// cache-hit dispatch.
161pub fn accepted_rule_fingerprints_into(
162    rules: &[BatchRuleProgram],
163    fingerprints: &mut Vec<[u8; 32]>,
164    occupied: &mut Vec<bool>,
165    addressed: &mut Vec<bool>,
166) -> Vec<BatchRuleRejection> {
167    let mut rejections = Vec::new();
168    accepted_rule_fingerprints_and_rejections_into(
169        rules,
170        fingerprints,
171        occupied,
172        addressed,
173        &mut rejections,
174    );
175    rejections
176}
177
178/// Fill caller-owned storage with fingerprints and rejection details for valid
179/// dense catalog entries.
180///
181/// This is the allocation-stable form used by hot dispatchers. All scratch
182/// vectors are cleared and reused; valid unchanged catalogs perform no host
183/// allocations while checking resident rule-buffer identity.
184pub fn accepted_rule_fingerprints_and_rejections_into(
185    rules: &[BatchRuleProgram],
186    fingerprints: &mut Vec<[u8; 32]>,
187    occupied: &mut Vec<bool>,
188    addressed: &mut Vec<bool>,
189    rejections: &mut Vec<BatchRuleRejection>,
190) {
191    fingerprints.clear();
192    fingerprints.resize(rules.len(), [0; 32]);
193    occupied.clear();
194    occupied.resize(rules.len(), false);
195    addressed.clear();
196    addressed.resize(rules.len(), false);
197    rejections.clear();
198
199    for rule in rules {
200        mark_addressed(addressed, rule.rule_idx);
201        match validate_rule_shape(
202            rule.rule_idx,
203            &rule.transitions,
204            &rule.accept,
205            rule.state_count,
206        ) {
207            Ok(()) => match claim_dense_index(occupied, rule.rule_idx, rules.len()) {
208                Ok(index) => fingerprints[index] = rule_fingerprint(rule),
209                Err(rejection) => rejections.push(rejection),
210            },
211            Err(error) => rejections.push(BatchRuleRejection {
212                rule_idx: Some(rule.rule_idx),
213                reason: error.to_string(),
214            }),
215        }
216    }
217
218    extend_missing_rejections(occupied, addressed, rejections);
219    let mut write = 0;
220    for read in 0..occupied.len() {
221        if occupied[read] {
222            fingerprints[write] = fingerprints[read];
223            write += 1;
224        }
225    }
226    fingerprints.truncate(write);
227}
228
229/// Pack valid DFA rules into compact shared device tables.
230///
231/// Rules with identical `(transitions, accept, state_count)` share backing
232/// transition and accept storage while retaining distinct dense metadata slots.
233pub fn pack_rule_catalog(rules: &[BatchRuleProgram]) -> Result<PackedRuleCatalog, PipelineError> {
234    let mut scratch = RuleCatalogPackingScratch::default();
235    pack_rule_catalog_into(rules, &mut scratch)?;
236    Ok(PackedRuleCatalog {
237        rule_meta: scratch.rule_meta,
238        transitions: scratch.transitions,
239        accept: scratch.accept,
240        class_maps: scratch.class_maps,
241        rejected_rules: scratch.rejected_rules,
242    })
243}
244
245/// Pack valid DFA rules into caller-owned storage.
246///
247/// Existing vector and hash-map allocations in `scratch` are reused across
248/// calls. This is the hot-path form for resident megakernel dispatchers that
249/// refresh device rule buffers repeatedly.
250pub fn pack_rule_catalog_into(
251    rules: &[BatchRuleProgram],
252    scratch: &mut RuleCatalogPackingScratch,
253) -> Result<(), PipelineError> {
254    scratch.unique_storage.clear();
255    reserve_catalog_map(
256        &mut scratch.unique_storage,
257        rules.len(),
258        "unique DFA storage",
259    )?;
260    // Inert rule slot 0: a 1-state DFA that self-loops on every byte and never
261    // accepts. Compressed it is num_classes=1, a 1-word transition row `[0]`,
262    // and a 256-entry all-zero byte→class map. Rejected / missing rules point
263    // their metadata here so the kernel reads a well-formed (no-match) DFA
264    // instead of out-of-bounds storage.
265    scratch.transitions.clear();
266    reserve_catalog_vec(&mut scratch.transitions, 1, "inert transition row")?;
267    scratch.transitions.push(0);
268    scratch.accept.clear();
269    reserve_catalog_vec(&mut scratch.accept, 1, "inert accept row")?;
270    scratch.accept.push(0);
271    scratch.class_maps.clear();
272    reserve_catalog_vec(
273        &mut scratch.class_maps,
274        ALPHABET_SIZE_USIZE,
275        "inert byte-class map",
276    )?;
277    scratch.class_maps.resize(ALPHABET_SIZE_USIZE, 0);
278    scratch.rule_meta.clear();
279    reserve_catalog_vec(&mut scratch.rule_meta, rules.len(), "rule metadata")?;
280    scratch.rule_meta.resize(
281        rules.len(),
282        RuleMeta {
283            transition_base: 0,
284            accept_base: 0,
285            state_count: 1,
286            class_map_base: 0,
287            num_classes: 1,
288        },
289    );
290    scratch.rejected_rules.clear();
291    reserve_catalog_vec(
292        &mut scratch.rejected_rules,
293        rules.len(),
294        "rule rejection rows",
295    )?;
296    scratch.occupied.clear();
297    reserve_catalog_vec(&mut scratch.occupied, rules.len(), "dense occupancy bitmap")?;
298    scratch.occupied.resize(rules.len(), false);
299    scratch.addressed.clear();
300    reserve_catalog_vec(
301        &mut scratch.addressed,
302        rules.len(),
303        "dense addressed bitmap",
304    )?;
305    scratch.addressed.resize(rules.len(), false);
306
307    for rule in rules {
308        mark_addressed(&mut scratch.addressed, rule.rule_idx);
309        if let Err(error) = validate_rule_shape(
310            rule.rule_idx,
311            &rule.transitions,
312            &rule.accept,
313            rule.state_count,
314        ) {
315            scratch.rejected_rules.push(BatchRuleRejection {
316                rule_idx: Some(rule.rule_idx),
317                reason: error.to_string(),
318            });
319            continue;
320        }
321
322        let meta_index = match claim_dense_index(
323            &mut scratch.occupied,
324            rule.rule_idx,
325            scratch.rule_meta.len(),
326        ) {
327            Ok(index) => index,
328            Err(rejection) => {
329                scratch.rejected_rules.push(rejection);
330                continue;
331            }
332        };
333
334        let storage_fingerprint = dfa_storage_fingerprint(rule);
335        let layout = if let Some(layout) = scratch.unique_storage.get(&storage_fingerprint) {
336            *layout
337        } else {
338            // Build the LOSSLESS byte→class map for this DFA into reusable
339            // scratch, then emit the compressed `state * num_classes + class`
340            // transition block. `num_classes <= 256`, with equality only when
341            // every byte transitions differently in some state — the common
342            // secret-detector DFAs collapse to a handful of classes.
343            let num_classes = build_byte_class_map(rule, &mut scratch.class_scratch);
344
345            let class_map_base =
346                u32::try_from(scratch.class_maps.len()).map_err(|_| PipelineError::QueueFull {
347                    queue: "submission",
348                    fix: "flattened byte-class map table exceeds u32::MAX words; split the rule catalog into smaller groups",
349                })?;
350            let class_map_target = scratch
351                .class_maps
352                .len()
353                .checked_add(ALPHABET_SIZE_USIZE)
354                .ok_or(PipelineError::QueueFull {
355                    queue: "submission",
356                    fix: "flattened byte-class map length overflows usize; split the rule catalog into smaller groups",
357                })?;
358            reserve_catalog_vec(
359                &mut scratch.class_maps,
360                class_map_target,
361                "flattened byte-class map storage",
362            )?;
363            scratch.class_maps.extend_from_slice(&scratch.class_scratch);
364
365            let transition_base =
366                u32::try_from(scratch.transitions.len()).map_err(|_| PipelineError::QueueFull {
367                    queue: "submission",
368                    fix: "flattened transition table exceeds u32::MAX words; split the rule catalog into smaller groups",
369                })?;
370            let accept_base = u32::try_from(scratch.accept.len()).map_err(|_| PipelineError::QueueFull {
371                queue: "submission",
372                fix: "flattened accept table exceeds u32::MAX words; split the rule catalog into smaller groups",
373            })?;
374            // Compressed block size = state_count * num_classes. Both are
375            // bounded (state_count is validated, num_classes <= 256), so the
376            // product cannot exceed the dense state_count * 256 size that
377            // already validated.
378            let compressed_words = (rule.state_count as usize)
379                .checked_mul(num_classes as usize)
380                .ok_or(PipelineError::QueueFull {
381                    queue: "submission",
382                    fix: "compressed transition block size overflows usize; split the rule catalog into smaller groups",
383                })?;
384            let transition_target = scratch
385                .transitions
386                .len()
387                .checked_add(compressed_words)
388                .ok_or(PipelineError::QueueFull {
389                    queue: "submission",
390                    fix: "flattened transition table length overflows usize; split the rule catalog into smaller groups",
391                })?;
392            reserve_catalog_vec(
393                &mut scratch.transitions,
394                transition_target,
395                "flattened transition storage",
396            )?;
397            let accept_target = scratch
398                .accept
399                .len()
400                .checked_add(rule.accept.len())
401                .ok_or(PipelineError::QueueFull {
402                    queue: "submission",
403                    fix: "flattened accept table length overflows usize; split the rule catalog into smaller groups",
404                })?;
405            reserve_catalog_vec(
406                &mut scratch.accept,
407                accept_target,
408                "flattened accept storage",
409            )?;
410            // Emit one compressed transition per (state, class). For class `c`
411            // pick ANY byte that maps to it (the first); the dense column for
412            // every byte in that class is identical by construction, so the
413            // value is well-defined and lossless.
414            let mut class_representative = vec![0usize; num_classes as usize];
415            let mut seen = vec![false; num_classes as usize];
416            for (byte, &class) in scratch.class_scratch.iter().enumerate() {
417                let class = class as usize;
418                if !seen[class] {
419                    seen[class] = true;
420                    class_representative[class] = byte;
421                }
422            }
423            for state in 0..rule.state_count as usize {
424                let dense_row = state * ALPHABET_SIZE_USIZE;
425                for &rep_byte in &class_representative {
426                    scratch.transitions.push(rule.transitions[dense_row + rep_byte]);
427                }
428            }
429            scratch.accept.extend_from_slice(&rule.accept);
430
431            let layout = UniqueStorageLayout {
432                transition_base,
433                accept_base,
434                state_count: rule.state_count,
435                class_map_base,
436                num_classes,
437            };
438            scratch
439                .unique_storage
440                .insert(storage_fingerprint, layout);
441            layout
442        };
443        scratch.rule_meta[meta_index] = RuleMeta {
444            transition_base: layout.transition_base,
445            accept_base: layout.accept_base,
446            state_count: layout.state_count,
447            class_map_base: layout.class_map_base,
448            num_classes: layout.num_classes,
449        };
450    }
451
452    extend_missing_rejections(
453        &scratch.occupied,
454        &scratch.addressed,
455        &mut scratch.rejected_rules,
456    );
457    Ok(())
458}
459
460/// Build the LOSSLESS byte→class map for one dense DFA into `out` (resized to
461/// 256) and return the class count.
462///
463/// Two bytes share a class iff their transition COLUMN is identical across every
464/// state: `transitions[s*256 + a] == transitions[s*256 + b]` for all `s`. Class
465/// ids are assigned in order of first byte appearance, so the map is `0` for
466/// byte 0's class and deterministic. The returned width `num_classes` is `<=
467/// 256`; for the secret-detector DFAs (long fixed prefixes + a few char
468/// classes) it collapses to a handful, shrinking the per-state transition row
469/// from 256 words to `num_classes` words — a lossless ~16x reduction on the
470/// ~987 MB catalog without changing a single firing.
471///
472/// `out` is cleared/reused so the hot resident-refresh path allocates nothing.
473fn build_byte_class_map(rule: &BatchRuleProgram, out: &mut Vec<u32>) -> u32 {
474    out.clear();
475    out.resize(ALPHABET_SIZE_USIZE, 0);
476    let state_count = rule.state_count as usize;
477    // Column signature per byte = its next-state across every state. Group by
478    // signature. `FxHashMap` keyed on the column bytes; the first byte to
479    // produce a signature owns a fresh class id.
480    let mut signature_to_class: FxHashMap<Vec<u32>, u32> = FxHashMap::default();
481    let mut next_class: u32 = 0;
482    let mut signature = Vec::with_capacity(state_count);
483    for byte in 0..ALPHABET_SIZE_USIZE {
484        signature.clear();
485        for state in 0..state_count {
486            signature.push(rule.transitions[state * ALPHABET_SIZE_USIZE + byte]);
487        }
488        let class = match signature_to_class.get(&signature) {
489            Some(&class) => class,
490            None => {
491                let class = next_class;
492                next_class += 1;
493                signature_to_class.insert(signature.clone(), class);
494                class
495            }
496        };
497        out[byte] = class;
498    }
499    next_class
500}
501
502fn validate_rule_shape(
503    rule_idx: u32,
504    transitions: &[u32],
505    accept: &[u32],
506    state_count: u32,
507) -> Result<(), PipelineError> {
508    let expected_transitions = usize::try_from(state_count)
509        .ok()
510        .and_then(|count| count.checked_mul(ALPHABET_SIZE_USIZE))
511        .ok_or_else(|| {
512            PipelineError::Backend("rule transition table size overflowed usize".to_string())
513        })?;
514    if transitions.len() != expected_transitions {
515        return Err(PipelineError::Backend(format!(
516            "rule {rule_idx} transition table has {} words, expected {expected_transitions}. Fix: compile a dense state_count * 256 DFA table before batch dispatch.",
517            transitions.len()
518        )));
519    }
520    let state_count_usize = usize::try_from(state_count).map_err(|source| {
521        PipelineError::Backend(format!(
522            "rule {rule_idx} state_count {state_count} cannot fit usize: {source}. Fix: shard the DFA state space before batch dispatch."
523        ))
524    })?;
525    if accept.len() != state_count_usize {
526        return Err(PipelineError::Backend(format!(
527            "rule {rule_idx} accept table has {} words, expected {state_count}. Fix: emit one accept entry per DFA state before batch dispatch.",
528            accept.len()
529        )));
530    }
531    Ok(())
532}
533
534fn rule_fingerprint(rule: &BatchRuleProgram) -> [u8; 32] {
535    let mut hasher = blake3::Hasher::new();
536    hasher.update(&rule.rule_idx.to_le_bytes());
537    hasher.update(bytemuck::cast_slice(&rule.transitions));
538    hasher.update(bytemuck::cast_slice(&rule.accept));
539    hasher.update(&rule.state_count.to_le_bytes());
540    *hasher.finalize().as_bytes()
541}
542
543fn dfa_storage_fingerprint(rule: &BatchRuleProgram) -> [u8; 32] {
544    let mut hasher = blake3::Hasher::new();
545    hasher.update(bytemuck::cast_slice(&rule.transitions));
546    hasher.update(bytemuck::cast_slice(&rule.accept));
547    hasher.update(&rule.state_count.to_le_bytes());
548    *hasher.finalize().as_bytes()
549}
550
551fn mark_addressed(addressed: &mut [bool], rule_idx: u32) {
552    if let Some(index) = usize::try_from(rule_idx)
553        .ok()
554        .filter(|index| *index < addressed.len())
555    {
556        addressed[index] = true;
557    }
558}
559
560fn claim_dense_index(
561    occupied: &mut [bool],
562    rule_idx: u32,
563    slot_count: usize,
564) -> Result<usize, BatchRuleRejection> {
565    let Some(meta_index) = usize::try_from(rule_idx).ok() else {
566        return Err(BatchRuleRejection {
567            rule_idx: Some(rule_idx),
568            reason: "rule_idx exceeds usize. Fix: rebuild the batch with a smaller rule catalog"
569                .to_string(),
570        });
571    };
572    if meta_index >= slot_count {
573        return Err(BatchRuleRejection {
574            rule_idx: Some(rule_idx),
575            reason: format!(
576                "rule_idx {rule_idx} falls outside 0..{slot_count}. Fix: keep the rule catalog dense so the batch work queue can address every rule"
577            ),
578        });
579    }
580    if occupied[meta_index] {
581        return Err(BatchRuleRejection {
582            rule_idx: Some(rule_idx),
583            reason: format!(
584                "duplicate rule_idx {rule_idx}. Fix: keep exactly one rule per dense rule-table slot"
585            ),
586        });
587    }
588    occupied[meta_index] = true;
589    Ok(meta_index)
590}
591
592fn extend_missing_rejections(
593    occupied: &[bool],
594    addressed: &[bool],
595    out: &mut Vec<BatchRuleRejection>,
596) {
597    for (rule_idx, (occupied, addressed)) in occupied
598        .iter()
599        .copied()
600        .zip(addressed.iter().copied())
601        .enumerate()
602    {
603        if !occupied && !addressed {
604            let Ok(rule_idx_u32) = u32::try_from(rule_idx) else {
605                continue;
606            };
607            out.push(BatchRuleRejection {
608                rule_idx: Some(rule_idx_u32),
609                reason: format!(
610                    "rule_idx {rule_idx} has no valid catalog entry. Fix: provide a well-formed DFA for every dense rule slot before batch dispatch"
611                ),
612            });
613        }
614    }
615}
616
617#[cfg(test)]
618
619mod tests {
620    use super::*;
621
622    /// Resolve the next state the COMPRESSED packed catalog yields for
623    /// `(rule, state, byte)` — mirrors the GPU kernel's index math exactly so
624    /// the parity tests can prove byte-for-byte equivalence to the dense table.
625    fn packed_next_state(packed: &PackedRuleCatalog, meta_index: usize, state: u32, byte: u8) -> u32 {
626        let meta = packed.rule_meta[meta_index];
627        let class = packed.class_maps[meta.class_map_base as usize + byte as usize];
628        let idx = meta.transition_base as usize
629            + state as usize * meta.num_classes as usize
630            + class as usize;
631        packed.transitions[idx]
632    }
633
634    #[test]
635    fn duplicate_dfas_share_catalog_storage() {
636        let first = BatchRuleProgram::new(0, vec![0; 256], vec![0], 1).unwrap();
637        let second = BatchRuleProgram::new(1, vec![0; 256], vec![0], 1).unwrap();
638        let packed = pack_rule_catalog(&[first, second]).unwrap();
639        // Identical DFAs share compressed transition, accept AND class-map storage.
640        assert_eq!(
641            packed.rule_meta[0].transition_base,
642            packed.rule_meta[1].transition_base
643        );
644        assert_eq!(
645            packed.rule_meta[0].accept_base,
646            packed.rule_meta[1].accept_base
647        );
648        assert_eq!(
649            packed.rule_meta[0].class_map_base,
650            packed.rule_meta[1].class_map_base
651        );
652        assert_eq!(packed.rule_meta[0].num_classes, packed.rule_meta[1].num_classes);
653        // An all-zero 1-state DFA collapses to a SINGLE byte class (every byte
654        // self-loops to state 0), so its compressed row is exactly one word, not
655        // 256. transition_base points just past the 1-word inert row.
656        assert_eq!(packed.rule_meta[0].num_classes, 1);
657        assert_eq!(packed.rule_meta[0].transition_base, 1);
658        assert_eq!(
659            packed.transitions.len(),
660            packed.rule_meta[0].transition_base as usize + 1
661        );
662        assert_eq!(
663            packed.accept.len(),
664            packed.rule_meta[0].accept_base as usize + 1
665        );
666        assert!(packed.rejected_rules.is_empty());
667    }
668
669    #[test]
670    fn duplicate_dfas_do_not_reserve_raw_duplicate_storage() {
671        let rules = (0..32)
672            .map(|rule_idx| BatchRuleProgram::new(rule_idx, vec![0; 256], vec![0], 1).unwrap())
673            .collect::<Vec<_>>();
674
675        let packed = pack_rule_catalog(&rules).unwrap();
676
677        // 1-word inert row + 1-word shared compressed row for all 32 duplicates.
678        assert_eq!(packed.transitions.len(), 2);
679        assert!(
680            packed.transitions.capacity() < ALPHABET_SIZE as usize * rules.len(),
681            "Fix: duplicate DFA catalogs must not reserve memory as if every rule had unique transition storage."
682        );
683        assert_eq!(packed.accept.len(), 2);
684        assert!(
685            packed.accept.capacity() < rules.len(),
686            "Fix: duplicate DFA catalogs must not reserve accept storage for every duplicate rule."
687        );
688        // One inert + one shared class map, not 32.
689        assert_eq!(packed.class_maps.len(), ALPHABET_SIZE as usize * 2);
690    }
691
692    /// The compressed catalog yields byte-for-byte identical next-states to the
693    /// dense `state * 256 + byte` table for EVERY (state, byte) of a non-trivial
694    /// multi-class DFA — the lossless parity contract the GPU kernel depends on.
695    #[test]
696    fn byte_class_compression_is_lossless() {
697        // 3-state DFA. byte 0x41 ('A') advances 0->1->2->2; byte 0x42 ('B')
698        // advances 1->2 only; all other bytes reset to 0. This forces THREE
699        // distinct byte classes (A, B, everything-else) so num_classes < 256
700        // and the compression is exercised, not a degenerate single class.
701        let states = 3usize;
702        let mut dense = vec![0u32; states * 256];
703        // state 0: 'A' -> 1, else -> 0
704        dense[0 * 256 + 0x41] = 1;
705        // state 1: 'A' -> 2, 'B' -> 2, else -> 0
706        dense[1 * 256 + 0x41] = 2;
707        dense[1 * 256 + 0x42] = 2;
708        // state 2: 'A' -> 2, else -> 0
709        dense[2 * 256 + 0x41] = 2;
710        let accept = vec![0u32, 0, 1];
711        let rule = BatchRuleProgram::new(0, dense.clone(), accept, states as u32).unwrap();
712        let packed = pack_rule_catalog(&[rule]).unwrap();
713
714        assert_eq!(packed.rejected_rules.len(), 0);
715        // 'A', 'B', and the rest are three behaviourally-distinct columns.
716        assert_eq!(packed.rule_meta[0].num_classes, 3);
717        assert!(
718            packed.transitions.len() < 1 + states * 256,
719            "compressed transitions must be smaller than the dense table"
720        );
721
722        for state in 0..states as u32 {
723            for byte in 0u16..256 {
724                let byte = byte as u8;
725                let expected = dense[state as usize * 256 + byte as usize];
726                let got = packed_next_state(&packed, 0, state, byte);
727                assert_eq!(
728                    got, expected,
729                    "compressed transition mismatch at state {state} byte {byte:#x}: dense={expected} packed={got}"
730                );
731            }
732        }
733    }
734
735    /// A DFA whose every byte transitions differently in some state must NOT be
736    /// over-compressed: it keeps all 256 classes and still round-trips losslessly.
737    #[test]
738    fn full_alphabet_dfa_keeps_all_classes_and_is_lossless() {
739        // 2-state DFA where state 0 sends byte b -> (b as state is impossible
740        // with 2 states), so instead: state 0 sends EVERY byte to a distinct
741        // value by using state 1 vs 0 based on parity — that only yields 2
742        // classes. To force 256 classes we need 256 distinct columns, which
743        // needs >=256 states. Use a 256-state identity: state s, byte b -> b.
744        let states = 256usize;
745        let mut dense = vec![0u32; states * 256];
746        for s in 0..states {
747            for b in 0..256 {
748                dense[s * 256 + b] = b as u32; // column for byte b is constant = b across all states
749            }
750        }
751        // Every byte's column is the constant vector [b; 256], all distinct, so
752        // 256 classes.
753        let accept = vec![0u32; states];
754        let rule = BatchRuleProgram::new(0, dense.clone(), accept, states as u32).unwrap();
755        let packed = pack_rule_catalog(&[rule]).unwrap();
756        assert_eq!(packed.rule_meta[0].num_classes, 256);
757        for state in 0..states as u32 {
758            for byte in 0u16..256 {
759                let byte = byte as u8;
760                let expected = dense[state as usize * 256 + byte as usize];
761                assert_eq!(packed_next_state(&packed, 0, state, byte), expected);
762            }
763        }
764    }
765
766    #[test]
767    fn accepted_rule_fingerprints_into_reuses_caller_storage() {
768        let rules = (0..8)
769            .map(|rule_idx| BatchRuleProgram::new(rule_idx, vec![0; 256], vec![0], 1).unwrap())
770            .collect::<Vec<_>>();
771        let mut fingerprints = Vec::with_capacity(16);
772        let mut occupied = Vec::with_capacity(16);
773        let mut addressed = Vec::with_capacity(16);
774        let fingerprint_ptr = fingerprints.as_ptr();
775        let occupied_ptr = occupied.as_ptr();
776        let addressed_ptr = addressed.as_ptr();
777
778        let rejections = accepted_rule_fingerprints_into(
779            &rules,
780            &mut fingerprints,
781            &mut occupied,
782            &mut addressed,
783        );
784
785        assert!(rejections.is_empty());
786        assert_eq!(fingerprints.len(), rules.len());
787        assert_eq!(fingerprints.as_ptr(), fingerprint_ptr);
788        assert_eq!(occupied.as_ptr(), occupied_ptr);
789        assert_eq!(addressed.as_ptr(), addressed_ptr);
790    }
791
792    #[test]
793    fn invalid_rules_are_isolated_to_inert_catalog_entries() {
794        let valid = BatchRuleProgram::new(0, vec![0; 256], vec![1], 1).unwrap();
795        let invalid = BatchRuleProgram {
796            rule_idx: 1,
797            transitions: vec![0; 8],
798            accept: vec![0],
799            state_count: 1,
800        };
801
802        let packed = pack_rule_catalog(&[valid, invalid]).unwrap();
803        assert_eq!(packed.rejected_rules.len(), 1);
804        assert_eq!(packed.rejected_rules[0].rule_idx, Some(1));
805        // Valid rule (slot 0) points at a REAL compressed block past the inert
806        // row; the inert/rejected slot 1 points back at the inert row 0.
807        assert_eq!(packed.rule_meta[0].state_count, 1);
808        assert!(packed.rule_meta[0].transition_base >= 1);
809        assert_eq!(packed.rule_meta[1].transition_base, 0);
810        assert_eq!(packed.rule_meta[1].accept_base, 0);
811        assert_eq!(packed.rule_meta[1].state_count, 1);
812        assert_eq!(packed.rule_meta[1].class_map_base, 0);
813        assert_eq!(packed.rule_meta[1].num_classes, 1);
814        // Inert row 0: a single self-loop word and an all-zero 256-entry class
815        // map — the rejected slot reads a well-formed no-match DFA.
816        assert_eq!(packed.transitions[0], 0);
817        assert_eq!(packed.accept[0], 0);
818        assert_eq!(
819            &packed.class_maps[..ALPHABET_SIZE as usize],
820            &vec![0; ALPHABET_SIZE as usize]
821        );
822        // The rejected slot, driven on ANY byte from state 0, stays in state 0
823        // (never matches) — the loud-isolation contract.
824        assert_eq!(packed_next_state(&packed, 1, 0, b'X'), 0);
825    }
826}