vyre-libs 0.6.2

//! Regex set → dense DFA GPU pipeline.
//!
//! Composes three existing primitives end-to-end:
//!
//! 1. `compile_regex_set` (this crate) - regex sources → bit-vector NFA.
//! 2. `vyre_primitives::matching::nfa_to_dfa` - NFA → dense
//!    `CompiledDfa` via subset construction.
//! 3. `build_ac_bounded_ranges_program` (this crate) - `CompiledDfa`
//!    → GPU `Program` with O(1)-per-byte AC scan semantics.
//!
//! The output [`RegexDfaPipeline`] is the regex-aware counterpart of
//! the literal-pattern `ClassicAcAutomaton`: same `Program` shape, same
//! per-byte transition cost, same `(pid, start, end)` hit triple
//! output. Consumers that already dispatch
//! `classic_ac_bounded_ranges_program` can swap their literal pattern
//! source for a regex set and pay no extra per-byte cost on the GPU.
//!
//! # When this beats `RulePipeline`
//!
//! `RulePipeline` (the bit-vector NFA scan kernel) is O(LANES²) per
//! byte regardless of pattern set size - fine for short-buffer per-
//! anchor confirmation, catastrophic for whole-buffer scans at >1 MiB.
//! `RegexDfaPipeline` is O(1) per byte but pays a state-explosion risk
//! in the subset construction. Use it when:
//!
//! * The pattern set's subset-constructed DFA stays under
//!   `max_dfa_states` (typical for high-volume detector regexes:
//!   bounded character classes + bounded repetitions).
//! * The haystack is large enough that per-byte cost dominates per-
//!   dispatch overhead.
//!
//! Fall back to `RulePipeline` when the subset construction returns
//! [`vyre_primitives::matching::NfaToDfaError::StateExplosion`].

use std::error::Error;
use std::fmt;

use vyre_foundation::ir::Program;

use vyre_primitives::matching::{nfa_to_dfa, CompiledDfa, NfaTables, NfaToDfaError};

use crate::scan::classic_ac::try_build_ac_bounded_ranges_program_ext;
use crate::scan::regex_compile::{compile_regex_set, CompiledRegexSet, RegexCompileError};

/// Ready-to-dispatch regex DFA pipeline.
///
/// `program` is built by `build_ac_bounded_ranges_program` and has the
/// same buffer contract as a literal-AC dispatch: `haystack`,
/// `transitions`, `output_offsets`, `output_records`, `pattern_lengths`,
/// `haystack_len`, `match_count`, `matches`. Upload the corresponding
/// fields of [`RegexDfaPipeline::dfa`] and [`RegexDfaPipeline::pattern_lengths`]
/// to those buffers and dispatch.
#[derive(Debug, Clone)]
pub struct RegexDfaPipeline {
    /// Dispatchable GPU program. Same shape as
    /// `classic_ac_bounded_ranges_program` output - caller wires the
    /// existing AC kernel host-side, no new dispatch path needed.
    pub program: Program,
    /// Dense DFA produced by NFA → DFA subset construction. Owns the
    /// transition / accept / output_offsets / output_records buffers
    /// the GPU program reads from.
    pub dfa: CompiledDfa,
    /// One entry per input regex; `pattern_lengths[pid]` is the longest
    /// possible match length for that pattern. Uploaded to the
    /// `pattern_lengths` buffer the AC kernel uses to derive each
    /// match's `start` offset from the accepting `end` offset.
    pub pattern_lengths: Vec<u32>,
}

/// Failures from [`build_regex_dfa_pipeline`].
#[derive(Debug)]
#[non_exhaustive]
pub enum RegexDfaError {
    /// Regex parsing or NFA construction rejected a pattern.
    Compile(RegexCompileError),
    /// Subset construction couldn't lower the NFA - typically state
    /// explosion. The caller should either raise `max_dfa_states`,
    /// shard the pattern set, or fall back to `RulePipeline`.
    Lower(NfaToDfaError),
    /// Regex/DFA metadata exceeded the GPU program's u32 ABI or host-side
    /// staging allocation budget.
    Size {
        /// Actionable sizing diagnostic.
        message: String,
    },
}

impl fmt::Display for RegexDfaError {
    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Self::Compile(error) => write!(formatter, "regex NFA compile failed: {error}"),
            Self::Lower(error) => {
                write!(formatter, "NFA → DFA subset construction failed: {error}")
            }
            Self::Size { message } => write!(formatter, "regex DFA sizing failed: {message}"),
        }
    }
}

impl Error for RegexDfaError {}

impl From<RegexCompileError> for RegexDfaError {
    fn from(error: RegexCompileError) -> Self {
        Self::Compile(error)
    }
}

impl From<NfaToDfaError> for RegexDfaError {
    fn from(error: NfaToDfaError) -> Self {
        Self::Lower(error)
    }
}

/// Build a [`RegexDfaPipeline`] from a list of regex sources.
///
/// `max_matches` is the per-dispatch hit-buffer cap (passed through to
/// `build_ac_bounded_ranges_program`). `max_dfa_states` is the subset-
/// construction state cap (see
/// [`vyre_primitives::matching::nfa_to_dfa`]). The default of 16k
/// states matches `DEFAULT_DFA_BUDGET_BYTES = 16 MiB` (16k × 256 × 4 B).
///
/// The match-append strategy is the default `append_match_subgroup`
/// (I.17 - one atomic per subgroup leader). On backends that can't
/// lower `subgroup_ballot` / `subgroup_shuffle` yet (currently
/// `vyre-driver-cuda`) use [`build_regex_dfa_pipeline_ext`] with
/// `use_subgroup_coalesce = false`.
///
/// # Errors
/// See [`RegexDfaError`].
pub fn build_regex_dfa_pipeline(
    patterns: &[&str],
    max_matches: u32,
    max_dfa_states: usize,
) -> Result<RegexDfaPipeline, RegexDfaError> {
    build_regex_dfa_pipeline_ext(patterns, max_matches, max_dfa_states, true)
}

/// [`build_regex_dfa_pipeline`] with explicit `use_subgroup_coalesce`
/// control. Pass `false` on backends whose IR lowering cannot yet emit
/// `subgroup_ballot` + `subgroup_shuffle` - currently `vyre-driver-cuda`
/// rejects the subgroup form during canonical pre-emit lowering. Either
/// flag produces bit-identical match output; the difference is purely
/// the atomic-coalescing strategy at hit-buffer append time.
///
/// # Errors
/// See [`RegexDfaError`].
pub fn build_regex_dfa_pipeline_ext(
    patterns: &[&str],
    max_matches: u32,
    max_dfa_states: usize,
    use_subgroup_coalesce: bool,
) -> Result<RegexDfaPipeline, RegexDfaError> {
    let regex_set = compile_regex_set(patterns)?;
    finish_regex_dfa_pipeline(
        regex_set,
        patterns,
        max_matches,
        max_dfa_states,
        use_subgroup_coalesce,
    )
}

/// **Unanchored (find-anywhere)** counterpart of [`build_regex_dfa_pipeline`].
///
/// [`build_regex_dfa_pipeline`] compiles an *anchored* DFA: it only matches a
/// pattern starting at the scan origin (a secret at byte 9 of a file is missed).
/// This variant adds the implicit `.*` prefix at the **NFA-table level** — it
/// self-loops the NFA start state on every byte so the automaton stays live at
/// every position (Aho-Corasick semantics), then runs the same subset
/// construction. Match offsets are reported at the match END, exactly as the
/// literal AC path.
///
/// This is done on the bit-table, NOT by prepending `(?s).*?` to the regex
/// source: the regex-text approach explodes NFA/DFA construction for complex
/// patterns (measured OOM across a 1.7k-pattern set), while the start self-loop
/// is O(256) and leaves the rest of the automaton untouched.
///
/// # Errors
/// See [`RegexDfaError`].
pub fn build_regex_dfa_unanchored(
    patterns: &[&str],
    max_matches: u32,
    max_dfa_states: usize,
) -> Result<RegexDfaPipeline, RegexDfaError> {
    let mut regex_set = compile_regex_set(patterns)?;
    add_implicit_dotstar_prefix(
        &mut regex_set.transition_table,
        regex_set.plan.num_states as usize,
    );
    finish_regex_dfa_pipeline(regex_set, patterns, max_matches, max_dfa_states, true)
}

/// Add an implicit `.*` prefix to a subgroup-NFA transition table: self-loop the
/// start state (state 0 — lane 0, bit 0) on every byte so it remains active at
/// each input position. This is the standard unanchored/Aho-Corasick transform,
/// applied to the lane-major `[num_states × 256 × LANES]` table where entry
/// `trans[src*256*LANES + byte*LANES + lane]` holds the destination-state bits
/// lane `lane` owns. For `src = 0, lane = 0` over every byte we OR in bit 0.
fn add_implicit_dotstar_prefix(transition_table: &mut [u32], num_states: usize) {
    if num_states == 0 {
        return;
    }
    // LANES = table_len / (num_states * 256); derive it so this stays correct if
    // LANES_PER_SUBGROUP ever changes, with no extra feature import.
    let denom = num_states.saturating_mul(256);
    if denom == 0 || transition_table.len() % denom != 0 {
        return; // malformed table — leave it to nfa_to_dfa to reject.
    }
    let lanes = transition_table.len() / denom;
    for byte in 0..256usize {
        // src = 0, lane = 0  →  index = 0*256*lanes + byte*lanes + 0
        let idx = byte * lanes;
        if idx < transition_table.len() {
            transition_table[idx] |= 1; // bit 0 = state 0 (start) self-loop
        }
    }
}

/// Shared tail of the regex→DFA build: turn a compiled NFA regex set into a
/// dispatchable [`RegexDfaPipeline`] (subset construction + AC program). Called
/// by both the anchored and unanchored entry points.
fn finish_regex_dfa_pipeline(
    regex_set: CompiledRegexSet,
    patterns: &[&str],
    max_matches: u32,
    max_dfa_states: usize,
    use_subgroup_coalesce: bool,
) -> Result<RegexDfaPipeline, RegexDfaError> {
    // The NFA `plan` carries accept_states as `(pattern_id, match_len)`
    // tuples. nfa_to_dfa wants the pattern ids and the max len
    // separately; max_pattern_len doubles as the AC kernel's per-
    // position replay window cap.
    let mut accept_pattern_ids: Vec<u32> = Vec::new();
    reserve_regex_vec(
        &mut accept_pattern_ids,
        regex_set.plan.accept_states.len(),
        "accept pattern id table",
    )?;
    accept_pattern_ids.extend(regex_set.plan.accept_states.iter().map(|(pid, _)| *pid));
    let max_pattern_len = regex_set
        .plan
        .accept_states
        .iter()
        .map(|(_, len)| *len)
        .max()
        .unwrap_or(0);
    // pattern_lengths is per-pattern indexed; build it from the accept
    // table. A pattern with multiple accept states (alternation) takes
    // the longest match length - same convention `dfa_compile` uses.
    let pattern_count = u32::try_from(patterns.len()).map_err(|source| RegexDfaError::Size {
        message: format!(
            "pattern count {} exceeds u32 GPU buffer metadata: {source}. Fix: shard the regex set before building a DFA dispatch.",
            patterns.len()
        ),
    })?;
    let mut pattern_lengths = Vec::new();
    reserve_regex_vec(&mut pattern_lengths, patterns.len(), "pattern length table")?;
    pattern_lengths.resize(patterns.len(), 0);
    for (pid, len) in &regex_set.plan.accept_states {
        let idx = usize::try_from(*pid).map_err(|source| RegexDfaError::Size {
            message: format!(
                "accept pattern id {pid} cannot fit usize for pattern-length indexing: {source}. Fix: shard the regex set before building a DFA dispatch."
            ),
        })?;
        if idx < pattern_lengths.len() && *len > pattern_lengths[idx] {
            pattern_lengths[idx] = *len;
        }
    }

    let tables = NfaTables {
        num_states: regex_set.plan.num_states,
        transition_table: &regex_set.transition_table,
        epsilon_table: &regex_set.epsilon_table,
        accept_state_ids: &regex_set.plan.accept_state_ids,
        accept_pattern_ids: &accept_pattern_ids,
        max_pattern_len,
    };
    let dfa = nfa_to_dfa(&tables, max_dfa_states)?;

    let program = try_build_ac_bounded_ranges_program_ext(
        &dfa,
        pattern_count,
        max_matches,
        use_subgroup_coalesce,
    )
    .map_err(|message| RegexDfaError::Size { message })?;

    Ok(RegexDfaPipeline {
        program,
        dfa,
        pattern_lengths,
    })
}

fn reserve_regex_vec<T>(
    vec: &mut Vec<T>,
    requested: usize,
    label: &'static str,
) -> Result<(), RegexDfaError> {
    vyre_foundation::allocation::try_reserve_vec_to_capacity(vec, requested).map_err(|source| {
        RegexDfaError::Size {
            message: format!(
                "regex DFA {label} reservation failed for {requested} item(s): {source}. Fix: shard the regex set or lower the DFA budget before dispatch."
            ),
        }
    })
}

#[cfg(test)]
mod tests {
    use super::*;

    /// Single-pass DFA replay from the start state — the exact semantics the
    /// megakernel batch dispatcher uses (one pass per file, no per-position
    /// restart). Returns the end offsets where the DFA accepts.
    fn single_pass_accept_ends(dfa: &CompiledDfa, haystack: &[u8]) -> Vec<usize> {
        let mut state = 0u32;
        let mut ends = Vec::new();
        for (i, &b) in haystack.iter().enumerate() {
            state = dfa.transitions[state as usize * 256 + b as usize];
            if dfa.accept[state as usize] != 0 {
                ends.push(i + 1);
            }
        }
        ends
    }

    /// The unanchored build must match a pattern at ANY offset under a single
    /// forward pass (find-anywhere), while the anchored build dies on a
    /// non-matching prefix. This is the property the keyhog megakernel fallback
    /// port depends on (a secret is rarely at byte 0).
    #[test]
    fn unanchored_dfa_matches_at_any_offset_single_pass() {
        let anchored = build_regex_dfa_pipeline(&["abc"], 1024, 1024).expect("anchored compiles");
        let unanchored =
            build_regex_dfa_unanchored(&["abc"], 1024, 1024).expect("unanchored compiles");

        // Unanchored: one pass over "xxabc" accepts at end=5 (abc at bytes 2..4).
        assert_eq!(
            single_pass_accept_ends(&unanchored.dfa, b"xxabc"),
            vec![5],
            "unanchored DFA must match `abc` after a non-matching prefix"
        );
        // Anchored: the leading 'x' drives state 0 to a dead state → no accept.
        assert!(
            single_pass_accept_ends(&anchored.dfa, b"xxabc").is_empty(),
            "anchored DFA must NOT match `abc` after a non-matching prefix"
        );
        // Both match at the start.
        assert_eq!(single_pass_accept_ends(&unanchored.dfa, b"abc"), vec![3]);
        assert_eq!(single_pass_accept_ends(&anchored.dfa, b"abc"), vec![3]);
        // Unanchored finds every occurrence in one pass.
        assert_eq!(
            single_pass_accept_ends(&unanchored.dfa, b"abcxabc"),
            vec![3, 7],
            "unanchored DFA must find all occurrences"
        );
    }

    /// Regression: the GPU parity gate (keyhog) missed a real `ghp_` token whose
    /// 36-char body contains g/h/p (the prefix chars) — a prefix/body overlap
    /// under the `.*` self-loop. This CPU single-pass DFA check isolates whether
    /// the miss is in THIS primitive's construction or downstream on the GPU.
    #[test]
    fn unanchored_dfa_finds_overlap_body_token_single_pass() {
        let dfa = build_regex_dfa_unanchored(&["ghp_[A-Za-z0-9]{36}"], 1024, 16384)
            .expect("compiles")
            .dfa;
        // Exact missed content from the keyhog cpu_parity gate (file 120).
        let hay = b"at = \"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\"";
        let ends = single_pass_accept_ends(&dfa, hay);
        assert_eq!(
            ends,
            vec![hay.len() - 1],
            "unanchored DFA must accept the ghp_ token exactly before the closing quote"
        );
    }

    /// Isolation for the 6 GPU parity-gate misses: run the EXACT missed contexts
    /// through the dense `CompiledDfa` on the CPU with the kernel's single-pass
    /// semantics. If these all accept here but the GPU drops them, the bug is in
    /// the megakernel dispatch, not this primitive's DFA construction.
    #[test]
    fn unanchored_dfa_finds_all_parity_gate_misses_single_pass() {
        // (pattern, exact missed match content from the cpu_parity gate run)
        let cases: &[(&str, &[u8])] = &[
            (
                "ghp_[A-Za-z0-9]{36}",
                b"at = \"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\"",
            ),
            (
                "gho_[A-Za-z0-9]{36}",
                b"ken: \"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\"",
            ),
            (
                "ghu_[A-Za-z0-9]{36}",
                b"Key: \"ghu_m7BOv2Uj0AZZK088M7RQJkZX3EgBVV1Xt7i2\"",
            ),
            (
                "ghu_[A-Za-z0-9]{36}",
                b"OKEN: ghu_4u5ef0rIhtKpPV1F0dPwwhXNMpEXkB0tWWQv",
            ),
            (
                "xox[baprs]-[A-Za-z0-9-]{10,48}",
                b"Key: \"xoxb-1234567890-1234567890-EXAMPLE-TOKEN\"",
            ),
            (
                "xox[baprs]-[A-Za-z0-9-]{10,48}",
                b"_KEY=\"xoxb-32790994721-16118213278-q5KLPWcLboh0tchHpJPgWhuC\"",
            ),
        ];
        for (pat, hay) in cases {
            let dfa = build_regex_dfa_unanchored(&[pat], 1024, 16384)
                .unwrap_or_else(|e| panic!("pattern {pat:?} must compile: {e:?}"))
                .dfa;
            let ends = single_pass_accept_ends(&dfa, hay);
            let expected_end = if hay.ends_with(b"\"") {
                hay.len() - 1
            } else {
                hay.len()
            };
            assert!(
                ends.contains(&expected_end),
                "dense CompiledDfa for {pat:?} must accept the full token end {expected_end} \
                 in {:?} (single-pass); got {ends:?}. state_count={}",
                String::from_utf8_lossy(hay),
                dfa.state_count,
            );
            assert!(
                ends.iter().all(|end| *end <= expected_end),
                "dense CompiledDfa for {pat:?} must not accept past token boundary {expected_end} \
                 in {:?}; got {ends:?}. state_count={}",
                String::from_utf8_lossy(hay),
                dfa.state_count,
            );
        }
    }

    /// End-to-end: a literal regex set should produce a Program whose
    /// CompiledDfa accepts the literal at the expected end offset. The
    /// CompiledDfa accept table is the load-bearing assertion - if it's
    /// empty, the composition didn't propagate accept metadata through
    /// the subset construction.
    #[test]
    fn literal_pattern_set_lowers_through_to_dfa_program() {
        let pipeline =
            build_regex_dfa_pipeline(&["abc"], 1024, 1024).expect("Fix: literal must compile");
        assert!(
            pipeline.dfa.state_count >= 4,
            "literal 'abc' DFA must have at least 4 states (entry + 3 progress); got {}",
            pipeline.dfa.state_count
        );
        assert_eq!(
            pipeline.pattern_lengths,
            vec![3],
            "single literal 'abc' must have pattern_lengths = [3]"
        );
        assert!(
            pipeline
                .dfa
                .accept
                .iter()
                .any(|&pid_plus_one| pid_plus_one == 1),
            "at least one DFA state must accept pattern 0 (encoded as accept = 1)"
        );
        // Program buffer surface matches the AC kernel's contract:
        // haystack, transitions, output_offsets, output_records,
        // pattern_lengths, haystack_len, match_count, matches.
        let names: Vec<&str> = pipeline.program.buffers.iter().map(|b| b.name()).collect();
        for expected in [
            "haystack",
            "transitions",
            "output_offsets",
            "output_records",
            "pattern_lengths",
            "haystack_len",
            "match_count",
            "matches",
        ] {
            assert!(
                names.contains(&expected),
                "RegexDfaPipeline program must declare buffer `{expected}` for AC dispatch; got {names:?}"
            );
        }
    }

    /// Multi-pattern union: two literals must end up in two distinct
    /// accept states (each tied to its own pattern id), not collapsed
    /// into one.
    #[test]
    fn multi_literal_set_emits_distinct_accept_pids() {
        let pipeline = build_regex_dfa_pipeline(&["abc", "xyz"], 1024, 1024)
            .expect("Fix: two literals must compile");
        assert_eq!(pipeline.pattern_lengths, vec![3, 3]);
        // accept[s] = pid + 1, so a multi-pattern set should produce
        // both `1` (pid 0) and `2` (pid 1) somewhere in the accept
        // table. If either is missing, the subset construction lost
        // an accept's pattern_id.
        let has_pid0 = pipeline.dfa.accept.iter().any(|&value| value == 1);
        let has_pid1 = pipeline.dfa.accept.iter().any(|&value| value == 2);
        assert!(has_pid0, "no DFA state accepts pid 0 - 'abc' lost in lower");
        assert!(has_pid1, "no DFA state accepts pid 1 - 'xyz' lost in lower");
    }

    /// State-explosion path: setting `max_dfa_states` to 1 must surface
    /// as a structured error, not a panic.
    #[test]
    fn state_explosion_surfaces_as_error_not_panic() {
        let err = build_regex_dfa_pipeline(&["abc"], 1024, 1)
            .expect_err("max_dfa_states=1 must trip state explosion");
        match err {
            RegexDfaError::Lower(NfaToDfaError::StateExplosion { .. }) => {}
            other => panic!("expected Lower(StateExplosion), got {other:?}"),
        }
    }

    /// A regex with a character class should also lower - this is the
    /// case `RulePipeline` would scan via NFA bit-vector. The DFA path
    /// must produce an accept somewhere so the consumer gets a hit.
    #[test]
    fn character_class_pattern_lowers_to_acceptor_dfa() {
        let pipeline = build_regex_dfa_pipeline(&["[ab]c"], 1024, 1024)
            .expect("Fix: character class must compile");
        assert!(
            pipeline.dfa.accept.iter().any(|&value| value != 0),
            "DFA for '[ab]c' must accept at least one state"
        );
    }

    #[test]
    fn regex_dfa_pipeline_uses_checked_size_conversions() {
        let production = include_str!("regex_dfa.rs")
            .split("\n#[cfg(test)]\nmod tests")
            .next()
            .expect("Fix: regex DFA production section should precede tests");

        assert!(
            production.contains("RegexDfaError::Size"),
            "Fix: regex DFA sizing failures must be structured errors, not panics or unchecked casts."
        );
        assert!(
            production.contains("u32::try_from(patterns.len())"),
            "Fix: regex DFA pattern count must use checked conversion for GPU ABI metadata."
        );
        assert!(
            production.contains("usize::try_from(*pid)"),
            "Fix: regex DFA accept pattern ids must use checked host indexing conversion."
        );
        assert!(
            production.contains("try_build_ac_bounded_ranges_program_ext"),
            "Fix: regex DFA must call the fallible AC program builder."
        );
        assert!(
            !production.contains("patterns.len() as u32"),
            "Fix: regex DFA must not narrow pattern counts with unchecked casts."
        );
    }
}