keyhog_scanner/engine/
gpu_lazy.rs

1use super::*;
2
3impl CompiledScanner {
4    /// Lazily compile the GPU literal-set on first call. Returns `None`
5    /// when no compatible adapter was detected at probe time.
6    ///
7    /// Persists the compiled matcher to `~/.cache/keyhog/programs/<hash>.bin`.
8    /// On a cache hit the matcher is loaded from disk and the GPU
9    /// recompile is skipped entirely - biggest cold-start win on
10    /// `keyhog scan` / `scan-system` runs that re-launch repeatedly.
11    /// Cache misses (no file, version-mismatch, corrupt blob) silently
12    /// recompile and re-cache.
13    pub fn gpu_matcher(&self) -> Option<&vyre_libs::scan::GpuLiteralSet> {
14        self.gpu_matcher
15            .get_or_init(|| {
16                let Some(literals) = &self.gpu_literals else {
17                    return None;
18                };
19                let literal_refs: Vec<&[u8]> = literals.iter().map(|v| v.as_slice()).collect();
20                let cache_dir = super::gpu_cache::gpu_matcher_cache_dir()?;
21                let cache_key = format!(
22                    "lit-{}",
23                    super::gpu_cache::gpu_matcher_cache_key(&literal_refs)
24                );
25                let started = std::time::Instant::now();
26                // One-line lego-block cache wiring courtesy of
27                // `vyre_libs::scan::cached_load_or_compile`. The
28                // helper handles atomic-rename, stale-blob deletion,
29                // and silent fall-through on cache-side I/O errors -
30                // every behaviour the previous hand-rolled
31                // load/save pair tried to match. We log compile cost
32                // here so the operator can still see warm-vs-cold
33                // start latency in `--verbose` output.
34                let matcher =
35                    vyre_libs::scan::cached_load_or_compile(&cache_dir, &cache_key, || {
36                        vyre_libs::scan::GpuLiteralSet::compile(&literal_refs)
37                    });
38                tracing::debug!(
39                    target: "keyhog::routing",
40                    patterns = literal_refs.len(),
41                    elapsed_ms = started.elapsed().as_millis() as u64,
42                    "GpuLiteralSet ready (warm cache or compiled)"
43                );
44                Some(matcher)
45            })
46            .as_ref()
47    }
48
49    /// Lazily build the Aho-Corasick bounded-ranges dispatch Program
50    /// from the GpuLiteralSet's CompiledDfa. The two engines share the
51    /// same DFA - only the dispatch Program (and therefore the
52    /// per-byte algorithm) differs:
53    ///
54    /// * `gpu_matcher().program` - `build_literal_set_program`:
55    ///   walks every pattern × every literal byte per haystack
56    ///   position. `O(N × L) per byte`. Works for any pattern set
57    ///   that fits the DFA budget.
58    /// * `ac_gpu_program()` - `classic_ac_bounded_ranges_program`:
59    ///   walks the AC transition table forward `L_max` bytes per
60    ///   position, emits every pattern in the accepting state's
61    ///   flat output_links. `O(L_max) per byte` regardless of N.
62    ///
63    /// Selected at scan time via `KEYHOG_GPU_KERNEL=ac`. Returns
64    /// `None` when no GPU matcher is available; callers fall through
65    /// to the literal-set path or non-GPU backend.
66    ///
67    /// Cap of `super::rule_pipeline::AC_GPU_MAX_MATCHES_PER_DISPATCH` triples per shard
68    /// dispatch matches the existing literal-set output-buffer cap.
69    /// Truncation (count > cap on readback) is handled by the same
70    /// fall-back-to-CPU branch the literal-set path uses.
71    pub fn ac_gpu_program(&self) -> Option<&vyre::Program> {
72        self.ac_gpu_program
73            .get_or_init(|| {
74                let matcher = self.gpu_matcher()?;
75                let pattern_count = matcher.pattern_lengths.len() as u32;
76                // Pick the match-append strategy. The subgroup form
77                // (subgroup_ballot + subgroup_shuffle producing
78                // _vyre_match_leader) was originally gated to wgpu
79                // only because vyre-driver-cuda rejects it during
80                // canonical pre-emit lowering. Runtime testing on
81                // Apple Silicon M4 Pro with vyre v0.4.2 confirmed
82                // the SAME "_vyre_match_leader referenced before
83                // binding" rejection on the wgpu path: the lowering
84                // gap is in vyre's substrate-neutral pre-emit step,
85                // not the driver-specific emitter. Until the IR gap
86                // is closed, use_subgroup_coalesce stays false on
87                // every backend. We lose the ~32x atomic-contention
88                // reduction the subgroup form would have provided
89                // (Innovation I.17), but recall and correctness are
90                // preserved; the plain append_match path produces
91                // bit-identical match output, just with more atomic
92                // pressure on the shared count buffer.
93                let backend_id = self.gpu_backend.as_ref().map(|b| b.id()).unwrap_or("none");
94                let use_subgroup_coalesce = false;
95                let program = vyre_libs::scan::classic_ac::build_ac_bounded_ranges_program_ext(
96                    &matcher.dfa,
97                    pattern_count,
98                    super::rule_pipeline::AC_GPU_MAX_MATCHES_PER_DISPATCH,
99                    use_subgroup_coalesce,
100                );
101                tracing::debug!(
102                    target: "keyhog::routing",
103                    pattern_count,
104                    state_count = matcher.dfa.state_count,
105                    max_pattern_len = matcher.dfa.max_pattern_len,
106                    backend = backend_id,
107                    use_subgroup_coalesce,
108                    "AC GPU dispatch Program built"
109                );
110                Some(program)
111            })
112            .as_ref()
113    }
114
115    /// Lazily compile the regex-NFA `RulePipeline` on first call.
116    /// Returns `None` once the OnceLock has fired when the regex
117    /// compile failed - typically because the combined NFA exceeds
118    /// vyre's per-subgroup state cap (`LANES * 32`) or because one
119    /// of the detector regexes uses a feature the byte-NFA frontend
120    /// can't represent (Unicode classes, lookaround, backrefs).
121    /// Callers should fall back to the literal-set GPU dispatch on
122    /// `None`.
123    ///
124    /// Pipeline is sized for [`super::rule_pipeline::megascan_input_len()`] bytes; batches
125    /// larger than that must take a different path. The orchestrator
126    /// caps batches at the same value (256 MiB default, up to 1 GiB
127    /// on 24+ GiB-VRAM cards) so this matches normal scan flow.
128    pub fn rule_pipeline(&self) -> Option<&vyre_libs::scan::RulePipeline> {
129        self.rule_pipeline
130            .get_or_init(|| {
131                let pattern_strs: Vec<&str> = self
132                    .ac_map
133                    .iter()
134                    .map(|p| p.regex.as_str())
135                    .chain(self.fallback.iter().map(|(p, _)| p.regex.as_str()))
136                    .collect();
137                if pattern_strs.is_empty() {
138                    return None;
139                }
140                let started = std::time::Instant::now();
141                let input_cap = super::rule_pipeline::megascan_input_len();
142                match super::rule_pipeline::rule_pipeline_cached(&pattern_strs, input_cap as u32) {
143                    Ok(pipe) => {
144                        tracing::info!(
145                            target: "keyhog::routing",
146                            patterns = pattern_strs.len(),
147                            input_len = input_cap,
148                            elapsed_ms = started.elapsed().as_millis() as u64,
149                            "MegaScan RulePipeline compiled"
150                        );
151                        Some(pipe)
152                    }
153                    Err(error) => {
154                        // Demoted from `warn` to `debug` - the
155                        // fallback to literal-set GPU dispatch is the
156                        // designed degradation when vyre's byte-NFA
157                        // frontend can't represent every pattern (e.g.
158                        // lookaround in pattern 990 of the bundled
159                        // detector corpus). The user can't fix it, and
160                        // hitting this WARN once per `--backend mega-
161                        // scan` invocation creates noise without
162                        // signal. kimi-dogfood-3 #138.
163                        tracing::debug!(
164                            patterns = pattern_strs.len(),
165                            error = %format!("{error:?}"),
166                            "MegaScan RulePipeline compile failed - falling back to literal-set GPU dispatch. \
167                             Common causes: regex set exceeds vyre's per-subgroup state cap, or one or more \
168                             patterns use Unicode classes / lookaround / backrefs that the byte-NFA frontend \
169                             can't represent."
170                        );
171                        None
172                    }
173                }
174            })
175            .as_ref()
176    }
177
178    /// Lazily build fused GPU decode→scan programs (base64 + hex).
179    ///
180    /// Returns `None` when no GPU matcher is available (no literals, no
181    /// adapter). The fused programs share the same DFA transition tables
182    /// as the literal-set engine but prepend an on-GPU decode stage,
183    /// eliminating the CPU→GPU round-trip for encoded content.
184    pub fn fused_decode_programs(
185        &self,
186    ) -> Option<&super::gpu_decode_scan::FusedDecodeScanPrograms> {
187        self.fused_decode_programs
188            .get_or_init(|| {
189                let matcher = self.gpu_matcher()?;
190                let state_count = matcher.dfa.state_count;
191                let input_len = super::rule_pipeline::megascan_input_len() as u32;
192                let programs = super::gpu_decode_scan::build_fused_programs(state_count, input_len);
193                if programs.any_available() {
194                    tracing::info!(
195                        target: "keyhog::gpu",
196                        base64 = programs.base64_program.is_some(),
197                        hex = programs.hex_program.is_some(),
198                        state_count,
199                        input_len,
200                        "fused decode+scan programs built"
201                    );
202                    Some(programs)
203                } else {
204                    tracing::debug!(
205                        target: "keyhog::gpu",
206                        "fused decode+scan programs not available - CPU decode path will be used"
207                    );
208                    None
209                }
210            })
211            .as_ref()
212    }
213}
keyhog_scanner/engine/gpu_lazy.rs

keyhog_scanner/engine/
gpu_lazy.rs