keyhog_scanner/engine/gpu_lazy.rs
1use super::*;
2
3impl CompiledScanner {
4 /// Lazily compile the GPU literal-set on first call. Returns `None`
5 /// when no compatible adapter was detected at probe time.
6 ///
7 /// Persists the compiled matcher to `~/.cache/keyhog/programs/<hash>.bin`.
8 /// On a cache hit the matcher is loaded from disk and the GPU
9 /// recompile is skipped entirely - biggest cold-start win on
10 /// `keyhog scan` / `scan-system` runs that re-launch repeatedly.
11 /// Cache misses (no file, version-mismatch, corrupt blob) silently
12 /// recompile and re-cache.
13 pub fn gpu_matcher(&self) -> Option<&vyre_libs::scan::GpuLiteralSet> {
14 self.gpu_matcher
15 .get_or_init(|| {
16 let Some(literals) = &self.gpu_literals else {
17 return None;
18 };
19 let literal_refs: Vec<&[u8]> = literals.iter().map(|v| v.as_slice()).collect();
20 let cache_dir = super::gpu_cache::gpu_matcher_cache_dir()?;
21 let cache_key = format!(
22 "lit-{}",
23 super::gpu_cache::gpu_matcher_cache_key(&literal_refs)
24 );
25 let started = std::time::Instant::now();
26 // One-line lego-block cache wiring courtesy of
27 // `vyre_libs::scan::cached_load_or_compile`. The
28 // helper handles atomic-rename, stale-blob deletion,
29 // and silent fall-through on cache-side I/O errors -
30 // every behaviour the previous hand-rolled
31 // load/save pair tried to match. We log compile cost
32 // here so the operator can still see warm-vs-cold
33 // start latency in `--verbose` output.
34 let matcher =
35 vyre_libs::scan::cached_load_or_compile(&cache_dir, &cache_key, || {
36 vyre_libs::scan::GpuLiteralSet::compile(&literal_refs)
37 });
38 tracing::debug!(
39 target: "keyhog::routing",
40 patterns = literal_refs.len(),
41 elapsed_ms = started.elapsed().as_millis() as u64,
42 "GpuLiteralSet ready (warm cache or compiled)"
43 );
44 Some(matcher)
45 })
46 .as_ref()
47 }
48
49 /// Lazily build the Aho-Corasick bounded-ranges dispatch Program
50 /// from the GpuLiteralSet's CompiledDfa. The two engines share the
51 /// same DFA - only the dispatch Program (and therefore the
52 /// per-byte algorithm) differs:
53 ///
54 /// * `gpu_matcher().program` - `build_literal_set_program`:
55 /// walks every pattern × every literal byte per haystack
56 /// position. `O(N × L) per byte`. Works for any pattern set
57 /// that fits the DFA budget.
58 /// * `ac_gpu_program()` - `classic_ac_bounded_ranges_program`:
59 /// walks the AC transition table forward `L_max` bytes per
60 /// position, emits every pattern in the accepting state's
61 /// flat output_links. `O(L_max) per byte` regardless of N.
62 ///
63 /// Selected at scan time via `KEYHOG_GPU_KERNEL=ac`. Returns
64 /// `None` when no GPU matcher is available; callers fall through
65 /// to the literal-set path or non-GPU backend.
66 ///
67 /// Cap of `super::rule_pipeline::AC_GPU_MAX_MATCHES_PER_DISPATCH` triples per shard
68 /// dispatch matches the existing literal-set output-buffer cap.
69 /// Truncation (count > cap on readback) is handled by the same
70 /// fall-back-to-CPU branch the literal-set path uses.
71 pub fn ac_gpu_program(&self) -> Option<&vyre::Program> {
72 self.ac_gpu_program
73 .get_or_init(|| {
74 let matcher = self.gpu_matcher()?;
75 let pattern_count = matcher.pattern_lengths.len() as u32;
76 // Pick the match-append strategy. The subgroup form
77 // (subgroup_ballot + subgroup_shuffle producing
78 // _vyre_match_leader) was originally gated to wgpu
79 // only because vyre-driver-cuda rejects it during
80 // canonical pre-emit lowering. Runtime testing on
81 // Apple Silicon M4 Pro with vyre v0.4.2 confirmed
82 // the SAME "_vyre_match_leader referenced before
83 // binding" rejection on the wgpu path: the lowering
84 // gap is in vyre's substrate-neutral pre-emit step,
85 // not the driver-specific emitter. Until the IR gap
86 // is closed, use_subgroup_coalesce stays false on
87 // every backend. We lose the ~32x atomic-contention
88 // reduction the subgroup form would have provided
89 // (Innovation I.17), but recall and correctness are
90 // preserved; the plain append_match path produces
91 // bit-identical match output, just with more atomic
92 // pressure on the shared count buffer.
93 let backend_id = self.gpu_backend.as_ref().map(|b| b.id()).unwrap_or("none");
94 let use_subgroup_coalesce = false;
95 let program = vyre_libs::scan::classic_ac::build_ac_bounded_ranges_program_ext(
96 &matcher.dfa,
97 pattern_count,
98 super::rule_pipeline::AC_GPU_MAX_MATCHES_PER_DISPATCH,
99 use_subgroup_coalesce,
100 );
101 tracing::debug!(
102 target: "keyhog::routing",
103 pattern_count,
104 state_count = matcher.dfa.state_count,
105 max_pattern_len = matcher.dfa.max_pattern_len,
106 backend = backend_id,
107 use_subgroup_coalesce,
108 "AC GPU dispatch Program built"
109 );
110 Some(program)
111 })
112 .as_ref()
113 }
114
115 /// Lazily compile the regex-NFA `RulePipeline` on first call.
116 /// Returns `None` once the OnceLock has fired when the regex
117 /// compile failed - typically because the combined NFA exceeds
118 /// vyre's per-subgroup state cap (`LANES * 32`) or because one
119 /// of the detector regexes uses a feature the byte-NFA frontend
120 /// can't represent (Unicode classes, lookaround, backrefs).
121 /// Callers should fall back to the literal-set GPU dispatch on
122 /// `None`.
123 ///
124 /// Pipeline is sized for [`super::rule_pipeline::megascan_input_len()`] bytes; batches
125 /// larger than that must take a different path. The orchestrator
126 /// caps batches at the same value (256 MiB default, up to 1 GiB
127 /// on 24+ GiB-VRAM cards) so this matches normal scan flow.
128 pub fn rule_pipeline(&self) -> Option<&vyre_libs::scan::RulePipeline> {
129 self.rule_pipeline
130 .get_or_init(|| {
131 let pattern_strs: Vec<&str> = self
132 .ac_map
133 .iter()
134 .map(|p| p.regex.as_str())
135 .chain(self.fallback.iter().map(|(p, _)| p.regex.as_str()))
136 .collect();
137 if pattern_strs.is_empty() {
138 return None;
139 }
140 let started = std::time::Instant::now();
141 let input_cap = super::rule_pipeline::megascan_input_len();
142 match super::rule_pipeline::rule_pipeline_cached(&pattern_strs, input_cap as u32) {
143 Ok(pipe) => {
144 tracing::info!(
145 target: "keyhog::routing",
146 patterns = pattern_strs.len(),
147 input_len = input_cap,
148 elapsed_ms = started.elapsed().as_millis() as u64,
149 "MegaScan RulePipeline compiled"
150 );
151 Some(pipe)
152 }
153 Err(error) => {
154 // Demoted from `warn` to `debug` - the
155 // fallback to literal-set GPU dispatch is the
156 // designed degradation when vyre's byte-NFA
157 // frontend can't represent every pattern (e.g.
158 // lookaround in pattern 990 of the bundled
159 // detector corpus). The user can't fix it, and
160 // hitting this WARN once per `--backend mega-
161 // scan` invocation creates noise without
162 // signal. kimi-dogfood-3 #138.
163 tracing::debug!(
164 patterns = pattern_strs.len(),
165 error = %format!("{error:?}"),
166 "MegaScan RulePipeline compile failed - falling back to literal-set GPU dispatch. \
167 Common causes: regex set exceeds vyre's per-subgroup state cap, or one or more \
168 patterns use Unicode classes / lookaround / backrefs that the byte-NFA frontend \
169 can't represent."
170 );
171 None
172 }
173 }
174 })
175 .as_ref()
176 }
177
178 /// Lazily build fused GPU decode→scan programs (base64 + hex).
179 ///
180 /// Returns `None` when no GPU matcher is available (no literals, no
181 /// adapter). The fused programs share the same DFA transition tables
182 /// as the literal-set engine but prepend an on-GPU decode stage,
183 /// eliminating the CPU→GPU round-trip for encoded content.
184 pub fn fused_decode_programs(
185 &self,
186 ) -> Option<&super::gpu_decode_scan::FusedDecodeScanPrograms> {
187 self.fused_decode_programs
188 .get_or_init(|| {
189 let matcher = self.gpu_matcher()?;
190 let state_count = matcher.dfa.state_count;
191 let input_len = super::rule_pipeline::megascan_input_len() as u32;
192 let programs = super::gpu_decode_scan::build_fused_programs(state_count, input_len);
193 if programs.any_available() {
194 tracing::info!(
195 target: "keyhog::gpu",
196 base64 = programs.base64_program.is_some(),
197 hex = programs.hex_program.is_some(),
198 state_count,
199 input_len,
200 "fused decode+scan programs built"
201 );
202 Some(programs)
203 } else {
204 tracing::debug!(
205 target: "keyhog::gpu",
206 "fused decode+scan programs not available - CPU decode path will be used"
207 );
208 None
209 }
210 })
211 .as_ref()
212 }
213}