keyhog_scanner/engine/compile.rs
1use super::*;
2
3impl CompiledScanner {
4 pub fn compile(detectors: Vec<DetectorSpec>) -> Result<Self> {
5 Self::compile_with_gpu_policy(detectors, GpuInitPolicy::FromEnvironment)
6 }
7
8 pub fn compile_with_gpu_policy(
9 detectors: Vec<DetectorSpec>,
10 gpu_policy: GpuInitPolicy,
11 ) -> Result<Self> {
12 // `state` is only mutated under `feature = "simd"` (the
13 // Hyperscan-reject reroute below). Lean builds would lint it
14 // unused-mut otherwise.
15 #[cfg_attr(not(feature = "simd"), allow(unused_mut))]
16 let mut state = build_compile_state(&detectors)?;
17 let ac = build_ac_pattern_set(&state.ac_literals)?;
18 // GPU is unconditional in the build; runtime probe decides whether to
19 // actually use it. `gpu_available` is set by hw_probe based on adapter
20 // detection (excluding software renderers like llvmpipe/lavapipe).
21 // Resolve the active GPU backend with the cascade
22 // CUDA (when `cuda` feature on + libcuda.so loadable)
23 // → wgpu (any-vendor cross-platform fallback)
24 // → None (auto-routes to SIMD/CPU).
25 // CUDA bypasses the wgpu validation layers + naga IR + WGSL
26 // text + driver shader compile; the path through CUDA driver
27 // API + PTX is empirically 5-10× faster on NVIDIA hardware
28 // and is the headline path. CUDA acquisition is opaque to
29 // failures: if libcuda.so is missing or the driver refuses,
30 // `acquire()` returns Err and we fall through to wgpu so
31 // nothing regresses on non-CUDA hosts.
32 // `crate::gpu::env_no_gpu()` is the single source of truth for
33 // "skip every GPU init path". Explicit KEYHOG_NO_GPU wins both
34 // directions; in its absence the helper auto-detects CI runners
35 // (CI=true + a dozen platform-specific markers) and returns
36 // true, since CI runners have no discrete GPU - the wgpu probe
37 // would enumerate llvmpipe, get rejected as software, and the
38 // operator would see a confusing "GPU MoE init failed" warning
39 // after burning ~250ms on cold-start. Set KEYHOG_NO_GPU=0 in CI
40 // to opt back in on self-hosted GPU runners.
41 let gpu_disabled = match gpu_policy {
42 GpuInitPolicy::FromEnvironment => crate::gpu::env_no_gpu(),
43 GpuInitPolicy::ForceEnabled => false,
44 GpuInitPolicy::ForceDisabled => true,
45 };
46 if gpu_disabled {
47 let disabled_by_policy = matches!(gpu_policy, GpuInitPolicy::ForceDisabled);
48 let in_ci = !disabled_by_policy
49 && crate::gpu::is_ci_environment()
50 && std::env::var("KEYHOG_NO_GPU").is_err();
51 if disabled_by_policy {
52 tracing::info!(
53 target: "keyhog::routing",
54 "GPU init bypassed by caller policy; scanner will use CPU/SIMD paths"
55 );
56 } else if in_ci {
57 tracing::info!(
58 target: "keyhog::routing",
59 "CI environment detected (CI= or platform-specific marker set); bypassing CUDA/wgpu init. \
60 Set KEYHOG_NO_GPU=0 to force GPU on self-hosted GPU runners."
61 );
62 } else {
63 tracing::info!(
64 target: "keyhog::routing",
65 "KEYHOG_NO_GPU set: bypassing CUDA/wgpu init, routing every chunk through the CPU/SIMD path"
66 );
67 }
68 }
69 #[cfg(feature = "gpu")]
70 let (gpu_literals, gpu_backend, wgpu_backend) =
71 if !gpu_disabled && crate::hw_probe::probe_hardware().gpu_available {
72 let literals = build_gpu_literals(&state.ac_literals);
73 let cuda_backend: Option<Arc<dyn vyre::VyreBackend>> = {
74 #[cfg(target_os = "linux")]
75 {
76 match vyre_driver_cuda::cuda_factory() {
77 Ok(boxed) => {
78 tracing::info!(
79 target: "keyhog::routing",
80 "CUDA backend acquired, bypassing wgpu/naga/WGSL path"
81 );
82 Some(Arc::from(boxed))
83 }
84 Err(error) => {
85 surface_cuda_acquisition_failure(&error);
86 None
87 }
88 }
89 }
90 #[cfg(not(target_os = "linux"))]
91 {
92 None
93 }
94 };
95 match cuda_backend {
96 Some(cuda) => (literals, Some(cuda), None),
97 None => match vyre_driver_wgpu::WgpuBackend::shared() {
98 Ok(wgpu) => {
99 let trait_obj: Arc<dyn vyre::VyreBackend> = wgpu.clone();
100 (literals, Some(trait_obj), Some(wgpu))
101 }
102 Err(error) => {
103 tracing::warn!(
104 target: "keyhog::routing",
105 %error,
106 "wgpu backend unavailable; scan will use CPU-only path"
107 );
108 (literals, None, None)
109 }
110 },
111 }
112 } else {
113 (None, None, None)
114 };
115
116 // Lean (no-`gpu`) build: never link the wgpu / CUDA drivers, never
117 // probe Vulkan at startup. The hw_probe still reports its findings so
118 // downstream routing surfaces `KEYHOG_NO_GPU` semantics, but no
119 // backend is acquired. `gpu_disabled` stays read so the cfg-aware
120 // dead-code warning is suppressed without an `_ =` decoration.
121 #[cfg(not(feature = "gpu"))]
122 let (gpu_literals, gpu_backend): (
123 Option<Arc<Vec<Vec<u8>>>>,
124 Option<Arc<dyn vyre::VyreBackend>>,
125 ) = {
126 let _ = gpu_disabled;
127 (None, None)
128 };
129 let prefix_propagation = CsrU32::from(build_prefix_propagation(&state.ac_literals));
130 let same_prefix_patterns = CsrU32::from(build_same_prefix_patterns(&state.ac_literals));
131
132 // Build the Hyperscan scanner BEFORE the keyword fallback so we
133 // learn which ac_map patterns Hyperscan rejected (over-long, or an
134 // unsupported construct like a large `{100,200}` bounded repeat).
135 // A rejected pattern produces zero HS matches, and because it took
136 // the literal-prefix (ac_map) branch in build_compile_state it is
137 // NOT in the keyword fallback either - so it is silently dead under
138 // the HS backend (the default on Linux/CI). Reroute each one into
139 // the keyword fallback, gated by its detector's keywords, so it
140 // fires via the backend-independent regex sweep. Closes the
141 // contracts_runner recall hole on line/paloalto/tower/keystonejs/
142 // snowflake/bandwidth and the matching adversarial-wrapper misses.
143 #[cfg(feature = "simd")]
144 let (simd_prefilter, hs_index_map) =
145 match super::build_simd_scanner(&state.ac_map, &state.fallback) {
146 Some((scanner, index_map, unsupported_ac)) => {
147 for ac_idx in unsupported_ac {
148 let pattern = state.ac_map[ac_idx].clone();
149 let keywords = detectors[pattern.detector_index].keywords.clone();
150 state.fallback.push((pattern, keywords));
151 }
152 (Some(scanner), CsrU32::from(index_map))
153 }
154 None => (None, CsrU32::default()),
155 };
156
157 let (fallback_keyword_ac, fallback_keyword_to_patterns) =
158 build_fallback_keyword_ac(&state.fallback);
159 let fallback_keyword_to_patterns = CsrU32::from(fallback_keyword_to_patterns);
160 // Precompute always-active fallback indices so the per-chunk hot path
161 // seeds the sparse active set without scanning the full fallback table.
162 let fallback_always_active_indices: Vec<usize> = state
163 .fallback
164 .iter()
165 .enumerate()
166 // Mirrors `compiler::build_fallback_keyword_ac`'s
167 // 4-char floor - see the rationale comment there. The
168 // experimental 3-char floor measured a net F1 regression
169 // on SecretBench-medium, so both checks stay at 4.
170 .filter_map(|(index, (_, keywords))| {
171 (!keywords.iter().any(|k| k.len() >= 4)).then_some(index)
172 })
173 .collect();
174
175 log_quality_warnings(&state.quality_warnings);
176
177 let mut alphabet_targets = state.ac_literals.clone();
178 for (_, keywords) in &state.fallback {
179 alphabet_targets.extend(keywords.clone());
180 }
181 let alphabet_screen = if alphabet_targets.is_empty() {
182 None
183 } else {
184 Some(crate::alphabet_filter::AlphabetScreen::new(
185 &alphabet_targets,
186 ))
187 };
188
189 let bigram_bloom =
190 crate::bigram_bloom::BigramBloom::from_literal_prefixes(&alphabet_targets);
191 tracing::debug!(
192 popcount = bigram_bloom.popcount(),
193 "bigram bloom built (4096 bits, lower popcount = stronger filter)"
194 );
195
196 // Pre-intern detector metadata strings into a CHD perfect
197 // hash so per-scan `intern_metadata` calls hand out shared
198 // `Arc<str>` without touching the global allocator. Built
199 // once per scanner; lock-free on read.
200 let static_intern_strings: Vec<&str> = detectors
201 .iter()
202 .flat_map(|d| [d.id.as_str(), d.name.as_str(), d.service.as_str()].into_iter())
203 .collect();
204 let static_intern = Arc::new(crate::static_intern::StaticInterner::from_detector_strings(
205 static_intern_strings,
206 ));
207
208 // Resolve each detector's interned (id, name, service) triple ONCE,
209 // indexed by detector index, so the per-match emission sites clone by
210 // index instead of re-hashing the same three strings through the CHD
211 // perfect hash on every finding (PERF-locality_intern-1). The strings
212 // are exactly the arena entries the per-match `lookup` would return;
213 // every detector field was just fed into `from_detector_strings`
214 // above, so each lookup is guaranteed `Some`. The `unwrap_or_else`
215 // fallback (interning the source string directly) is unreachable in
216 // practice but keeps the build total — a future detector field that
217 // somehow missed the interner universe still emits its true string,
218 // never an empty or wrong one.
219 let metadata_by_index: Vec<(Arc<str>, Arc<str>, Arc<str>)> = detectors
220 .iter()
221 .map(|d| {
222 (
223 static_intern
224 .lookup(&d.id)
225 .unwrap_or_else(|| Arc::from(d.id.as_str())),
226 static_intern
227 .lookup(&d.name)
228 .unwrap_or_else(|| Arc::from(d.name.as_str())),
229 static_intern
230 .lookup(&d.service)
231 .unwrap_or_else(|| Arc::from(d.service.as_str())),
232 )
233 })
234 .collect();
235
236 // Pre-intern the four synthetic entropy-fallback metadata triples once
237 // (PERF-locality_intern-1). These are not detector specs, so they are
238 // not in the StaticInterner universe; intern them directly into shared
239 // Arc<str> here so the entropy emit path clones by index rather than
240 // re-allocating/re-hashing the same four constants per finding. String
241 // values are byte-identical to the prior `intern_metadata` results.
242 #[cfg(feature = "entropy")]
243 let entropy_metadata_by_index: [(Arc<str>, Arc<str>, Arc<str>); 4] = {
244 use crate::engine::fallback_entropy_helpers::ENTROPY_DETECTOR_METADATA;
245 std::array::from_fn(|i| {
246 let (id, name, service) = ENTROPY_DETECTOR_METADATA[i];
247 (
248 static_intern.lookup(id).unwrap_or_else(|| Arc::from(id)),
249 static_intern
250 .lookup(name)
251 .unwrap_or_else(|| Arc::from(name)),
252 static_intern
253 .lookup(service)
254 .unwrap_or_else(|| Arc::from(service)),
255 )
256 })
257 };
258
259 // Precise-regex validators for the simdsieve hot fast-path. Built here
260 // (before `detectors` is moved into the struct) so the fast path can
261 // reject literal-prefix candidates the detector's own regex would not
262 // match - see `build_hot_pattern_validators`.
263 #[cfg(feature = "simdsieve")]
264 let hot_pattern_validators =
265 crate::simdsieve_prefilter::build_hot_pattern_validators(&detectors);
266
267 // Pre-intern the hot-pattern metadata constants ONCE, index-parallel
268 // with HOT_PATTERNS, so the simdsieve fast path clones by slot index
269 // instead of re-hashing the same three `&'static str`s through the CHD
270 // interner on every hot hit (PERF-locality_intern-1). These constants
271 // name real detectors whose id/name/service are already in the interner
272 // universe; the `unwrap_or_else` only fires for the one synthetic slot
273 // (square) with no canonical detector, where it interns the static
274 // string directly — still byte-identical to what the per-match
275 // `intern_metadata` call would have produced.
276 #[cfg(feature = "simdsieve")]
277 let hot_metadata_by_index: Vec<(Arc<str>, Arc<str>, Arc<str>)> = {
278 use crate::simdsieve_prefilter::{
279 HOT_PATTERN_DETECTOR_IDS, HOT_PATTERN_DISPLAY_NAMES, HOT_PATTERN_NAMES,
280 };
281 (0..HOT_PATTERN_NAMES.len())
282 .map(|i| {
283 let id = HOT_PATTERN_DETECTOR_IDS[i];
284 let name = HOT_PATTERN_DISPLAY_NAMES[i];
285 let service = HOT_PATTERN_NAMES[i];
286 (
287 static_intern.lookup(id).unwrap_or_else(|| Arc::from(id)),
288 static_intern
289 .lookup(name)
290 .unwrap_or_else(|| Arc::from(name)),
291 static_intern
292 .lookup(service)
293 .unwrap_or_else(|| Arc::from(service)),
294 )
295 })
296 .collect()
297 };
298
299 let scanner = Self {
300 ac,
301 gpu_backend,
302 #[cfg(feature = "gpu")]
303 wgpu_backend,
304 gpu_literals,
305 gpu_matcher: OnceLock::new(),
306 gpu_const_packs: OnceLock::new(),
307 gpu_ac_const_packs: OnceLock::new(),
308 ac_gpu_program: OnceLock::new(),
309 gpu_last_degrade_reason: std::sync::Mutex::new(None),
310
311 rule_pipeline: OnceLock::new(),
312 fused_program: OnceLock::new(),
313 fused_decode_programs: OnceLock::new(),
314 static_intern,
315 metadata_by_index,
316 ac_map: state.ac_map,
317 prefix_propagation,
318 fallback: state.fallback,
319 companions: state.companions,
320 detectors,
321 same_prefix_patterns,
322 fallback_keyword_ac,
323 fallback_keyword_to_patterns,
324 fallback_always_active_indices,
325 #[cfg(feature = "simd")]
326 simd_prefilter,
327 #[cfg(feature = "simd")]
328 hs_index_map,
329 #[cfg(feature = "simdsieve")]
330 hot_pattern_validators,
331 #[cfg(feature = "simdsieve")]
332 hot_metadata_by_index,
333 #[cfg(feature = "entropy")]
334 entropy_metadata_by_index,
335 config: ScannerConfig::default(),
336 alphabet_screen,
337 bigram_bloom,
338 fragment_cache: crate::fragment_cache::FragmentCache::new(1000),
339 };
340
341 Ok(scanner)
342 }
343
344 /// Apply a custom configuration to the compiled scanner.
345 pub fn with_config(mut self, config: ScannerConfig) -> Self {
346 self.config = config;
347 self
348 }
349}
350
351/// One-shot guard so the CUDA-acquisition-failed warning fires
352/// exactly once per process, not on every recompile. The CUDA factory
353/// is called inside `compile()` and a binary that re-compiles a
354/// scanner per-job (daemon mode, watch mode) would otherwise spam.
355#[cfg(all(target_os = "linux", feature = "gpu"))]
356static CUDA_FALLBACK_WARNED: std::sync::OnceLock<()> = std::sync::OnceLock::new();
357
358/// Surface a CUDA-backend acquisition failure when the host looks
359/// like it should have a working CUDA stack. We don't want to warn
360/// on plain non-NVIDIA Linux (the wgpu fall-through is the right
361/// path); we DO want to warn when the user is on an NVIDIA box with
362/// libcuda.so or /proc/driver/nvidia present, because in that case
363/// they paid for the CUDA stack and we just dropped them onto the
364/// 5-10x slower wgpu path silently. KEYHOG_REQUIRE_GPU=1 turns the
365/// warning into a hard exit, matching the contract used by the MoE
366/// init and the scan dispatch paths.
367#[cfg(all(target_os = "linux", feature = "gpu"))]
368fn surface_cuda_acquisition_failure(error: &dyn std::fmt::Display) {
369 let on_nvidia_host = nvidia_userland_present();
370 let require_gpu = std::env::var("KEYHOG_REQUIRE_GPU").as_deref() == Ok("1");
371 let no_gpu = std::env::var("KEYHOG_NO_GPU").as_deref() == Ok("1");
372
373 if require_gpu && on_nvidia_host {
374 eprintln!(
375 "keyhog: KEYHOG_REQUIRE_GPU=1 but CUDA backend acquisition failed on \
376an NVIDIA host: {error}. Refusing to fall back to WGPU."
377 );
378 std::process::exit(2);
379 }
380
381 if no_gpu {
382 return;
383 }
384
385 if on_nvidia_host && CUDA_FALLBACK_WARNED.set(()).is_ok() {
386 eprintln!(
387 "keyhog: CUDA backend unavailable on this NVIDIA host ({error}); \
388falling back to WGPU (typically 5-10x slower than CUDA on the same hardware). \
389This is usually a libcuda.so version mismatch or a driver upgrade pending a \
390reboot. Set KEYHOG_NO_GPU=1 to silence this warning, or KEYHOG_REQUIRE_GPU=1 \
391to hard-fail next time."
392 );
393 }
394 tracing::warn!("CUDA backend unavailable, falling back to wgpu: {error}");
395}
396
397/// Check the common libcuda.so locations + /proc/driver/nvidia to
398/// decide whether this host appears to have an NVIDIA CUDA userland
399/// installed. Mirrors the probes install.sh uses so the runtime view
400/// matches the install-time view.
401#[cfg(all(target_os = "linux", feature = "gpu"))]
402fn nvidia_userland_present() -> bool {
403 if std::path::Path::new("/proc/driver/nvidia").exists() {
404 return true;
405 }
406 for p in [
407 "/usr/lib/x86_64-linux-gnu/libcuda.so",
408 "/usr/lib/x86_64-linux-gnu/libcuda.so.1",
409 "/usr/lib64/libcuda.so",
410 "/usr/lib64/libcuda.so.1",
411 "/usr/local/cuda/lib64/libcuda.so",
412 "/opt/cuda/lib64/libcuda.so",
413 ] {
414 if std::path::Path::new(p).exists() {
415 return true;
416 }
417 }
418 false
419}