keyhog_scanner/engine/compile.rs
1use super::*;
2
3impl CompiledScanner {
4 pub fn compile(detectors: Vec<DetectorSpec>) -> Result<Self> {
5 let mut state = build_compile_state(&detectors)?;
6 let ac = build_ac_pattern_set(&state.ac_literals)?;
7 // GPU is unconditional in the build; runtime probe decides whether to
8 // actually use it. `gpu_available` is set by hw_probe based on adapter
9 // detection (excluding software renderers like llvmpipe/lavapipe).
10 // Resolve the active GPU backend with the cascade
11 // CUDA (when `cuda` feature on + libcuda.so loadable)
12 // → wgpu (any-vendor cross-platform fallback)
13 // → None (auto-routes to SIMD/CPU).
14 // CUDA bypasses the wgpu validation layers + naga IR + WGSL
15 // text + driver shader compile; the path through CUDA driver
16 // API + PTX is empirically 5-10× faster on NVIDIA hardware
17 // and is the headline path. CUDA acquisition is opaque to
18 // failures: if libcuda.so is missing or the driver refuses,
19 // `acquire()` returns Err and we fall through to wgpu so
20 // nothing regresses on non-CUDA hosts.
21 // `crate::gpu::env_no_gpu()` is the single source of truth for
22 // "skip every GPU init path". Explicit KEYHOG_NO_GPU wins both
23 // directions; in its absence the helper auto-detects CI runners
24 // (CI=true + a dozen platform-specific markers) and returns
25 // true, since CI runners have no discrete GPU - the wgpu probe
26 // would enumerate llvmpipe, get rejected as software, and the
27 // operator would see a confusing "GPU MoE init failed" warning
28 // after burning ~250ms on cold-start. Set KEYHOG_NO_GPU=0 in CI
29 // to opt back in on self-hosted GPU runners.
30 let gpu_disabled = crate::gpu::env_no_gpu();
31 if gpu_disabled {
32 let in_ci = crate::gpu::is_ci_environment() && std::env::var("KEYHOG_NO_GPU").is_err();
33 if in_ci {
34 tracing::info!(
35 target: "keyhog::routing",
36 "CI environment detected (CI= or platform-specific marker set); bypassing CUDA/wgpu init. \
37 Set KEYHOG_NO_GPU=0 to force GPU on self-hosted GPU runners."
38 );
39 } else {
40 tracing::info!(
41 target: "keyhog::routing",
42 "KEYHOG_NO_GPU set: bypassing CUDA/wgpu init, routing every chunk through the CPU/SIMD path"
43 );
44 }
45 }
46 let (gpu_literals, gpu_backend, wgpu_backend) =
47 if !gpu_disabled && crate::hw_probe::probe_hardware().gpu_available {
48 let literals = build_gpu_literals(&state.ac_literals);
49 let cuda_backend: Option<Arc<dyn vyre::VyreBackend>> = {
50 #[cfg(target_os = "linux")]
51 {
52 match vyre_driver_cuda::cuda_factory() {
53 Ok(boxed) => {
54 tracing::info!(
55 target: "keyhog::routing",
56 "CUDA backend acquired, bypassing wgpu/naga/WGSL path"
57 );
58 Some(Arc::from(boxed))
59 }
60 Err(error) => {
61 surface_cuda_acquisition_failure(&error);
62 None
63 }
64 }
65 }
66 #[cfg(not(target_os = "linux"))]
67 {
68 None
69 }
70 };
71 match cuda_backend {
72 Some(cuda) => (literals, Some(cuda), None),
73 None => match vyre_driver_wgpu::WgpuBackend::shared() {
74 Ok(wgpu) => {
75 let trait_obj: Arc<dyn vyre::VyreBackend> = wgpu.clone();
76 (literals, Some(trait_obj), Some(wgpu))
77 }
78 Err(error) => {
79 tracing::warn!(
80 target: "keyhog::routing",
81 %error,
82 "wgpu backend unavailable; scan will use CPU-only path"
83 );
84 (literals, None, None)
85 }
86 },
87 }
88 } else {
89 (None, None, None)
90 };
91 let prefix_propagation = build_prefix_propagation(&state.ac_literals);
92 let same_prefix_patterns = build_same_prefix_patterns(&state.ac_literals);
93
94 // Build the Hyperscan scanner BEFORE the keyword fallback so we
95 // learn which ac_map patterns Hyperscan rejected (over-long, or an
96 // unsupported construct like a large `{100,200}` bounded repeat).
97 // A rejected pattern produces zero HS matches, and because it took
98 // the literal-prefix (ac_map) branch in build_compile_state it is
99 // NOT in the keyword fallback either - so it is silently dead under
100 // the HS backend (the default on Linux/CI). Reroute each one into
101 // the keyword fallback, gated by its detector's keywords, so it
102 // fires via the backend-independent regex sweep. Closes the
103 // contracts_runner recall hole on line/paloalto/tower/keystonejs/
104 // snowflake/bandwidth and the matching adversarial-wrapper misses.
105 #[cfg(feature = "simd")]
106 let (simd_prefilter, hs_index_map) =
107 match super::build_simd_scanner(&state.ac_map, &state.fallback) {
108 Some((scanner, index_map, unsupported_ac)) => {
109 for ac_idx in unsupported_ac {
110 let pattern = state.ac_map[ac_idx].clone();
111 let keywords = detectors[pattern.detector_index].keywords.clone();
112 state.fallback.push((pattern, keywords));
113 }
114 (Some(scanner), index_map)
115 }
116 None => (None, Vec::new()),
117 };
118
119 let (fallback_keyword_ac, fallback_keyword_to_patterns) =
120 build_fallback_keyword_ac(&state.fallback);
121 // Precompute the per-pattern "always-active" bitmap so the per-chunk
122 // hot path avoids walking every pattern's keyword list. See the
123 // doc comment on the field for rationale.
124 let fallback_always_active: Vec<bool> = state
125 .fallback
126 .iter()
127 // Mirrors `compiler::build_fallback_keyword_ac`'s
128 // 4-char floor - see the rationale comment there. The
129 // experimental 3-char floor measured a net F1 regression
130 // on SecretBench-medium, so both checks stay at 4.
131 .map(|(_, keywords)| !keywords.iter().any(|k| k.len() >= 4))
132 .collect();
133
134 log_quality_warnings(&state.quality_warnings);
135
136 let mut alphabet_targets = state.ac_literals.clone();
137 for (_, keywords) in &state.fallback {
138 alphabet_targets.extend(keywords.clone());
139 }
140 let alphabet_screen = if alphabet_targets.is_empty() {
141 None
142 } else {
143 Some(crate::alphabet_filter::AlphabetScreen::new(
144 &alphabet_targets,
145 ))
146 };
147
148 let bigram_bloom =
149 crate::bigram_bloom::BigramBloom::from_literal_prefixes(&alphabet_targets);
150 tracing::debug!(
151 popcount = bigram_bloom.popcount(),
152 "bigram bloom built (4096 bits, lower popcount = stronger filter)"
153 );
154
155 // Pre-intern detector metadata strings into a CHD perfect
156 // hash so per-scan `intern_metadata` calls hand out shared
157 // `Arc<str>` without touching the global allocator. Built
158 // once per scanner; lock-free on read.
159 let static_intern_strings: Vec<&str> = detectors
160 .iter()
161 .flat_map(|d| [d.id.as_str(), d.name.as_str(), d.service.as_str()].into_iter())
162 .collect();
163 let static_intern = Arc::new(crate::static_intern::StaticInterner::from_detector_strings(
164 static_intern_strings,
165 ));
166
167 // Precise-regex validators for the simdsieve hot fast-path. Built here
168 // (before `detectors` is moved into the struct) so the fast path can
169 // reject literal-prefix candidates the detector's own regex would not
170 // match - see `build_hot_pattern_validators`.
171 #[cfg(feature = "simdsieve")]
172 let hot_pattern_validators =
173 crate::simdsieve_prefilter::build_hot_pattern_validators(&detectors);
174
175 Ok(Self {
176 ac,
177 gpu_backend,
178 wgpu_backend,
179 gpu_literals,
180 gpu_matcher: OnceLock::new(),
181 gpu_const_packs: OnceLock::new(),
182 gpu_ac_const_packs: OnceLock::new(),
183 ac_gpu_program: OnceLock::new(),
184
185 rule_pipeline: OnceLock::new(),
186 fused_program: OnceLock::new(),
187 fused_decode_programs: OnceLock::new(),
188 static_intern,
189 ac_map: state.ac_map,
190 prefix_propagation,
191 fallback: state.fallback,
192 companions: state.companions,
193 detectors,
194 same_prefix_patterns,
195 fallback_keyword_ac,
196 fallback_keyword_to_patterns,
197 fallback_always_active,
198 #[cfg(feature = "simd")]
199 simd_prefilter,
200 #[cfg(feature = "simd")]
201 hs_index_map,
202 #[cfg(feature = "simdsieve")]
203 hot_pattern_validators,
204 config: ScannerConfig::default(),
205 alphabet_screen,
206 bigram_bloom,
207 fragment_cache: crate::fragment_cache::FragmentCache::new(1000),
208 })
209 }
210
211 /// Apply a custom configuration to the compiled scanner.
212 pub fn with_config(mut self, config: ScannerConfig) -> Self {
213 self.config = config;
214 self
215 }
216}
217
218/// One-shot guard so the CUDA-acquisition-failed warning fires
219/// exactly once per process, not on every recompile. The CUDA factory
220/// is called inside `compile()` and a binary that re-compiles a
221/// scanner per-job (daemon mode, watch mode) would otherwise spam.
222#[cfg(target_os = "linux")]
223static CUDA_FALLBACK_WARNED: std::sync::OnceLock<()> = std::sync::OnceLock::new();
224
225/// Surface a CUDA-backend acquisition failure when the host looks
226/// like it should have a working CUDA stack. We don't want to warn
227/// on plain non-NVIDIA Linux (the wgpu fall-through is the right
228/// path); we DO want to warn when the user is on an NVIDIA box with
229/// libcuda.so or /proc/driver/nvidia present, because in that case
230/// they paid for the CUDA stack and we just dropped them onto the
231/// 5-10x slower wgpu path silently. KEYHOG_REQUIRE_GPU=1 turns the
232/// warning into a hard exit, matching the contract used by the MoE
233/// init and the scan dispatch paths.
234#[cfg(target_os = "linux")]
235fn surface_cuda_acquisition_failure(error: &dyn std::fmt::Display) {
236 let on_nvidia_host = nvidia_userland_present();
237 let require_gpu = std::env::var("KEYHOG_REQUIRE_GPU").as_deref() == Ok("1");
238 let no_gpu = std::env::var("KEYHOG_NO_GPU").as_deref() == Ok("1");
239
240 if require_gpu && on_nvidia_host {
241 eprintln!(
242 "keyhog: KEYHOG_REQUIRE_GPU=1 but CUDA backend acquisition failed on \
243an NVIDIA host: {error}. Refusing to fall back to WGPU."
244 );
245 std::process::exit(2);
246 }
247
248 if no_gpu {
249 return;
250 }
251
252 if on_nvidia_host && CUDA_FALLBACK_WARNED.set(()).is_ok() {
253 eprintln!(
254 "keyhog: CUDA backend unavailable on this NVIDIA host ({error}); \
255falling back to WGPU (typically 5-10x slower than CUDA on the same hardware). \
256This is usually a libcuda.so version mismatch or a driver upgrade pending a \
257reboot. Set KEYHOG_NO_GPU=1 to silence this warning, or KEYHOG_REQUIRE_GPU=1 \
258to hard-fail next time."
259 );
260 }
261 tracing::warn!("CUDA backend unavailable, falling back to wgpu: {error}");
262}
263
264/// Check the common libcuda.so locations + /proc/driver/nvidia to
265/// decide whether this host appears to have an NVIDIA CUDA userland
266/// installed. Mirrors the probes install.sh uses so the runtime view
267/// matches the install-time view.
268#[cfg(target_os = "linux")]
269fn nvidia_userland_present() -> bool {
270 if std::path::Path::new("/proc/driver/nvidia").exists() {
271 return true;
272 }
273 for p in [
274 "/usr/lib/x86_64-linux-gnu/libcuda.so",
275 "/usr/lib/x86_64-linux-gnu/libcuda.so.1",
276 "/usr/lib64/libcuda.so",
277 "/usr/lib64/libcuda.so.1",
278 "/usr/local/cuda/lib64/libcuda.so",
279 "/opt/cuda/lib64/libcuda.so",
280 ] {
281 if std::path::Path::new(p).exists() {
282 return true;
283 }
284 }
285 false
286}