Skip to main content

keyhog_scanner/engine/
compile.rs

1use super::*;
2
3impl CompiledScanner {
4    pub fn compile(detectors: Vec<DetectorSpec>) -> Result<Self> {
5        Self::compile_with_gpu_policy(detectors, GpuInitPolicy::FromEnvironment)
6    }
7
8    pub fn compile_with_gpu_policy(
9        detectors: Vec<DetectorSpec>,
10        gpu_policy: GpuInitPolicy,
11    ) -> Result<Self> {
12        // `state` is only mutated under `feature = "simd"` (the
13        // Hyperscan-reject reroute below). Lean builds would lint it
14        // unused-mut otherwise.
15        #[cfg_attr(not(feature = "simd"), allow(unused_mut))]
16        let mut state = build_compile_state(&detectors)?;
17        let ac = build_ac_pattern_set(&state.ac_literals)?;
18        // GPU is unconditional in the build; runtime probe decides whether to
19        // actually use it. `gpu_available` is set by hw_probe based on adapter
20        // detection (excluding software renderers like llvmpipe/lavapipe).
21        // Resolve the active GPU backend with the cascade
22        //     CUDA (when `cuda` feature on + libcuda.so loadable)
23        //     → wgpu (any-vendor cross-platform fallback)
24        //     → None (auto-routes to SIMD/CPU).
25        // CUDA bypasses the wgpu validation layers + naga IR + WGSL
26        // text + driver shader compile; the path through CUDA driver
27        // API + PTX is empirically 5-10× faster on NVIDIA hardware
28        // and is the headline path. CUDA acquisition is opaque to
29        // failures: if libcuda.so is missing or the driver refuses,
30        // `acquire()` returns Err and we fall through to wgpu so
31        // nothing regresses on non-CUDA hosts.
32        // `crate::gpu::env_no_gpu()` is the single source of truth for
33        // "skip every GPU init path". Explicit KEYHOG_NO_GPU wins both
34        // directions; in its absence the helper auto-detects CI runners
35        // (CI=true + a dozen platform-specific markers) and returns
36        // true, since CI runners have no discrete GPU - the wgpu probe
37        // would enumerate llvmpipe, get rejected as software, and the
38        // operator would see a confusing "GPU MoE init failed" warning
39        // after burning ~250ms on cold-start. Set KEYHOG_NO_GPU=0 in CI
40        // to opt back in on self-hosted GPU runners.
41        let gpu_disabled = match gpu_policy {
42            GpuInitPolicy::FromEnvironment => crate::gpu::env_no_gpu(),
43            GpuInitPolicy::ForceEnabled => false,
44            GpuInitPolicy::ForceDisabled => true,
45        };
46        if gpu_disabled {
47            let disabled_by_policy = matches!(gpu_policy, GpuInitPolicy::ForceDisabled);
48            let in_ci = !disabled_by_policy
49                && crate::gpu::is_ci_environment()
50                && std::env::var("KEYHOG_NO_GPU").is_err();
51            if disabled_by_policy {
52                tracing::info!(
53                    target: "keyhog::routing",
54                    "GPU init bypassed by caller policy; scanner will use CPU/SIMD paths"
55                );
56            } else if in_ci {
57                tracing::info!(
58                    target: "keyhog::routing",
59                    "CI environment detected (CI= or platform-specific marker set); bypassing CUDA/wgpu init. \
60                     Set KEYHOG_NO_GPU=0 to force GPU on self-hosted GPU runners."
61                );
62            } else {
63                tracing::info!(
64                    target: "keyhog::routing",
65                    "KEYHOG_NO_GPU set: bypassing CUDA/wgpu init, routing every chunk through the CPU/SIMD path"
66                );
67            }
68        }
69        #[cfg(feature = "gpu")]
70        let (gpu_literals, gpu_backend, wgpu_backend) =
71            if !gpu_disabled && crate::hw_probe::probe_hardware().gpu_available {
72                let literals = build_gpu_literals(&state.ac_literals);
73                let cuda_backend: Option<Arc<dyn vyre::VyreBackend>> = {
74                    #[cfg(target_os = "linux")]
75                    {
76                        match vyre_driver_cuda::cuda_factory() {
77                            Ok(boxed) => {
78                                tracing::info!(
79                                    target: "keyhog::routing",
80                                    "CUDA backend acquired, bypassing wgpu/naga/WGSL path"
81                                );
82                                Some(Arc::from(boxed))
83                            }
84                            Err(error) => {
85                                surface_cuda_acquisition_failure(&error);
86                                None
87                            }
88                        }
89                    }
90                    #[cfg(not(target_os = "linux"))]
91                    {
92                        None
93                    }
94                };
95                match cuda_backend {
96                    Some(cuda) => (literals, Some(cuda), None),
97                    None => match vyre_driver_wgpu::WgpuBackend::shared() {
98                        Ok(wgpu) => {
99                            let trait_obj: Arc<dyn vyre::VyreBackend> = wgpu.clone();
100                            (literals, Some(trait_obj), Some(wgpu))
101                        }
102                        Err(error) => {
103                            tracing::warn!(
104                                target: "keyhog::routing",
105                                %error,
106                                "wgpu backend unavailable; scan will use CPU-only path"
107                            );
108                            (literals, None, None)
109                        }
110                    },
111                }
112            } else {
113                (None, None, None)
114            };
115
116        // Lean (no-`gpu`) build: never link the wgpu / CUDA drivers, never
117        // probe Vulkan at startup. The hw_probe still reports its findings so
118        // downstream routing surfaces `KEYHOG_NO_GPU` semantics, but no
119        // backend is acquired. `gpu_disabled` stays read so the cfg-aware
120        // dead-code warning is suppressed without an `_ =` decoration.
121        #[cfg(not(feature = "gpu"))]
122        let (gpu_literals, gpu_backend): (
123            Option<Arc<Vec<Vec<u8>>>>,
124            Option<Arc<dyn vyre::VyreBackend>>,
125        ) = {
126            let _ = gpu_disabled;
127            (None, None)
128        };
129        let prefix_propagation = CsrU32::from(build_prefix_propagation(&state.ac_literals));
130        let same_prefix_patterns = CsrU32::from(build_same_prefix_patterns(&state.ac_literals));
131
132        // Build the Hyperscan scanner BEFORE the keyword fallback so we
133        // learn which ac_map patterns Hyperscan rejected (over-long, or an
134        // unsupported construct like a large `{100,200}` bounded repeat).
135        // A rejected pattern produces zero HS matches, and because it took
136        // the literal-prefix (ac_map) branch in build_compile_state it is
137        // NOT in the keyword fallback either - so it is silently dead under
138        // the HS backend (the default on Linux/CI). Reroute each one into
139        // the keyword fallback, gated by its detector's keywords, so it
140        // fires via the backend-independent regex sweep. Closes the
141        // contracts_runner recall hole on line/paloalto/tower/keystonejs/
142        // snowflake/bandwidth and the matching adversarial-wrapper misses.
143        #[cfg(feature = "simd")]
144        let (simd_prefilter, hs_index_map) =
145            match super::build_simd_scanner(&state.ac_map, &state.fallback) {
146                Some((scanner, index_map, unsupported_ac)) => {
147                    for ac_idx in unsupported_ac {
148                        let pattern = state.ac_map[ac_idx].clone();
149                        let keywords = detectors[pattern.detector_index].keywords.clone();
150                        state.fallback.push((pattern, keywords));
151                    }
152                    (Some(scanner), CsrU32::from(index_map))
153                }
154                None => (None, CsrU32::default()),
155            };
156
157        let (fallback_keyword_ac, fallback_keyword_to_patterns) =
158            build_fallback_keyword_ac(&state.fallback);
159        let fallback_keyword_to_patterns = CsrU32::from(fallback_keyword_to_patterns);
160        // Precompute always-active fallback indices so the per-chunk hot path
161        // seeds the sparse active set without scanning the full fallback table.
162        let fallback_always_active_indices: Vec<usize> = state
163            .fallback
164            .iter()
165            .enumerate()
166            // Mirrors `compiler::build_fallback_keyword_ac`'s
167            // 4-char floor - see the rationale comment there. The
168            // experimental 3-char floor measured a net F1 regression
169            // on SecretBench-medium, so both checks stay at 4.
170            .filter_map(|(index, (_, keywords))| {
171                (!keywords.iter().any(|k| k.len() >= 4)).then_some(index)
172            })
173            .collect();
174
175        log_quality_warnings(&state.quality_warnings);
176
177        let mut alphabet_targets = state.ac_literals.clone();
178        for (_, keywords) in &state.fallback {
179            alphabet_targets.extend(keywords.clone());
180        }
181        let alphabet_screen = if alphabet_targets.is_empty() {
182            None
183        } else {
184            Some(crate::alphabet_filter::AlphabetScreen::new(
185                &alphabet_targets,
186            ))
187        };
188
189        let bigram_bloom =
190            crate::bigram_bloom::BigramBloom::from_literal_prefixes(&alphabet_targets);
191        tracing::debug!(
192            popcount = bigram_bloom.popcount(),
193            "bigram bloom built (4096 bits, lower popcount = stronger filter)"
194        );
195
196        // Pre-intern detector metadata strings into a CHD perfect
197        // hash so per-scan `intern_metadata` calls hand out shared
198        // `Arc<str>` without touching the global allocator. Built
199        // once per scanner; lock-free on read.
200        let static_intern_strings: Vec<&str> = detectors
201            .iter()
202            .flat_map(|d| [d.id.as_str(), d.name.as_str(), d.service.as_str()].into_iter())
203            .collect();
204        let static_intern = Arc::new(crate::static_intern::StaticInterner::from_detector_strings(
205            static_intern_strings,
206        ));
207
208        // Resolve each detector's interned (id, name, service) triple ONCE,
209        // indexed by detector index, so the per-match emission sites clone by
210        // index instead of re-hashing the same three strings through the CHD
211        // perfect hash on every finding (PERF-locality_intern-1). The strings
212        // are exactly the arena entries the per-match `lookup` would return;
213        // every detector field was just fed into `from_detector_strings`
214        // above, so each lookup is guaranteed `Some`. The `unwrap_or_else`
215        // fallback (interning the source string directly) is unreachable in
216        // practice but keeps the build total — a future detector field that
217        // somehow missed the interner universe still emits its true string,
218        // never an empty or wrong one.
219        let metadata_by_index: Vec<(Arc<str>, Arc<str>, Arc<str>)> = detectors
220            .iter()
221            .map(|d| {
222                (
223                    static_intern
224                        .lookup(&d.id)
225                        .unwrap_or_else(|| Arc::from(d.id.as_str())),
226                    static_intern
227                        .lookup(&d.name)
228                        .unwrap_or_else(|| Arc::from(d.name.as_str())),
229                    static_intern
230                        .lookup(&d.service)
231                        .unwrap_or_else(|| Arc::from(d.service.as_str())),
232                )
233            })
234            .collect();
235
236        // Pre-intern the four synthetic entropy-fallback metadata triples once
237        // (PERF-locality_intern-1). These are not detector specs, so they are
238        // not in the StaticInterner universe; intern them directly into shared
239        // Arc<str> here so the entropy emit path clones by index rather than
240        // re-allocating/re-hashing the same four constants per finding. String
241        // values are byte-identical to the prior `intern_metadata` results.
242        #[cfg(feature = "entropy")]
243        let entropy_metadata_by_index: [(Arc<str>, Arc<str>, Arc<str>); 4] = {
244            use crate::engine::fallback_entropy_helpers::ENTROPY_DETECTOR_METADATA;
245            std::array::from_fn(|i| {
246                let (id, name, service) = ENTROPY_DETECTOR_METADATA[i];
247                (
248                    static_intern.lookup(id).unwrap_or_else(|| Arc::from(id)),
249                    static_intern
250                        .lookup(name)
251                        .unwrap_or_else(|| Arc::from(name)),
252                    static_intern
253                        .lookup(service)
254                        .unwrap_or_else(|| Arc::from(service)),
255                )
256            })
257        };
258
259        // Precise-regex validators for the simdsieve hot fast-path. Built here
260        // (before `detectors` is moved into the struct) so the fast path can
261        // reject literal-prefix candidates the detector's own regex would not
262        // match - see `build_hot_pattern_validators`.
263        #[cfg(feature = "simdsieve")]
264        let hot_pattern_validators =
265            crate::simdsieve_prefilter::build_hot_pattern_validators(&detectors);
266
267        // Pre-intern the hot-pattern metadata constants ONCE, index-parallel
268        // with HOT_PATTERNS, so the simdsieve fast path clones by slot index
269        // instead of re-hashing the same three `&'static str`s through the CHD
270        // interner on every hot hit (PERF-locality_intern-1). These constants
271        // name real detectors whose id/name/service are already in the interner
272        // universe; the `unwrap_or_else` only fires for the one synthetic slot
273        // (square) with no canonical detector, where it interns the static
274        // string directly — still byte-identical to what the per-match
275        // `intern_metadata` call would have produced.
276        #[cfg(feature = "simdsieve")]
277        let hot_metadata_by_index: Vec<(Arc<str>, Arc<str>, Arc<str>)> = {
278            use crate::simdsieve_prefilter::{
279                HOT_PATTERN_DETECTOR_IDS, HOT_PATTERN_DISPLAY_NAMES, HOT_PATTERN_NAMES,
280            };
281            (0..HOT_PATTERN_NAMES.len())
282                .map(|i| {
283                    let id = HOT_PATTERN_DETECTOR_IDS[i];
284                    let name = HOT_PATTERN_DISPLAY_NAMES[i];
285                    let service = HOT_PATTERN_NAMES[i];
286                    (
287                        static_intern.lookup(id).unwrap_or_else(|| Arc::from(id)),
288                        static_intern
289                            .lookup(name)
290                            .unwrap_or_else(|| Arc::from(name)),
291                        static_intern
292                            .lookup(service)
293                            .unwrap_or_else(|| Arc::from(service)),
294                    )
295                })
296                .collect()
297        };
298
299        let scanner = Self {
300            ac,
301            gpu_backend,
302            #[cfg(feature = "gpu")]
303            wgpu_backend,
304            gpu_literals,
305            gpu_matcher: OnceLock::new(),
306            gpu_const_packs: OnceLock::new(),
307            gpu_ac_const_packs: OnceLock::new(),
308            ac_gpu_program: OnceLock::new(),
309            gpu_last_degrade_reason: std::sync::Mutex::new(None),
310
311            rule_pipeline: OnceLock::new(),
312            fused_program: OnceLock::new(),
313            fused_decode_programs: OnceLock::new(),
314            static_intern,
315            metadata_by_index,
316            ac_map: state.ac_map,
317            prefix_propagation,
318            fallback: state.fallback,
319            companions: state.companions,
320            detectors,
321            same_prefix_patterns,
322            fallback_keyword_ac,
323            fallback_keyword_to_patterns,
324            fallback_always_active_indices,
325            #[cfg(feature = "simd")]
326            simd_prefilter,
327            #[cfg(feature = "simd")]
328            hs_index_map,
329            #[cfg(feature = "simdsieve")]
330            hot_pattern_validators,
331            #[cfg(feature = "simdsieve")]
332            hot_metadata_by_index,
333            #[cfg(feature = "entropy")]
334            entropy_metadata_by_index,
335            config: ScannerConfig::default(),
336            alphabet_screen,
337            bigram_bloom,
338            fragment_cache: crate::fragment_cache::FragmentCache::new(1000),
339        };
340
341        Ok(scanner)
342    }
343
344    /// Apply a custom configuration to the compiled scanner.
345    pub fn with_config(mut self, config: ScannerConfig) -> Self {
346        self.config = config;
347        self
348    }
349}
350
351/// One-shot guard so the CUDA-acquisition-failed warning fires
352/// exactly once per process, not on every recompile. The CUDA factory
353/// is called inside `compile()` and a binary that re-compiles a
354/// scanner per-job (daemon mode, watch mode) would otherwise spam.
355#[cfg(all(target_os = "linux", feature = "gpu"))]
356static CUDA_FALLBACK_WARNED: std::sync::OnceLock<()> = std::sync::OnceLock::new();
357
358/// Surface a CUDA-backend acquisition failure when the host looks
359/// like it should have a working CUDA stack. We don't want to warn
360/// on plain non-NVIDIA Linux (the wgpu fall-through is the right
361/// path); we DO want to warn when the user is on an NVIDIA box with
362/// libcuda.so or /proc/driver/nvidia present, because in that case
363/// they paid for the CUDA stack and we just dropped them onto the
364/// 5-10x slower wgpu path silently. KEYHOG_REQUIRE_GPU=1 turns the
365/// warning into a hard exit, matching the contract used by the MoE
366/// init and the scan dispatch paths.
367#[cfg(all(target_os = "linux", feature = "gpu"))]
368fn surface_cuda_acquisition_failure(error: &dyn std::fmt::Display) {
369    let on_nvidia_host = nvidia_userland_present();
370    let require_gpu = std::env::var("KEYHOG_REQUIRE_GPU").as_deref() == Ok("1");
371    let no_gpu = std::env::var("KEYHOG_NO_GPU").as_deref() == Ok("1");
372
373    if require_gpu && on_nvidia_host {
374        eprintln!(
375            "keyhog: KEYHOG_REQUIRE_GPU=1 but CUDA backend acquisition failed on \
376an NVIDIA host: {error}. Refusing to fall back to WGPU."
377        );
378        std::process::exit(2);
379    }
380
381    if no_gpu {
382        return;
383    }
384
385    if on_nvidia_host && CUDA_FALLBACK_WARNED.set(()).is_ok() {
386        eprintln!(
387            "keyhog: CUDA backend unavailable on this NVIDIA host ({error}); \
388falling back to WGPU (typically 5-10x slower than CUDA on the same hardware). \
389This is usually a libcuda.so version mismatch or a driver upgrade pending a \
390reboot. Set KEYHOG_NO_GPU=1 to silence this warning, or KEYHOG_REQUIRE_GPU=1 \
391to hard-fail next time."
392        );
393    }
394    tracing::warn!("CUDA backend unavailable, falling back to wgpu: {error}");
395}
396
397/// Check the common libcuda.so locations + /proc/driver/nvidia to
398/// decide whether this host appears to have an NVIDIA CUDA userland
399/// installed. Mirrors the probes install.sh uses so the runtime view
400/// matches the install-time view.
401#[cfg(all(target_os = "linux", feature = "gpu"))]
402fn nvidia_userland_present() -> bool {
403    if std::path::Path::new("/proc/driver/nvidia").exists() {
404        return true;
405    }
406    for p in [
407        "/usr/lib/x86_64-linux-gnu/libcuda.so",
408        "/usr/lib/x86_64-linux-gnu/libcuda.so.1",
409        "/usr/lib64/libcuda.so",
410        "/usr/lib64/libcuda.so.1",
411        "/usr/local/cuda/lib64/libcuda.so",
412        "/opt/cuda/lib64/libcuda.so",
413    ] {
414        if std::path::Path::new(p).exists() {
415            return true;
416        }
417    }
418    false
419}