1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
use super::*;
impl CompiledScanner {
pub fn compile(detectors: Vec<DetectorSpec>) -> Result<Self> {
let mut state = build_compile_state(&detectors)?;
let ac = build_ac_pattern_set(&state.ac_literals)?;
// GPU is unconditional in the build; runtime probe decides whether to
// actually use it. `gpu_available` is set by hw_probe based on adapter
// detection (excluding software renderers like llvmpipe/lavapipe).
// Resolve the active GPU backend with the cascade
// CUDA (when `cuda` feature on + libcuda.so loadable)
// → wgpu (any-vendor cross-platform fallback)
// → None (auto-routes to SIMD/CPU).
// CUDA bypasses the wgpu validation layers + naga IR + WGSL
// text + driver shader compile; the path through CUDA driver
// API + PTX is empirically 5-10× faster on NVIDIA hardware
// and is the headline path. CUDA acquisition is opaque to
// failures: if libcuda.so is missing or the driver refuses,
// `acquire()` returns Err and we fall through to wgpu so
// nothing regresses on non-CUDA hosts.
// `crate::gpu::env_no_gpu()` is the single source of truth for
// "skip every GPU init path". Explicit KEYHOG_NO_GPU wins both
// directions; in its absence the helper auto-detects CI runners
// (CI=true + a dozen platform-specific markers) and returns
// true, since CI runners have no discrete GPU - the wgpu probe
// would enumerate llvmpipe, get rejected as software, and the
// operator would see a confusing "GPU MoE init failed" warning
// after burning ~250ms on cold-start. Set KEYHOG_NO_GPU=0 in CI
// to opt back in on self-hosted GPU runners.
let gpu_disabled = crate::gpu::env_no_gpu();
if gpu_disabled {
let in_ci = crate::gpu::is_ci_environment() && std::env::var("KEYHOG_NO_GPU").is_err();
if in_ci {
tracing::info!(
target: "keyhog::routing",
"CI environment detected (CI= or platform-specific marker set); bypassing CUDA/wgpu init. \
Set KEYHOG_NO_GPU=0 to force GPU on self-hosted GPU runners."
);
} else {
tracing::info!(
target: "keyhog::routing",
"KEYHOG_NO_GPU set: bypassing CUDA/wgpu init, routing every chunk through the CPU/SIMD path"
);
}
}
let (gpu_literals, gpu_backend, wgpu_backend) =
if !gpu_disabled && crate::hw_probe::probe_hardware().gpu_available {
let literals = build_gpu_literals(&state.ac_literals);
let cuda_backend: Option<Arc<dyn vyre::VyreBackend>> = {
#[cfg(target_os = "linux")]
{
match vyre_driver_cuda::cuda_factory() {
Ok(boxed) => {
tracing::info!(
target: "keyhog::routing",
"CUDA backend acquired, bypassing wgpu/naga/WGSL path"
);
Some(Arc::from(boxed))
}
Err(error) => {
surface_cuda_acquisition_failure(&error);
None
}
}
}
#[cfg(not(target_os = "linux"))]
{
None
}
};
match cuda_backend {
Some(cuda) => (literals, Some(cuda), None),
None => match vyre_driver_wgpu::WgpuBackend::shared() {
Ok(wgpu) => {
let trait_obj: Arc<dyn vyre::VyreBackend> = wgpu.clone();
(literals, Some(trait_obj), Some(wgpu))
}
Err(error) => {
tracing::warn!(
target: "keyhog::routing",
%error,
"wgpu backend unavailable; scan will use CPU-only path"
);
(literals, None, None)
}
},
}
} else {
(None, None, None)
};
let prefix_propagation = build_prefix_propagation(&state.ac_literals);
let same_prefix_patterns = build_same_prefix_patterns(&state.ac_literals);
// Build the Hyperscan scanner BEFORE the keyword fallback so we
// learn which ac_map patterns Hyperscan rejected (over-long, or an
// unsupported construct like a large `{100,200}` bounded repeat).
// A rejected pattern produces zero HS matches, and because it took
// the literal-prefix (ac_map) branch in build_compile_state it is
// NOT in the keyword fallback either - so it is silently dead under
// the HS backend (the default on Linux/CI). Reroute each one into
// the keyword fallback, gated by its detector's keywords, so it
// fires via the backend-independent regex sweep. Closes the
// contracts_runner recall hole on line/paloalto/tower/keystonejs/
// snowflake/bandwidth and the matching adversarial-wrapper misses.
#[cfg(feature = "simd")]
let (simd_prefilter, hs_index_map) =
match super::build_simd_scanner(&state.ac_map, &state.fallback) {
Some((scanner, index_map, unsupported_ac)) => {
for ac_idx in unsupported_ac {
let pattern = state.ac_map[ac_idx].clone();
let keywords = detectors[pattern.detector_index].keywords.clone();
state.fallback.push((pattern, keywords));
}
(Some(scanner), index_map)
}
None => (None, Vec::new()),
};
let (fallback_keyword_ac, fallback_keyword_to_patterns) =
build_fallback_keyword_ac(&state.fallback);
// Precompute the per-pattern "always-active" bitmap so the per-chunk
// hot path avoids walking every pattern's keyword list. See the
// doc comment on the field for rationale.
let fallback_always_active: Vec<bool> = state
.fallback
.iter()
// Mirrors `compiler::build_fallback_keyword_ac`'s
// 4-char floor - see the rationale comment there. The
// experimental 3-char floor measured a net F1 regression
// on SecretBench-medium, so both checks stay at 4.
.map(|(_, keywords)| !keywords.iter().any(|k| k.len() >= 4))
.collect();
log_quality_warnings(&state.quality_warnings);
let mut alphabet_targets = state.ac_literals.clone();
for (_, keywords) in &state.fallback {
alphabet_targets.extend(keywords.clone());
}
let alphabet_screen = if alphabet_targets.is_empty() {
None
} else {
Some(crate::alphabet_filter::AlphabetScreen::new(
&alphabet_targets,
))
};
let bigram_bloom =
crate::bigram_bloom::BigramBloom::from_literal_prefixes(&alphabet_targets);
tracing::debug!(
popcount = bigram_bloom.popcount(),
"bigram bloom built (4096 bits, lower popcount = stronger filter)"
);
// Pre-intern detector metadata strings into a CHD perfect
// hash so per-scan `intern_metadata` calls hand out shared
// `Arc<str>` without touching the global allocator. Built
// once per scanner; lock-free on read.
let static_intern_strings: Vec<&str> = detectors
.iter()
.flat_map(|d| [d.id.as_str(), d.name.as_str(), d.service.as_str()].into_iter())
.collect();
let static_intern = Arc::new(crate::static_intern::StaticInterner::from_detector_strings(
static_intern_strings,
));
// Precise-regex validators for the simdsieve hot fast-path. Built here
// (before `detectors` is moved into the struct) so the fast path can
// reject literal-prefix candidates the detector's own regex would not
// match - see `build_hot_pattern_validators`.
#[cfg(feature = "simdsieve")]
let hot_pattern_validators =
crate::simdsieve_prefilter::build_hot_pattern_validators(&detectors);
Ok(Self {
ac,
gpu_backend,
wgpu_backend,
gpu_literals,
gpu_matcher: OnceLock::new(),
gpu_const_packs: OnceLock::new(),
gpu_ac_const_packs: OnceLock::new(),
ac_gpu_program: OnceLock::new(),
rule_pipeline: OnceLock::new(),
fused_program: OnceLock::new(),
fused_decode_programs: OnceLock::new(),
static_intern,
ac_map: state.ac_map,
prefix_propagation,
fallback: state.fallback,
companions: state.companions,
detectors,
same_prefix_patterns,
fallback_keyword_ac,
fallback_keyword_to_patterns,
fallback_always_active,
#[cfg(feature = "simd")]
simd_prefilter,
#[cfg(feature = "simd")]
hs_index_map,
#[cfg(feature = "simdsieve")]
hot_pattern_validators,
config: ScannerConfig::default(),
alphabet_screen,
bigram_bloom,
fragment_cache: crate::fragment_cache::FragmentCache::new(1000),
})
}
/// Apply a custom configuration to the compiled scanner.
pub fn with_config(mut self, config: ScannerConfig) -> Self {
self.config = config;
self
}
}
/// One-shot guard so the CUDA-acquisition-failed warning fires
/// exactly once per process, not on every recompile. The CUDA factory
/// is called inside `compile()` and a binary that re-compiles a
/// scanner per-job (daemon mode, watch mode) would otherwise spam.
#[cfg(target_os = "linux")]
static CUDA_FALLBACK_WARNED: std::sync::OnceLock<()> = std::sync::OnceLock::new();
/// Surface a CUDA-backend acquisition failure when the host looks
/// like it should have a working CUDA stack. We don't want to warn
/// on plain non-NVIDIA Linux (the wgpu fall-through is the right
/// path); we DO want to warn when the user is on an NVIDIA box with
/// libcuda.so or /proc/driver/nvidia present, because in that case
/// they paid for the CUDA stack and we just dropped them onto the
/// 5-10x slower wgpu path silently. KEYHOG_REQUIRE_GPU=1 turns the
/// warning into a hard exit, matching the contract used by the MoE
/// init and the scan dispatch paths.
#[cfg(target_os = "linux")]
fn surface_cuda_acquisition_failure(error: &dyn std::fmt::Display) {
let on_nvidia_host = nvidia_userland_present();
let require_gpu = std::env::var("KEYHOG_REQUIRE_GPU").as_deref() == Ok("1");
let no_gpu = std::env::var("KEYHOG_NO_GPU").as_deref() == Ok("1");
if require_gpu && on_nvidia_host {
eprintln!(
"keyhog: KEYHOG_REQUIRE_GPU=1 but CUDA backend acquisition failed on \
an NVIDIA host: {error}. Refusing to fall back to WGPU."
);
std::process::exit(2);
}
if no_gpu {
return;
}
if on_nvidia_host && CUDA_FALLBACK_WARNED.set(()).is_ok() {
eprintln!(
"keyhog: CUDA backend unavailable on this NVIDIA host ({error}); \
falling back to WGPU (typically 5-10x slower than CUDA on the same hardware). \
This is usually a libcuda.so version mismatch or a driver upgrade pending a \
reboot. Set KEYHOG_NO_GPU=1 to silence this warning, or KEYHOG_REQUIRE_GPU=1 \
to hard-fail next time."
);
}
tracing::warn!("CUDA backend unavailable, falling back to wgpu: {error}");
}
/// Check the common libcuda.so locations + /proc/driver/nvidia to
/// decide whether this host appears to have an NVIDIA CUDA userland
/// installed. Mirrors the probes install.sh uses so the runtime view
/// matches the install-time view.
#[cfg(target_os = "linux")]
fn nvidia_userland_present() -> bool {
if std::path::Path::new("/proc/driver/nvidia").exists() {
return true;
}
for p in [
"/usr/lib/x86_64-linux-gnu/libcuda.so",
"/usr/lib/x86_64-linux-gnu/libcuda.so.1",
"/usr/lib64/libcuda.so",
"/usr/lib64/libcuda.so.1",
"/usr/local/cuda/lib64/libcuda.so",
"/opt/cuda/lib64/libcuda.so",
] {
if std::path::Path::new(p).exists() {
return true;
}
}
false
}