keyhog_scanner/engine/mod.rs
1//! Core scanning engine implementation.
2
3mod backend;
4mod backend_dispatch;
5mod backend_pattern_hits;
6mod backend_prepared;
7mod backend_triggered;
8pub mod boundary;
9mod compile;
10mod extract;
11mod fallback;
12mod fallback_entropy;
13mod fallback_entropy_helpers;
14mod fallback_generic;
15mod gpu_ac_phase1;
16mod gpu_cache;
17mod gpu_coalesce;
18pub mod gpu_decode_scan;
19mod gpu_dispatch;
20mod gpu_forced;
21mod gpu_lazy;
22mod gpu_literal_phase1;
23mod gpu_megascan;
24mod gpu_phase2;
25pub(crate) mod gpu_postprocess;
26pub mod gpu_program_fusion;
27pub mod gpu_regex_dfa;
28mod gpu_scan_wrappers;
29mod hot_patterns;
30mod process;
31mod rule_pipeline;
32mod scan;
33mod scan_filters;
34mod scan_postprocess;
35pub mod segment_attribution;
36mod windowed;
37
38// `build_simd_scanner` only exists under the `simd` (Hyperscan) feature; its
39// sole call site in compile.rs is `#[cfg(feature = "simd")]` too. Gate the
40// import to match, or non-simd builds (the `portable` feature used for the
41// macOS/Windows/musl release assets) fail with E0432.
42#[cfg(feature = "simd")]
43pub(crate) use backend_prepared::build_simd_scanner;
44pub(crate) use backend_prepared::PreparedChunk;
45pub use gpu_cache::{AcConstPacks, GpuConstPacks};
46pub use gpu_coalesce::coalesce_chunks;
47pub use gpu_regex_dfa::{build_regex_dfa, RegexDfaError};
48pub use gpu_scan_wrappers::GpuPhase1Output;
49pub use rule_pipeline::{
50 build_rule_pipeline, megascan_input_len, rule_pipeline_cached, AC_GPU_MAX_MATCHES_PER_DISPATCH,
51 MEGASCAN_INPUT_LEN, MEGASCAN_INPUT_LEN_DEFAULT,
52};
53pub use windowed::{
54 floor_char_boundary, line_number_for_offset, next_window_offset, record_window_match,
55 window_chunk, window_end_offset,
56};
57
58use crate::compiler::*;
59use crate::error::Result;
60use crate::pipeline::*;
61use crate::types::*;
62use aho_corasick::AhoCorasick;
63use keyhog_core::{Chunk, DetectorSpec, RawMatch};
64use std::sync::Arc;
65use std::sync::OnceLock;
66
67pub use vyre_libs::scan::LiteralMatch;
68
69/// Read `KEYHOG_PER_CHUNK_TIMEOUT_MS` and turn it into a per-chunk
70/// deadline `Instant`. Returns `None` when the env var is unset or
71/// malformed - the historical "scan until done" behavior.
72///
73/// Wired into the public `scan` / `scan_with_backend` entry points
74/// so a hostile or pathological input (e.g. the Apple Silicon
75/// regex-DFA construction stall surfaced during cross-platform
76/// dogfood - a single 171-byte line with `var token = identifier.Flag(...)`
77/// shape spends minutes inside the multiline preprocessor) bails
78/// after the configured budget instead of hanging the entire
79/// `keyhog scan <repo>` run. The CLI orchestrator path runs scans
80/// in parallel via rayon; a stuck worker would otherwise keep one
81/// core pinned at 100% indefinitely.
82///
83/// Default unset (no timeout) preserves prior behavior. Recommend
84/// `export KEYHOG_PER_CHUNK_TIMEOUT_MS=30000` (30 s) for production
85/// scans where bounded latency matters more than scan completeness.
86fn env_per_chunk_deadline() -> Option<std::time::Instant> {
87 static MS: std::sync::OnceLock<Option<u64>> = std::sync::OnceLock::new();
88 let ms = *MS.get_or_init(|| {
89 std::env::var("KEYHOG_PER_CHUNK_TIMEOUT_MS")
90 .ok()
91 .and_then(|v| v.parse::<u64>().ok())
92 .filter(|&v| v > 0)
93 });
94 ms.map(|ms| std::time::Instant::now() + std::time::Duration::from_millis(ms))
95}
96
97pub enum MlScoreResult<'a> {
98 /// Score is final and the match can be pushed immediately.
99 Final(f64),
100 #[cfg(feature = "ml")]
101 /// ML scoring is batched at the end of the scan.
102 Pending {
103 heuristic_conf: f64,
104 code_context: crate::context::CodeContext,
105 credential: std::borrow::Cow<'a, str>,
106 ml_context: std::borrow::Cow<'a, str>,
107 },
108 /// Zero-sized placeholder that keeps the `'a` lifetime live when ML batch
109 /// scoring is compiled out (lean / `--no-default-features` build). Never
110 /// constructed - it exists solely so the type still carries `'a` without
111 /// the `ml` feature, where only the borrowing `Pending` variant uses it.
112 #[cfg(not(feature = "ml"))]
113 #[doc(hidden)]
114 _Lifetime(std::marker::PhantomData<&'a ()>),
115}
116
117/// Compressed-sparse-row (CSR) index table: a flattened replacement for a
118/// `Vec<Vec<usize>>` whose rows are pattern/literal indices.
119///
120/// The detector-side index maps (`prefix_propagation`, `same_prefix_patterns`,
121/// `fallback_keyword_to_patterns`, and the simd `hs_index_map`) are each
122/// indexed parallel to the ~1000+ AC literals / fallback patterns. Stored as
123/// `Vec<Vec<usize>>` that is ~1000+ separate heap allocations per table, each
124/// inner `Vec` carrying a 24-byte (ptr+len+cap) header plus capacity slack -
125/// even for the overwhelmingly common empty or single-element row. That
126/// fragments the heap, forces pointer-chasing on the hot lookup path (every
127/// row a separate cacheline), and wastes 8-byte `usize` where the values are
128/// corpus-bounded indices that fit in `u32`.
129///
130/// CSR collapses each table to exactly two allocations: `data` holds every
131/// row concatenated, and `offsets` (length `n + 1`) records where each row
132/// starts, so `row(i) == &data[offsets[i]..offsets[i + 1]]`. Empty rows cost
133/// zero data bytes instead of a header, element width halves to `u32`, and
134/// lookups are contiguous. Build it once from the existing
135/// `Vec<Vec<usize>>`-producing builders via `From` (or directly with
136/// `from_rows`); reads go through [`CsrU32::get`], mirroring the slice/`Vec`
137/// API the old field type exposed.
138#[derive(Clone, Debug, Default)]
139pub(crate) struct CsrU32 {
140 /// All rows concatenated, in row order.
141 data: Vec<u32>,
142 /// `offsets[i]..offsets[i + 1]` is the slice of `data` for row `i`.
143 /// Always non-empty once built: a table of `n` rows has `n + 1` offsets.
144 offsets: Vec<u32>,
145}
146
147impl CsrU32 {
148 /// Build a CSR table from per-row index lists in a single pass.
149 ///
150 /// Accepts any iterator of rows so the existing builders can feed their
151 /// `Vec<Vec<usize>>` (or borrowed slices) straight in without an
152 /// intermediate allocation. Values are narrowed to `u32`; a corpus index
153 /// can never exceed the pattern count, which is far below `u32::MAX`.
154 pub(crate) fn from_rows<R, I>(rows: R) -> Self
155 where
156 R: IntoIterator<Item = I>,
157 I: IntoIterator<Item = usize>,
158 {
159 let mut data = Vec::new();
160 let mut offsets = vec![0u32];
161 for row in rows {
162 for v in row {
163 data.push(v as u32);
164 }
165 offsets.push(data.len() as u32);
166 }
167 Self { data, offsets }
168 }
169
170 /// Row `i` as a contiguous slice, or `None` when `i` is out of range.
171 /// Replaces `Vec::get(i) -> Option<&Vec<usize>>` on the hot lookup path.
172 #[inline]
173 pub(crate) fn get(&self, i: usize) -> Option<&[u32]> {
174 let start = *self.offsets.get(i)? as usize;
175 let end = *self.offsets.get(i + 1)? as usize;
176 Some(&self.data[start..end])
177 }
178}
179
180impl From<Vec<Vec<usize>>> for CsrU32 {
181 fn from(rows: Vec<Vec<usize>>) -> Self {
182 Self::from_rows(rows)
183 }
184}
185
186impl std::ops::Index<usize> for CsrU32 {
187 type Output = [u32];
188
189 #[inline]
190 fn index(&self, i: usize) -> &[u32] {
191 let start = self.offsets[i] as usize;
192 let end = self.offsets[i + 1] as usize;
193 &self.data[start..end]
194 }
195}
196
197#[derive(Clone, Copy, Debug, Eq, PartialEq)]
198pub enum GpuInitPolicy {
199 /// Honor KEYHOG_NO_GPU / CI auto-disable.
200 FromEnvironment,
201 /// Acquire a GPU backend when hardware is present, regardless of
202 /// KEYHOG_NO_GPU. Used when the operator explicitly forces GPU.
203 ForceEnabled,
204 /// Skip CUDA/wgpu acquisition. Used when the selected CLI path cannot
205 /// route to GPU, avoiding startup and RSS overhead without changing scan
206 /// results.
207 ForceDisabled,
208}
209
210pub struct CompiledScanner {
211 pub(crate) fragment_cache: crate::fragment_cache::FragmentCache,
212 pub(crate) ac: Option<AhoCorasick>,
213 pub(crate) gpu_backend: Option<Arc<dyn vyre::VyreBackend>>,
214 // Only the `gpu` build holds a concrete wgpu handle — its sole purpose
215 // is to reach `dispatch_borrowed_batch`, which the trait object can't
216 // express. Without the feature, the CUDA / wgpu drivers aren't linked
217 // at all and `gpu_backend` is always None.
218 #[cfg(feature = "gpu")]
219 pub(crate) wgpu_backend: Option<Arc<vyre_driver_wgpu::WgpuBackend>>,
220 pub(crate) gpu_literals: Option<Arc<Vec<Vec<u8>>>>,
221 pub(crate) gpu_matcher: OnceLock<Option<vyre_libs::scan::GpuLiteralSet>>,
222 pub(crate) gpu_const_packs: OnceLock<GpuConstPacks>,
223 pub(crate) gpu_ac_const_packs: OnceLock<AcConstPacks>,
224 pub(crate) ac_gpu_program: OnceLock<Option<vyre::Program>>,
225 pub(crate) gpu_last_degrade_reason: std::sync::Mutex<Option<String>>,
226 pub(crate) rule_pipeline: OnceLock<Option<vyre_libs::scan::RulePipeline>>,
227 /// Fused AC + rule pipeline program (single GPU dispatch instead of two).
228 /// Lazily built on first access via `fused_program()`.
229 pub(crate) fused_program: OnceLock<Option<vyre::Program>>,
230 /// Fused decode→scan programs for base64/hex GPU decode.
231 /// Lazily built on first access.
232 pub(crate) fused_decode_programs: OnceLock<Option<gpu_decode_scan::FusedDecodeScanPrograms>>,
233 pub(crate) static_intern: Arc<crate::static_intern::StaticInterner>,
234 /// Per-detector interned `(id, name, service)` metadata triple, indexed by
235 /// `detector_index`. Built ONCE at scanner construction from the same
236 /// frozen `StaticInterner` the per-match path used to re-hash against.
237 /// Every emission site has the detector index in hand, so emitting metadata
238 /// is three `Arc::clone`s (atomic refcount bumps) instead of three CHD
239 /// perfect-hash lookups (2x FNV-1a + verify-hash + full string compare per
240 /// field). The strings are byte-identical to `static_intern.lookup(...)`
241 /// because they ARE its arena entries — see `perf_locality_intern.rs`.
242 pub(crate) metadata_by_index: Vec<(Arc<str>, Arc<str>, Arc<str>)>,
243 pub(crate) ac_map: Vec<CompiledPattern>,
244 pub(crate) prefix_propagation: CsrU32,
245 pub(crate) fallback: Vec<(CompiledPattern, Vec<String>)>,
246 pub(crate) companions: Vec<Vec<CompiledCompanion>>,
247 pub(crate) detectors: Vec<DetectorSpec>,
248 pub(crate) same_prefix_patterns: CsrU32,
249 pub(crate) fallback_keyword_ac: Option<AhoCorasick>,
250 pub(crate) fallback_keyword_to_patterns: CsrU32,
251 pub(crate) fallback_always_active_indices: Vec<usize>,
252 #[cfg(feature = "simd")]
253 pub(crate) simd_prefilter: Option<crate::simd::backend::HsScanner>,
254 #[cfg(feature = "simd")]
255 pub(crate) hs_index_map: CsrU32,
256 /// Precise-regex validator per hot-pattern slot (index-parallel with
257 /// `simdsieve_prefilter::HOT_PATTERNS`). The hot fast-path runs each
258 /// literal-prefix candidate through these before emitting so it can never
259 /// surface a token the detector's own regex rejects (the length floor
260 /// alone let `ghp_…_…`/`xoxp-123-456-789-abc` through). `None` for the one
261 /// slot with no canonical detector (square).
262 #[cfg(feature = "simdsieve")]
263 pub(crate) hot_pattern_validators: Vec<Option<regex::Regex>>,
264 /// Pre-interned `(detector_id, detector_name, service)` triple per
265 /// hot-pattern slot, index-parallel with `simdsieve_prefilter::HOT_PATTERNS`
266 /// / `HOT_PATTERN_NAMES`. The simdsieve fast path emits directly and used to
267 /// re-hash the three `&'static str` metadata constants through the CHD
268 /// interner on every hot hit; this caches the resolved `Arc<str>` once so
269 /// each emission is three `Arc::clone`s (PERF-locality_intern-1). Byte-
270 /// identical to `static_intern.lookup(HOT_PATTERN_*[idx])`.
271 #[cfg(feature = "simdsieve")]
272 pub(crate) hot_metadata_by_index: Vec<(Arc<str>, Arc<str>, Arc<str>)>,
273 /// Pre-interned `(detector_id, detector_name, service)` triple for each of
274 /// the four synthetic entropy-fallback classes, indexed by
275 /// `classify_entropy_detector_index` (0 generic / 1 password / 2 token /
276 /// 3 api-key). The entropy fallback emits directly and used to re-intern
277 /// these fixed `&'static str` constants per finding; caching the four
278 /// `Arc<str>` triples once turns each emit into three `Arc::clone`s
279 /// (PERF-locality_intern-1). String values are unchanged.
280 #[cfg(feature = "entropy")]
281 pub(crate) entropy_metadata_by_index: [(Arc<str>, Arc<str>, Arc<str>); 4],
282 pub config: ScannerConfig,
283 pub alphabet_screen: Option<crate::alphabet_filter::AlphabetScreen>,
284 pub(crate) bigram_bloom: crate::bigram_bloom::BigramBloom,
285}
286
287const _: () = {
288 const fn assert_send_sync<T: Send + Sync>() {}
289 let _ = assert_send_sync::<CompiledScanner>;
290};
291
292impl CompiledScanner {
293 /// Whether a SIMD (Hyperscan/Vectorscan) prefilter is compiled in and live.
294 ///
295 /// The GPU phase-1 paths reroute a batch through the SIMD coalesced scan
296 /// when the GPU prefix output is too dense for phase 2. That reroute only
297 /// exists when the `simd` feature is on; in `--no-default-features`
298 /// (portable / macOS no-system-libs) builds the `simd_prefilter` field is
299 /// `#[cfg]`-compiled out entirely, so there is nothing to reroute into and
300 /// the answer is always `false`. This accessor keeps the reroute guards
301 /// compiling in every feature combination without scattering
302 /// `#[cfg(feature = "simd")]` across each call site.
303 #[cfg(feature = "simd")]
304 #[inline]
305 pub(crate) fn has_simd_prefilter(&self) -> bool {
306 self.simd_prefilter.is_some()
307 }
308
309 #[cfg(not(feature = "simd"))]
310 #[inline]
311 pub(crate) fn has_simd_prefilter(&self) -> bool {
312 false
313 }
314
315 /// Number of loaded detectors.
316 pub fn detector_count(&self) -> usize {
317 self.detectors.len()
318 }
319
320 /// Pre-interned `(detector_id, detector_name, service)` triple for the
321 /// detector at `detector_index`. Three `Arc::clone`s, zero hashing — the
322 /// hot-path replacement for three `ScanState::intern_metadata` calls on
323 /// frozen detector metadata (PERF-locality_intern-1). Returns byte-for-byte
324 /// the same `Arc<str>` values `static_intern.lookup(...)` would, because
325 /// they ARE the same arena entries, so emitted findings are unchanged.
326 #[inline]
327 pub(crate) fn interned_detector_metadata(
328 &self,
329 detector_index: usize,
330 ) -> (Arc<str>, Arc<str>, Arc<str>) {
331 let (id, name, service) = &self.metadata_by_index[detector_index];
332 (Arc::clone(id), Arc::clone(name), Arc::clone(service))
333 }
334
335 /// Total number of patterns (AC + fallback).
336 pub fn pattern_count(&self) -> usize {
337 self.ac_map.len() + self.fallback.len()
338 }
339
340 /// Eagerly compile every pattern's regex, in parallel, up front.
341 ///
342 /// Patterns compile lazily on first use (see [`crate::types::LazyRegex`]),
343 /// which makes a one-shot CLI scan start in milliseconds instead of
344 /// paying ~450ms-2.3s to build the whole corpus. For a LONG-lived or
345 /// LARGE scan - the daemon, `watch`, `scan-system`, or a big repo where a
346 /// detector fires across thousands of files - it's better to pay the
347 /// compile once, in parallel, before the hot loop rather than stalling
348 /// the first file that touches each detector. Callers on those paths
349 /// should `warm()` after building the scanner.
350 ///
351 /// Idempotent and cheap to repeat: an already-compiled pattern is a
352 /// `OnceLock` hit. Also the correct setup for a per-scan perf benchmark,
353 /// which means to measure match throughput, not one-time compilation.
354 pub fn warm(&self) {
355 use rayon::prelude::*;
356 // Warm the lazy regex transition caches in parallel so the first real
357 // source batch does not serialize DFA first-touch under worker load.
358 const WARM_SAMPLE: &str = concat!(
359 "int main(void){ char *buf = malloc(4096); for(size_t i=0;i<len;i++){ ",
360 "config.timeout_ms = 30000; user_id=0x1f3b9c; const KEY = \"abcDEF0123456789\"; ",
361 "https://example.org/api/v2?token=eyJhbGciOi&id=550e8400-e29b-41d4-a716; ",
362 "base64=QUtJQUlPU0ZPRE5ON0VYQU1QTEU= sha=da39a3ee5e6b4b0d3255bfef95601890; ",
363 "snake_case_name camelCaseName SCREAMING_CASE path/to/file.rs node_modules ",
364 "} /* comment */ // trailing\n\t<xml attr='v'>text</xml> {\"json\":true,\"n\":42}"
365 );
366 self.ac_map.par_iter().for_each(|p| {
367 let _ = p.regex.get().find(WARM_SAMPLE);
368 });
369 self.fallback.par_iter().for_each(|(p, _)| {
370 let _ = p.regex.get().find(WARM_SAMPLE);
371 });
372 crate::shared_regexes::warm_runtime_regexes();
373 fallback_generic::warm_generic_assignment_runtime();
374 crate::multiline::warm_runtime_regexes();
375 crate::checksum::warm_runtime_regexes();
376 }
377
378 /// Iterator over the FINAL regex source strings (post anchoring /
379 /// group extraction / normalization) the scanner uses.
380 pub fn pattern_regex_strs(&self) -> Vec<&str> {
381 let mut out = Vec::with_capacity(self.ac_map.len() + self.fallback.len());
382 out.extend(self.ac_map.iter().map(|p| p.regex.as_str()));
383 out.extend(self.fallback.iter().map(|(p, _)| p.regex.as_str()));
384 out
385 }
386
387 /// Return the preferred backend for a file of the given size.
388 #[must_use]
389 pub fn select_backend_for_file(&self, file_size: u64) -> crate::hw_probe::ScanBackend {
390 crate::hw_probe::select_backend(
391 crate::hw_probe::probe_hardware(),
392 file_size,
393 self.pattern_count(),
394 )
395 }
396
397 /// Identifier of the GPU backend acquired at compile time, or
398 /// None if scanning routes to CPU/SIMD only. Mirrors
399 /// `VyreBackend::id()` which returns "cuda", "wgpu", or the
400 /// driver-defined name. The startup banner uses this so the
401 /// operator can tell at a glance whether they got CUDA (the
402 /// headline 5-10x faster path on NVIDIA hardware) or the WGPU
403 /// fallback, rather than just "Gpu" which collapses both.
404 #[must_use]
405 pub fn gpu_backend_label(&self) -> Option<&'static str> {
406 self.gpu_backend.as_ref().map(|b| b.id())
407 }
408
409 /// Most recent concrete GPU runtime-degrade reason for this compiled
410 /// scanner, if one has occurred. Used by health probes to emit
411 /// machine-readable failure causes without scraping stderr.
412 pub fn last_gpu_degrade_reason(&self) -> Option<String> {
413 self.gpu_last_degrade_reason
414 .lock()
415 .ok()
416 .and_then(|guard| guard.clone())
417 }
418
419 /// Return the steady-state backend label used for startup reporting.
420 #[must_use]
421 pub fn preferred_backend_label(&self) -> &'static str {
422 self.select_backend_for_file(0).label()
423 }
424
425 /// Warm backend resources that are initialized lazily during scanning.
426 pub fn warm_backend(&self, backend: crate::hw_probe::ScanBackend) -> bool {
427 let ready = match backend {
428 crate::hw_probe::ScanBackend::Gpu => self.gpu_stack_usable(),
429 crate::hw_probe::ScanBackend::MegaScan => {
430 let pipeline_ready = self.rule_pipeline().is_some();
431 let stack_ready = self.gpu_stack_usable();
432 if !pipeline_ready && stack_ready {
433 gpu_forced::deny_silent_megascan_degrade(
434 "regex pipeline compile rejected the detector set",
435 );
436 }
437 pipeline_ready && stack_ready
438 }
439 crate::hw_probe::ScanBackend::SimdCpu | crate::hw_probe::ScanBackend::CpuFallback => {
440 true
441 }
442 };
443 if !ready {
444 gpu_forced::deny_silent_gpu_degrade(self, backend);
445 }
446 ready
447 }
448
449 /// Scan a chunk of text and return all raw credential matches.
450 pub fn scan(&self, chunk: &Chunk) -> Vec<RawMatch> {
451 self.scan_with_deadline(chunk, env_per_chunk_deadline())
452 }
453
454 /// Scan a chunk using a caller-selected backend.
455 pub fn scan_with_backend(
456 &self,
457 chunk: &Chunk,
458 backend: crate::hw_probe::ScanBackend,
459 ) -> Vec<RawMatch> {
460 self.scan_with_deadline_and_backend(chunk, env_per_chunk_deadline(), Some(backend))
461 }
462
463 /// Scan multiple chunks using a caller-selected backend.
464 pub fn scan_chunks_with_backend(
465 &self,
466 chunks: &[Chunk],
467 backend: crate::hw_probe::ScanBackend,
468 ) -> Vec<Vec<RawMatch>> {
469 gpu_forced::deny_silent_gpu_degrade(self, backend);
470 self.scan_chunks_with_backend_internal(chunks, backend)
471 }
472
473 /// Reset the cross-file fragment-reassembly cache.
474 pub fn clear_fragment_cache(&self) {
475 self.fragment_cache.clear();
476 }
477
478 /// Scan a chunk of text against all compiled detectors.
479 pub fn scan_with_deadline(
480 &self,
481 chunk: &Chunk,
482 deadline: Option<std::time::Instant>,
483 ) -> Vec<RawMatch> {
484 self.scan_with_deadline_and_backend(chunk, deadline, None)
485 }
486
487 pub fn scan_with_deadline_and_backend(
488 &self,
489 chunk: &Chunk,
490 deadline: Option<std::time::Instant>,
491 backend: Option<crate::hw_probe::ScanBackend>,
492 ) -> Vec<RawMatch> {
493 if let Some(path) = chunk.metadata.path.as_deref() {
494 let filename = path.rsplit(['/', '\\']).next().unwrap_or(path);
495 if filename == ".keyhog"
496 || filename == ".keyhogignore"
497 || path.split(['/', '\\']).any(|c| c == "detectors")
498 {
499 crate::telemetry::record_file_skipped();
500 return Vec::new();
501 }
502 }
503
504 // Direct-match prefilters: skip chunks that carry none of any
505 // detector's literal bytes (`AlphabetScreen`) or bigrams (bloom). A
506 // FULLY-ENCODED secret (e.g. `data = "<base64-of-ghp_…>"`) carries none
507 // of those - its plaintext prefix only appears AFTER decoding - so the
508 // prefilters would drop it before decode-through could recover it,
509 // silently defeating the decode-through feature on the encoded-only
510 // case. When the prefilter rejects but decode is enabled AND the chunk
511 // carries a long base64/hex run, fall through to a DECODE-ONLY pass
512 // instead of skipping. Bounded: only encoded-looking rejected chunks
513 // pay the decode cost, so normal traffic keeps the fast skip.
514 let alphabet_ok = self
515 .alphabet_screen
516 .as_ref()
517 .map_or(true, |screen| screen.screen(chunk.data.as_bytes()));
518 let bigram_ok =
519 chunk.data.len() < 64 || self.bigram_bloom.maybe_overlaps(chunk.data.as_bytes());
520 if !(alphabet_ok && bigram_ok) {
521 #[cfg(feature = "decode")]
522 if self.config.max_decode_depth > 0
523 && chunk.data.len() <= self.config.max_decode_bytes
524 && crate::decode::has_decodable_payload(chunk.data.as_bytes())
525 {
526 // Direct scan is skipped (the outer bytes match nothing); only
527 // the decoded sub-chunks are scanned, inside post_process.
528 let mut matches = Vec::new();
529 self.post_process_matches(chunk, &mut matches, deadline);
530 return matches;
531 }
532 crate::telemetry::record_file_skipped();
533 return Vec::new();
534 }
535
536 let selected_backend =
537 backend.unwrap_or_else(|| self.select_backend_for_file(chunk.data.len() as u64));
538 gpu_forced::deny_silent_gpu_degrade(self, selected_backend);
539 tracing::trace!(
540 target: "keyhog::routing",
541 backend = selected_backend.label(),
542 chunk_bytes = chunk.data.len(),
543 source_type = chunk.metadata.source_type.as_str(),
544 "scan dispatch"
545 );
546 let mut matches = if chunk.data.len() > MAX_SCAN_CHUNK_BYTES {
547 self.scan_windowed(chunk, deadline)
548 } else {
549 self.scan_inner(chunk, selected_backend, deadline)
550 };
551
552 self.post_process_matches(chunk, &mut matches, deadline);
553
554 matches
555 }
556}