Skip to main content

keyhog_scanner/engine/
mod.rs

1//! Core scanning engine implementation.
2
3mod backend;
4mod backend_dispatch;
5mod backend_pattern_hits;
6mod backend_prepared;
7mod backend_triggered;
8pub mod boundary;
9mod compile;
10mod extract;
11mod fallback;
12mod fallback_entropy;
13mod fallback_entropy_helpers;
14mod fallback_generic;
15mod gpu_ac_phase1;
16mod gpu_cache;
17mod gpu_coalesce;
18#[allow(dead_code)]
19pub mod gpu_decode_scan;
20mod gpu_dispatch;
21mod gpu_forced;
22mod gpu_lazy;
23mod gpu_literal_phase1;
24mod gpu_megascan;
25mod gpu_phase2;
26pub(crate) mod gpu_postprocess;
27#[allow(dead_code)]
28pub mod gpu_program_fusion;
29#[allow(dead_code)]
30pub mod gpu_regex_dfa;
31mod gpu_scan_wrappers;
32mod hot_patterns;
33mod process;
34mod rule_pipeline;
35mod scan;
36mod scan_filters;
37mod scan_postprocess;
38pub mod segment_attribution;
39mod windowed;
40
41// `build_simd_scanner` only exists under the `simd` (Hyperscan) feature; its
42// sole call site in compile.rs is `#[cfg(feature = "simd")]` too. Gate the
43// import to match, or non-simd builds (the `portable` feature used for the
44// macOS/Windows/musl release assets) fail with E0432.
45#[cfg(feature = "simd")]
46pub(crate) use backend_prepared::build_simd_scanner;
47pub(crate) use backend_prepared::PreparedChunk;
48pub use gpu_cache::{AcConstPacks, GpuConstPacks};
49pub use gpu_coalesce::coalesce_chunks;
50pub use gpu_regex_dfa::{build_regex_dfa, RegexDfaError};
51pub use gpu_scan_wrappers::GpuPhase1Output;
52pub use rule_pipeline::{
53    build_rule_pipeline, megascan_input_len, rule_pipeline_cached, AC_GPU_MAX_MATCHES_PER_DISPATCH,
54    MEGASCAN_INPUT_LEN, MEGASCAN_INPUT_LEN_DEFAULT,
55};
56pub use windowed::{
57    floor_char_boundary, line_number_for_offset, next_window_offset, record_window_match,
58    window_chunk, window_end_offset,
59};
60
61use crate::compiler::*;
62use crate::error::Result;
63use crate::pipeline::*;
64use crate::types::*;
65use aho_corasick::AhoCorasick;
66use keyhog_core::{Chunk, DetectorSpec, RawMatch};
67use std::sync::Arc;
68use std::sync::OnceLock;
69
70pub use vyre_libs::scan::LiteralMatch;
71
72/// Read `KEYHOG_PER_CHUNK_TIMEOUT_MS` and turn it into a per-chunk
73/// deadline `Instant`. Returns `None` when the env var is unset or
74/// malformed - the historical "scan until done" behavior.
75///
76/// Wired into the public `scan` / `scan_with_backend` entry points
77/// so a hostile or pathological input (e.g. the Apple Silicon
78/// regex-DFA construction stall surfaced during cross-platform
79/// dogfood - a single 171-byte line with `var token = identifier.Flag(...)`
80/// shape spends minutes inside the multiline preprocessor) bails
81/// after the configured budget instead of hanging the entire
82/// `keyhog scan <repo>` run. The CLI orchestrator path runs scans
83/// in parallel via rayon; a stuck worker would otherwise keep one
84/// core pinned at 100% indefinitely.
85///
86/// Default unset (no timeout) preserves prior behavior. Recommend
87/// `export KEYHOG_PER_CHUNK_TIMEOUT_MS=30000` (30 s) for production
88/// scans where bounded latency matters more than scan completeness.
89fn env_per_chunk_deadline() -> Option<std::time::Instant> {
90    static MS: std::sync::OnceLock<Option<u64>> = std::sync::OnceLock::new();
91    let ms = *MS.get_or_init(|| {
92        std::env::var("KEYHOG_PER_CHUNK_TIMEOUT_MS")
93            .ok()
94            .and_then(|v| v.parse::<u64>().ok())
95            .filter(|&v| v > 0)
96    });
97    ms.map(|ms| std::time::Instant::now() + std::time::Duration::from_millis(ms))
98}
99
100pub enum MlScoreResult<'a> {
101    /// Score is final and the match can be pushed immediately.
102    Final(f64),
103    #[cfg(feature = "ml")]
104    /// ML scoring is deferred to a batch call at the end of the scan.
105    Pending {
106        heuristic_conf: f64,
107        code_context: crate::context::CodeContext,
108        credential: std::borrow::Cow<'a, str>,
109        ml_context: std::borrow::Cow<'a, str>,
110    },
111    /// Zero-sized placeholder that keeps the `'a` lifetime live when ML batch
112    /// scoring is compiled out (lean / `--no-default-features` build). Never
113    /// constructed - it exists solely so the type still carries `'a` without
114    /// the `ml` feature, where only the borrowing `Pending` variant uses it.
115    #[cfg(not(feature = "ml"))]
116    #[doc(hidden)]
117    _Lifetime(std::marker::PhantomData<&'a ()>),
118}
119
120pub struct CompiledScanner {
121    pub(crate) fragment_cache: crate::fragment_cache::FragmentCache,
122    pub(crate) ac: Option<AhoCorasick>,
123    pub(crate) gpu_backend: Option<Arc<dyn vyre::VyreBackend>>,
124    pub(crate) wgpu_backend: Option<Arc<vyre_driver_wgpu::WgpuBackend>>,
125    pub(crate) gpu_literals: Option<Arc<Vec<Vec<u8>>>>,
126    pub(crate) gpu_matcher: OnceLock<Option<vyre_libs::scan::GpuLiteralSet>>,
127    pub(crate) gpu_const_packs: OnceLock<GpuConstPacks>,
128    pub(crate) gpu_ac_const_packs: OnceLock<AcConstPacks>,
129    pub(crate) ac_gpu_program: OnceLock<Option<vyre::Program>>,
130    pub(crate) rule_pipeline: OnceLock<Option<vyre_libs::scan::RulePipeline>>,
131    /// Fused AC + rule pipeline program (single GPU dispatch instead of two).
132    /// Lazily built on first access via `fused_program()`.
133    pub(crate) fused_program: OnceLock<Option<vyre::Program>>,
134    /// Fused decode→scan programs for base64/hex GPU decode.
135    /// Lazily built on first access.
136    pub(crate) fused_decode_programs: OnceLock<Option<gpu_decode_scan::FusedDecodeScanPrograms>>,
137    pub(crate) static_intern: Arc<crate::static_intern::StaticInterner>,
138    pub(crate) ac_map: Vec<CompiledPattern>,
139    pub(crate) prefix_propagation: Vec<Vec<usize>>,
140    pub(crate) fallback: Vec<(CompiledPattern, Vec<String>)>,
141    pub(crate) companions: Vec<Vec<CompiledCompanion>>,
142    pub(crate) detectors: Vec<DetectorSpec>,
143    pub(crate) same_prefix_patterns: Vec<Vec<usize>>,
144    pub(crate) fallback_keyword_ac: Option<AhoCorasick>,
145    pub(crate) fallback_keyword_to_patterns: Vec<Vec<usize>>,
146    pub(crate) fallback_always_active: Vec<bool>,
147    #[cfg(feature = "simd")]
148    pub(crate) simd_prefilter: Option<crate::simd::backend::HsScanner>,
149    #[cfg(feature = "simd")]
150    pub(crate) hs_index_map: Vec<Vec<usize>>,
151    /// Precise-regex validator per hot-pattern slot (index-parallel with
152    /// `simdsieve_prefilter::HOT_PATTERNS`). The hot fast-path runs each
153    /// literal-prefix candidate through these before emitting so it can never
154    /// surface a token the detector's own regex rejects (the length floor
155    /// alone let `ghp_…_…`/`xoxp-123-456-789-abc` through). `None` for the one
156    /// slot with no canonical detector (square).
157    #[cfg(feature = "simdsieve")]
158    pub(crate) hot_pattern_validators: Vec<Option<regex::Regex>>,
159    pub config: ScannerConfig,
160    pub alphabet_screen: Option<crate::alphabet_filter::AlphabetScreen>,
161    pub(crate) bigram_bloom: crate::bigram_bloom::BigramBloom,
162}
163
164const _: () = {
165    const fn assert_send_sync<T: Send + Sync>() {}
166    let _ = assert_send_sync::<CompiledScanner>;
167};
168
169impl CompiledScanner {
170    /// Number of loaded detectors.
171    pub fn detector_count(&self) -> usize {
172        self.detectors.len()
173    }
174
175    /// Total number of patterns (AC + fallback).
176    pub fn pattern_count(&self) -> usize {
177        self.ac_map.len() + self.fallback.len()
178    }
179
180    /// Eagerly compile every pattern's regex, in parallel, up front.
181    ///
182    /// Patterns compile lazily on first use (see [`crate::types::LazyRegex`]),
183    /// which makes a one-shot CLI scan start in milliseconds instead of
184    /// paying ~450ms-2.3s to build the whole corpus. For a LONG-lived or
185    /// LARGE scan - the daemon, `watch`, `scan-system`, or a big repo where a
186    /// detector fires across thousands of files - it's better to pay the
187    /// compile once, in parallel, before the hot loop rather than stalling
188    /// the first file that touches each detector. Callers on those paths
189    /// should `warm()` after building the scanner.
190    ///
191    /// Idempotent and cheap to repeat: an already-compiled pattern is a
192    /// `OnceLock` hit. Also the correct setup for a per-scan perf benchmark,
193    /// which means to measure match throughput, not one-time compilation.
194    pub fn warm(&self) {
195        use rayon::prelude::*;
196        self.ac_map.par_iter().for_each(|p| {
197            let _ = p.regex.get();
198        });
199        self.fallback.par_iter().for_each(|(p, _)| {
200            let _ = p.regex.get();
201        });
202    }
203
204    /// Iterator over the FINAL regex source strings (post anchoring /
205    /// group extraction / normalization) the scanner uses.
206    pub fn pattern_regex_strs(&self) -> Vec<&str> {
207        let mut out = Vec::with_capacity(self.ac_map.len() + self.fallback.len());
208        out.extend(self.ac_map.iter().map(|p| p.regex.as_str()));
209        out.extend(self.fallback.iter().map(|(p, _)| p.regex.as_str()));
210        out
211    }
212
213    /// Return the preferred backend for a file of the given size.
214    #[must_use]
215    pub fn select_backend_for_file(&self, file_size: u64) -> crate::hw_probe::ScanBackend {
216        crate::hw_probe::select_backend(
217            crate::hw_probe::probe_hardware(),
218            file_size,
219            self.pattern_count(),
220        )
221    }
222
223    /// Identifier of the GPU backend acquired at compile time, or
224    /// None if scanning routes to CPU/SIMD only. Mirrors
225    /// `VyreBackend::id()` which returns "cuda", "wgpu", or the
226    /// driver-defined name. The startup banner uses this so the
227    /// operator can tell at a glance whether they got CUDA (the
228    /// headline 5-10x faster path on NVIDIA hardware) or the WGPU
229    /// fallback, rather than just "Gpu" which collapses both.
230    #[must_use]
231    pub fn gpu_backend_label(&self) -> Option<&'static str> {
232        self.gpu_backend.as_ref().map(|b| b.id())
233    }
234
235    /// Return the steady-state backend label used for startup reporting.
236    #[must_use]
237    pub fn preferred_backend_label(&self) -> &'static str {
238        self.select_backend_for_file(0).label()
239    }
240
241    /// Warm backend resources that are initialized lazily during scanning.
242    pub fn warm_backend(&self, backend: crate::hw_probe::ScanBackend) -> bool {
243        let ready = match backend {
244            crate::hw_probe::ScanBackend::Gpu => self.gpu_stack_usable(),
245            crate::hw_probe::ScanBackend::MegaScan => {
246                let pipeline_ready = self.rule_pipeline().is_some();
247                let stack_ready = self.gpu_stack_usable();
248                if !pipeline_ready && stack_ready {
249                    gpu_forced::deny_silent_megascan_degrade(
250                        "regex pipeline compile rejected the detector set",
251                    );
252                }
253                pipeline_ready && stack_ready
254            }
255            crate::hw_probe::ScanBackend::SimdCpu | crate::hw_probe::ScanBackend::CpuFallback => {
256                true
257            }
258        };
259        if !ready {
260            gpu_forced::deny_silent_gpu_degrade(self, backend);
261        }
262        ready
263    }
264
265    /// Scan a chunk of text and return all raw credential matches.
266    pub fn scan(&self, chunk: &Chunk) -> Vec<RawMatch> {
267        self.scan_with_deadline(chunk, env_per_chunk_deadline())
268    }
269
270    /// Scan a chunk using a caller-selected backend.
271    pub fn scan_with_backend(
272        &self,
273        chunk: &Chunk,
274        backend: crate::hw_probe::ScanBackend,
275    ) -> Vec<RawMatch> {
276        self.scan_with_deadline_and_backend(chunk, env_per_chunk_deadline(), Some(backend))
277    }
278
279    /// Scan multiple chunks using a caller-selected backend.
280    pub fn scan_chunks_with_backend(
281        &self,
282        chunks: &[Chunk],
283        backend: crate::hw_probe::ScanBackend,
284    ) -> Vec<Vec<RawMatch>> {
285        gpu_forced::deny_silent_gpu_degrade(self, backend);
286        self.scan_chunks_with_backend_internal(chunks, backend)
287    }
288
289    /// Reset the cross-file fragment-reassembly cache.
290    pub fn clear_fragment_cache(&self) {
291        self.fragment_cache.clear();
292    }
293
294    /// Scan a chunk of text against all compiled detectors.
295    pub fn scan_with_deadline(
296        &self,
297        chunk: &Chunk,
298        deadline: Option<std::time::Instant>,
299    ) -> Vec<RawMatch> {
300        self.scan_with_deadline_and_backend(chunk, deadline, None)
301    }
302
303    pub fn scan_with_deadline_and_backend(
304        &self,
305        chunk: &Chunk,
306        deadline: Option<std::time::Instant>,
307        backend: Option<crate::hw_probe::ScanBackend>,
308    ) -> Vec<RawMatch> {
309        if let Some(path) = chunk.metadata.path.as_deref() {
310            let filename = path.rsplit(['/', '\\']).next().unwrap_or(path);
311            if filename == ".keyhog"
312                || filename == ".keyhogignore"
313                || path.split(['/', '\\']).any(|c| c == "detectors")
314            {
315                crate::telemetry::record_file_skipped();
316                return Vec::new();
317            }
318        }
319
320        if let Some(screen) = &self.alphabet_screen {
321            if !screen.screen(chunk.data.as_bytes()) {
322                crate::telemetry::record_file_skipped();
323                return Vec::new();
324            }
325        }
326
327        if chunk.data.len() >= 64 && !self.bigram_bloom.maybe_overlaps(chunk.data.as_bytes()) {
328            crate::telemetry::record_file_skipped();
329            return Vec::new();
330        }
331
332        let selected_backend =
333            backend.unwrap_or_else(|| self.select_backend_for_file(chunk.data.len() as u64));
334        gpu_forced::deny_silent_gpu_degrade(self, selected_backend);
335        tracing::trace!(
336            target: "keyhog::routing",
337            backend = selected_backend.label(),
338            chunk_bytes = chunk.data.len(),
339            source_type = chunk.metadata.source_type.as_str(),
340            "scan dispatch"
341        );
342        let mut matches = if chunk.data.len() > MAX_SCAN_CHUNK_BYTES {
343            self.scan_windowed(chunk, deadline)
344        } else {
345            self.scan_inner(chunk, selected_backend, deadline)
346        };
347
348        self.post_process_matches(chunk, &mut matches, deadline);
349
350        matches
351    }
352}