Skip to main content

keyhog_scanner/engine/
mod.rs

1//! Core scanning engine implementation.
2
3mod backend;
4mod fallback;
5mod hot_patterns;
6mod scan;
7mod windowed;
8
9pub use windowed::{
10    floor_char_boundary, line_number_for_offset, next_window_offset, record_window_match,
11    window_chunk, window_end_offset,
12};
13
14use crate::compiler::*;
15use crate::context::{self, CodeContext};
16use crate::error::Result;
17use crate::pipeline::*;
18use crate::types::*;
19use crate::unicode_hardening;
20use aho_corasick::AhoCorasick;
21use keyhog_core::{Chunk, DetectorSpec, RawMatch};
22#[cfg(feature = "entropy")]
23use keyhog_core::{MatchLocation, Severity};
24#[cfg(feature = "ml")]
25use sha2::Digest;
26use std::collections::{HashMap, HashSet, VecDeque};
27use std::sync::{Arc, OnceLock};
28use warpstate::PatternSet;
29
30/// Result of calculating a match's final confidence score.
31pub enum MlScoreResult {
32    /// Score is final and the match can be pushed immediately.
33    Final(f64),
34    #[cfg(feature = "ml")]
35    /// ML scoring is deferred to a batch call at the end of the scan.
36    Pending {
37        heuristic_conf: f64,
38        code_context: crate::context::CodeContext,
39        credential: String,
40        ml_context: String,
41    },
42}
43
44/// A pre-compiled set of rules for fast execution.
45pub struct CompiledScanner {
46    pub(crate) ac: Option<PatternSet>,
47    /// Complete pattern set (AC + fallback regexes) wired to the GPU matcher.
48    pub(crate) gpu_pattern_set: Option<warpstate::PatternSet>,
49    pub(crate) gpu_matcher: OnceLock<Option<warpstate::AutoMatcher>>,
50    pub(crate) ac_map: Vec<CompiledPattern>,
51    pub(crate) prefix_propagation: Vec<Vec<usize>>,
52    pub(crate) fallback: Vec<(CompiledPattern, Vec<String>)>,
53    pub(crate) companions: Vec<Vec<CompiledCompanion>>,
54    pub(crate) detectors: Vec<DetectorSpec>,
55    pub(crate) detector_to_patterns: Vec<Vec<usize>>,
56    pub(crate) same_prefix_patterns: Vec<Vec<usize>>,
57    #[allow(dead_code)]
58    pub(crate) fallback_keyword_ac: Option<AhoCorasick>,
59    #[allow(dead_code)]
60    pub(crate) fallback_keyword_to_patterns: Vec<Vec<usize>>,
61    #[cfg(feature = "simd")]
62    pub(crate) simd_prefilter: Option<crate::simd::backend::HsScanner>,
63    /// HS pattern ID → original ac_map indices.
64    #[cfg(feature = "simd")]
65    pub(crate) hs_index_map: Vec<Vec<usize>>,
66    #[cfg(feature = "simdsieve")]
67    pub(crate) simdsieve_prefilter: crate::simdsieve_prefilter::SimdPrefilter,
68    pub config: ScannerConfig,
69    pub alphabet_screen: Option<crate::alphabet_filter::AlphabetScreen>,
70}
71
72#[cfg(feature = "ml")]
73pub fn cached_ml_score(
74    scan_state: &mut ScanState,
75    credential: &str,
76    context: &str,
77    config: &ScannerConfig,
78) -> f64 {
79    let mut hasher = sha2::Sha256::new();
80    sha2::Digest::update(&mut hasher, credential.as_bytes());
81    sha2::Digest::update(&mut hasher, [0u8]);
82    sha2::Digest::update(&mut hasher, context.as_bytes());
83    let digest = hasher.finalize();
84    let mut digest_arr = [0u8; 32];
85    digest_arr.copy_from_slice(&digest);
86
87    let cache_key = (credential.to_string(), context.to_string());
88    if let Some(score) = scan_state.ml_score_cache.get(&cache_key) {
89        return *score;
90    }
91
92    let entry_bytes = credential.len() + context.len();
93    while scan_state.ml_cache_bytes + entry_bytes > MAX_ML_CACHE_BYTES
94        || scan_state.ml_score_cache.len() >= MAX_ML_CACHE_ENTRIES
95    {
96        if let Some(oldest) = scan_state.ml_cache_order.pop_front() {
97            if scan_state.ml_score_cache.remove(&oldest).is_some() {
98                scan_state.ml_cache_bytes = scan_state
99                    .ml_cache_bytes
100                    .saturating_sub(oldest.0.len() + oldest.1.len());
101            }
102        } else {
103            break;
104        }
105    }
106
107    let score = crate::ml_scorer::score_with_config(
108        credential,
109        context,
110        &config.known_prefixes,
111        &config.secret_keywords,
112        &config.test_keywords,
113        &config.placeholder_keywords,
114    );
115    scan_state.ml_score_cache.insert(cache_key.clone(), score);
116    scan_state.ml_cache_order.push_back(cache_key);
117    scan_state.ml_cache_bytes = scan_state.ml_cache_bytes.saturating_add(entry_bytes);
118    score
119}
120
121const _: () = {
122    const fn assert_send_sync<T: Send + Sync>() {}
123    let _ = assert_send_sync::<CompiledScanner>;
124};
125
126impl CompiledScanner {
127    /// Compile all detector specs into a single scanner.
128    #[must_use = "the scanner is expensive to compile — use it for scanning"]
129    pub fn compile(detectors: Vec<DetectorSpec>) -> Result<Self> {
130        let state = build_compile_state(&detectors)?;
131        let ac = build_ac_pattern_set(&state.ac_literals)?;
132        // Only compile GPU PatternSet if GPU hardware is actually available.
133        let gpu_pattern_set = if crate::hw_probe::probe_hardware().gpu_available {
134            build_gpu_pattern_set(&state.ac_literals)
135        } else {
136            None
137        };
138        let prefix_propagation = build_prefix_propagation(&state.ac_literals);
139        let same_prefix_patterns = build_same_prefix_patterns(&state.ac_literals);
140        let detector_to_patterns = build_detector_to_patterns(&state.ac_map, detectors.len());
141        let (fallback_keyword_ac, fallback_keyword_to_patterns) =
142            build_fallback_keyword_ac(&state.fallback);
143
144        log_quality_warnings(&state.quality_warnings);
145
146        #[cfg(feature = "simdsieve")]
147        let simdsieve_prefilter = crate::simdsieve_prefilter::SimdPrefilter::new();
148
149        #[cfg(feature = "simd")]
150        let (simd_prefilter, hs_index_map) =
151            backend::build_simd_scanner(&state.ac_map, &state.fallback)
152                .map(|(s, m)| (Some(s), m))
153                .unwrap_or((None, Vec::new()));
154
155        let mut alphabet_targets = state.ac_literals.clone();
156        for (_, keywords) in &state.fallback {
157            alphabet_targets.extend(keywords.clone());
158        }
159        let alphabet_screen = if alphabet_targets.is_empty() {
160            None
161        } else {
162            Some(crate::alphabet_filter::AlphabetScreen::new(
163                &alphabet_targets,
164            ))
165        };
166
167        Ok(Self {
168            ac,
169            gpu_pattern_set,
170            gpu_matcher: OnceLock::new(),
171            ac_map: state.ac_map,
172            prefix_propagation,
173            fallback: state.fallback,
174            companions: state.companions,
175            detectors,
176            detector_to_patterns,
177            same_prefix_patterns,
178            fallback_keyword_ac,
179            fallback_keyword_to_patterns,
180            #[cfg(feature = "simd")]
181            simd_prefilter,
182            #[cfg(feature = "simd")]
183            hs_index_map,
184            #[cfg(feature = "simdsieve")]
185            simdsieve_prefilter,
186            config: ScannerConfig::default(),
187            alphabet_screen,
188        })
189    }
190
191    /// Apply a custom configuration to the compiled scanner.
192    pub fn with_config(mut self, config: ScannerConfig) -> Self {
193        self.config = config;
194        self
195    }
196
197    /// Number of loaded detectors.
198    pub fn detector_count(&self) -> usize {
199        self.detectors.len()
200    }
201
202    /// Total number of patterns (AC + fallback).
203    pub fn pattern_count(&self) -> usize {
204        self.ac_map.len() + self.fallback.len()
205    }
206
207    /// Return the preferred backend for a file of the given size.
208    #[must_use]
209    pub fn select_backend_for_file(&self, file_size: u64) -> crate::hw_probe::ScanBackend {
210        crate::hw_probe::select_backend(
211            crate::hw_probe::probe_hardware(),
212            file_size,
213            self.pattern_count(),
214        )
215    }
216
217    /// Return the steady-state backend label used for startup reporting.
218    #[must_use]
219    pub fn preferred_backend_label(&self) -> &'static str {
220        self.select_backend_for_file(0).label()
221    }
222
223    /// Scan a chunk of text and return all raw credential matches.
224    pub fn scan(&self, chunk: &Chunk) -> Vec<RawMatch> {
225        self.scan_with_deadline(chunk, None)
226    }
227
228    /// Scan a chunk using a caller-selected backend.
229    pub fn scan_with_backend(
230        &self,
231        chunk: &Chunk,
232        backend: crate::hw_probe::ScanBackend,
233    ) -> Vec<RawMatch> {
234        self.scan_with_deadline_and_backend(chunk, None, Some(backend))
235    }
236
237    /// Scan multiple chunks using a caller-selected backend.
238    pub fn scan_chunks_with_backend(
239        &self,
240        chunks: &[Chunk],
241        backend: crate::hw_probe::ScanBackend,
242    ) -> Vec<Vec<RawMatch>> {
243        self.scan_chunks_with_backend_internal(chunks, backend)
244    }
245
246    /// Scan a chunk of text against all compiled detectors.
247    pub fn scan_with_deadline(
248        &self,
249        chunk: &Chunk,
250        deadline: Option<std::time::Instant>,
251    ) -> Vec<RawMatch> {
252        self.scan_with_deadline_and_backend(chunk, deadline, None)
253    }
254
255    pub fn scan_with_deadline_and_backend(
256        &self,
257        chunk: &Chunk,
258        deadline: Option<std::time::Instant>,
259        backend: Option<crate::hw_probe::ScanBackend>,
260    ) -> Vec<RawMatch> {
261        if let Some(path) = chunk.metadata.path.as_deref() {
262            let filename = path.rsplit(['/', '\\']).next().unwrap_or(path);
263            if filename == ".keyhog"
264                || filename == ".keyhogignore"
265                || path.split(['/', '\\']).any(|c| c == "detectors")
266            {
267                return Vec::new();
268            }
269        }
270
271        if let Some(screen) = &self.alphabet_screen
272            && !screen.screen(chunk.data.as_bytes())
273        {
274            return Vec::new();
275        }
276
277        #[cfg(feature = "simdsieve")]
278        let _simdsieve_hint = if chunk.data.len() > 100_000 {
279            let (should_scan, _confidence) =
280                self.simdsieve_prefilter.quick_screen(chunk.data.as_bytes());
281            should_scan
282        } else {
283            true
284        };
285
286        let selected_backend =
287            backend.unwrap_or_else(|| self.select_backend_for_file(chunk.data.len() as u64));
288        let mut matches = if chunk.data.len() > MAX_SCAN_CHUNK_BYTES {
289            self.scan_windowed(chunk, deadline)
290        } else {
291            self.scan_inner(chunk, selected_backend, deadline)
292        };
293
294        self.scan_cross_chunk_fragments(chunk, &mut matches, deadline);
295
296        #[cfg(feature = "decode")]
297        if chunk.data.len() <= self.config.max_decode_bytes {
298            let mut seen: HashSet<(String, String)> = matches
299                .iter()
300                .map(|m| (m.detector_id.to_string(), m.credential.to_string()))
301                .collect();
302            for decoded_chunk in crate::decode::decode_chunk(
303                chunk,
304                self.config.max_decode_depth,
305                self.config.validate_decode,
306                deadline,
307                self.alphabet_screen.as_ref(),
308            ) {
309                let decoded_matches = if decoded_chunk.data.len() > MAX_SCAN_CHUNK_BYTES {
310                    self.scan_windowed(&decoded_chunk, deadline)
311                } else {
312                    let decoded_backend =
313                        self.select_backend_for_file(decoded_chunk.data.len() as u64);
314                    self.scan_inner(&decoded_chunk, decoded_backend, deadline)
315                };
316                for m in decoded_matches {
317                    if seen.insert((m.detector_id.to_string(), m.credential.to_string())) {
318                        matches.push(m);
319                    }
320                }
321            }
322        }
323
324        matches
325    }
326
327    fn scan_cross_chunk_fragments(
328        &self,
329        chunk: &Chunk,
330        matches: &mut Vec<RawMatch>,
331        deadline: Option<std::time::Instant>,
332    ) {
333        static ASSIGN_RE: std::sync::LazyLock<regex::Regex> = std::sync::LazyLock::new(|| {
334            regex::Regex::new(
335                r#"(?i)([a-z0-9_-]{2,32})\s*[:=]\s*["'`]([a-zA-Z0-9/+=_-]{4,})["'`](?:;|,)?$"#,
336            )
337            .expect("hardcoded regex must compile")
338        });
339        let assign_re = &*ASSIGN_RE;
340
341        for (line_idx, line) in chunk.data.lines().enumerate() {
342            if let Some(caps) = assign_re.captures(line) {
343                let Some(var_name_match) = caps.get(1) else {
344                    continue;
345                };
346                let Some(value_match) = caps.get(2) else {
347                    continue;
348                };
349
350                let fragment = crate::fragment_cache::SecretFragment {
351                    prefix: crate::multiline::extract_prefix(var_name_match.as_str()),
352                    var_name: var_name_match.as_str().to_string(),
353                    value: value_match.as_str().to_string(),
354                    line: line_idx + 1,
355                    path: chunk.metadata.path.clone(),
356                };
357
358                let candidates =
359                    crate::fragment_cache::get_fragment_cache().record_and_reassemble(fragment);
360                for candidate in candidates {
361                    // Only reassemble candidates with enough entropy to be plausible secrets.
362                    // Low-entropy reassemblies (concatenated variable names, prose) are noise.
363                    let entropy = crate::pipeline::match_entropy(candidate.as_bytes());
364                    if entropy < 3.0 || candidate.len() < 16 {
365                        continue;
366                    }
367
368                    let dummy_chunk = Chunk {
369                        data: format!("reassembled_key = \"{}\"", candidate),
370                        metadata: chunk.metadata.clone(),
371                    };
372
373                    let backend = self.select_backend_for_file(dummy_chunk.data.len() as u64);
374                    for mut reassembled_match in self.scan_inner(&dummy_chunk, backend, deadline) {
375                        reassembled_match.detector_id =
376                            format!("{}:reassembled", reassembled_match.detector_id).into();
377                        matches.push(reassembled_match);
378                    }
379                }
380            }
381        }
382    }
383
384    fn expand_triggered_patterns(&self, triggered_patterns: &[u64]) -> Vec<u64> {
385        let mut expanded = triggered_patterns.to_vec();
386        for (word_idx, &word) in triggered_patterns.iter().enumerate() {
387            if word == 0 {
388                continue;
389            }
390            let mut bits = word;
391            while bits != 0 {
392                let bit = bits.trailing_zeros() as usize;
393                let pat_idx = word_idx * 64 + bit;
394                if pat_idx >= self.ac_map.len() {
395                    break;
396                }
397                for &other_idx in &self.same_prefix_patterns[pat_idx] {
398                    expanded[other_idx / 64] |= 1 << (other_idx % 64);
399                }
400                let det_idx = self.ac_map[pat_idx].detector_index;
401                for &other_idx in &self.detector_to_patterns[det_idx] {
402                    expanded[other_idx / 64] |= 1 << (other_idx % 64);
403                }
404                bits &= bits - 1; // clear lowest set bit
405            }
406        }
407        expanded
408    }
409
410    #[allow(clippy::too_many_arguments)]
411    fn extract_confirmed_patterns(
412        &self,
413        confirmed_patterns: &[usize],
414        preprocessed: &ScannerPreprocessedText,
415        line_offsets: &[usize],
416        code_lines: &[&str],
417        documentation_lines: &[bool],
418        chunk: &Chunk,
419        scan_state: &mut ScanState,
420        deadline: Option<std::time::Instant>,
421    ) {
422        for &pat_idx in confirmed_patterns {
423            if let Some(deadline) = deadline
424                && std::time::Instant::now() > deadline
425            {
426                break;
427            }
428            let entry = if pat_idx < self.ac_map.len() {
429                &self.ac_map[pat_idx]
430            } else {
431                let fallback_idx = pat_idx - self.ac_map.len();
432                if fallback_idx >= self.fallback.len() {
433                    continue;
434                }
435                &self.fallback[fallback_idx].0
436            };
437            self.extract_matches(
438                entry,
439                preprocessed,
440                line_offsets,
441                code_lines,
442                documentation_lines,
443                chunk,
444                scan_state,
445                0,
446                0,
447            );
448        }
449    }
450
451    #[cfg(feature = "ml")]
452    fn apply_ml_batch_scores(&self, scan_state: &mut ScanState) {
453        if scan_state.ml_pending.is_empty() {
454            return;
455        }
456
457        let candidates: Vec<(String, String)> = scan_state
458            .ml_pending
459            .iter()
460            .map(|pending| (pending.credential.clone(), pending.ml_context.clone()))
461            .collect();
462
463        let scores = crate::gpu::batch_ml_inference(&candidates, &self.config);
464        let pending_matches: Vec<_> = scan_state.ml_pending.drain(..).collect();
465        for (pending, ml_conf) in pending_matches.into_iter().zip(scores.into_iter()) {
466            let mut final_score = (crate::types::ML_WEIGHT * ml_conf)
467                + (crate::types::HEURISTIC_WEIGHT * pending.heuristic_conf);
468            final_score = final_score.max(pending.heuristic_conf).max(ml_conf);
469
470            if matches!(
471                pending.code_context,
472                crate::context::CodeContext::TestCode
473                    | crate::context::CodeContext::Documentation
474                    | crate::context::CodeContext::Comment
475            ) && final_score < 0.95
476            {
477                final_score *= pending.code_context.confidence_multiplier();
478            }
479
480            let final_score =
481                crate::confidence::apply_post_ml_penalties(final_score, &pending.credential);
482            let final_score = crate::confidence::apply_path_confidence_penalties(
483                final_score,
484                pending.raw_match.location.file_path.as_deref(),
485            );
486            let final_score = if let Some(floor) =
487                crate::confidence::known_prefix_confidence_floor(&pending.credential)
488            {
489                final_score.max(floor)
490            } else {
491                final_score
492            };
493
494            if !pending.code_context.should_hard_suppress(final_score) {
495                let mut raw_match = pending.raw_match;
496                raw_match.confidence = Some(final_score);
497                scan_state.push_match(raw_match, self.config.max_matches_per_chunk);
498            }
499        }
500    }
501}