keyhog_scanner/
lib.rs

1//! Two-phase secret scanning engine.
2//!
3//! Phase 1 builds an Aho-Corasick automaton from literal prefixes extracted from
4//! detector regex patterns and runs a single O(n) pass over the input. Phase 2
5//! confirms candidate regions with the full regex. Patterns without extractable
6//! prefixes fall back to sequential regex scanning.
7//!
8//! # Feature flags
9//!
10//! - `ml` — MoE ML classifier for confidence scoring (default: on)
11//! - `entropy` — Shannon entropy-based detection (default: on)
12//! - `decode` — Decode-through scanning: base64, hex, URL, HTML, MIME (default: on)
13//! - `multiline` — Multi-line concatenation joining (default: on)
14//! - `gpu` — GPU-accelerated batch ML inference (optional)
15//!
16//! Additional layers: base64/hex decode-through, ML confidence scoring,
17//! structural context analysis, and multi-match resolution.
18
19/// Confidence scoring helpers for combining heuristic signals.
20pub mod confidence;
21/// Structural code-context inference used to adjust confidence.
22pub mod context;
23/// Decode-through scanning helpers for layered encodings.
24#[cfg(feature = "decode")]
25pub mod decode;
26/// Entropy-based fallback detection for unknown secret formats.
27#[cfg(feature = "entropy")]
28pub mod entropy;
29#[cfg(feature = "gpu")]
30pub mod gpu;
31#[allow(clippy::excessive_precision)]
32/// Embedded ML scorer used to downrank likely placeholders and noise.
33#[cfg(feature = "ml")]
34pub mod ml_scorer;
35/// Multi-line preprocessing for string concatenation and line continuations.
36#[cfg(feature = "multiline")]
37pub mod multiline;
38/// Prefix propagation tables for literal-prefix matching.
39pub mod prefix_trie;
40/// Match-resolution helpers for suppressing lower-quality overlaps.
41pub mod resolution;
42/// Vectorscan/Hyperscan SIMD regex backend (optional, feature-gated).
43pub mod simd;
44
45#[cfg(test)]
46#[allow(clippy::manual_range_contains, clippy::useless_format)]
47mod adversarial_tests;
48
49use aho_corasick::AhoCorasick;
50use keyhog_core::{Chunk, CompanionSpec, DetectorSpec, MatchLocation, PatternSpec, RawMatch};
51use multimatch::{MatchError, PatternSet, PatternSetBuilder};
52use regex::Regex;
53use std::borrow::Cow;
54use std::collections::{HashMap, VecDeque};
55use thiserror::Error;
56use unicode_normalization::UnicodeNormalization;
57
58// Fallback regex-only scanning switches to per-line mode once a chunk grows
59// beyond 10 KB. Prefixless regexes over larger blobs are expensive and secrets
60// are short enough that line-local scanning preserves recall.
61const LARGE_FALLBACK_SCAN_THRESHOLD: usize = 10_000;
62
63/// Hard cap on the dedup set to prevent unbounded memory growth when scanning
64/// repositories with millions of duplicate credential-like strings.
65const MAX_WINDOW_DEDUP_ENTRIES: usize = 100_000;
66
67/// Maximum bytes scanned in a single chunk. Files larger than this are split
68/// into overlapping windows. 1 MiB keeps peak RSS predictable under parallel
69/// scanning with `rayon` (N threads × 1 MiB per chunk = bounded memory).
70const MAX_SCAN_CHUNK_BYTES: usize = 1024 * 1024;
71
72/// Overlap between adjacent scan windows when a file exceeds
73/// `MAX_SCAN_CHUNK_BYTES`. Must be larger than the longest secret the scanner
74/// can detect to avoid missing secrets that straddle a chunk boundary. 4 KiB
75/// covers PEM-encoded RSA-4096 keys (~3,200 chars base64) with margin.
76const WINDOW_OVERLAP_BYTES: usize = 4096;
77
78/// Minimum line length considered for fallback pattern scanning. Lines shorter
79/// than 8 bytes cannot contain a credential prefix plus a meaningful secret.
80const MIN_FALLBACK_LINE_LENGTH: usize = 8;
81
82/// Minimum AC literal prefix length. Shorter prefixes (e.g., "1", "x", "_")
83/// match too many positions and degrade Aho-Corasick throughput.
84const FULL_MATCH_INDEX: usize = 0;
85const FIRST_CAPTURE_GROUP_INDEX: usize = 1;
86const FIRST_LINE_NUMBER: usize = 1;
87const PREVIOUS_LINE_DISTANCE: usize = 1;
88const MIN_LITERAL_PREFIX_CHARS: usize = 3;
89
90/// Compiled regex AST size limit. 10 MiB is large enough for complex detectors
91/// while preventing pathological patterns from consuming unbounded memory
92/// during regex compilation.
93const REGEX_SIZE_LIMIT_BYTES: usize = 10 << 20;
94
95/// How many characters around a hex match to inspect for structural context
96/// (assignment operators, quotes, keywords).
97const HEX_CONTEXT_RADIUS_CHARS: usize = 20;
98
99/// Minimum length for a standalone hex string to qualify as a potential secret.
100/// Shorter hex runs (e.g., CSS colors like `#ff00ff`) are too common.
101const MIN_HEX_MATCH_LEN: usize = 16;
102const MIN_HEX_DIGITS_IN_MATCH: usize = 16;
103
104/// Minimum hex digits required in the context window around a match to trigger
105/// hex-aware false-positive suppression.
106const MIN_HEX_CONTEXT_DIGITS: usize = 8;
107
108/// Maximum non-hex separators (colons, dashes) tolerated within a hex context
109/// window before the match is treated as a non-hex string.
110const MAX_HEX_CONTEXT_SEPARATORS: usize = 4;
111
112#[cfg(feature = "ml")]
113const MAX_ML_CACHE_ENTRIES: usize = 1024;
114#[cfg(feature = "ml")]
115const MAX_ML_CACHE_BYTES: usize = 256 * 1024;
116#[cfg(feature = "ml")]
117const ML_CONTEXT_RADIUS_LINES: usize = 5;
118#[cfg(feature = "ml")]
119const ML_WEIGHT: f64 = 0.6;
120#[cfg(feature = "ml")]
121const HEURISTIC_WEIGHT: f64 = 0.4;
122
123#[cfg(not(feature = "multiline"))]
124#[derive(Debug, Clone)]
125struct LineMapping {
126    start_offset: usize,
127    end_offset: usize,
128    line_number: usize,
129}
130
131#[cfg(not(feature = "multiline"))]
132#[derive(Debug, Clone)]
133struct PreprocessedText {
134    text: String,
135    mappings: Vec<LineMapping>,
136}
137
138#[cfg(not(feature = "multiline"))]
139impl PreprocessedText {
140    fn line_for_offset(&self, offset: usize) -> Option<usize> {
141        self.mappings
142            .iter()
143            .find(|mapping| offset >= mapping.start_offset && offset < mapping.end_offset)
144            .map(|mapping| mapping.line_number)
145    }
146
147    fn passthrough(line: &str) -> Self {
148        Self {
149            text: line.to_string(),
150            mappings: vec![LineMapping {
151                line_number: 1,
152                start_offset: 0,
153                end_offset: line.len(),
154            }],
155        }
156    }
157}
158
159#[cfg(feature = "multiline")]
160type ScannerPreprocessedText = multiline::PreprocessedText;
161
162#[cfg(not(feature = "multiline"))]
163type ScannerPreprocessedText = PreprocessedText;
164
165#[derive(Debug, Error)]
166/// Errors returned while compiling detector patterns into a scanner.
167///
168/// # Examples
169///
170/// ```rust
171/// use keyhog_scanner::ScanError;
172///
173/// let error = ScanError::RegexSetCompile(regex::Error::Syntax("bad regex".into()));
174/// assert!(error.to_string().contains("Fix"));
175/// ```
176pub enum ScanError {
177    #[error(
178        "failed to compile regex for detector {detector_id} pattern {index}: {source}. Fix: correct the detector regex or capture group configuration"
179    )]
180    RegexCompile {
181        detector_id: String,
182        index: usize,
183        source: regex::Error,
184    },
185    #[error(
186        "failed to compile scanner regex set: {0}. Fix: simplify the detector regex set or remove the invalid pattern"
187    )]
188    RegexSetCompile(#[from] regex::Error),
189    #[error(
190        "failed to build multimatch automaton: {0}. Fix: reduce detector complexity or remove unsupported regex constructs"
191    )]
192    Multimatch(#[from] MatchError),
193    #[error(
194        "failed to build Aho-Corasick automaton: {0}. Fix: shorten overly broad prefixes or reduce detector count"
195    )]
196    AhoCorasick(#[from] aho_corasick::BuildError),
197}
198
199/// A compiled entry: one pattern from one detector.
200struct CompiledPattern {
201    detector_index: usize,
202    regex: Regex,
203    group: Option<usize>,
204}
205
206/// An optional compiled companion pattern for a detector.
207struct CompiledCompanion {
208    regex: Regex,
209    capture_group: Option<usize>,
210    within_lines: usize,
211}
212
213/// The compiled scanner: all detector patterns fused into a single
214/// Aho-Corasick automaton for prefiltering, backed by individual
215/// regexes for extraction.
216///
217/// # Examples
218///
219/// ```rust
220/// use keyhog_core::{Chunk, ChunkMetadata, DetectorSpec, PatternSpec, Severity};
221/// use keyhog_scanner::CompiledScanner;
222///
223/// let scanner = CompiledScanner::compile(vec![DetectorSpec {
224///     id: "demo-token".into(),
225///     name: "Demo Token".into(),
226///     service: "demo".into(),
227///     severity: Severity::High,
228///     patterns: vec![PatternSpec {
229///         regex: "demo_[A-Z0-9]{8}".into(),
230///         description: None,
231///         group: None,
232///     }],
233///     companion: None,
234///     verify: None,
235///     keywords: vec!["demo_".into()],
236/// }])
237/// .unwrap();
238///
239/// let chunk = Chunk {
240///     data: "TOKEN=demo_ABC12345".into(),
241///     metadata: ChunkMetadata {
242///         source_type: "filesystem".into(),
243///         path: Some(".env".into()),
244///         commit: None,
245///         author: None,
246///         date: None,
247///     },
248/// };
249///
250/// assert_eq!(scanner.scan(&chunk).len(), 1);
251/// ```
252pub struct CompiledScanner {
253    /// Pattern matcher built from literal prefixes of patterns.
254    ac: Option<PatternSet>,
255    /// Maps AC pattern index → compiled pattern entry.
256    ac_map: Vec<CompiledPattern>,
257    /// Batched first-pass regex confirmation for AC-backed patterns.
258    /// The literal prefix strings corresponding to ac_map entries.
259    /// Prefix propagation: for each AC pattern, list of OTHER ac_map indices
260    /// whose prefix is a superstring. Pre-computed at compile time.
261    /// When AC matches pattern i, also check all patterns in propagation[i].
262    prefix_propagation: Vec<Vec<usize>>,
263    /// Patterns without extractable literal prefixes — checked via regex only.
264    /// Each entry pairs the compiled pattern with its detector's keywords for
265    /// chunk-level prefiltering (skip pattern if no keywords found in chunk).
266    fallback: Vec<(CompiledPattern, Vec<String>)>,
267    /// Compiled companion patterns, indexed by detector index.
268    companions: Vec<Option<CompiledCompanion>>,
269    /// Original detector specs for metadata.
270    detectors: Vec<DetectorSpec>,
271    /// Pre-computed: detector_index → list of AC pattern indices for that detector.
272    /// Eliminates O(N²) expansion during scan.
273    detector_to_patterns: Vec<Vec<usize>>,
274    /// Pre-computed: AC pattern index → list of other pattern indices with same literal prefix.
275    /// Eliminates O(N²) prefix comparison during scan.
276    same_prefix_patterns: Vec<Vec<usize>>,
277    /// Aho-Corasick automaton for fallback pattern keywords.
278    /// Single-pass keyword scan replaces per-pattern contains() loops.
279    fallback_keyword_ac: Option<AhoCorasick>,
280    /// Maps keyword AC match index → list of fallback pattern indices that use this keyword.
281    fallback_keyword_to_patterns: Vec<Vec<usize>>,
282    /// Optional Hyperscan SIMD scanner for 3-5x throughput.
283    /// When available, replaces both AC prefilter and fallback scanning
284    /// with a single SIMD pass over all patterns simultaneously.
285    #[cfg(feature = "simd")]
286    hs_scanner: Option<simd::backend::HsScanner>,
287}
288
289impl CompiledScanner {
290    /// Compile all detector specs into a single scanner.
291    ///
292    /// # Examples
293    ///
294    /// ```rust
295    /// use keyhog_core::{DetectorSpec, PatternSpec, Severity};
296    /// use keyhog_scanner::CompiledScanner;
297    ///
298    /// let scanner = CompiledScanner::compile(vec![DetectorSpec {
299    ///     id: "demo-token".into(),
300    ///     name: "Demo Token".into(),
301    ///     service: "demo".into(),
302    ///     severity: Severity::High,
303    ///     patterns: vec![PatternSpec {
304    ///         regex: "demo_[A-Z0-9]{8}".into(),
305    ///         description: None,
306    ///         group: None,
307    ///     }],
308    ///     companion: None,
309    ///     verify: None,
310    ///     keywords: vec!["demo_".into()],
311    /// }])
312    /// .unwrap();
313    ///
314    /// assert_eq!(scanner.detector_count(), 1);
315    /// ```
316    pub fn compile(detectors: Vec<DetectorSpec>) -> Result<Self, ScanError> {
317        let CompileState {
318            ac_literals,
319            ac_map,
320            fallback,
321            companions,
322            quality_warnings,
323        } = build_compile_state(&detectors)?;
324        log_quality_warnings(&quality_warnings);
325        tracing::info!(
326            ac_patterns = ac_map.len(),
327            fallback_patterns = fallback.len(),
328            detectors = detectors.len(),
329            "scanner compiled"
330        );
331
332        let ac = build_ac_pattern_set(&ac_literals)?;
333        let prefix_propagation = prefix_trie::build_propagation_table(&ac_literals);
334        let detector_to_patterns = build_detector_to_patterns(&ac_map, detectors.len());
335        let same_prefix_patterns = build_same_prefix_patterns(&ac_literals);
336
337        // Build keyword AC for fallback pattern prefiltering
338        let (fallback_keyword_ac, fallback_keyword_to_patterns) =
339            build_fallback_keyword_ac(&fallback);
340
341        // Build Hyperscan SIMD database when feature is enabled
342        #[cfg(feature = "simd")]
343        let hs_scanner = {
344            // Collect ALL patterns (AC + fallback) for Hyperscan compilation
345            let mut all_patterns: Vec<(usize, usize, &str, bool)> = Vec::new();
346            for (i, entry) in ac_map.iter().enumerate() {
347                all_patterns.push((
348                    entry.detector_index,
349                    i,
350                    entry.regex.as_str(),
351                    entry.group.is_some(),
352                ));
353            }
354            for (i, (entry, _)) in fallback.iter().enumerate() {
355                all_patterns.push((
356                    entry.detector_index,
357                    ac_map.len() + i,
358                    entry.regex.as_str(),
359                    entry.group.is_some(),
360                ));
361            }
362            match simd::backend::HsScanner::compile(&all_patterns) {
363                Ok((hs, unsupported)) => {
364                    tracing::info!(
365                        hs_patterns = hs.pattern_count(),
366                        unsupported = unsupported.len(),
367                        "hyperscan SIMD database compiled"
368                    );
369                    Some(hs)
370                }
371                Err(e) => {
372                    tracing::warn!("hyperscan compilation failed, using AC fallback: {e}");
373                    None
374                }
375            }
376        };
377
378        Ok(Self {
379            ac,
380            ac_map,
381            prefix_propagation,
382            fallback,
383            companions,
384            detectors,
385            detector_to_patterns,
386            same_prefix_patterns,
387            fallback_keyword_ac,
388            fallback_keyword_to_patterns,
389            #[cfg(feature = "simd")]
390            hs_scanner,
391        })
392    }
393
394    /// Number of loaded detectors.
395    ///
396    /// # Examples
397    ///
398    /// ```rust
399    /// use keyhog_core::{DetectorSpec, PatternSpec, Severity};
400    /// use keyhog_scanner::CompiledScanner;
401    ///
402    /// let scanner = CompiledScanner::compile(vec![DetectorSpec {
403    ///     id: "demo-token".into(),
404    ///     name: "Demo Token".into(),
405    ///     service: "demo".into(),
406    ///     severity: Severity::High,
407    ///     patterns: vec![PatternSpec {
408    ///         regex: "demo_[A-Z0-9]{8}".into(),
409    ///         description: None,
410    ///         group: None,
411    ///     }],
412    ///     companion: None,
413    ///     verify: None,
414    ///     keywords: vec!["demo_".into()],
415    /// }])
416    /// .unwrap();
417    ///
418    /// assert_eq!(scanner.detector_count(), 1);
419    /// ```
420    pub fn detector_count(&self) -> usize {
421        self.detectors.len()
422    }
423
424    /// Total number of patterns (AC + fallback).
425    ///
426    /// # Examples
427    ///
428    /// ```rust
429    /// use keyhog_core::{DetectorSpec, PatternSpec, Severity};
430    /// use keyhog_scanner::CompiledScanner;
431    ///
432    /// let scanner = CompiledScanner::compile(vec![DetectorSpec {
433    ///     id: "demo-token".into(),
434    ///     name: "Demo Token".into(),
435    ///     service: "demo".into(),
436    ///     severity: Severity::High,
437    ///     patterns: vec![PatternSpec {
438    ///         regex: "demo_[A-Z0-9]{8}".into(),
439    ///         description: None,
440    ///         group: None,
441    ///     }],
442    ///     companion: None,
443    ///     verify: None,
444    ///     keywords: vec!["demo_".into()],
445    /// }])
446    /// .unwrap();
447    ///
448    /// assert_eq!(scanner.pattern_count(), 1);
449    /// ```
450    pub fn pattern_count(&self) -> usize {
451        self.ac_map.len() + self.fallback.len()
452    }
453
454    /// Maximum chunk size to scan (1MB). Larger chunks are split into overlapping windows.
455    /// Maximum chunk size for windowed scanning.
456    /// 1MB balances memory usage vs. split-boundary risk.
457    /// Larger files are split with WINDOW_OVERLAP to avoid missing
458    /// secrets at boundaries. Validated: 200KB adversarial test passes.
459    pub(crate) const MAX_SCAN_CHUNK: usize = MAX_SCAN_CHUNK_BYTES;
460    /// Overlap between windows to avoid missing secrets at boundaries.
461    const WINDOW_OVERLAP: usize = WINDOW_OVERLAP_BYTES;
462
463    /// Scan a chunk of text and return all raw credential matches.
464    /// Applies multi-line preprocessing to detect secrets split across lines.
465    /// Large chunks are split into overlapping windows for bounded scan time.
466    ///
467    /// # Examples
468    ///
469    /// ```rust
470    /// use keyhog_core::{Chunk, ChunkMetadata, DetectorSpec, PatternSpec, Severity};
471    /// use keyhog_scanner::CompiledScanner;
472    ///
473    /// let scanner = CompiledScanner::compile(vec![DetectorSpec {
474    ///     id: "demo-token".into(),
475    ///     name: "Demo Token".into(),
476    ///     service: "demo".into(),
477    ///     severity: Severity::High,
478    ///     patterns: vec![PatternSpec {
479    ///         regex: "demo_[A-Z0-9]{8}".into(),
480    ///         description: None,
481    ///         group: None,
482    ///     }],
483    ///     companion: None,
484    ///     verify: None,
485    ///     keywords: vec!["demo_".into()],
486    /// }])
487    /// .unwrap();
488    ///
489    /// let matches = scanner.scan(&Chunk {
490    ///     data: "TOKEN=demo_ABC12345".into(),
491    ///     metadata: ChunkMetadata {
492    ///         source_type: "filesystem".into(),
493    ///         path: Some(".env".into()),
494    ///         commit: None,
495    ///         author: None,
496    ///         date: None,
497    ///     },
498    /// });
499    ///
500    /// assert_eq!(matches.len(), 1);
501    /// ```
502    pub fn scan(&self, chunk: &Chunk) -> Vec<RawMatch> {
503        // For large chunks, split into overlapping windows.
504        let mut matches = if chunk.data.len() > Self::MAX_SCAN_CHUNK {
505            self.scan_windowed(chunk)
506        } else {
507            self.scan_inner(chunk)
508        };
509
510        // Decode-through: scan base64/hex/URL decoded variants of the chunk.
511        // Skip for large chunks — decode is O(N) per line and cascading scans
512        // on decoded chunks can be expensive on multiline-heavy content.
513        #[cfg(feature = "decode")]
514        if chunk.data.len() <= 64 * 1024 {
515            let mut seen: std::collections::HashSet<(String, String)> = matches
516                .iter()
517                .map(|m| (m.detector_id.clone(), m.credential.clone()))
518                .collect();
519            for decoded_chunk in decode::decode_chunk(chunk) {
520                let decoded_matches = if decoded_chunk.data.len() > Self::MAX_SCAN_CHUNK {
521                    self.scan_windowed(&decoded_chunk)
522                } else {
523                    self.scan_inner(&decoded_chunk)
524                };
525                for m in decoded_matches {
526                    if seen.insert((m.detector_id.clone(), m.credential.clone())) {
527                        matches.push(m);
528                    }
529                }
530            }
531        }
532
533        matches
534    }
535
536    /// Split a large chunk into overlapping windows and scan each.
537    ///
538    /// # Window Layout
539    ///
540    /// ```text
541    /// ├────── MAX_SCAN_CHUNK (1 MiB) ──────┤
542    /// │ window 0                            │
543    /// │                     ├─ OVERLAP (4K) ┤
544    /// │                     │ window 1      │──── MAX_SCAN_CHUNK ────│
545    /// │                     │               │                       │
546    /// ```
547    ///
548    /// Windows advance by `MAX_SCAN_CHUNK - WINDOW_OVERLAP` bytes. The 4 KiB
549    /// overlap ensures secrets up to ~3,200 chars (PEM RSA-4096 base64) that
550    /// straddle a boundary are fully contained in at least one window.
551    ///
552    /// # Deduplication
553    ///
554    /// The `seen` set tracks `(credential, detector_id)` pairs across windows
555    /// so that a secret in the overlap region is only reported once. The set is
556    /// capped at [`MAX_WINDOW_DEDUP_ENTRIES`] and cleared on overflow to bound
557    /// memory for pathological inputs with millions of matches.
558    fn scan_windowed(&self, chunk: &Chunk) -> Vec<RawMatch> {
559        let chunk_text = &chunk.data;
560        let mut all_matches = Vec::with_capacity((chunk_text.len() / 4096).max(16));
561        let mut seen = std::collections::HashSet::new();
562        let mut seen_order = VecDeque::new();
563        let mut offset = 0usize;
564
565        while offset < chunk_text.len() {
566            let end = window_end_offset(chunk_text, offset, Self::MAX_SCAN_CHUNK);
567            let window_chunk = window_chunk(chunk, offset, end);
568            for mut m in self.scan_inner(&window_chunk) {
569                if record_window_match(chunk_text, offset, &mut m, &mut seen, &mut seen_order) {
570                    all_matches.push(m);
571                }
572            }
573            if end >= chunk_text.len() {
574                break;
575            }
576            offset = next_window_offset(chunk_text, end, Self::WINDOW_OVERLAP);
577        }
578
579        all_matches
580    }
581
582    fn scan_inner(&self, chunk: &Chunk) -> Vec<RawMatch> {
583        let mut owned_normalized = None;
584        let chunk = if chunk.data.is_ascii() {
585            chunk
586        } else {
587            normalize_scannable_chunk(chunk, &mut owned_normalized)
588        };
589        #[cfg(feature = "multiline")]
590        let preprocessed = if crate::multiline::has_concatenation_indicators(&chunk.data) {
591            multiline::preprocess_multiline(&chunk.data, &multiline::MultilineConfig::default())
592        } else {
593            ScannerPreprocessedText::passthrough(&chunk.data)
594        };
595        #[cfg(not(feature = "multiline"))]
596        let preprocessed = ScannerPreprocessedText::passthrough(&chunk.data);
597
598        let line_offsets = compute_line_offsets(&preprocessed.text);
599        let code_lines: Vec<&str> = chunk.data.lines().collect();
600        let documentation_lines = context::documentation_line_flags(&code_lines);
601        let mut scan_state = ScanState {
602            matches: Vec::with_capacity((chunk.data.len() / 4096).max(16)),
603            ..Default::default()
604        };
605
606        // SIMD fast path: Hyperscan matches ALL patterns in a single SIMD pass.
607        // NOTE: Hyperscan SIMD prefilter is compiled and ready (1634 patterns)
608        // but disabled because it's 3x SLOWER than AC+keyword for our use case.
609        // Reason: HS matches all patterns simultaneously (good for IDS) but we still
610        // need Rust regex for capture group extraction. The double work (HS scan +
611        // regex confirmation) is more expensive than AC prefilter + selective regex.
612        // HS would help if we had 10K+ patterns where AC automaton size is the bottleneck.
613        #[cfg(feature = "simd")]
614        let used_simd = if let Some(hs) = &self.hs_scanner {
615            let hs_matches = hs.scan(preprocessed.text.as_bytes());
616            // Collect unique pattern indices that HS triggered
617            let mut triggered_set = std::collections::HashSet::new();
618            for &(hs_id, _start, _end) in &hs_matches {
619                if let Some((det_idx, pat_idx, _has_group)) = hs.pattern_info(hs_id) {
620                    triggered_set.insert((det_idx, pat_idx));
621                }
622            }
623            // Run the Rust regex for each triggered pattern to extract matches
624            let all_patterns: Vec<&CompiledPattern> = self
625                .ac_map
626                .iter()
627                .chain(self.fallback.iter().map(|(p, _)| p))
628                .collect();
629            for &(_det_idx, pat_idx) in &triggered_set {
630                if let Some(entry) = all_patterns.get(pat_idx) {
631                    self.extract_matches(
632                        entry,
633                        &preprocessed,
634                        &line_offsets,
635                        &code_lines,
636                        &documentation_lines,
637                        chunk,
638                        &mut scan_state.matches,
639                        &mut scan_state.ml_score_cache,
640                        &mut scan_state.ml_cache_order,
641                        &mut scan_state.ml_cache_bytes,
642                    );
643                }
644            }
645            true
646        } else {
647            false
648        };
649        #[cfg(not(feature = "simd"))]
650        let used_simd = false;
651
652        if !used_simd {
653            // Standard path: AC prefilter + fallback keyword scanning
654            let expanded_patterns = self.collect_expanded_patterns(&preprocessed.text);
655            let triggered: Vec<usize> = (0..self.ac_map.len())
656                .filter(|&i| (expanded_patterns[i / 64] & (1 << (i % 64))) != 0)
657                .collect();
658            self.scan_prefiltered_patterns(
659                &triggered,
660                &preprocessed,
661                &line_offsets,
662                &code_lines,
663                &documentation_lines,
664                chunk,
665                &mut scan_state.matches,
666                &mut scan_state.ml_score_cache,
667                &mut scan_state.ml_cache_order,
668                &mut scan_state.ml_cache_bytes,
669            );
670        }
671        if !used_simd {
672            self.scan_fallback_patterns(
673                &preprocessed,
674                &line_offsets,
675                &code_lines,
676                &documentation_lines,
677                chunk,
678                &mut scan_state.matches,
679                &mut scan_state.ml_score_cache,
680                &mut scan_state.ml_cache_order,
681                &mut scan_state.ml_cache_bytes,
682            );
683        }
684        scan_state.matches
685    }
686
687    /// Dispatch regex execution for a single compiled pattern against the
688    /// preprocessed text. Routes to either grouped extraction (when the
689    /// pattern has a capture group for the credential value) or plain
690    /// extraction (full-match mode).
691    ///
692    /// Matched credentials are appended to `matches` after confidence scoring
693    /// and false-positive filtering. The ML score cache is shared across
694    /// patterns to avoid redundant inference for the same credential string.
695    #[allow(clippy::too_many_arguments)]
696    fn extract_matches(
697        &self,
698        entry: &CompiledPattern,
699        preprocessed: &ScannerPreprocessedText,
700        line_offsets: &[usize],
701        code_lines: &[&str],
702        documentation_lines: &[bool],
703        chunk: &Chunk,
704        matches: &mut Vec<RawMatch>,
705        ml_score_cache: &mut HashMap<(String, String), f64>,
706        ml_cache_order: &mut VecDeque<(String, String)>,
707        ml_cache_bytes: &mut usize,
708    ) {
709        let detector = &self.detectors[entry.detector_index];
710        if let Some(group) = entry.group {
711            self.extract_grouped_matches(
712                entry,
713                detector,
714                group,
715                preprocessed,
716                line_offsets,
717                code_lines,
718                documentation_lines,
719                chunk,
720                matches,
721                ml_score_cache,
722                ml_cache_order,
723                ml_cache_bytes,
724            );
725            return;
726        }
727        self.extract_plain_matches(
728            entry,
729            detector,
730            preprocessed,
731            line_offsets,
732            code_lines,
733            documentation_lines,
734            chunk,
735            matches,
736            ml_score_cache,
737            ml_cache_order,
738            ml_cache_bytes,
739        );
740    }
741
742    /// Process a single regex match and push a `RawMatch` if it passes filters.
743    #[allow(clippy::too_many_arguments)]
744    fn process_match(
745        &self,
746        entry: &CompiledPattern,
747        detector: &DetectorSpec,
748        data: &str,
749        preprocessed: &ScannerPreprocessedText,
750        line_offsets: &[usize],
751        code_lines: &[&str],
752        documentation_lines: &[bool],
753        chunk: &Chunk,
754        matches: &mut Vec<RawMatch>,
755        ml_score_cache: &mut HashMap<(String, String), f64>,
756        ml_cache_order: &mut VecDeque<(String, String)>,
757        ml_cache_bytes: &mut usize,
758        credential: &str,
759        match_start: usize,
760        match_end: usize,
761    ) {
762        if is_within_hex_context(data, match_start, match_end) {
763            return;
764        }
765        let line = match_line_number(preprocessed, line_offsets, match_start);
766        if context::is_false_positive_context(
767            code_lines,
768            line.saturating_sub(PREVIOUS_LINE_DISTANCE),
769            chunk.metadata.path.as_deref(),
770        ) || context::is_false_positive_match_context(
771            data,
772            match_start,
773            chunk.metadata.path.as_deref(),
774        ) {
775            return;
776        }
777        let inferred_context = context::infer_context_with_documentation(
778            code_lines,
779            line.saturating_sub(PREVIOUS_LINE_DISTANCE),
780            chunk.metadata.path.as_deref(),
781            documentation_lines,
782        );
783        if should_suppress_known_example_credential(
784            credential,
785            chunk.metadata.path.as_deref(),
786            inferred_context,
787        ) {
788            return;
789        }
790        let companion = self.match_companion(entry, preprocessed, line);
791        let ent = match_entropy(credential.as_bytes());
792        let conf = self.match_confidence(
793            entry,
794            detector,
795            code_lines,
796            documentation_lines,
797            chunk,
798            credential,
799            data,
800            line,
801            ent,
802            companion.is_some(),
803            ml_score_cache,
804            ml_cache_order,
805            ml_cache_bytes,
806        );
807        matches.push(build_raw_match(
808            detector,
809            chunk,
810            credential,
811            companion,
812            match_start,
813            line,
814            ent,
815            conf,
816        ));
817    }
818
819    fn collect_expanded_patterns(&self, text: &str) -> Vec<u64> {
820        let triggered_patterns = self.collect_triggered_patterns(text);
821        self.expand_triggered_patterns(&triggered_patterns)
822    }
823
824    fn collect_triggered_patterns(&self, text: &str) -> Vec<u64> {
825        let mut triggered_patterns = vec![0u64; self.ac_map.len().div_ceil(64)];
826        if let Some(ac) = &self.ac {
827            for ac_match in ac.scan(text.as_bytes()) {
828                let pat_idx = ac_match.pattern_id;
829                if pat_idx >= self.ac_map.len() {
830                    continue;
831                }
832                // SAFETY: pat_idx is bounded by ac_map.len() which is checked at compile time.
833                // pat_idx % 64 is always 0..63, so the shift never overflows.
834                triggered_patterns[pat_idx / 64] |= 1u64 << (pat_idx % 64);
835                for &propagated_idx in &self.prefix_propagation[pat_idx] {
836                    triggered_patterns[propagated_idx / 64] |= 1 << (propagated_idx % 64);
837                }
838            }
839        }
840        triggered_patterns
841    }
842
843    fn expand_triggered_patterns(&self, triggered_patterns: &[u64]) -> Vec<u64> {
844        let mut expanded = triggered_patterns.to_vec();
845        for pat_idx in 0..self.ac_map.len() {
846            if (triggered_patterns[pat_idx / 64] & (1 << (pat_idx % 64))) != 0 {
847                for &other_idx in &self.same_prefix_patterns[pat_idx] {
848                    expanded[other_idx / 64] |= 1 << (other_idx % 64);
849                }
850                let det_idx = self.ac_map[pat_idx].detector_index;
851                for &other_idx in &self.detector_to_patterns[det_idx] {
852                    expanded[other_idx / 64] |= 1 << (other_idx % 64);
853                }
854            }
855        }
856        expanded
857    }
858
859    #[allow(clippy::too_many_arguments)]
860    fn scan_prefiltered_patterns(
861        &self,
862        confirmed_patterns: &[usize],
863        preprocessed: &ScannerPreprocessedText,
864        line_offsets: &[usize],
865        code_lines: &[&str],
866        documentation_lines: &[bool],
867        chunk: &Chunk,
868        matches: &mut Vec<RawMatch>,
869        ml_score_cache: &mut HashMap<(String, String), f64>,
870        ml_cache_order: &mut VecDeque<(String, String)>,
871        ml_cache_bytes: &mut usize,
872    ) {
873        for &pat_idx in confirmed_patterns {
874            let entry = &self.ac_map[pat_idx];
875            self.extract_matches(
876                entry,
877                preprocessed,
878                line_offsets,
879                code_lines,
880                documentation_lines,
881                chunk,
882                matches,
883                ml_score_cache,
884                ml_cache_order,
885                ml_cache_bytes,
886            );
887        }
888    }
889
890    #[allow(clippy::too_many_arguments)]
891    fn scan_fallback_patterns(
892        &self,
893        preprocessed: &ScannerPreprocessedText,
894        line_offsets: &[usize],
895        code_lines: &[&str],
896        documentation_lines: &[bool],
897        chunk: &Chunk,
898        matches: &mut Vec<RawMatch>,
899        ml_score_cache: &mut HashMap<(String, String), f64>,
900        ml_cache_order: &mut VecDeque<(String, String)>,
901        ml_cache_bytes: &mut usize,
902    ) {
903        if preprocessed.text.len() > LARGE_FALLBACK_SCAN_THRESHOLD && !self.fallback.is_empty() {
904            self.scan_large_fallback_patterns(
905                preprocessed,
906                line_offsets,
907                chunk,
908                matches,
909                ml_score_cache,
910                ml_cache_order,
911                ml_cache_bytes,
912            );
913            return;
914        }
915        // Single-pass keyword scan: find which fallback patterns are relevant for this chunk.
916        let active_patterns: Vec<bool> = if let Some(kw_ac) = &self.fallback_keyword_ac {
917            let mut active = vec![false; self.fallback.len()];
918            // Mark patterns whose keywords have NO usable keywords as always-active
919            for (i, (_pattern, keywords)) in self.fallback.iter().enumerate() {
920                if !keywords.iter().any(|kw| kw.len() >= 4) {
921                    active[i] = true;
922                }
923            }
924            // Single AC scan over chunk to find all keyword matches
925            for mat in kw_ac.find_iter(&chunk.data) {
926                let kw_idx = mat.pattern().as_usize();
927                if kw_idx < self.fallback_keyword_to_patterns.len() {
928                    for &pattern_idx in &self.fallback_keyword_to_patterns[kw_idx] {
929                        if pattern_idx < active.len() {
930                            active[pattern_idx] = true;
931                        }
932                    }
933                }
934            }
935            active
936        } else {
937            vec![true; self.fallback.len()]
938        };
939
940        for (i, (entry, _keywords)) in self.fallback.iter().enumerate() {
941            if !active_patterns[i] {
942                continue;
943            }
944            self.extract_matches(
945                entry,
946                preprocessed,
947                line_offsets,
948                code_lines,
949                documentation_lines,
950                chunk,
951                matches,
952                ml_score_cache,
953                ml_cache_order,
954                ml_cache_bytes,
955            );
956        }
957    }
958
959    #[allow(clippy::too_many_arguments)]
960    fn scan_large_fallback_patterns(
961        &self,
962        preprocessed: &ScannerPreprocessedText,
963        line_offsets: &[usize],
964        chunk: &Chunk,
965        matches: &mut Vec<RawMatch>,
966        ml_score_cache: &mut HashMap<(String, String), f64>,
967        ml_cache_order: &mut VecDeque<(String, String)>,
968        ml_cache_bytes: &mut usize,
969    ) {
970        // Use keyword AC for fast pre-filtering (same as scan_fallback_patterns)
971        let active_set: Vec<bool> = if let Some(kw_ac) = &self.fallback_keyword_ac {
972            let mut active = vec![false; self.fallback.len()];
973            for (i, (_, keywords)) in self.fallback.iter().enumerate() {
974                if !keywords.iter().any(|kw| kw.len() >= 4) {
975                    active[i] = true;
976                }
977            }
978            for mat in kw_ac.find_iter(&chunk.data) {
979                let kw_idx = mat.pattern().as_usize();
980                if kw_idx < self.fallback_keyword_to_patterns.len() {
981                    for &pattern_idx in &self.fallback_keyword_to_patterns[kw_idx] {
982                        if pattern_idx < active.len() {
983                            active[pattern_idx] = true;
984                        }
985                    }
986                }
987            }
988            active
989        } else {
990            vec![true; self.fallback.len()]
991        };
992        let active_fallback: Vec<&CompiledPattern> = self
993            .fallback
994            .iter()
995            .enumerate()
996            .filter(|(i, _)| active_set[*i])
997            .map(|(_, (entry, _))| entry)
998            .collect();
999
1000        if active_fallback.is_empty() {
1001            return;
1002        }
1003
1004        for (line_idx, line) in preprocessed.text.lines().enumerate() {
1005            if line.len() < MIN_FALLBACK_LINE_LENGTH {
1006                continue;
1007            }
1008            let start_len = matches.len();
1009            let line_pre = ScannerPreprocessedText::passthrough(line);
1010            let line_code_lines = [line];
1011            let line_documentation_lines = [false];
1012            for entry in &active_fallback {
1013                self.extract_matches(
1014                    entry,
1015                    &line_pre,
1016                    &[0],
1017                    &line_code_lines,
1018                    &line_documentation_lines,
1019                    chunk,
1020                    matches,
1021                    ml_score_cache,
1022                    ml_cache_order,
1023                    ml_cache_bytes,
1024                );
1025            }
1026            adjust_fallback_match_locations(
1027                &mut matches[start_len..],
1028                line_idx,
1029                line_offsets[line_idx],
1030            );
1031        }
1032    }
1033
1034    #[allow(clippy::too_many_arguments)]
1035    fn extract_grouped_matches(
1036        &self,
1037        entry: &CompiledPattern,
1038        detector: &DetectorSpec,
1039        group: usize,
1040        preprocessed: &ScannerPreprocessedText,
1041        line_offsets: &[usize],
1042        code_lines: &[&str],
1043        documentation_lines: &[bool],
1044        chunk: &Chunk,
1045        matches: &mut Vec<RawMatch>,
1046        ml_score_cache: &mut HashMap<(String, String), f64>,
1047        ml_cache_order: &mut VecDeque<(String, String)>,
1048        ml_cache_bytes: &mut usize,
1049    ) {
1050        // The preprocessed text contains original text + appended multiline joins.
1051        // Single-pass search covers both structural and multiline-joined patterns.
1052        let search_text = &preprocessed.text;
1053        for caps in entry.regex.captures_iter(search_text) {
1054            let Some(full_match) = caps.get(FULL_MATCH_INDEX) else {
1055                continue;
1056            };
1057            let credential = caps
1058                .get(group)
1059                .map(|capture| capture.as_str())
1060                .unwrap_or_else(|| full_match.as_str());
1061            self.process_match(
1062                entry,
1063                detector,
1064                search_text,
1065                preprocessed,
1066                line_offsets,
1067                code_lines,
1068                documentation_lines,
1069                chunk,
1070                matches,
1071                ml_score_cache,
1072                ml_cache_order,
1073                ml_cache_bytes,
1074                credential,
1075                full_match.start(),
1076                full_match.end(),
1077            );
1078        }
1079    }
1080
1081    #[allow(clippy::too_many_arguments)]
1082    fn extract_plain_matches(
1083        &self,
1084        entry: &CompiledPattern,
1085        detector: &DetectorSpec,
1086        preprocessed: &ScannerPreprocessedText,
1087        line_offsets: &[usize],
1088        code_lines: &[&str],
1089        documentation_lines: &[bool],
1090        chunk: &Chunk,
1091        matches: &mut Vec<RawMatch>,
1092        ml_score_cache: &mut HashMap<(String, String), f64>,
1093        ml_cache_order: &mut VecDeque<(String, String)>,
1094        ml_cache_bytes: &mut usize,
1095    ) {
1096        let search_text = &preprocessed.text;
1097        for matched in entry.regex.find_iter(search_text) {
1098            self.process_match(
1099                entry,
1100                detector,
1101                search_text,
1102                preprocessed,
1103                line_offsets,
1104                code_lines,
1105                documentation_lines,
1106                chunk,
1107                matches,
1108                ml_score_cache,
1109                ml_cache_order,
1110                ml_cache_bytes,
1111                matched.as_str(),
1112                matched.start(),
1113                matched.end(),
1114            );
1115        }
1116    }
1117
1118    fn match_companion(
1119        &self,
1120        entry: &CompiledPattern,
1121        preprocessed: &ScannerPreprocessedText,
1122        line: usize,
1123    ) -> Option<String> {
1124        self.companions
1125            .get(entry.detector_index)
1126            .and_then(|companion| companion.as_ref())
1127            .and_then(|companion| find_companion(preprocessed, line, companion))
1128    }
1129
1130    /// Compute the confidence score for a credential match.
1131    ///
1132    /// # Scoring Pipeline
1133    ///
1134    /// 1. **Heuristic signals** (`confidence::compute_confidence`): combines
1135    ///    literal prefix presence, capture-group anchoring, Shannon entropy,
1136    ///    keyword proximity, sensitive file paths, match length, and companion
1137    ///    secret presence into a raw score in `[0.0, 1.0]`.
1138    ///
1139    /// 2. **Context adjustment**: the surrounding code context (test files,
1140    ///    documentation, comments, example blocks) applies a multiplier that
1141    ///    reduces confidence for matches in non-production contexts.
1142    ///
1143    /// 3. **ML blending** (when `feature = "ml"` is enabled): a 41-feature
1144    ///    mixture-of-experts classifier produces an independent confidence
1145    ///    score. The final output is `max(blended, heuristic, ml)` — we take
1146    ///    the maximum so that a strong heuristic signal is never dragged down
1147    ///    by a weak ML prediction, and vice versa.
1148    ///
1149    /// When ML is disabled, returns the heuristic confidence directly.
1150    #[allow(clippy::too_many_arguments)]
1151    fn match_confidence(
1152        &self,
1153        entry: &CompiledPattern,
1154        detector: &DetectorSpec,
1155        code_lines: &[&str],
1156        documentation_lines: &[bool],
1157        chunk: &Chunk,
1158        credential: &str,
1159        data: &str,
1160        line: usize,
1161        ent: f64,
1162        has_companion: bool,
1163        ml_score_cache: &mut HashMap<(String, String), f64>,
1164        ml_cache_order: &mut VecDeque<(String, String)>,
1165        ml_cache_bytes: &mut usize,
1166    ) -> f64 {
1167        let raw_conf = confidence::compute_confidence(&confidence::ConfidenceSignals {
1168            has_literal_prefix: extract_literal_prefix(entry.regex.as_str()).is_some(),
1169            has_context_anchor: entry.group.is_some(),
1170            entropy: ent,
1171            keyword_nearby: detector
1172                .keywords
1173                .iter()
1174                .any(|keyword| chunk.data.contains(keyword.as_str())),
1175            sensitive_file: chunk
1176                .metadata
1177                .path
1178                .as_deref()
1179                .map(confidence::is_sensitive_path)
1180                .unwrap_or(false),
1181            match_length: credential.len(),
1182            has_companion,
1183        });
1184        let context = context::infer_context_with_documentation(
1185            code_lines,
1186            line.saturating_sub(PREVIOUS_LINE_DISTANCE),
1187            chunk.metadata.path.as_deref(),
1188            documentation_lines,
1189        );
1190        let heuristic_conf = raw_conf * context.confidence_multiplier();
1191        #[cfg(not(feature = "ml"))]
1192        {
1193            let _ = (data, ml_score_cache, ml_cache_order, ml_cache_bytes);
1194            return heuristic_conf;
1195        }
1196
1197        #[cfg(feature = "ml")]
1198        {
1199            let text_context = local_context_window(data, line, ML_CONTEXT_RADIUS_LINES);
1200            // Prepend file path so the MoE gate can route to the right expert
1201            // based on file extension (.env → config, .yml → CI, .tf → infra, etc.)
1202            let ml_context = match chunk.metadata.path.as_deref() {
1203                Some(path) => format!("file:{path}\n{text_context}"),
1204                None => text_context,
1205            };
1206            let ml_conf = cached_ml_score(
1207                ml_score_cache,
1208                ml_cache_order,
1209                ml_cache_bytes,
1210                credential,
1211                &ml_context,
1212            );
1213            // Use the HIGHER of ML and heuristic scores. A strong heuristic match
1214            // (prefix + entropy + context) should never be dragged down by a weak ML
1215            // prediction, and a strong ML prediction should override weak heuristics.
1216            let blended = (ML_WEIGHT * ml_conf) + (HEURISTIC_WEIGHT * heuristic_conf);
1217            blended.max(heuristic_conf).max(ml_conf)
1218        }
1219    }
1220}
1221
1222#[derive(Default)]
1223struct ScanState {
1224    matches: Vec<RawMatch>,
1225    ml_score_cache: HashMap<(String, String), f64>,
1226    ml_cache_order: VecDeque<(String, String)>,
1227    ml_cache_bytes: usize,
1228}
1229
1230struct CompileState {
1231    ac_literals: Vec<String>,
1232    ac_map: Vec<CompiledPattern>,
1233    fallback: Vec<(CompiledPattern, Vec<String>)>,
1234    companions: Vec<Option<CompiledCompanion>>,
1235    quality_warnings: Vec<String>,
1236}
1237
1238fn build_compile_state(detectors: &[DetectorSpec]) -> Result<CompileState, ScanError> {
1239    let mut ac_literals = Vec::new();
1240    let mut ac_map = Vec::new();
1241    let mut fallback = Vec::new();
1242    let mut companions = Vec::with_capacity(detectors.len());
1243    let mut quality_warnings = Vec::new();
1244    for (detector_index, detector) in detectors.iter().enumerate() {
1245        companions.push(compile_detector_companion(detector)?);
1246        for (pattern_index, pattern) in detector.patterns.iter().enumerate() {
1247            compile_detector_pattern(
1248                detector_index,
1249                detector,
1250                pattern_index,
1251                pattern,
1252                &mut ac_literals,
1253                &mut ac_map,
1254                &mut fallback,
1255                &mut quality_warnings,
1256            )?;
1257        }
1258    }
1259    Ok(CompileState {
1260        ac_literals,
1261        ac_map,
1262        fallback,
1263        companions,
1264        quality_warnings,
1265    })
1266}
1267
1268fn compile_detector_companion(
1269    detector: &DetectorSpec,
1270) -> Result<Option<CompiledCompanion>, ScanError> {
1271    detector
1272        .companion
1273        .as_ref()
1274        .map(|companion| compile_companion(companion, &detector.id))
1275        .transpose()
1276}
1277
1278#[allow(clippy::too_many_arguments)]
1279fn compile_detector_pattern(
1280    detector_index: usize,
1281    detector: &DetectorSpec,
1282    pattern_index: usize,
1283    pattern: &PatternSpec,
1284    ac_literals: &mut Vec<String>,
1285    ac_map: &mut Vec<CompiledPattern>,
1286    fallback: &mut Vec<(CompiledPattern, Vec<String>)>,
1287    quality_warnings: &mut Vec<String>,
1288) -> Result<(), ScanError> {
1289    let prefix = extract_literal_prefix(&pattern.regex);
1290    if prefix.is_none() && detector.keywords.is_empty() {
1291        quality_warnings.push(format!(
1292            "detector '{}' pattern {} has no literal prefix and no keywords — will produce false positives. Add keywords for context anchoring.",
1293            detector.id, pattern_index
1294        ));
1295    }
1296    let compiled = compile_pattern(detector_index, pattern_index, pattern, &detector.id)?;
1297    match prefix {
1298        Some(prefix) => {
1299            ac_literals.push(prefix);
1300            ac_map.push(compiled);
1301        }
1302        _ => fallback.push((compiled, detector.keywords.clone())),
1303    }
1304    Ok(())
1305}
1306
1307/// Build an Aho-Corasick automaton over all unique fallback keywords (≥4 chars).
1308/// Returns the AC and a mapping from keyword-match-index → fallback-pattern-indices.
1309fn build_fallback_keyword_ac(
1310    fallback: &[(CompiledPattern, Vec<String>)],
1311) -> (Option<AhoCorasick>, Vec<Vec<usize>>) {
1312    // Collect unique keywords → pattern indices
1313    let mut keyword_map: std::collections::HashMap<String, Vec<usize>> =
1314        std::collections::HashMap::new();
1315    for (pattern_idx, (_pattern, keywords)) in fallback.iter().enumerate() {
1316        for kw in keywords {
1317            if kw.len() >= 4 {
1318                keyword_map
1319                    .entry(kw.to_ascii_lowercase())
1320                    .or_default()
1321                    .push(pattern_idx);
1322            }
1323        }
1324    }
1325    if keyword_map.is_empty() {
1326        return (None, Vec::new());
1327    }
1328    let keywords: Vec<String> = keyword_map.keys().cloned().collect();
1329    let mapping: Vec<Vec<usize>> = keywords.iter().map(|kw| keyword_map[kw].clone()).collect();
1330    let ac = AhoCorasick::builder()
1331        .ascii_case_insensitive(true)
1332        .build(&keywords)
1333        .ok();
1334    (ac, mapping)
1335}
1336
1337fn log_quality_warnings(warnings: &[String]) {
1338    for warning in warnings {
1339        tracing::warn!("{}", warning);
1340    }
1341}
1342
1343fn build_ac_pattern_set(ac_literals: &[String]) -> Result<Option<PatternSet>, ScanError> {
1344    if ac_literals.is_empty() {
1345        return Ok(None);
1346    }
1347
1348    let mut builder = PatternSetBuilder::new();
1349    for (index, literal) in ac_literals.iter().enumerate() {
1350        builder = builder.add_literal(literal, index);
1351    }
1352
1353    Ok(Some(builder.build()?))
1354}
1355
1356fn build_detector_to_patterns(
1357    ac_map: &[CompiledPattern],
1358    detector_count: usize,
1359) -> Vec<Vec<usize>> {
1360    let mut detector_to_patterns = vec![Vec::new(); detector_count];
1361    for (pattern_index, entry) in ac_map.iter().enumerate() {
1362        detector_to_patterns[entry.detector_index].push(pattern_index);
1363    }
1364    detector_to_patterns
1365}
1366
1367fn build_same_prefix_patterns(ac_literals: &[String]) -> Vec<Vec<usize>> {
1368    let mut prefix_groups: HashMap<&str, Vec<usize>> = HashMap::new();
1369    for (index, literal) in ac_literals.iter().enumerate() {
1370        prefix_groups
1371            .entry(literal.as_str())
1372            .or_default()
1373            .push(index);
1374    }
1375    let mut same_prefix_patterns = vec![Vec::new(); ac_literals.len()];
1376    for indices in prefix_groups.values() {
1377        for &index in indices {
1378            same_prefix_patterns[index] = indices
1379                .iter()
1380                .copied()
1381                .filter(|other| *other != index)
1382                .collect();
1383        }
1384    }
1385    same_prefix_patterns
1386}
1387
1388fn normalize_scannable_chunk<'a>(
1389    chunk: &'a Chunk,
1390    owned_normalized: &'a mut Option<Chunk>,
1391) -> &'a Chunk {
1392    if chunk.data.is_ascii() {
1393        return chunk;
1394    }
1395
1396    match normalize_chunk_data(&chunk.data) {
1397        Cow::Borrowed(_) => chunk,
1398        Cow::Owned(normalized_chunk_text) => {
1399            *owned_normalized = Some(keyhog_core::Chunk {
1400                data: normalized_chunk_text,
1401                metadata: chunk.metadata.clone(),
1402            });
1403            // SAFETY: `owned_normalized` was set to `Some(...)` two lines
1404            // above, so `.as_ref()` is infallible here.
1405            match owned_normalized.as_ref() {
1406                Some(chunk) => chunk,
1407                None => chunk,
1408            }
1409        }
1410    }
1411}
1412
1413fn window_end_offset(text: &str, offset: usize, window_size: usize) -> usize {
1414    let mut end = (offset + window_size).min(text.len());
1415    while end < text.len() && !text.is_char_boundary(end) {
1416        end += 1; // Advance by 1 byte to find char boundary
1417    }
1418    end
1419}
1420
1421fn window_chunk(chunk: &Chunk, offset: usize, end: usize) -> Chunk {
1422    Chunk {
1423        data: chunk.data[offset..end].to_string(),
1424        metadata: chunk.metadata.clone(),
1425    }
1426}
1427
1428fn record_window_match(
1429    chunk_text: &str,
1430    offset: usize,
1431    matched: &mut RawMatch,
1432    seen: &mut std::collections::HashSet<(String, String, usize)>,
1433    seen_order: &mut VecDeque<(String, String, usize)>,
1434) -> bool {
1435    matched.location.offset += offset;
1436    matched.location.line = Some(line_number_for_offset(chunk_text, matched.location.offset));
1437    let key = (
1438        matched.detector_id.clone(),
1439        matched.credential.clone(),
1440        matched.location.offset,
1441    );
1442    if !seen.insert(key.clone()) {
1443        return false;
1444    }
1445
1446    seen_order.push_back(key);
1447    while seen.len() > MAX_WINDOW_DEDUP_ENTRIES {
1448        let Some(oldest) = seen_order.pop_front() else {
1449            break;
1450        };
1451        seen.remove(&oldest);
1452    }
1453
1454    true
1455}
1456
1457fn next_window_offset(text: &str, end: usize, overlap: usize) -> usize {
1458    let mut offset = end.saturating_sub(overlap);
1459    while offset > 0 && !text.is_char_boundary(offset) {
1460        offset -= 1; // Step back by 1 byte to find char boundary
1461    }
1462    offset
1463}
1464
1465fn adjust_fallback_match_locations(matches: &mut [RawMatch], line_idx: usize, line_offset: usize) {
1466    for matched in matches {
1467        if matched.location.line == Some(FIRST_LINE_NUMBER) {
1468            matched.location.line = Some(line_idx + FIRST_LINE_NUMBER);
1469        }
1470        matched.location.offset += line_offset;
1471    }
1472}
1473
1474fn match_line_number(
1475    preprocessed: &ScannerPreprocessedText,
1476    line_offsets: &[usize],
1477    match_start: usize,
1478) -> usize {
1479    preprocessed
1480        .line_for_offset(match_start)
1481        .unwrap_or_else(|| line_number_for_offset_with_offsets(line_offsets, match_start))
1482}
1483
1484#[allow(clippy::too_many_arguments)]
1485fn build_raw_match(
1486    detector: &DetectorSpec,
1487    chunk: &Chunk,
1488    credential: &str,
1489    companion: Option<String>,
1490    match_start: usize,
1491    line: usize,
1492    entropy: f64,
1493    confidence: f64,
1494) -> RawMatch {
1495    RawMatch {
1496        detector_id: detector.id.clone(),
1497        detector_name: detector.name.clone(),
1498        service: detector.service.clone(),
1499        severity: detector.severity,
1500        credential: credential.to_string(),
1501        companion,
1502        location: MatchLocation {
1503            source: chunk.metadata.source_type.clone(),
1504            file_path: chunk.metadata.path.clone(),
1505            line: Some(line),
1506            offset: match_start,
1507            commit: chunk.metadata.commit.clone(),
1508            author: chunk.metadata.author.clone(),
1509            date: chunk.metadata.date.clone(),
1510        },
1511        entropy: Some(entropy),
1512        confidence: Some(confidence),
1513    }
1514}
1515
1516fn should_suppress_known_example_credential(
1517    credential: &str,
1518    file_path: Option<&str>,
1519    inferred_context: context::CodeContext,
1520) -> bool {
1521    if !context::is_known_example_credential(credential) {
1522        return false;
1523    }
1524
1525    let sensitive_file = file_path
1526        .map(confidence::is_sensitive_path)
1527        .unwrap_or(false);
1528    !(sensitive_file && matches!(inferred_context, context::CodeContext::Assignment))
1529}
1530
1531#[cfg(feature = "ml")]
1532fn cached_ml_score(
1533    ml_score_cache: &mut HashMap<(String, String), f64>,
1534    ml_cache_order: &mut VecDeque<(String, String)>,
1535    ml_cache_bytes: &mut usize,
1536    credential: &str,
1537    context: &str,
1538) -> f64 {
1539    #[cfg(not(feature = "ml"))]
1540    {
1541        let _ = (
1542            ml_score_cache,
1543            ml_cache_order,
1544            ml_cache_bytes,
1545            credential,
1546            context,
1547        );
1548        return 0.0;
1549    }
1550
1551    #[cfg(feature = "ml")]
1552    {
1553        let cache_key = (credential.to_string(), context.to_string());
1554
1555        if let Some(score) = ml_score_cache.get(&cache_key) {
1556            if let Some(position) = ml_cache_order.iter().position(|key| key == &cache_key) {
1557                ml_cache_order.remove(position);
1558            }
1559            ml_cache_order.push_back(cache_key);
1560            return *score;
1561        }
1562
1563        let entry_bytes = cache_key.0.len().saturating_add(cache_key.1.len());
1564        while ml_score_cache.len() >= MAX_ML_CACHE_ENTRIES
1565            || ml_cache_bytes.saturating_add(entry_bytes) > MAX_ML_CACHE_BYTES
1566        {
1567            let Some(evicted) = ml_cache_order.pop_front() else {
1568                break;
1569            };
1570            if ml_score_cache.remove(&evicted).is_some() {
1571                *ml_cache_bytes =
1572                    ml_cache_bytes.saturating_sub(evicted.0.len().saturating_add(evicted.1.len()));
1573            }
1574        }
1575
1576        let score = ml_scorer::score(credential, context);
1577        ml_score_cache.insert(cache_key.clone(), score);
1578        ml_cache_order.push_back(cache_key);
1579        *ml_cache_bytes = ml_cache_bytes.saturating_add(entry_bytes);
1580        score
1581    }
1582}
1583
1584#[cfg(feature = "ml")]
1585fn local_context_window(data: &str, line: usize, radius: usize) -> String {
1586    let lines: Vec<&str> = data.lines().collect();
1587    if lines.is_empty() {
1588        return String::new();
1589    }
1590
1591    let start = line.saturating_sub(radius + 1);
1592    let end = (line + radius).min(lines.len());
1593    lines[start..end].join("\n")
1594}
1595
1596fn floor_char_boundary(text: &str, offset: usize) -> usize {
1597    let mut safe_offset = offset.min(text.len());
1598    while safe_offset > 0 && !text.is_char_boundary(safe_offset) {
1599        safe_offset -= 1;
1600    }
1601    safe_offset
1602}
1603
1604fn line_number_for_offset(text: &str, offset: usize) -> usize {
1605    let safe_offset = floor_char_boundary(text, offset);
1606    memchr::memchr_iter(b'\n', &text.as_bytes()[..safe_offset])
1607        .count()
1608        .saturating_add(1)
1609}
1610
1611fn line_number_for_offset_with_offsets(line_offsets: &[usize], offset: usize) -> usize {
1612    line_offsets.partition_point(|line_offset| *line_offset <= offset)
1613}
1614
1615fn compute_line_offsets(text: &str) -> Vec<usize> {
1616    let mut offsets = Vec::with_capacity(128);
1617    offsets.push(0);
1618    for idx in memchr::memchr_iter(b'\n', text.as_bytes()) {
1619        offsets.push(idx + 1);
1620    }
1621    offsets
1622}
1623
1624fn normalize_chunk_data(data: &str) -> Cow<'_, str> {
1625    if data.is_ascii() {
1626        return Cow::Borrowed(data);
1627    }
1628
1629    let normalized = data.nfc().collect::<String>();
1630    if normalized == data {
1631        Cow::Borrowed(data)
1632    } else {
1633        Cow::Owned(normalized)
1634    }
1635}
1636
1637/// Extract a literal prefix from a regex pattern for Aho-Corasick.
1638/// Takes consecutive non-metacharacters from the start.
1639/// Returns `None` if fewer than 3 literal chars.
1640fn extract_literal_prefix(pattern: &str) -> Option<String> {
1641    let mut prefix = String::new();
1642    let mut chars = pattern.chars();
1643    while let Some(ch) = chars.next() {
1644        match ch {
1645            '\\' => {
1646                let Some(next) = chars.next() else {
1647                    break;
1648                };
1649                if is_escaped_literal(next) {
1650                    prefix.push(next);
1651                } else {
1652                    break;
1653                }
1654            }
1655            '[' | '(' | '.' | '*' | '+' | '?' | '{' | '|' | '^' | '$' => break,
1656            _ => {
1657                prefix.push(ch);
1658            }
1659        }
1660    }
1661    if prefix.len() >= MIN_LITERAL_PREFIX_CHARS {
1662        Some(prefix)
1663    } else {
1664        None
1665    }
1666}
1667
1668fn is_escaped_literal(ch: char) -> bool {
1669    matches!(
1670        ch,
1671        '[' | ']' | '(' | ')' | '.' | '*' | '+' | '?' | '{' | '}' | '\\' | '|' | '^' | '$'
1672    )
1673}
1674
1675/// Search for a companion pattern within N lines of a given line number.
1676fn find_companion(
1677    preprocessed: &ScannerPreprocessedText,
1678    primary_line: usize,
1679    companion: &CompiledCompanion,
1680) -> Option<String> {
1681    let start = primary_line.saturating_sub(companion.within_lines);
1682    let end = primary_line.saturating_add(companion.within_lines);
1683    let (window_start, window_end) =
1684        line_window_offsets(preprocessed, start + FIRST_LINE_NUMBER, end)?;
1685    let haystack = &preprocessed.text[window_start..window_end];
1686
1687    for captures in companion.regex.captures_iter(haystack) {
1688        let Some(m) = captures.get(companion.capture_group.unwrap_or(FIRST_CAPTURE_GROUP_INDEX))
1689        else {
1690            continue;
1691        };
1692        if m.len() > 4096 {
1693            continue; // Prevent memory issues from excessively long companion matches
1694        }
1695        if let Some(line) = preprocessed.line_for_offset(window_start + m.start())
1696            && (start + FIRST_LINE_NUMBER..=end).contains(&line)
1697        {
1698            return Some(m.as_str().to_string());
1699        }
1700    }
1701    None
1702}
1703
1704fn line_window_offsets(
1705    preprocessed: &ScannerPreprocessedText,
1706    start_line: usize,
1707    end_line: usize,
1708) -> Option<(usize, usize)> {
1709    let mut start_offset = None;
1710    let mut end_offset = None;
1711
1712    for mapping in &preprocessed.mappings {
1713        if start_offset.is_none() && mapping.line_number >= start_line {
1714            start_offset = Some(mapping.start_offset);
1715        }
1716        if mapping.line_number <= end_line {
1717            end_offset = Some(mapping.end_offset);
1718        }
1719    }
1720
1721    Some((start_offset?, end_offset?))
1722}
1723
1724#[cfg(not(feature = "entropy"))]
1725fn fallback_entropy(data: &[u8]) -> f64 {
1726    if data.is_empty() {
1727        return 0.0;
1728    }
1729
1730    let mut counts = [0u64; 256];
1731    for &byte in data {
1732        counts[byte as usize] += 1;
1733    }
1734
1735    let len = data.len() as f64;
1736    let mut entropy = 0.0;
1737    for &count in &counts {
1738        if count > 0 {
1739            let p = count as f64 / len;
1740            entropy -= p * p.log2();
1741        }
1742    }
1743    entropy
1744}
1745
1746fn match_entropy(data: &[u8]) -> f64 {
1747    #[cfg(feature = "entropy")]
1748    {
1749        entropy::shannon_entropy(data)
1750    }
1751
1752    #[cfg(not(feature = "entropy"))]
1753    {
1754        fallback_entropy(data)
1755    }
1756}
1757
1758/// Check if a match is within a hex-encoded context (i.e., surrounded by hex digits).
1759/// This prevents false positives where a secret pattern matches inside hex-encoded data.
1760/// We look at up to 20 chars before and after the match to determine context.
1761fn is_within_hex_context(data: &str, match_start: usize, match_end: usize) -> bool {
1762    if !valid_match_bounds(data, match_start, match_end) {
1763        return false;
1764    }
1765    let matched = &data[match_start..match_end];
1766    let matched_hex_digits = matched.chars().filter(|c| c.is_ascii_hexdigit()).count();
1767    if matched.len() < MIN_HEX_MATCH_LEN || matched_hex_digits < MIN_HEX_DIGITS_IN_MATCH {
1768        return false;
1769    }
1770    let (before, after) = surrounding_hex_context(data, match_start, match_end);
1771    let hex_before = formatted_hex_run(before.chars().rev());
1772    let hex_after = formatted_hex_run(after.chars());
1773    hex_before >= MIN_HEX_CONTEXT_DIGITS && hex_after >= MIN_HEX_CONTEXT_DIGITS
1774}
1775
1776fn valid_match_bounds(data: &str, match_start: usize, match_end: usize) -> bool {
1777    match_end > match_start
1778        && data.is_char_boundary(match_start)
1779        && data.is_char_boundary(match_end)
1780}
1781
1782fn surrounding_hex_context(data: &str, match_start: usize, match_end: usize) -> (&str, &str) {
1783    let context_start =
1784        floor_char_boundary(data, match_start.saturating_sub(HEX_CONTEXT_RADIUS_CHARS));
1785    let context_end = {
1786        let mut end = (match_end + HEX_CONTEXT_RADIUS_CHARS).min(data.len());
1787        while end < data.len() && !data.is_char_boundary(end) {
1788            end += 1; // Advance by 1 byte to find char boundary
1789        }
1790        end.min(data.len())
1791    };
1792    (
1793        &data[context_start..match_start],
1794        &data[match_end..context_end],
1795    )
1796}
1797
1798fn formatted_hex_run(iter: impl Iterator<Item = char>) -> usize {
1799    let mut hex_digits = 0usize;
1800    let mut separators = 0usize;
1801    let mut seen_hex = false;
1802
1803    for ch in iter {
1804        if ch.is_ascii_hexdigit() {
1805            hex_digits += 1;
1806            seen_hex = true;
1807            continue;
1808        }
1809        if matches!(ch, ' ' | '\t' | ':' | '-')
1810            && (!seen_hex || separators < MAX_HEX_CONTEXT_SEPARATORS)
1811        {
1812            separators += 1;
1813            continue;
1814        }
1815        break;
1816    }
1817
1818    hex_digits
1819}
1820
1821fn compile_pattern(
1822    detector_index: usize,
1823    pattern_index: usize,
1824    spec: &PatternSpec,
1825    detector_id: &str,
1826) -> Result<CompiledPattern, ScanError> {
1827    let regex = regex::RegexBuilder::new(&spec.regex)
1828        .size_limit(REGEX_SIZE_LIMIT_BYTES)
1829        .dfa_size_limit(REGEX_SIZE_LIMIT_BYTES)
1830        .build()
1831        .map_err(|e| ScanError::RegexCompile {
1832            detector_id: detector_id.to_string(),
1833            index: pattern_index,
1834            source: e,
1835        })?;
1836    Ok(CompiledPattern {
1837        detector_index,
1838        regex,
1839        group: spec.group,
1840    })
1841}
1842
1843fn compile_companion(
1844    spec: &CompanionSpec,
1845    detector_id: &str,
1846) -> Result<CompiledCompanion, ScanError> {
1847    let regex = regex::RegexBuilder::new(&spec.regex)
1848        .size_limit(REGEX_SIZE_LIMIT_BYTES)
1849        .dfa_size_limit(REGEX_SIZE_LIMIT_BYTES)
1850        .build()
1851        .map_err(|e| ScanError::RegexCompile {
1852            detector_id: detector_id.to_string(),
1853            index: FIRST_CAPTURE_GROUP_INDEX,
1854            source: e,
1855        })?;
1856    let capture_group = (regex.captures_len() > 1).then_some(FIRST_CAPTURE_GROUP_INDEX);
1857    Ok(CompiledCompanion {
1858        regex,
1859        capture_group,
1860        within_lines: spec.within_lines,
1861    })
1862}
1863
1864#[cfg(test)]
1865mod tests {
1866    use super::*;
1867    use keyhog_core::{ChunkMetadata, Severity};
1868
1869    fn make_chunk(data: &str) -> Chunk {
1870        Chunk {
1871            data: data.to_string(),
1872            metadata: ChunkMetadata {
1873                source_type: "test".into(),
1874                path: Some("test.txt".into()),
1875                commit: None,
1876                author: None,
1877                date: None,
1878            },
1879        }
1880    }
1881
1882    #[test]
1883    fn literal_prefix_extraction() {
1884        assert_eq!(
1885            extract_literal_prefix("AKIA[0-9A-Z]{16}"),
1886            Some("AKIA".into())
1887        );
1888        assert_eq!(
1889            extract_literal_prefix("xoxb-[0-9]{10}"),
1890            Some("xoxb-".into())
1891        );
1892        assert_eq!(
1893            extract_literal_prefix("ghp_[A-Za-z0-9]{36}"),
1894            Some("ghp_".into())
1895        );
1896        assert_eq!(extract_literal_prefix("[a-z]+"), None);
1897        assert_eq!(extract_literal_prefix("ab"), None);
1898        assert_eq!(
1899            extract_literal_prefix(r"foo\.bar[0-9]+"),
1900            Some("foo.bar".into())
1901        );
1902        assert_eq!(
1903            extract_literal_prefix(r"abc\*def[0-9]+"),
1904            Some("abc*def".into())
1905        );
1906    }
1907
1908    #[test]
1909    fn scan_detects_slack_bot_token_from_single_line_literal() {
1910        let detector = DetectorSpec {
1911            id: "slack-bot".into(),
1912            name: "Slack Bot Token".into(),
1913            service: "slack".into(),
1914            severity: Severity::Critical,
1915            patterns: vec![PatternSpec {
1916                regex: "xoxb-[0-9]{10}-[0-9]{10}-[a-zA-Z0-9]{24}".into(),
1917                description: None,
1918                group: None,
1919            }],
1920            companion: None,
1921            verify: None,
1922            keywords: vec![],
1923        };
1924
1925        let scanner = CompiledScanner::compile(vec![detector]).unwrap();
1926        let chunk = make_chunk("token = \"xoxb-1234567890-1234567890-abcdefghijABCDEFGHIJklmn\"");
1927        let matches = scanner.scan(&chunk);
1928        assert_eq!(matches.len(), 1);
1929        assert_eq!(matches[0].detector_id, "slack-bot");
1930        assert!(matches[0].credential.starts_with("xoxb-"));
1931    }
1932
1933    #[test]
1934    fn scan_attaches_companion_secret_near_aws_access_key() {
1935        let detector = DetectorSpec {
1936            id: "aws-key".into(),
1937            name: "AWS Access Key".into(),
1938            service: "aws".into(),
1939            severity: Severity::Critical,
1940            patterns: vec![PatternSpec {
1941                regex: "AKIA[0-9A-Z]{16}".into(),
1942                description: None,
1943                group: None,
1944            }],
1945            companion: Some(CompanionSpec {
1946                regex: "AWS_SECRET_ACCESS_KEY[=:\\s]+([0-9a-zA-Z/+=]{40})".into(),
1947                within_lines: 3,
1948                name: "secret_key".into(),
1949            }),
1950            verify: None,
1951            keywords: vec![],
1952        };
1953
1954        let scanner = CompiledScanner::compile(vec![detector]).unwrap();
1955        let access_key = format!("AKIA{}", "R7VXNPLMQ3HSKWJT");
1956        let secret_key = format!("kR4vN8pW2cF6gH0j{}", "L3mQsT7uX9yAbDe12fG5nP8Z");
1957        let chunk = make_chunk(
1958            &format!("AWS_ACCESS_KEY_ID={access_key}\nAWS_SECRET_ACCESS_KEY={secret_key}"),
1959        );
1960        let matches = scanner.scan(&chunk);
1961        assert_eq!(matches.len(), 1);
1962        assert_eq!(matches[0].credential, access_key);
1963        assert!(matches[0].companion.is_some());
1964    }
1965
1966    #[test]
1967    fn scan_extracts_captured_companion_value_without_anchor_text() {
1968        let detector = DetectorSpec {
1969            id: "anchored-companion".into(),
1970            name: "Anchored Companion".into(),
1971            service: "test".into(),
1972            severity: Severity::High,
1973            patterns: vec![PatternSpec {
1974                regex: "client_id[=:\\s\"']+([a-z0-9]{8})".into(),
1975                description: None,
1976                group: Some(1),
1977            }],
1978            companion: Some(CompanionSpec {
1979                regex: "client_secret[=:\\s\"']+([A-Za-z0-9]{16})".into(),
1980                within_lines: 1,
1981                name: "client_secret".into(),
1982            }),
1983            verify: None,
1984            keywords: vec!["client_id".into(), "client_secret".into()],
1985        };
1986
1987        let scanner = CompiledScanner::compile(vec![detector]).unwrap();
1988        let chunk = make_chunk("client_id=deadbeef\nclient_secret=ABCDEFGHIJKLMNOP");
1989        let matches = scanner.scan(&chunk);
1990        assert_eq!(matches.len(), 1);
1991        assert_eq!(matches[0].companion.as_deref(), Some("ABCDEFGHIJKLMNOP"));
1992    }
1993
1994    #[test]
1995    fn empty_input_produces_no_matches() {
1996        let detector = DetectorSpec {
1997            id: "test".into(),
1998            name: "Test".into(),
1999            service: "test".into(),
2000            severity: Severity::Low,
2001            patterns: vec![PatternSpec {
2002                regex: "SECRET_[A-Z]{10}".into(),
2003                description: None,
2004                group: None,
2005            }],
2006            companion: None,
2007            verify: None,
2008            keywords: vec![],
2009        };
2010
2011        let scanner = CompiledScanner::compile(vec![detector]).unwrap();
2012        let chunk = make_chunk("");
2013        assert!(scanner.scan(&chunk).is_empty());
2014    }
2015
2016    #[test]
2017    fn known_example_aws_key_is_allowed_in_sensitive_assignment_file() {
2018        let detector = DetectorSpec {
2019            id: "aws-key".into(),
2020            name: "AWS Key".into(),
2021            service: "aws".into(),
2022            severity: Severity::Critical,
2023            patterns: vec![PatternSpec {
2024                regex: "AKIA[0-9A-Z]{16}".into(),
2025                description: None,
2026                group: None,
2027            }],
2028            companion: None,
2029            verify: None,
2030            keywords: vec!["AKIA".into()],
2031        };
2032        let scanner = CompiledScanner::compile(vec![detector]).unwrap();
2033        let chunk = Chunk {
2034            data: "AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE\n".into(),
2035            metadata: ChunkMetadata {
2036                source_type: "test".into(),
2037                path: Some("aws.env".into()),
2038                commit: None,
2039                author: None,
2040                date: None,
2041            },
2042        };
2043
2044        let matches = scanner.scan(&chunk);
2045        assert_eq!(matches.len(), 1);
2046        assert_eq!(matches[0].credential, "AKIAIOSFODNN7EXAMPLE");
2047    }
2048
2049    #[test]
2050    fn scan_detects_slack_bot_token_split_across_concat_lines() {
2051        // Slack token split across lines with + operator
2052        let detector = DetectorSpec {
2053            id: "slack-bot".into(),
2054            name: "Slack Bot Token".into(),
2055            service: "slack".into(),
2056            severity: Severity::Critical,
2057            patterns: vec![PatternSpec {
2058                regex: "xoxb-[0-9]{10}-[0-9]{10}-[a-zA-Z0-9]{24}".into(),
2059                description: None,
2060                group: None,
2061            }],
2062            companion: None,
2063            verify: None,
2064            keywords: vec!["slack".into()],
2065        };
2066
2067        let scanner = CompiledScanner::compile(vec![detector]).unwrap();
2068        let chunk = make_chunk(
2069            "token = \"xoxb-1234567890-\" + \"1234567890-\" + \"abcdefghijABCDEFGHIJklmn\"",
2070        );
2071        let matches = scanner.scan(&chunk);
2072        assert_eq!(matches.len(), 1, "Should find token split with + operator");
2073        assert_eq!(matches[0].detector_id, "slack-bot");
2074        assert!(matches[0].credential.starts_with("xoxb-"));
2075    }
2076
2077    #[test]
2078    fn scan_detects_aws_access_key_split_by_backslash_continuation() {
2079        // AWS key split with backslash continuation
2080        let detector = DetectorSpec {
2081            id: "aws-access-key".into(),
2082            name: "AWS Access Key".into(),
2083            service: "aws".into(),
2084            severity: Severity::Critical,
2085            patterns: vec![PatternSpec {
2086                regex: "AKIA[0-9A-Z]{16}".into(),
2087                description: None,
2088                group: None,
2089            }],
2090            companion: None,
2091            verify: None,
2092            keywords: vec!["aws".into(), "access".into()],
2093        };
2094
2095        let scanner = CompiledScanner::compile(vec![detector]).unwrap();
2096        let chunk = make_chunk("AWS_ACCESS_KEY_ID = \"AKIA\" \\\n    \"R7VXNPLMQ3HSKWJT\"");
2097        let matches = scanner.scan(&chunk);
2098        assert_eq!(
2099            matches.len(),
2100            1,
2101            "Should find AWS key with backslash continuation"
2102        );
2103        assert_eq!(matches[0].detector_id, "aws-access-key");
2104        assert!(matches[0].credential.starts_with("AKIA"));
2105    }
2106
2107    #[test]
2108    fn scan_detects_python_style_multiline_api_key() {
2109        // Python-style multiline secret with implicit concatenation
2110        let detector = DetectorSpec {
2111            id: "generic-api-key".into(),
2112            name: "Generic API Key".into(),
2113            service: "generic".into(),
2114            severity: Severity::High,
2115            patterns: vec![PatternSpec {
2116                regex: "sk-[a-z]{4}-[a-zA-Z0-9]{32}".into(),
2117                description: None,
2118                group: None,
2119            }],
2120            companion: None,
2121            verify: None,
2122            keywords: vec!["api".into(), "key".into()],
2123        };
2124
2125        let scanner = CompiledScanner::compile(vec![detector]).unwrap();
2126        let chunk = make_chunk(
2127            r#"api_key = "sk-proj-" + \
2128    "AbCdEfGhIjKlMnOpQrStUvWxYz123456""#,
2129        );
2130        let matches = scanner.scan(&chunk);
2131        assert_eq!(matches.len(), 1, "Should find Python multiline secret");
2132        assert_eq!(matches[0].detector_id, "generic-api-key");
2133        assert!(matches[0].credential.starts_with("sk-proj-"));
2134    }
2135
2136    #[test]
2137    fn scan_detects_javascript_multiline_github_token() {
2138        // JavaScript-style multiline with + operator
2139        let detector = DetectorSpec {
2140            id: "github-token".into(),
2141            name: "GitHub Token".into(),
2142            service: "github".into(),
2143            severity: Severity::Critical,
2144            patterns: vec![PatternSpec {
2145                regex: "ghp_[a-zA-Z0-9]{36}".into(),
2146                description: None,
2147                group: None,
2148            }],
2149            companion: None,
2150            verify: None,
2151            keywords: vec!["github".into(), "token".into()],
2152        };
2153
2154        let scanner = CompiledScanner::compile(vec![detector]).unwrap();
2155        let chunk = make_chunk(
2156            r#"const token = "ghp_" +
2157    "kR4vN8pW2cF6gH0jL3" +
2158    "mQsT7uX9yAbDe12fG5";"#,
2159        );
2160        let matches = scanner.scan(&chunk);
2161        assert_eq!(
2162            matches.len(),
2163            1,
2164            "Should find GitHub token split with + operator"
2165        );
2166        assert_eq!(matches[0].detector_id, "github-token");
2167        assert!(matches[0].credential.starts_with("ghp_"));
2168    }
2169
2170    #[test]
2171    fn line_number_for_offset_clamps_to_char_boundary() {
2172        let text = "line1\ncaf\u{00e9}\nline3";
2173        let offset_inside_multibyte = text.find('\u{00e9}').unwrap() + 1;
2174
2175        assert_eq!(line_number_for_offset(text, offset_inside_multibyte), 2);
2176    }
2177
2178    #[test]
2179    fn line_number_for_offset_treats_newline_as_previous_line() {
2180        let text = "first\nsecond";
2181        let newline_offset = text.find('\n').unwrap();
2182        assert_eq!(line_number_for_offset(text, newline_offset), 1);
2183        assert_eq!(line_number_for_offset(text, newline_offset + 1), 2);
2184    }
2185
2186    #[test]
2187    fn cached_ml_score_uses_context_in_cache_key() {
2188        let mut cache = HashMap::new();
2189        let mut order = VecDeque::new();
2190        let mut bytes = 0usize;
2191
2192        let first = cached_ml_score(
2193            &mut cache,
2194            &mut order,
2195            &mut bytes,
2196            "shared-credential",
2197            "password=shared-credential",
2198        );
2199        let second = cached_ml_score(
2200            &mut cache,
2201            &mut order,
2202            &mut bytes,
2203            "shared-credential",
2204            "token: shared-credential",
2205        );
2206        let repeated = cached_ml_score(
2207            &mut cache,
2208            &mut order,
2209            &mut bytes,
2210            "shared-credential",
2211            "password=shared-credential",
2212        );
2213
2214        assert_eq!(cache.len(), 2);
2215        assert_eq!(order.len(), 2);
2216        assert_eq!(first, repeated);
2217        assert_eq!(
2218            cache.get(&(
2219                "shared-credential".to_string(),
2220                "password=shared-credential".to_string(),
2221            )),
2222            Some(&first)
2223        );
2224        assert_eq!(
2225            cache.get(&(
2226                "shared-credential".to_string(),
2227                "token: shared-credential".to_string(),
2228            )),
2229            Some(&second)
2230        );
2231    }
2232
2233    #[test]
2234    fn cached_ml_score_obeys_byte_budget() {
2235        let mut cache = HashMap::new();
2236        let mut order = VecDeque::new();
2237        let mut bytes = 0usize;
2238
2239        for idx in 0..64 {
2240            let context = format!("ctx-{idx}-{}", "x".repeat(8_192));
2241            let _ = cached_ml_score(&mut cache, &mut order, &mut bytes, "cred", &context);
2242        }
2243
2244        assert!(bytes <= MAX_ML_CACHE_BYTES);
2245        assert!(cache.len() < 64);
2246    }
2247
2248    #[test]
2249    fn companion_search_uses_preprocessed_text() {
2250        let detector = DetectorSpec {
2251            id: "aws-key".into(),
2252            name: "AWS Access Key".into(),
2253            service: "aws".into(),
2254            severity: Severity::Critical,
2255            patterns: vec![PatternSpec {
2256                regex: "AKIA[0-9A-Z]{16}".into(),
2257                description: None,
2258                group: None,
2259            }],
2260            companion: Some(CompanionSpec {
2261                regex: "[0-9a-zA-Z/+=]{40}".into(),
2262                within_lines: 3,
2263                name: "secret_key".into(),
2264            }),
2265            verify: None,
2266            keywords: vec![],
2267        };
2268
2269        let scanner = CompiledScanner::compile(vec![detector]).unwrap();
2270        let access_key = format!("AKIA{}", "R7VXNPLMQ3HSKWJT");
2271        let chunk = make_chunk(
2272            &format!("AWS_ACCESS_KEY_ID = \"AKIA\" + \"R7VXNPLMQ3HSKWJT\"\nAWS_SECRET_ACCESS_KEY = \"kR4vN8pW2cF6gH0jL3mQsT7uX9yAbDe12fG5nP8\""),
2273        );
2274        let matches = scanner.scan(&chunk);
2275        assert_eq!(matches.len(), 1);
2276        assert_eq!(matches[0].credential, access_key);
2277        // Note: companion may or may not be found depending on multiline
2278        // preprocessing — the line structure changes after joining string
2279        // concatenations, which can shift the companion out of within_lines range.
2280    }
2281
2282    #[test]
2283    fn fallback_line_by_line_scan_preserves_absolute_location() {
2284        let detector = DetectorSpec {
2285            id: "fallback".into(),
2286            name: "Fallback".into(),
2287            service: "generic".into(),
2288            severity: Severity::High,
2289            patterns: vec![PatternSpec {
2290                regex: "[A-Z0-9]{32}".into(),
2291                description: None,
2292                group: None,
2293            }],
2294            companion: None,
2295            verify: None,
2296            keywords: vec!["token".into()],
2297        };
2298
2299        let scanner = CompiledScanner::compile(vec![detector]).unwrap();
2300        let prefix = "a".repeat(LARGE_FALLBACK_SCAN_THRESHOLD + 1);
2301        let secret = "ABCDEFGHIJKLMNOPQRSTUVWX12345678";
2302        let chunk = make_chunk(&format!("{prefix}\ntoken = {secret}"));
2303        let matches = scanner.scan(&chunk);
2304        assert_eq!(matches.len(), 1);
2305        assert_eq!(matches[0].credential, secret);
2306        assert_eq!(matches[0].location.line, Some(2));
2307        assert_eq!(
2308            matches[0].location.offset,
2309            prefix.len() + 1 + "token = ".len()
2310        );
2311    }
2312
2313    #[test]
2314    fn hex_context_handles_formatted_hex_dump() {
2315        let text = "aa bb cc dd ee ff 0011223344556677 88 99 aa bb cc dd ee ff";
2316        let start = text.find("0011223344556677").unwrap();
2317        let end = start + "0011223344556677".len();
2318        assert!(is_within_hex_context(text, start, end));
2319    }
2320
2321    #[test]
2322    fn windowed_scan_reports_boundary_spanning_secret_once() {
2323        let detector = DetectorSpec {
2324            id: "boundary-gh".into(),
2325            name: "Boundary GitHub Token".into(),
2326            service: "github".into(),
2327            severity: Severity::Critical,
2328            patterns: vec![PatternSpec {
2329                regex: "ghp_[A-Za-z0-9]{36}".into(),
2330                description: None,
2331                group: None,
2332            }],
2333            companion: None,
2334            verify: None,
2335            keywords: vec!["github".into()],
2336        };
2337
2338        let scanner = CompiledScanner::compile(vec![detector]).unwrap();
2339        let secret = "ghp_abcdefghijklmnopqrstuvwxyzABCDEFGHIJ";
2340        let prefix = "a".repeat(MAX_SCAN_CHUNK_BYTES - 16);
2341        let suffix = "z".repeat(WINDOW_OVERLAP_BYTES + 32);
2342        let chunk = make_chunk(&format!("{prefix}{secret}{suffix}"));
2343
2344        let matches = scanner.scan(&chunk);
2345        assert_eq!(matches.len(), 1);
2346        assert_eq!(matches[0].credential, secret);
2347        assert_eq!(matches[0].location.offset, prefix.len());
2348    }
2349}
2350
2351#[cfg(test)]
2352mod regression_tests {
2353    use super::*;
2354    use keyhog_core::{ChunkMetadata, DetectorSpec, PatternSpec, Severity};
2355
2356    #[test]
2357    fn openai_key_detection() {
2358        let detector = DetectorSpec {
2359            id: "openai-api-key".into(),
2360            name: "OpenAI API Key".into(),
2361            service: "openai".into(),
2362            severity: Severity::Critical,
2363            patterns: vec![PatternSpec {
2364                regex: "sk-proj-[a-zA-Z0-9_-]{100,}".into(),
2365                description: None,
2366                group: None,
2367            }],
2368            companion: None,
2369            verify: None,
2370            keywords: vec!["sk-proj-".into()],
2371        };
2372
2373        let scanner = CompiledScanner::compile(vec![detector]).unwrap();
2374        let chunk = Chunk {
2375            data: "sk-proj-abcdefghijklmnopqrstuvwxyz1234567890abcdefghijklmnopqrstuvwxyz1234567890abcdefghijklmnopqrstuvwxyz1234567890".into(),
2376            metadata: ChunkMetadata {
2377                source_type: "test".into(),
2378                path: Some("test.txt".into()),
2379                commit: None,
2380                author: None,
2381                date: None,
2382            },
2383        };
2384        let matches = scanner.scan(&chunk);
2385        assert!(
2386            !matches.is_empty(),
2387            "OpenAI key should be detected, got 0 matches. Preprocessed text starts with: {:?}",
2388            &chunk.data[..20]
2389        );
2390        assert_eq!(matches[0].detector_id, "openai-api-key");
2391        assert_eq!(
2392            matches[0].credential,
2393            "sk-proj-abcdefghijklmnopqrstuvwxyz1234567890abcdefghijklmnopqrstuvwxyz1234567890abcdefghijklmnopqrstuvwxyz1234567890"
2394        );
2395    }
2396}
keyhog_scanner/lib.rs

keyhog_scanner/
lib.rs