Skip to main content

keyhog_scanner/
scanner_config.rs

1//! Scanner configuration and scan state types.
2
3use std::cmp::Reverse;
4use std::collections::{BinaryHeap, HashSet};
5#[cfg(feature = "ml")]
6use std::collections::{HashMap, VecDeque};
7use std::sync::Arc;
8
9/// Configuration for the scanner's decoding and processing heuristics.
10#[derive(Debug, Clone)]
11pub struct ScannerConfig {
12    /// Maximum recursion depth for decode-through (base64, hex, etc.)
13    pub max_decode_depth: usize,
14    /// Validate decoded strings (e.g. check if decoded base64 is UTF-8)
15    pub validate_decode: bool,
16    /// Enable entropy-based detection
17    pub entropy_enabled: bool,
18    /// Threshold for entropy-based detection
19    pub entropy_threshold: f64,
20    /// Enable entropy-based detection in source code files
21    pub entropy_in_source_files: bool,
22    /// Route entropy-fallback candidates through the MoE with the model
23    /// AUTHORITATIVE (no entropy-magnitude floor) instead of the bare entropy
24    /// heuristic. Mirrors `keyhog_core::config::ScanConfig::entropy_ml_authoritative`
25    /// and the CLI `--no-entropy-ml-scoring` opt-out. No-op unless both
26    /// `entropy_enabled` and `ml_enabled` are set. See `apply_ml_batch_scores`
27    /// and `scan_entropy_fallback`.
28    pub entropy_ml_authoritative: bool,
29    /// Admit generic keyword-bridge values (`PASSWORD=`, `*_PASS=`, `secret:`,
30    /// `api_key=` ...) on the relaxed `generic-keyword-secret` entropy floor
31    /// instead of the high `generic-secret` floor. Mirrors
32    /// `keyhog_core::config::ScanConfig::generic_keyword_low_entropy` and the CLI
33    /// `--no-keyword-low-entropy` opt-out. The keyword key is the evidence;
34    /// precision is carried by the MoE + shape filters. See
35    /// `scan_generic_assignments`.
36    pub generic_keyword_low_entropy: bool,
37    /// Enable ML-based confidence scoring
38    pub ml_enabled: bool,
39    /// ML weight for confidence scoring, 0.0-1.0
40    pub ml_weight: f64,
41    /// Minimum confidence threshold for matches
42    pub min_confidence: f64,
43    /// Enable Unicode normalization
44    pub unicode_normalization: bool,
45    /// Maximum bytes for decode-through processing
46    pub max_decode_bytes: usize,
47    /// Maximum matches to collect per chunk before stopping.
48    /// Prevents OOM on extremely noisy files.
49    pub max_matches_per_chunk: usize,
50    /// When `true`, credentials inside source-code comments are
51    /// treated as first-class findings (no confidence downgrade,
52    /// no comment-context multiplier). Mirrors
53    /// `keyhog_core::config::ScanConfig::scan_comments` and the
54    /// CLI's `--scan-comments` flag. See that field's doc for why
55    /// the default is off.
56    pub scan_comments: bool,
57    /// Configuration for multiline concatenation
58    pub multiline: crate::multiline::MultilineConfig,
59    /// Known secret prefixes used to boost confidence.
60    pub known_prefixes: Vec<String>,
61    /// Keywords indicating a secret context (e.g. "api_key", "token").
62    pub secret_keywords: Vec<String>,
63    /// Keywords indicating a test/mock context (e.g. "test", "fake").
64    pub test_keywords: Vec<String>,
65    /// Keywords indicating a placeholder value (e.g. "change_me", "todo").
66    pub placeholder_keywords: Vec<String>,
67    /// Apply test/example path confidence and hard-suppression heuristics.
68    /// The CLI disables this for `--no-suppress-test-fixtures`.
69    pub penalize_test_paths: bool,
70}
71
72impl Default for ScannerConfig {
73    fn default() -> Self {
74        keyhog_core::config::ScanConfig::default().into()
75    }
76}
77
78impl ScannerConfig {
79    /// Confidence floor for [`ScannerConfig::high_precision`]. Distinct from the
80    /// canonical `ScanConfig::default()` floor (0.40) on purpose: precision mode
81    /// trades recall for a near-zero false-positive rate at mass-scan scale.
82    pub const HIGH_PRECISION_MIN_CONFIDENCE: f64 = 0.85;
83
84    pub fn fast() -> Self {
85        Self {
86            max_decode_depth: 0,
87            ml_enabled: false,
88            entropy_enabled: false,
89            ..Default::default()
90        }
91    }
92
93    pub fn thorough() -> Self {
94        // `min_confidence` intentionally omitted: it inherits the canonical
95        // `ScanConfig::default()` floor (single source of truth) instead of
96        // forking a second literal. Deep scanning widens decode/entropy, not
97        // the confidence bar.
98        Self {
99            max_decode_depth: 10,
100            ml_enabled: true,
101            entropy_enabled: true,
102            ..Default::default()
103        }
104    }
105
106    /// High-precision mass-scan preset: minimise false positives at the cost of
107    /// some recall, for scanning huge corpora where every FP is expensive to
108    /// triage. Fully offline and fast (no ML, no entropy sweep, shallow decode).
109    ///
110    /// - `entropy_enabled = false`: generic high-entropy matching is the single
111    ///   largest FP source; precision mode drops it entirely.
112    /// - `ml_enabled = true` (inherited): ML is the confidence discriminator that
113    ///   lifts genuine secrets over the high floor while leaving FP-shaped tokens
114    ///   below it. Disabling it would crater the scores the 0.85 bar relies on,
115    ///   so precision KEEPS ML (this mode trades recall for precision, not for
116    ///   speed — use `--fast` when speed is the goal).
117    /// - `min_confidence = HIGH_PRECISION_MIN_CONFIDENCE` (0.85): combined with
118    ///   the engine's checksum policy (valid token → floored 0.9, invalid →
119    ///   capped 0.1) and clamped over every detector's self-declared floor, this
120    ///   bar admits checksum-validated tokens and strong ML-scored findings while
121    ///   dropping checksum-failures and weak-signal matches.
122    /// - `max_decode_depth = 1`: deep-decoded payloads are a FP source at scale.
123    ///
124    /// `penalize_test_paths` stays on (the default) to suppress fixture-shaped
125    /// hits. A `--min-confidence` override still layers on top of this preset.
126    pub fn high_precision() -> Self {
127        Self {
128            max_decode_depth: 1,
129            entropy_enabled: false,
130            // High-precision mode does not admit low-entropy keyword-anchored
131            // values: that surface trades precision for real-world recall, the
132            // opposite of this preset's contract. Restores the high
133            // `generic-secret` floor.
134            generic_keyword_low_entropy: false,
135            min_confidence: Self::HIGH_PRECISION_MIN_CONFIDENCE,
136            ..Default::default()
137        }
138    }
139
140    pub fn min_confidence(mut self, min_confidence: f64) -> Self {
141        self.min_confidence = min_confidence;
142        self
143    }
144
145    /// Clamp every float field into its valid range and replace any
146    /// NaN with a safe default. A user-supplied
147    /// `--min-confidence=-5.0` or a corrupt config TOML feeding
148    /// `min_confidence = nan` would otherwise NaN-infect the
149    /// confidence-comparison path and silently drop every finding
150    /// (NaN comparisons are always false, so `conf < min_confidence`
151    /// is `false`, but `conf >= min_confidence` is also `false`,
152    /// behaviour-dependent on the call site).
153    ///
154    /// Idempotent - sanitising an already-sane config is a no-op.
155    /// Called inside `From<ScanConfig>` so any path that constructs
156    /// a ScannerConfig from a user-influenced source pays this
157    /// once at config-build time.
158    pub fn sanitise(&mut self) {
159        // Probabilities: clamp to [0.0, 1.0], NaN → canonical default. The
160        // NaN fallbacks READ FROM `ScanConfig::default()` rather than repeating
161        // a literal, so a corrupt-config scrub can never fork from the shipped
162        // floor (currently ml_weight 0.5, min_confidence 0.40) - one source.
163        let canon = keyhog_core::config::ScanConfig::default();
164        if !self.ml_weight.is_finite() {
165            self.ml_weight = canon.ml_weight;
166        } else {
167            self.ml_weight = self.ml_weight.clamp(0.0, 1.0);
168        }
169        if !self.min_confidence.is_finite() {
170            self.min_confidence = canon.min_confidence;
171        } else {
172            self.min_confidence = self.min_confidence.clamp(0.0, 1.0);
173        }
174        // Shannon entropy: 8.0 is the upper bound for byte-level
175        // entropy. NaN / negative → conservative default.
176        if !self.entropy_threshold.is_finite() || self.entropy_threshold < 0.0 {
177            self.entropy_threshold = 4.5;
178        } else if self.entropy_threshold > 8.0 {
179            self.entropy_threshold = 8.0;
180        }
181        // Recursion-depth + chunk-size caps. Production-bound the
182        // worst case: max_decode_depth > 32 risks stack overflow on
183        // pathological nested base64. max_matches_per_chunk has no
184        // theoretical upper bound but a billion is misconfiguration.
185        if self.max_decode_depth > 32 {
186            self.max_decode_depth = 32;
187        }
188        if self.max_matches_per_chunk > 1_000_000 {
189            self.max_matches_per_chunk = 1_000_000;
190        }
191        if self.max_matches_per_chunk == 0 {
192            self.max_matches_per_chunk = 1000;
193        }
194    }
195}
196
197impl From<keyhog_core::config::ScanConfig> for ScannerConfig {
198    fn from(config: keyhog_core::config::ScanConfig) -> Self {
199        // Identity-style mapping: every shared knob carries 1:1 with the
200        // SAME field name on both sides. No rename, no invented value -
201        // the From must not introduce drift, or a tuning baked into one
202        // `Default` silently disagrees with the benched/shipped path.
203        //
204        // `multiline` has no `ScanConfig` counterpart (its type lives in
205        // this crate, and `keyhog-core` cannot depend on `keyhog-scanner`
206        // without a dependency cycle), so it takes the scanner default.
207        //
208        // `ScanConfig::{min_secret_len, max_file_size, dedup}` are NOT
209        // carried: the scanner reads none of them, so mapping them in
210        // would be dead state that could drift. `max_file_size` is
211        // enforced independently at the source walker (`keyhog-sources`,
212        // `FilesystemSource::with_max_file_size`); `dedup` is applied by
213        // the verifier via `DedupScope`; `min_secret_len` currently has no
214        // reader at all. They stay on `ScanConfig` (covered by core config
215        // tests) but have no `ScannerConfig` peer by design.
216        let mut out = Self {
217            max_decode_depth: config.max_decode_depth,
218            validate_decode: config.validate_decode,
219            entropy_enabled: config.entropy_enabled,
220            entropy_threshold: config.entropy_threshold,
221            entropy_in_source_files: config.entropy_in_source_files,
222            entropy_ml_authoritative: config.entropy_ml_authoritative,
223            generic_keyword_low_entropy: config.generic_keyword_low_entropy,
224            ml_enabled: config.ml_enabled,
225            ml_weight: config.ml_weight,
226            min_confidence: config.min_confidence,
227            unicode_normalization: config.unicode_normalization,
228            max_decode_bytes: config.max_decode_bytes,
229            max_matches_per_chunk: config.max_matches_per_chunk,
230            scan_comments: config.scan_comments,
231            multiline: crate::multiline::MultilineConfig::default(),
232            known_prefixes: config.known_prefixes,
233            secret_keywords: config.secret_keywords,
234            test_keywords: config.test_keywords,
235            placeholder_keywords: config.placeholder_keywords,
236            // Scanner-only knob; the CLI flips it off for
237            // `--no-suppress-test-fixtures`.
238            penalize_test_paths: true,
239        };
240        // Defensive clamp + NaN scrub on every user-influenced
241        // numeric field. Idempotent. See `ScannerConfig::sanitise`
242        // for rationale.
243        out.sanitise();
244        out
245    }
246}
247
248/// Queued ML match waiting for batch inference at the end of a scan.
249#[cfg(feature = "ml")]
250#[derive(Debug, Clone)]
251pub struct MlPendingMatch {
252    /// The raw match built with heuristic confidence only.
253    pub raw_match: keyhog_core::RawMatch,
254    /// Heuristic confidence before ML blending.
255    pub heuristic_conf: f64,
256    /// Inferred code context for post-ML adjustments.
257    pub code_context: crate::context::CodeContext,
258    /// Credential text for feature extraction.
259    pub credential: String,
260    /// Surrounding context passed to the ML scorer.
261    pub ml_context: String,
262    /// When true, the MoE score is AUTHORITATIVE for this candidate: the final
263    /// confidence is the model score directly, NOT `max(heuristic, ml)`. Set for
264    /// entropy-fallback candidates, whose "heuristic" is bare entropy magnitude -
265    /// exactly the signal that mislabels high-entropy non-secrets (FQDNs, git
266    /// SHAs, base64 blobs) as findings. Flooring by that heuristic (as the
267    /// detector path does, where the regex IS positive evidence) would defeat the
268    /// model's ability to suppress those FPs. Detector/generic matches set this
269    /// false and keep the heuristic floor. See `apply_ml_batch_scores`.
270    pub model_authoritative: bool,
271}
272
273/// Internal state for a single scan operation (tracks matches and ML cache).
274#[derive(Default)]
275pub struct ScanState {
276    /// Matches collected for this chunk, prioritized by confidence.
277    /// Uses Reverse to make it a min-heap so we can easily pop the LOWEST confidence.
278    pub matches: BinaryHeap<Reverse<keyhog_core::RawMatch>>,
279    /// Interner for credentials found in this chunk to save memory on duplicates.
280    pub credential_interner: HashSet<Arc<str>>,
281    /// Static string cache for detector metadata. Uses
282    /// `HashSet<Arc<str>>` (not `HashMap<String, Arc<str>>`) so a
283    /// cache miss allocates ONLY the `Arc<str>` - the prior shape
284    /// also allocated a `String` to serve as the HashMap key, paying
285    /// twice for what's a single dedup slot. `HashSet::get(&s)` works
286    /// via `Arc<str>: Borrow<str>`, no allocation on hits.
287    ///
288    /// Hit ONLY by dynamic strings now: the scanner-wide
289    /// `StaticInterner` (vyre CHD perfect hash) handles every
290    /// `(detector_id, detector_name, service, source_type)` lookup
291    /// without per-scan allocation.
292    pub metadata_interner: HashSet<Arc<str>>,
293    /// Optional reference to the scanner's frozen static-string
294    /// interner. When `Some`, `intern_metadata` checks here first
295    /// before falling through to the per-scan `metadata_interner`.
296    /// Lock-free on read so concurrent rayon workers share one
297    /// instance without contention.
298    pub static_intern: Option<Arc<crate::static_intern::StaticInterner>>,
299    #[cfg(feature = "ml")]
300    pub ml_score_cache: HashMap<(String, String), f64>,
301    #[cfg(feature = "ml")]
302    pub ml_cache_order: VecDeque<(String, String)>,
303    #[cfg(feature = "ml")]
304    pub ml_cache_bytes: usize,
305    #[cfg(feature = "ml")]
306    /// Detector matches queued for batch ML scoring at the end of the scan.
307    pub ml_pending: Vec<MlPendingMatch>,
308}
309
310impl ScanState {
311    /// Intern a credential string, returning an `Arc<str>`.
312    pub fn intern_credential(&mut self, s: &str) -> Arc<str> {
313        if let Some(existing) = self.credential_interner.get(s) {
314            existing.clone()
315        } else {
316            let shared: Arc<str> = Arc::from(s);
317            self.credential_interner.insert(shared.clone());
318            shared
319        }
320    }
321
322    /// Intern a metadata string (detector_id, name, service, source_type, ...).
323    ///
324    /// Lookup order:
325    ///   1. Scanner-wide `StaticInterner` (vyre CHD perfect hash) for
326    ///      detector metadata that's frozen at scanner construction -
327    ///      O(1), no allocation, no lock contention.
328    ///   2. Per-scan `metadata_interner` `HashSet` for dynamic strings
329    ///      (file paths, commit SHAs, author names, dates).
330    pub fn intern_metadata(&mut self, s: &str) -> Arc<str> {
331        if let Some(intern) = self.static_intern.as_ref() {
332            if let Some(arc) = intern.lookup(s) {
333                return arc;
334            }
335        }
336        if let Some(existing) = self.metadata_interner.get(s) {
337            return existing.clone();
338        }
339        let shared: Arc<str> = Arc::from(s);
340        self.metadata_interner.insert(shared.clone());
341        shared
342    }
343
344    /// Construct a `ScanState` that consults the scanner-wide static
345    /// interner first. Use this from any path that has a
346    /// `&CompiledScanner` in scope; falls back to `default()` for
347    /// stand-alone unit tests.
348    pub fn with_static_intern(intern: Arc<crate::static_intern::StaticInterner>) -> Self {
349        Self {
350            static_intern: Some(intern),
351            ..Self::default()
352        }
353    }
354
355    /// Push a match to the state, maintaining priority and capacity.
356    /// High-confidence secrets will displace lower-confidence findings.
357    pub fn push_match(&mut self, m: keyhog_core::RawMatch, limit: usize) {
358        if self.matches.len() < limit {
359            self.matches.push(Reverse(m));
360        } else if let Some(mut lowest) = self.matches.peek_mut() {
361            if m > lowest.0 {
362                *lowest = Reverse(m);
363            }
364        }
365    }
366
367    /// Drain all matches into a sorted vector. Dedups identical findings
368    /// (same detector + same credential + same offset) - two engines can
369    /// produce the same finding for the same pattern (e.g. ac_map's
370    /// literal hit + homoglyph fallback variant both fire on plain ASCII
371    /// because the homoglyph char-class includes the original char). The
372    /// caller only wants one of them in the result set.
373    pub fn into_matches(self) -> Vec<keyhog_core::RawMatch> {
374        let mut matches: Vec<_> = self.matches.into_iter().map(|r| r.0).collect();
375        // Sort descending by confidence for final output
376        matches.sort_by(|a, b| b.cmp(a));
377        // Dedup identical findings (same detector + credential + offset).
378        // 0 or 1 match cannot contain a duplicate, so skip all dedup work -
379        // no HashSet alloc, no refcount traffic - on the overwhelmingly
380        // common small-chunk case.
381        if matches.len() <= 1 {
382            return matches;
383        }
384        // For small N a sort-based adjacent dedup beats a HashSet: it adds
385        // no allocation and no `Arc::clone` (two atomics per match) - it
386        // only borrows the identity fields for comparison. The Vec is
387        // already sorted confidence-descending above; `sort_by` is a STABLE
388        // sort, so grouping by (detector_id, credential, offset) preserves
389        // that confidence-descending order within each identity group. The
390        // first element of each run is therefore the highest-confidence
391        // entry, which `dedup_by` keeps. A final `b.cmp(a)` restores the
392        // canonical output order. Same result as the HashSet path, no alloc.
393        if matches.len() <= 64 {
394            matches.sort_by(|a, b| {
395                a.detector_id
396                    .cmp(&b.detector_id)
397                    .then_with(|| a.credential.cmp(&b.credential))
398                    .then_with(|| a.location.offset.cmp(&b.location.offset))
399            });
400            matches.dedup_by(|a, b| {
401                a.detector_id == b.detector_id
402                    && a.credential == b.credential
403                    && a.location.offset == b.location.offset
404            });
405            // Restore confidence-descending order for output.
406            matches.sort_by(|a, b| b.cmp(a));
407            return matches;
408        }
409        // Large N: HashSet dedup amortises better than repeated sorts.
410        // Stable: keeps the highest-confidence entry of any duplicate set
411        // thanks to the confidence sort above.
412        let mut seen: std::collections::HashSet<(std::sync::Arc<str>, std::sync::Arc<str>, usize)> =
413            std::collections::HashSet::with_capacity(matches.len());
414        matches.retain(|m| {
415            seen.insert((
416                std::sync::Arc::clone(&m.detector_id),
417                std::sync::Arc::clone(&m.credential),
418                m.location.offset,
419            ))
420        });
421        matches
422    }
423}