Skip to main content

keyhog_scanner/
scanner_config.rs

1//! Scanner configuration and scan state types.
2
3use std::cmp::Reverse;
4use std::collections::{BinaryHeap, HashSet};
5#[cfg(feature = "ml")]
6use std::collections::{HashMap, VecDeque};
7use std::sync::Arc;
8
9/// Configuration for the scanner's decoding and processing heuristics.
10#[derive(Debug, Clone)]
11pub struct ScannerConfig {
12    /// Maximum recursion depth for decode-through (base64, hex, etc.)
13    pub max_decode_depth: usize,
14    /// Validate decoded strings (e.g. check if decoded base64 is UTF-8)
15    pub validate_decode: bool,
16    /// Enable entropy-based detection
17    pub entropy_enabled: bool,
18    /// Threshold for entropy-based detection
19    pub entropy_threshold: f64,
20    /// Enable entropy-based detection in source code files
21    pub entropy_in_source_files: bool,
22    /// Enable ML-based confidence scoring
23    pub ml_enabled: bool,
24    /// ML weight for confidence scoring, 0.0-1.0
25    pub ml_weight: f64,
26    /// Minimum confidence threshold for matches
27    pub min_confidence: f64,
28    /// Enable Unicode normalization
29    pub unicode_normalization: bool,
30    /// Maximum bytes for decode-through processing
31    pub max_decode_bytes: usize,
32    /// Maximum matches to collect per chunk before stopping.
33    /// Prevents OOM on extremely noisy files.
34    pub max_matches_per_chunk: usize,
35    /// When `true`, credentials inside source-code comments are
36    /// treated as first-class findings (no confidence downgrade,
37    /// no comment-context multiplier). Mirrors
38    /// `keyhog_core::config::ScanConfig::scan_comments` and the
39    /// CLI's `--scan-comments` flag. See that field's doc for why
40    /// the default is off.
41    pub scan_comments: bool,
42    /// Configuration for multiline concatenation
43    pub multiline: crate::multiline::MultilineConfig,
44    /// Known secret prefixes used to boost confidence.
45    pub known_prefixes: Vec<String>,
46    /// Keywords indicating a secret context (e.g. "api_key", "token").
47    pub secret_keywords: Vec<String>,
48    /// Keywords indicating a test/mock context (e.g. "test", "fake").
49    pub test_keywords: Vec<String>,
50    /// Keywords indicating a placeholder value (e.g. "change_me", "todo").
51    pub placeholder_keywords: Vec<String>,
52}
53
54impl Default for ScannerConfig {
55    fn default() -> Self {
56        keyhog_core::config::ScanConfig::default().into()
57    }
58}
59
60impl ScannerConfig {
61    pub fn fast() -> Self {
62        Self {
63            max_decode_depth: 0,
64            ml_enabled: false,
65            entropy_enabled: false,
66            ..Default::default()
67        }
68    }
69
70    pub fn thorough() -> Self {
71        Self {
72            max_decode_depth: 10,
73            ml_enabled: true,
74            entropy_enabled: true,
75            min_confidence: 0.5,
76            ..Default::default()
77        }
78    }
79
80    pub fn min_confidence(mut self, min_confidence: f64) -> Self {
81        self.min_confidence = min_confidence;
82        self
83    }
84
85    /// Clamp every float field into its valid range and replace any
86    /// NaN with a safe default. A user-supplied
87    /// `--min-confidence=-5.0` or a corrupt config TOML feeding
88    /// `min_confidence = nan` would otherwise NaN-infect the
89    /// confidence-comparison path and silently drop every finding
90    /// (NaN comparisons are always false, so `conf < min_confidence`
91    /// is `false`, but `conf >= min_confidence` is also `false`,
92    /// behaviour-dependent on the call site).
93    ///
94    /// Idempotent - sanitising an already-sane config is a no-op.
95    /// Called inside `From<ScanConfig>` so any path that constructs
96    /// a ScannerConfig from a user-influenced source pays this
97    /// once at config-build time.
98    pub fn sanitise(&mut self) {
99        // Probabilities: clamp to [0.0, 1.0], NaN → default.
100        if !self.ml_weight.is_finite() {
101            self.ml_weight = 0.6;
102        } else {
103            self.ml_weight = self.ml_weight.clamp(0.0, 1.0);
104        }
105        if !self.min_confidence.is_finite() {
106            self.min_confidence = 0.3;
107        } else {
108            self.min_confidence = self.min_confidence.clamp(0.0, 1.0);
109        }
110        // Shannon entropy: 8.0 is the upper bound for byte-level
111        // entropy. NaN / negative → conservative default.
112        if !self.entropy_threshold.is_finite() || self.entropy_threshold < 0.0 {
113            self.entropy_threshold = 4.5;
114        } else if self.entropy_threshold > 8.0 {
115            self.entropy_threshold = 8.0;
116        }
117        // Recursion-depth + chunk-size caps. Production-bound the
118        // worst case: max_decode_depth > 32 risks stack overflow on
119        // pathological nested base64. max_matches_per_chunk has no
120        // theoretical upper bound but a billion is misconfiguration.
121        if self.max_decode_depth > 32 {
122            self.max_decode_depth = 32;
123        }
124        if self.max_matches_per_chunk > 1_000_000 {
125            self.max_matches_per_chunk = 1_000_000;
126        }
127        if self.max_matches_per_chunk == 0 {
128            self.max_matches_per_chunk = 1000;
129        }
130    }
131}
132
133impl From<keyhog_core::config::ScanConfig> for ScannerConfig {
134    fn from(config: keyhog_core::config::ScanConfig) -> Self {
135        let mut out = Self {
136            max_decode_depth: config.max_decode_depth,
137            validate_decode: true,
138            entropy_enabled: config.entropy_enabled,
139            entropy_threshold: config.entropy_threshold,
140            entropy_in_source_files: config.entropy_in_source_files,
141            ml_enabled: config.ml_enabled,
142            ml_weight: config.ml_weight,
143            min_confidence: config.min_confidence,
144            unicode_normalization: config.unicode_normalization,
145            max_decode_bytes: config.decode_size_limit,
146            max_matches_per_chunk: config.max_matches_per_chunk,
147            scan_comments: config.scan_comments,
148            multiline: crate::multiline::MultilineConfig::default(),
149            known_prefixes: config.known_prefixes,
150            secret_keywords: config.secret_keywords,
151            test_keywords: config.test_keywords,
152            placeholder_keywords: config.placeholder_keywords,
153        };
154        // Defensive clamp + NaN scrub on every user-influenced
155        // numeric field. Idempotent. See `ScannerConfig::sanitise`
156        // for rationale.
157        out.sanitise();
158        out
159    }
160}
161
162/// Deferred ML match waiting for batch inference at the end of a scan.
163#[cfg(feature = "ml")]
164#[derive(Debug, Clone)]
165pub struct MlPendingMatch {
166    /// The raw match built with heuristic confidence only.
167    pub raw_match: keyhog_core::RawMatch,
168    /// Heuristic confidence before ML blending.
169    pub heuristic_conf: f64,
170    /// Inferred code context for post-ML adjustments.
171    pub code_context: crate::context::CodeContext,
172    /// Credential text for feature extraction.
173    pub credential: String,
174    /// Surrounding context passed to the ML scorer.
175    pub ml_context: String,
176}
177
178/// Internal state for a single scan operation (tracks matches and ML cache).
179#[derive(Default)]
180pub struct ScanState {
181    /// Matches collected for this chunk, prioritized by confidence.
182    /// Uses Reverse to make it a min-heap so we can easily pop the LOWEST confidence.
183    pub matches: BinaryHeap<Reverse<keyhog_core::RawMatch>>,
184    /// Interner for credentials found in this chunk to save memory on duplicates.
185    pub credential_interner: HashSet<Arc<str>>,
186    /// Static string cache for detector metadata. Uses
187    /// `HashSet<Arc<str>>` (not `HashMap<String, Arc<str>>`) so a
188    /// cache miss allocates ONLY the `Arc<str>` - the prior shape
189    /// also allocated a `String` to serve as the HashMap key, paying
190    /// twice for what's a single dedup slot. `HashSet::get(&s)` works
191    /// via `Arc<str>: Borrow<str>`, no allocation on hits.
192    ///
193    /// Hit ONLY by dynamic strings now: the scanner-wide
194    /// `StaticInterner` (vyre CHD perfect hash) handles every
195    /// `(detector_id, detector_name, service, source_type)` lookup
196    /// without per-scan allocation.
197    pub metadata_interner: HashSet<Arc<str>>,
198    /// Optional reference to the scanner's frozen static-string
199    /// interner. When `Some`, `intern_metadata` checks here first
200    /// before falling through to the per-scan `metadata_interner`.
201    /// Lock-free on read so concurrent rayon workers share one
202    /// instance without contention.
203    pub static_intern: Option<Arc<crate::static_intern::StaticInterner>>,
204    #[cfg(feature = "ml")]
205    pub ml_score_cache: HashMap<(String, String), f64>,
206    #[cfg(feature = "ml")]
207    pub ml_cache_order: VecDeque<(String, String)>,
208    #[cfg(feature = "ml")]
209    pub ml_cache_bytes: usize,
210    #[cfg(feature = "ml")]
211    /// Detector matches deferred for batch ML scoring at the end of the scan.
212    pub ml_pending: Vec<MlPendingMatch>,
213}
214
215impl ScanState {
216    /// Intern a credential string, returning an `Arc<str>`.
217    pub fn intern_credential(&mut self, s: &str) -> Arc<str> {
218        if let Some(existing) = self.credential_interner.get(s) {
219            existing.clone()
220        } else {
221            let shared: Arc<str> = Arc::from(s);
222            self.credential_interner.insert(shared.clone());
223            shared
224        }
225    }
226
227    /// Intern a metadata string (detector_id, name, service, source_type, ...).
228    ///
229    /// Lookup order:
230    ///   1. Scanner-wide `StaticInterner` (vyre CHD perfect hash) for
231    ///      detector metadata that's frozen at scanner construction -
232    ///      O(1), no allocation, no lock contention.
233    ///   2. Per-scan `metadata_interner` `HashSet` for dynamic strings
234    ///      (file paths, commit SHAs, author names, dates).
235    pub fn intern_metadata(&mut self, s: &str) -> Arc<str> {
236        if let Some(intern) = self.static_intern.as_ref() {
237            if let Some(arc) = intern.lookup(s) {
238                return arc;
239            }
240        }
241        if let Some(existing) = self.metadata_interner.get(s) {
242            return existing.clone();
243        }
244        let shared: Arc<str> = Arc::from(s);
245        self.metadata_interner.insert(shared.clone());
246        shared
247    }
248
249    /// Construct a `ScanState` that consults the scanner-wide static
250    /// interner first. Use this from any path that has a
251    /// `&CompiledScanner` in scope; falls back to `default()` for
252    /// stand-alone unit tests.
253    pub fn with_static_intern(intern: Arc<crate::static_intern::StaticInterner>) -> Self {
254        Self {
255            static_intern: Some(intern),
256            ..Self::default()
257        }
258    }
259
260    /// Push a match to the state, maintaining priority and capacity.
261    /// High-confidence secrets will displace lower-confidence findings.
262    pub fn push_match(&mut self, m: keyhog_core::RawMatch, limit: usize) {
263        if self.matches.len() < limit {
264            self.matches.push(Reverse(m));
265        } else if let Some(mut lowest) = self.matches.peek_mut() {
266            if m > lowest.0 {
267                *lowest = Reverse(m);
268            }
269        }
270    }
271
272    /// Drain all matches into a sorted vector. Dedups identical findings
273    /// (same detector + same credential + same offset) - two engines can
274    /// produce the same finding for the same pattern (e.g. ac_map's
275    /// literal hit + homoglyph fallback variant both fire on plain ASCII
276    /// because the homoglyph char-class includes the original char). The
277    /// caller only wants one of them in the result set.
278    pub fn into_matches(self) -> Vec<keyhog_core::RawMatch> {
279        let mut matches: Vec<_> = self.matches.into_iter().map(|r| r.0).collect();
280        // Sort descending by confidence for final output
281        matches.sort_by(|a, b| b.cmp(a));
282        // Dedup identical findings. Stable: keeps the highest-confidence
283        // entry of any duplicate set thanks to the sort above.
284        let mut seen: std::collections::HashSet<(std::sync::Arc<str>, std::sync::Arc<str>, usize)> =
285            std::collections::HashSet::with_capacity(matches.len());
286        matches.retain(|m| {
287            seen.insert((
288                std::sync::Arc::clone(&m.detector_id),
289                std::sync::Arc::clone(&m.credential),
290                m.location.offset,
291            ))
292        });
293        matches
294    }
295}