keyhog_scanner/scanner_config.rs
1//! Scanner configuration and scan state types.
2
3use std::cmp::Reverse;
4use std::collections::{BinaryHeap, HashSet};
5#[cfg(feature = "ml")]
6use std::collections::{HashMap, VecDeque};
7use std::sync::Arc;
8
9/// Configuration for the scanner's decoding and processing heuristics.
10#[derive(Debug, Clone)]
11pub struct ScannerConfig {
12 /// Maximum recursion depth for decode-through (base64, hex, etc.)
13 pub max_decode_depth: usize,
14 /// Validate decoded strings (e.g. check if decoded base64 is UTF-8)
15 pub validate_decode: bool,
16 /// Enable entropy-based detection
17 pub entropy_enabled: bool,
18 /// Threshold for entropy-based detection
19 pub entropy_threshold: f64,
20 /// Enable entropy-based detection in source code files
21 pub entropy_in_source_files: bool,
22 /// Enable ML-based confidence scoring
23 pub ml_enabled: bool,
24 /// ML weight for confidence scoring, 0.0-1.0
25 pub ml_weight: f64,
26 /// Minimum confidence threshold for matches
27 pub min_confidence: f64,
28 /// Enable Unicode normalization
29 pub unicode_normalization: bool,
30 /// Maximum bytes for decode-through processing
31 pub max_decode_bytes: usize,
32 /// Maximum matches to collect per chunk before stopping.
33 /// Prevents OOM on extremely noisy files.
34 pub max_matches_per_chunk: usize,
35 /// When `true`, credentials inside source-code comments are
36 /// treated as first-class findings (no confidence downgrade,
37 /// no comment-context multiplier). Mirrors
38 /// `keyhog_core::config::ScanConfig::scan_comments` and the
39 /// CLI's `--scan-comments` flag. See that field's doc for why
40 /// the default is off.
41 pub scan_comments: bool,
42 /// Configuration for multiline concatenation
43 pub multiline: crate::multiline::MultilineConfig,
44 /// Known secret prefixes used to boost confidence.
45 pub known_prefixes: Vec<String>,
46 /// Keywords indicating a secret context (e.g. "api_key", "token").
47 pub secret_keywords: Vec<String>,
48 /// Keywords indicating a test/mock context (e.g. "test", "fake").
49 pub test_keywords: Vec<String>,
50 /// Keywords indicating a placeholder value (e.g. "change_me", "todo").
51 pub placeholder_keywords: Vec<String>,
52}
53
54impl Default for ScannerConfig {
55 fn default() -> Self {
56 keyhog_core::config::ScanConfig::default().into()
57 }
58}
59
60impl ScannerConfig {
61 pub fn fast() -> Self {
62 Self {
63 max_decode_depth: 0,
64 ml_enabled: false,
65 entropy_enabled: false,
66 ..Default::default()
67 }
68 }
69
70 pub fn thorough() -> Self {
71 Self {
72 max_decode_depth: 10,
73 ml_enabled: true,
74 entropy_enabled: true,
75 min_confidence: 0.5,
76 ..Default::default()
77 }
78 }
79
80 pub fn min_confidence(mut self, min_confidence: f64) -> Self {
81 self.min_confidence = min_confidence;
82 self
83 }
84
85 /// Clamp every float field into its valid range and replace any
86 /// NaN with a safe default. A user-supplied
87 /// `--min-confidence=-5.0` or a corrupt config TOML feeding
88 /// `min_confidence = nan` would otherwise NaN-infect the
89 /// confidence-comparison path and silently drop every finding
90 /// (NaN comparisons are always false, so `conf < min_confidence`
91 /// is `false`, but `conf >= min_confidence` is also `false`,
92 /// behaviour-dependent on the call site).
93 ///
94 /// Idempotent - sanitising an already-sane config is a no-op.
95 /// Called inside `From<ScanConfig>` so any path that constructs
96 /// a ScannerConfig from a user-influenced source pays this
97 /// once at config-build time.
98 pub fn sanitise(&mut self) {
99 // Probabilities: clamp to [0.0, 1.0], NaN → default.
100 if !self.ml_weight.is_finite() {
101 self.ml_weight = 0.6;
102 } else {
103 self.ml_weight = self.ml_weight.clamp(0.0, 1.0);
104 }
105 if !self.min_confidence.is_finite() {
106 self.min_confidence = 0.3;
107 } else {
108 self.min_confidence = self.min_confidence.clamp(0.0, 1.0);
109 }
110 // Shannon entropy: 8.0 is the upper bound for byte-level
111 // entropy. NaN / negative → conservative default.
112 if !self.entropy_threshold.is_finite() || self.entropy_threshold < 0.0 {
113 self.entropy_threshold = 4.5;
114 } else if self.entropy_threshold > 8.0 {
115 self.entropy_threshold = 8.0;
116 }
117 // Recursion-depth + chunk-size caps. Production-bound the
118 // worst case: max_decode_depth > 32 risks stack overflow on
119 // pathological nested base64. max_matches_per_chunk has no
120 // theoretical upper bound but a billion is misconfiguration.
121 if self.max_decode_depth > 32 {
122 self.max_decode_depth = 32;
123 }
124 if self.max_matches_per_chunk > 1_000_000 {
125 self.max_matches_per_chunk = 1_000_000;
126 }
127 if self.max_matches_per_chunk == 0 {
128 self.max_matches_per_chunk = 1000;
129 }
130 }
131}
132
133impl From<keyhog_core::config::ScanConfig> for ScannerConfig {
134 fn from(config: keyhog_core::config::ScanConfig) -> Self {
135 let mut out = Self {
136 max_decode_depth: config.max_decode_depth,
137 validate_decode: true,
138 entropy_enabled: config.entropy_enabled,
139 entropy_threshold: config.entropy_threshold,
140 entropy_in_source_files: config.entropy_in_source_files,
141 ml_enabled: config.ml_enabled,
142 ml_weight: config.ml_weight,
143 min_confidence: config.min_confidence,
144 unicode_normalization: config.unicode_normalization,
145 max_decode_bytes: config.decode_size_limit,
146 max_matches_per_chunk: config.max_matches_per_chunk,
147 scan_comments: config.scan_comments,
148 multiline: crate::multiline::MultilineConfig::default(),
149 known_prefixes: config.known_prefixes,
150 secret_keywords: config.secret_keywords,
151 test_keywords: config.test_keywords,
152 placeholder_keywords: config.placeholder_keywords,
153 };
154 // Defensive clamp + NaN scrub on every user-influenced
155 // numeric field. Idempotent. See `ScannerConfig::sanitise`
156 // for rationale.
157 out.sanitise();
158 out
159 }
160}
161
162/// Deferred ML match waiting for batch inference at the end of a scan.
163#[cfg(feature = "ml")]
164#[derive(Debug, Clone)]
165pub struct MlPendingMatch {
166 /// The raw match built with heuristic confidence only.
167 pub raw_match: keyhog_core::RawMatch,
168 /// Heuristic confidence before ML blending.
169 pub heuristic_conf: f64,
170 /// Inferred code context for post-ML adjustments.
171 pub code_context: crate::context::CodeContext,
172 /// Credential text for feature extraction.
173 pub credential: String,
174 /// Surrounding context passed to the ML scorer.
175 pub ml_context: String,
176}
177
178/// Internal state for a single scan operation (tracks matches and ML cache).
179#[derive(Default)]
180pub struct ScanState {
181 /// Matches collected for this chunk, prioritized by confidence.
182 /// Uses Reverse to make it a min-heap so we can easily pop the LOWEST confidence.
183 pub matches: BinaryHeap<Reverse<keyhog_core::RawMatch>>,
184 /// Interner for credentials found in this chunk to save memory on duplicates.
185 pub credential_interner: HashSet<Arc<str>>,
186 /// Static string cache for detector metadata. Uses
187 /// `HashSet<Arc<str>>` (not `HashMap<String, Arc<str>>`) so a
188 /// cache miss allocates ONLY the `Arc<str>` - the prior shape
189 /// also allocated a `String` to serve as the HashMap key, paying
190 /// twice for what's a single dedup slot. `HashSet::get(&s)` works
191 /// via `Arc<str>: Borrow<str>`, no allocation on hits.
192 ///
193 /// Hit ONLY by dynamic strings now: the scanner-wide
194 /// `StaticInterner` (vyre CHD perfect hash) handles every
195 /// `(detector_id, detector_name, service, source_type)` lookup
196 /// without per-scan allocation.
197 pub metadata_interner: HashSet<Arc<str>>,
198 /// Optional reference to the scanner's frozen static-string
199 /// interner. When `Some`, `intern_metadata` checks here first
200 /// before falling through to the per-scan `metadata_interner`.
201 /// Lock-free on read so concurrent rayon workers share one
202 /// instance without contention.
203 pub static_intern: Option<Arc<crate::static_intern::StaticInterner>>,
204 #[cfg(feature = "ml")]
205 pub ml_score_cache: HashMap<(String, String), f64>,
206 #[cfg(feature = "ml")]
207 pub ml_cache_order: VecDeque<(String, String)>,
208 #[cfg(feature = "ml")]
209 pub ml_cache_bytes: usize,
210 #[cfg(feature = "ml")]
211 /// Detector matches deferred for batch ML scoring at the end of the scan.
212 pub ml_pending: Vec<MlPendingMatch>,
213}
214
215impl ScanState {
216 /// Intern a credential string, returning an `Arc<str>`.
217 pub fn intern_credential(&mut self, s: &str) -> Arc<str> {
218 if let Some(existing) = self.credential_interner.get(s) {
219 existing.clone()
220 } else {
221 let shared: Arc<str> = Arc::from(s);
222 self.credential_interner.insert(shared.clone());
223 shared
224 }
225 }
226
227 /// Intern a metadata string (detector_id, name, service, source_type, ...).
228 ///
229 /// Lookup order:
230 /// 1. Scanner-wide `StaticInterner` (vyre CHD perfect hash) for
231 /// detector metadata that's frozen at scanner construction -
232 /// O(1), no allocation, no lock contention.
233 /// 2. Per-scan `metadata_interner` `HashSet` for dynamic strings
234 /// (file paths, commit SHAs, author names, dates).
235 pub fn intern_metadata(&mut self, s: &str) -> Arc<str> {
236 if let Some(intern) = self.static_intern.as_ref() {
237 if let Some(arc) = intern.lookup(s) {
238 return arc;
239 }
240 }
241 if let Some(existing) = self.metadata_interner.get(s) {
242 return existing.clone();
243 }
244 let shared: Arc<str> = Arc::from(s);
245 self.metadata_interner.insert(shared.clone());
246 shared
247 }
248
249 /// Construct a `ScanState` that consults the scanner-wide static
250 /// interner first. Use this from any path that has a
251 /// `&CompiledScanner` in scope; falls back to `default()` for
252 /// stand-alone unit tests.
253 pub fn with_static_intern(intern: Arc<crate::static_intern::StaticInterner>) -> Self {
254 Self {
255 static_intern: Some(intern),
256 ..Self::default()
257 }
258 }
259
260 /// Push a match to the state, maintaining priority and capacity.
261 /// High-confidence secrets will displace lower-confidence findings.
262 pub fn push_match(&mut self, m: keyhog_core::RawMatch, limit: usize) {
263 if self.matches.len() < limit {
264 self.matches.push(Reverse(m));
265 } else if let Some(mut lowest) = self.matches.peek_mut() {
266 if m > lowest.0 {
267 *lowest = Reverse(m);
268 }
269 }
270 }
271
272 /// Drain all matches into a sorted vector. Dedups identical findings
273 /// (same detector + same credential + same offset) - two engines can
274 /// produce the same finding for the same pattern (e.g. ac_map's
275 /// literal hit + homoglyph fallback variant both fire on plain ASCII
276 /// because the homoglyph char-class includes the original char). The
277 /// caller only wants one of them in the result set.
278 pub fn into_matches(self) -> Vec<keyhog_core::RawMatch> {
279 let mut matches: Vec<_> = self.matches.into_iter().map(|r| r.0).collect();
280 // Sort descending by confidence for final output
281 matches.sort_by(|a, b| b.cmp(a));
282 // Dedup identical findings. Stable: keeps the highest-confidence
283 // entry of any duplicate set thanks to the sort above.
284 let mut seen: std::collections::HashSet<(std::sync::Arc<str>, std::sync::Arc<str>, usize)> =
285 std::collections::HashSet::with_capacity(matches.len());
286 matches.retain(|m| {
287 seen.insert((
288 std::sync::Arc::clone(&m.detector_id),
289 std::sync::Arc::clone(&m.credential),
290 m.location.offset,
291 ))
292 });
293 matches
294 }
295}