keyhog_scanner/scanner_config.rs
1//! Scanner configuration and scan state types.
2
3use std::cmp::Reverse;
4use std::collections::{BinaryHeap, HashSet};
5#[cfg(feature = "ml")]
6use std::collections::{HashMap, VecDeque};
7use std::sync::Arc;
8
9/// Configuration for the scanner's decoding and processing heuristics.
10#[derive(Debug, Clone)]
11pub struct ScannerConfig {
12 /// Maximum recursion depth for decode-through (base64, hex, etc.)
13 pub max_decode_depth: usize,
14 /// Validate decoded strings (e.g. check if decoded base64 is UTF-8)
15 pub validate_decode: bool,
16 /// Enable entropy-based detection
17 pub entropy_enabled: bool,
18 /// Threshold for entropy-based detection
19 pub entropy_threshold: f64,
20 /// Enable entropy-based detection in source code files
21 pub entropy_in_source_files: bool,
22 /// Route entropy-fallback candidates through the MoE with the model
23 /// AUTHORITATIVE (no entropy-magnitude floor) instead of the bare entropy
24 /// heuristic. Mirrors `keyhog_core::config::ScanConfig::entropy_ml_authoritative`
25 /// and the CLI `--no-entropy-ml-scoring` opt-out. No-op unless both
26 /// `entropy_enabled` and `ml_enabled` are set. See `apply_ml_batch_scores`
27 /// and `scan_entropy_fallback`.
28 pub entropy_ml_authoritative: bool,
29 /// Admit generic keyword-bridge values (`PASSWORD=`, `*_PASS=`, `secret:`,
30 /// `api_key=` ...) on the relaxed `generic-keyword-secret` entropy floor
31 /// instead of the high `generic-secret` floor. Mirrors
32 /// `keyhog_core::config::ScanConfig::generic_keyword_low_entropy` and the CLI
33 /// `--no-keyword-low-entropy` opt-out. The keyword key is the evidence;
34 /// precision is carried by the MoE + shape filters. See
35 /// `scan_generic_assignments`.
36 pub generic_keyword_low_entropy: bool,
37 /// Enable ML-based confidence scoring
38 pub ml_enabled: bool,
39 /// ML weight for confidence scoring, 0.0-1.0
40 pub ml_weight: f64,
41 /// Minimum confidence threshold for matches
42 pub min_confidence: f64,
43 /// Enable Unicode normalization
44 pub unicode_normalization: bool,
45 /// Maximum bytes for decode-through processing
46 pub max_decode_bytes: usize,
47 /// Maximum matches to collect per chunk before stopping.
48 /// Prevents OOM on extremely noisy files.
49 pub max_matches_per_chunk: usize,
50 /// When `true`, credentials inside source-code comments are
51 /// treated as first-class findings (no confidence downgrade,
52 /// no comment-context multiplier). Mirrors
53 /// `keyhog_core::config::ScanConfig::scan_comments` and the
54 /// CLI's `--scan-comments` flag. See that field's doc for why
55 /// the default is off.
56 pub scan_comments: bool,
57 /// Configuration for multiline concatenation
58 pub multiline: crate::multiline::MultilineConfig,
59 /// Known secret prefixes used to boost confidence.
60 pub known_prefixes: Vec<String>,
61 /// Keywords indicating a secret context (e.g. "api_key", "token").
62 pub secret_keywords: Vec<String>,
63 /// Keywords indicating a test/mock context (e.g. "test", "fake").
64 pub test_keywords: Vec<String>,
65 /// Keywords indicating a placeholder value (e.g. "change_me", "todo").
66 pub placeholder_keywords: Vec<String>,
67 /// Apply test/example path confidence and hard-suppression heuristics.
68 /// The CLI disables this for `--no-suppress-test-fixtures`.
69 pub penalize_test_paths: bool,
70}
71
72impl Default for ScannerConfig {
73 fn default() -> Self {
74 keyhog_core::config::ScanConfig::default().into()
75 }
76}
77
78impl ScannerConfig {
79 /// Confidence floor for [`ScannerConfig::high_precision`]. Distinct from the
80 /// canonical `ScanConfig::default()` floor (0.40) on purpose: precision mode
81 /// trades recall for a near-zero false-positive rate at mass-scan scale.
82 pub const HIGH_PRECISION_MIN_CONFIDENCE: f64 = 0.85;
83
84 pub fn fast() -> Self {
85 Self {
86 max_decode_depth: 0,
87 ml_enabled: false,
88 entropy_enabled: false,
89 ..Default::default()
90 }
91 }
92
93 pub fn thorough() -> Self {
94 // `min_confidence` intentionally omitted: it inherits the canonical
95 // `ScanConfig::default()` floor (single source of truth) instead of
96 // forking a second literal. Deep scanning widens decode/entropy, not
97 // the confidence bar.
98 Self {
99 max_decode_depth: 10,
100 ml_enabled: true,
101 entropy_enabled: true,
102 ..Default::default()
103 }
104 }
105
106 /// High-precision mass-scan preset: minimise false positives at the cost of
107 /// some recall, for scanning huge corpora where every FP is expensive to
108 /// triage. Fully offline and fast (no ML, no entropy sweep, shallow decode).
109 ///
110 /// - `entropy_enabled = false`: generic high-entropy matching is the single
111 /// largest FP source; precision mode drops it entirely.
112 /// - `ml_enabled = true` (inherited): ML is the confidence discriminator that
113 /// lifts genuine secrets over the high floor while leaving FP-shaped tokens
114 /// below it. Disabling it would crater the scores the 0.85 bar relies on,
115 /// so precision KEEPS ML (this mode trades recall for precision, not for
116 /// speed — use `--fast` when speed is the goal).
117 /// - `min_confidence = HIGH_PRECISION_MIN_CONFIDENCE` (0.85): combined with
118 /// the engine's checksum policy (valid token → floored 0.9, invalid →
119 /// capped 0.1) and clamped over every detector's self-declared floor, this
120 /// bar admits checksum-validated tokens and strong ML-scored findings while
121 /// dropping checksum-failures and weak-signal matches.
122 /// - `max_decode_depth = 1`: deep-decoded payloads are a FP source at scale.
123 ///
124 /// `penalize_test_paths` stays on (the default) to suppress fixture-shaped
125 /// hits. A `--min-confidence` override still layers on top of this preset.
126 pub fn high_precision() -> Self {
127 Self {
128 max_decode_depth: 1,
129 entropy_enabled: false,
130 // High-precision mode does not admit low-entropy keyword-anchored
131 // values: that surface trades precision for real-world recall, the
132 // opposite of this preset's contract. Restores the high
133 // `generic-secret` floor.
134 generic_keyword_low_entropy: false,
135 min_confidence: Self::HIGH_PRECISION_MIN_CONFIDENCE,
136 ..Default::default()
137 }
138 }
139
140 pub fn min_confidence(mut self, min_confidence: f64) -> Self {
141 self.min_confidence = min_confidence;
142 self
143 }
144
145 /// Clamp every float field into its valid range and replace any
146 /// NaN with a safe default. A user-supplied
147 /// `--min-confidence=-5.0` or a corrupt config TOML feeding
148 /// `min_confidence = nan` would otherwise NaN-infect the
149 /// confidence-comparison path and silently drop every finding
150 /// (NaN comparisons are always false, so `conf < min_confidence`
151 /// is `false`, but `conf >= min_confidence` is also `false`,
152 /// behaviour-dependent on the call site).
153 ///
154 /// Idempotent - sanitising an already-sane config is a no-op.
155 /// Called inside `From<ScanConfig>` so any path that constructs
156 /// a ScannerConfig from a user-influenced source pays this
157 /// once at config-build time.
158 pub fn sanitise(&mut self) {
159 // Probabilities: clamp to [0.0, 1.0], NaN → canonical default. The
160 // NaN fallbacks READ FROM `ScanConfig::default()` rather than repeating
161 // a literal, so a corrupt-config scrub can never fork from the shipped
162 // floor (currently ml_weight 0.5, min_confidence 0.40) - one source.
163 let canon = keyhog_core::config::ScanConfig::default();
164 if !self.ml_weight.is_finite() {
165 self.ml_weight = canon.ml_weight;
166 } else {
167 self.ml_weight = self.ml_weight.clamp(0.0, 1.0);
168 }
169 if !self.min_confidence.is_finite() {
170 self.min_confidence = canon.min_confidence;
171 } else {
172 self.min_confidence = self.min_confidence.clamp(0.0, 1.0);
173 }
174 // Shannon entropy: 8.0 is the upper bound for byte-level
175 // entropy. NaN / negative → conservative default.
176 if !self.entropy_threshold.is_finite() || self.entropy_threshold < 0.0 {
177 self.entropy_threshold = 4.5;
178 } else if self.entropy_threshold > 8.0 {
179 self.entropy_threshold = 8.0;
180 }
181 // Recursion-depth + chunk-size caps. Production-bound the
182 // worst case: max_decode_depth > 32 risks stack overflow on
183 // pathological nested base64. max_matches_per_chunk has no
184 // theoretical upper bound but a billion is misconfiguration.
185 if self.max_decode_depth > 32 {
186 self.max_decode_depth = 32;
187 }
188 if self.max_matches_per_chunk > 1_000_000 {
189 self.max_matches_per_chunk = 1_000_000;
190 }
191 if self.max_matches_per_chunk == 0 {
192 self.max_matches_per_chunk = 1000;
193 }
194 }
195}
196
197impl From<keyhog_core::config::ScanConfig> for ScannerConfig {
198 fn from(config: keyhog_core::config::ScanConfig) -> Self {
199 // Identity-style mapping: every shared knob carries 1:1 with the
200 // SAME field name on both sides. No rename, no invented value -
201 // the From must not introduce drift, or a tuning baked into one
202 // `Default` silently disagrees with the benched/shipped path.
203 //
204 // `multiline` has no `ScanConfig` counterpart (its type lives in
205 // this crate, and `keyhog-core` cannot depend on `keyhog-scanner`
206 // without a dependency cycle), so it takes the scanner default.
207 //
208 // `ScanConfig::{min_secret_len, max_file_size, dedup}` are NOT
209 // carried: the scanner reads none of them, so mapping them in
210 // would be dead state that could drift. `max_file_size` is
211 // enforced independently at the source walker (`keyhog-sources`,
212 // `FilesystemSource::with_max_file_size`); `dedup` is applied by
213 // the verifier via `DedupScope`; `min_secret_len` currently has no
214 // reader at all. They stay on `ScanConfig` (covered by core config
215 // tests) but have no `ScannerConfig` peer by design.
216 let mut out = Self {
217 max_decode_depth: config.max_decode_depth,
218 validate_decode: config.validate_decode,
219 entropy_enabled: config.entropy_enabled,
220 entropy_threshold: config.entropy_threshold,
221 entropy_in_source_files: config.entropy_in_source_files,
222 entropy_ml_authoritative: config.entropy_ml_authoritative,
223 generic_keyword_low_entropy: config.generic_keyword_low_entropy,
224 ml_enabled: config.ml_enabled,
225 ml_weight: config.ml_weight,
226 min_confidence: config.min_confidence,
227 unicode_normalization: config.unicode_normalization,
228 max_decode_bytes: config.max_decode_bytes,
229 max_matches_per_chunk: config.max_matches_per_chunk,
230 scan_comments: config.scan_comments,
231 multiline: crate::multiline::MultilineConfig::default(),
232 known_prefixes: config.known_prefixes,
233 secret_keywords: config.secret_keywords,
234 test_keywords: config.test_keywords,
235 placeholder_keywords: config.placeholder_keywords,
236 // Scanner-only knob; the CLI flips it off for
237 // `--no-suppress-test-fixtures`.
238 penalize_test_paths: true,
239 };
240 // Defensive clamp + NaN scrub on every user-influenced
241 // numeric field. Idempotent. See `ScannerConfig::sanitise`
242 // for rationale.
243 out.sanitise();
244 out
245 }
246}
247
248/// Queued ML match waiting for batch inference at the end of a scan.
249#[cfg(feature = "ml")]
250#[derive(Debug, Clone)]
251pub struct MlPendingMatch {
252 /// The raw match built with heuristic confidence only.
253 pub raw_match: keyhog_core::RawMatch,
254 /// Heuristic confidence before ML blending.
255 pub heuristic_conf: f64,
256 /// Inferred code context for post-ML adjustments.
257 pub code_context: crate::context::CodeContext,
258 /// Credential text for feature extraction.
259 pub credential: String,
260 /// Surrounding context passed to the ML scorer.
261 pub ml_context: String,
262 /// When true, the MoE score is AUTHORITATIVE for this candidate: the final
263 /// confidence is the model score directly, NOT `max(heuristic, ml)`. Set for
264 /// entropy-fallback candidates, whose "heuristic" is bare entropy magnitude -
265 /// exactly the signal that mislabels high-entropy non-secrets (FQDNs, git
266 /// SHAs, base64 blobs) as findings. Flooring by that heuristic (as the
267 /// detector path does, where the regex IS positive evidence) would defeat the
268 /// model's ability to suppress those FPs. Detector/generic matches set this
269 /// false and keep the heuristic floor. See `apply_ml_batch_scores`.
270 pub model_authoritative: bool,
271}
272
273/// Internal state for a single scan operation (tracks matches and ML cache).
274#[derive(Default)]
275pub struct ScanState {
276 /// Matches collected for this chunk, prioritized by confidence.
277 /// Uses Reverse to make it a min-heap so we can easily pop the LOWEST confidence.
278 pub matches: BinaryHeap<Reverse<keyhog_core::RawMatch>>,
279 /// Interner for credentials found in this chunk to save memory on duplicates.
280 pub credential_interner: HashSet<Arc<str>>,
281 /// Static string cache for detector metadata. Uses
282 /// `HashSet<Arc<str>>` (not `HashMap<String, Arc<str>>`) so a
283 /// cache miss allocates ONLY the `Arc<str>` - the prior shape
284 /// also allocated a `String` to serve as the HashMap key, paying
285 /// twice for what's a single dedup slot. `HashSet::get(&s)` works
286 /// via `Arc<str>: Borrow<str>`, no allocation on hits.
287 ///
288 /// Hit ONLY by dynamic strings now: the scanner-wide
289 /// `StaticInterner` (vyre CHD perfect hash) handles every
290 /// `(detector_id, detector_name, service, source_type)` lookup
291 /// without per-scan allocation.
292 pub metadata_interner: HashSet<Arc<str>>,
293 /// Optional reference to the scanner's frozen static-string
294 /// interner. When `Some`, `intern_metadata` checks here first
295 /// before falling through to the per-scan `metadata_interner`.
296 /// Lock-free on read so concurrent rayon workers share one
297 /// instance without contention.
298 pub static_intern: Option<Arc<crate::static_intern::StaticInterner>>,
299 #[cfg(feature = "ml")]
300 pub ml_score_cache: HashMap<(String, String), f64>,
301 #[cfg(feature = "ml")]
302 pub ml_cache_order: VecDeque<(String, String)>,
303 #[cfg(feature = "ml")]
304 pub ml_cache_bytes: usize,
305 #[cfg(feature = "ml")]
306 /// Detector matches queued for batch ML scoring at the end of the scan.
307 pub ml_pending: Vec<MlPendingMatch>,
308}
309
310impl ScanState {
311 /// Intern a credential string, returning an `Arc<str>`.
312 pub fn intern_credential(&mut self, s: &str) -> Arc<str> {
313 if let Some(existing) = self.credential_interner.get(s) {
314 existing.clone()
315 } else {
316 let shared: Arc<str> = Arc::from(s);
317 self.credential_interner.insert(shared.clone());
318 shared
319 }
320 }
321
322 /// Intern a metadata string (detector_id, name, service, source_type, ...).
323 ///
324 /// Lookup order:
325 /// 1. Scanner-wide `StaticInterner` (vyre CHD perfect hash) for
326 /// detector metadata that's frozen at scanner construction -
327 /// O(1), no allocation, no lock contention.
328 /// 2. Per-scan `metadata_interner` `HashSet` for dynamic strings
329 /// (file paths, commit SHAs, author names, dates).
330 pub fn intern_metadata(&mut self, s: &str) -> Arc<str> {
331 if let Some(intern) = self.static_intern.as_ref() {
332 if let Some(arc) = intern.lookup(s) {
333 return arc;
334 }
335 }
336 if let Some(existing) = self.metadata_interner.get(s) {
337 return existing.clone();
338 }
339 let shared: Arc<str> = Arc::from(s);
340 self.metadata_interner.insert(shared.clone());
341 shared
342 }
343
344 /// Construct a `ScanState` that consults the scanner-wide static
345 /// interner first. Use this from any path that has a
346 /// `&CompiledScanner` in scope; falls back to `default()` for
347 /// stand-alone unit tests.
348 pub fn with_static_intern(intern: Arc<crate::static_intern::StaticInterner>) -> Self {
349 Self {
350 static_intern: Some(intern),
351 ..Self::default()
352 }
353 }
354
355 /// Push a match to the state, maintaining priority and capacity.
356 /// High-confidence secrets will displace lower-confidence findings.
357 pub fn push_match(&mut self, m: keyhog_core::RawMatch, limit: usize) {
358 if self.matches.len() < limit {
359 self.matches.push(Reverse(m));
360 } else if let Some(mut lowest) = self.matches.peek_mut() {
361 if m > lowest.0 {
362 *lowest = Reverse(m);
363 }
364 }
365 }
366
367 /// Drain all matches into a sorted vector. Dedups identical findings
368 /// (same detector + same credential + same offset) - two engines can
369 /// produce the same finding for the same pattern (e.g. ac_map's
370 /// literal hit + homoglyph fallback variant both fire on plain ASCII
371 /// because the homoglyph char-class includes the original char). The
372 /// caller only wants one of them in the result set.
373 pub fn into_matches(self) -> Vec<keyhog_core::RawMatch> {
374 let mut matches: Vec<_> = self.matches.into_iter().map(|r| r.0).collect();
375 // Sort descending by confidence for final output
376 matches.sort_by(|a, b| b.cmp(a));
377 // Dedup identical findings (same detector + credential + offset).
378 // 0 or 1 match cannot contain a duplicate, so skip all dedup work -
379 // no HashSet alloc, no refcount traffic - on the overwhelmingly
380 // common small-chunk case.
381 if matches.len() <= 1 {
382 return matches;
383 }
384 // For small N a sort-based adjacent dedup beats a HashSet: it adds
385 // no allocation and no `Arc::clone` (two atomics per match) - it
386 // only borrows the identity fields for comparison. The Vec is
387 // already sorted confidence-descending above; `sort_by` is a STABLE
388 // sort, so grouping by (detector_id, credential, offset) preserves
389 // that confidence-descending order within each identity group. The
390 // first element of each run is therefore the highest-confidence
391 // entry, which `dedup_by` keeps. A final `b.cmp(a)` restores the
392 // canonical output order. Same result as the HashSet path, no alloc.
393 if matches.len() <= 64 {
394 matches.sort_by(|a, b| {
395 a.detector_id
396 .cmp(&b.detector_id)
397 .then_with(|| a.credential.cmp(&b.credential))
398 .then_with(|| a.location.offset.cmp(&b.location.offset))
399 });
400 matches.dedup_by(|a, b| {
401 a.detector_id == b.detector_id
402 && a.credential == b.credential
403 && a.location.offset == b.location.offset
404 });
405 // Restore confidence-descending order for output.
406 matches.sort_by(|a, b| b.cmp(a));
407 return matches;
408 }
409 // Large N: HashSet dedup amortises better than repeated sorts.
410 // Stable: keeps the highest-confidence entry of any duplicate set
411 // thanks to the confidence sort above.
412 let mut seen: std::collections::HashSet<(std::sync::Arc<str>, std::sync::Arc<str>, usize)> =
413 std::collections::HashSet::with_capacity(matches.len());
414 matches.retain(|m| {
415 seen.insert((
416 std::sync::Arc::clone(&m.detector_id),
417 std::sync::Arc::clone(&m.credential),
418 m.location.offset,
419 ))
420 });
421 matches
422 }
423}