keyhog-scanner 0.5.40

keyhog-scanner: high-performance SIMD-accelerated secret detection engine
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
//! Scanner configuration and scan state types.

use std::cmp::Reverse;
use std::collections::{BinaryHeap, HashSet};
#[cfg(feature = "ml")]
use std::collections::{HashMap, VecDeque};
use std::sync::Arc;

/// Configuration for the scanner's decoding and processing heuristics.
#[derive(Debug, Clone)]
pub struct ScannerConfig {
    /// Maximum recursion depth for decode-through (base64, hex, etc.)
    pub max_decode_depth: usize,
    /// Validate decoded strings (e.g. check if decoded base64 is UTF-8)
    pub validate_decode: bool,
    /// Enable entropy-based detection
    pub entropy_enabled: bool,
    /// Threshold for entropy-based detection
    pub entropy_threshold: f64,
    /// Enable entropy-based detection in source code files
    pub entropy_in_source_files: bool,
    /// Route entropy-fallback candidates through the MoE with the model
    /// AUTHORITATIVE (no entropy-magnitude floor) instead of the bare entropy
    /// heuristic. Mirrors `keyhog_core::config::ScanConfig::entropy_ml_authoritative`
    /// and the CLI `--no-entropy-ml-scoring` opt-out. No-op unless both
    /// `entropy_enabled` and `ml_enabled` are set. See `apply_ml_batch_scores`
    /// and `scan_entropy_fallback`.
    pub entropy_ml_authoritative: bool,
    /// Admit generic keyword-bridge values (`PASSWORD=`, `*_PASS=`, `secret:`,
    /// `api_key=` ...) on the relaxed `generic-keyword-secret` entropy floor
    /// instead of the high `generic-secret` floor. Mirrors
    /// `keyhog_core::config::ScanConfig::generic_keyword_low_entropy` and the CLI
    /// `--no-keyword-low-entropy` opt-out. The keyword key is the evidence;
    /// precision is carried by the MoE + shape filters. See
    /// `scan_generic_assignments`.
    pub generic_keyword_low_entropy: bool,
    /// Enable ML-based confidence scoring
    pub ml_enabled: bool,
    /// ML weight for confidence scoring, 0.0-1.0
    pub ml_weight: f64,
    /// Minimum confidence threshold for matches
    pub min_confidence: f64,
    /// Enable Unicode normalization
    pub unicode_normalization: bool,
    /// Maximum bytes for decode-through processing
    pub max_decode_bytes: usize,
    /// Maximum matches to collect per chunk before stopping.
    /// Prevents OOM on extremely noisy files.
    pub max_matches_per_chunk: usize,
    /// When `true`, credentials inside source-code comments are
    /// treated as first-class findings (no confidence downgrade,
    /// no comment-context multiplier). Mirrors
    /// `keyhog_core::config::ScanConfig::scan_comments` and the
    /// CLI's `--scan-comments` flag. See that field's doc for why
    /// the default is off.
    pub scan_comments: bool,
    /// Configuration for multiline concatenation
    pub multiline: crate::multiline::MultilineConfig,
    /// Known secret prefixes used to boost confidence.
    pub known_prefixes: Vec<String>,
    /// Keywords indicating a secret context (e.g. "api_key", "token").
    pub secret_keywords: Vec<String>,
    /// Keywords indicating a test/mock context (e.g. "test", "fake").
    pub test_keywords: Vec<String>,
    /// Keywords indicating a placeholder value (e.g. "change_me", "todo").
    pub placeholder_keywords: Vec<String>,
    /// Apply test/example path confidence and hard-suppression heuristics.
    /// The CLI disables this for `--no-suppress-test-fixtures`.
    pub penalize_test_paths: bool,
}

impl Default for ScannerConfig {
    fn default() -> Self {
        keyhog_core::config::ScanConfig::default().into()
    }
}

impl ScannerConfig {
    /// Confidence floor for [`ScannerConfig::high_precision`]. Distinct from the
    /// canonical `ScanConfig::default()` floor (0.40) on purpose: precision mode
    /// trades recall for a near-zero false-positive rate at mass-scan scale.
    pub const HIGH_PRECISION_MIN_CONFIDENCE: f64 = 0.85;

    pub fn fast() -> Self {
        Self {
            max_decode_depth: 0,
            ml_enabled: false,
            entropy_enabled: false,
            ..Default::default()
        }
    }

    pub fn thorough() -> Self {
        // `min_confidence` intentionally omitted: it inherits the canonical
        // `ScanConfig::default()` floor (single source of truth) instead of
        // forking a second literal. Deep scanning widens decode/entropy, not
        // the confidence bar.
        Self {
            max_decode_depth: 10,
            ml_enabled: true,
            entropy_enabled: true,
            ..Default::default()
        }
    }

    /// High-precision mass-scan preset: minimise false positives at the cost of
    /// some recall, for scanning huge corpora where every FP is expensive to
    /// triage. Fully offline and fast (no ML, no entropy sweep, shallow decode).
    ///
    /// - `entropy_enabled = false`: generic high-entropy matching is the single
    ///   largest FP source; precision mode drops it entirely.
    /// - `ml_enabled = true` (inherited): ML is the confidence discriminator that
    ///   lifts genuine secrets over the high floor while leaving FP-shaped tokens
    ///   below it. Disabling it would crater the scores the 0.85 bar relies on,
    ///   so precision KEEPS ML (this mode trades recall for precision, not for
    ///   speed — use `--fast` when speed is the goal).
    /// - `min_confidence = HIGH_PRECISION_MIN_CONFIDENCE` (0.85): combined with
    ///   the engine's checksum policy (valid token → floored 0.9, invalid →
    ///   capped 0.1) and clamped over every detector's self-declared floor, this
    ///   bar admits checksum-validated tokens and strong ML-scored findings while
    ///   dropping checksum-failures and weak-signal matches.
    /// - `max_decode_depth = 1`: deep-decoded payloads are a FP source at scale.
    ///
    /// `penalize_test_paths` stays on (the default) to suppress fixture-shaped
    /// hits. A `--min-confidence` override still layers on top of this preset.
    pub fn high_precision() -> Self {
        Self {
            max_decode_depth: 1,
            entropy_enabled: false,
            // High-precision mode does not admit low-entropy keyword-anchored
            // values: that surface trades precision for real-world recall, the
            // opposite of this preset's contract. Restores the high
            // `generic-secret` floor.
            generic_keyword_low_entropy: false,
            min_confidence: Self::HIGH_PRECISION_MIN_CONFIDENCE,
            ..Default::default()
        }
    }

    pub fn min_confidence(mut self, min_confidence: f64) -> Self {
        self.min_confidence = min_confidence;
        self
    }

    /// Clamp every float field into its valid range and replace any
    /// NaN with a safe default. A user-supplied
    /// `--min-confidence=-5.0` or a corrupt config TOML feeding
    /// `min_confidence = nan` would otherwise NaN-infect the
    /// confidence-comparison path and silently drop every finding
    /// (NaN comparisons are always false, so `conf < min_confidence`
    /// is `false`, but `conf >= min_confidence` is also `false`,
    /// behaviour-dependent on the call site).
    ///
    /// Idempotent - sanitising an already-sane config is a no-op.
    /// Called inside `From<ScanConfig>` so any path that constructs
    /// a ScannerConfig from a user-influenced source pays this
    /// once at config-build time.
    pub fn sanitise(&mut self) {
        // Probabilities: clamp to [0.0, 1.0], NaN → canonical default. The
        // NaN fallbacks READ FROM `ScanConfig::default()` rather than repeating
        // a literal, so a corrupt-config scrub can never fork from the shipped
        // floor (currently ml_weight 0.5, min_confidence 0.40) - one source.
        let canon = keyhog_core::config::ScanConfig::default();
        if !self.ml_weight.is_finite() {
            self.ml_weight = canon.ml_weight;
        } else {
            self.ml_weight = self.ml_weight.clamp(0.0, 1.0);
        }
        if !self.min_confidence.is_finite() {
            self.min_confidence = canon.min_confidence;
        } else {
            self.min_confidence = self.min_confidence.clamp(0.0, 1.0);
        }
        // Shannon entropy: 8.0 is the upper bound for byte-level
        // entropy. NaN / negative → conservative default.
        if !self.entropy_threshold.is_finite() || self.entropy_threshold < 0.0 {
            self.entropy_threshold = 4.5;
        } else if self.entropy_threshold > 8.0 {
            self.entropy_threshold = 8.0;
        }
        // Recursion-depth + chunk-size caps. Production-bound the
        // worst case: max_decode_depth > 32 risks stack overflow on
        // pathological nested base64. max_matches_per_chunk has no
        // theoretical upper bound but a billion is misconfiguration.
        if self.max_decode_depth > 32 {
            self.max_decode_depth = 32;
        }
        if self.max_matches_per_chunk > 1_000_000 {
            self.max_matches_per_chunk = 1_000_000;
        }
        if self.max_matches_per_chunk == 0 {
            self.max_matches_per_chunk = 1000;
        }
    }
}

impl From<keyhog_core::config::ScanConfig> for ScannerConfig {
    fn from(config: keyhog_core::config::ScanConfig) -> Self {
        // Identity-style mapping: every shared knob carries 1:1 with the
        // SAME field name on both sides. No rename, no invented value -
        // the From must not introduce drift, or a tuning baked into one
        // `Default` silently disagrees with the benched/shipped path.
        //
        // `multiline` has no `ScanConfig` counterpart (its type lives in
        // this crate, and `keyhog-core` cannot depend on `keyhog-scanner`
        // without a dependency cycle), so it takes the scanner default.
        //
        // `ScanConfig::{min_secret_len, max_file_size, dedup}` are NOT
        // carried: the scanner reads none of them, so mapping them in
        // would be dead state that could drift. `max_file_size` is
        // enforced independently at the source walker (`keyhog-sources`,
        // `FilesystemSource::with_max_file_size`); `dedup` is applied by
        // the verifier via `DedupScope`; `min_secret_len` currently has no
        // reader at all. They stay on `ScanConfig` (covered by core config
        // tests) but have no `ScannerConfig` peer by design.
        let mut out = Self {
            max_decode_depth: config.max_decode_depth,
            validate_decode: config.validate_decode,
            entropy_enabled: config.entropy_enabled,
            entropy_threshold: config.entropy_threshold,
            entropy_in_source_files: config.entropy_in_source_files,
            entropy_ml_authoritative: config.entropy_ml_authoritative,
            generic_keyword_low_entropy: config.generic_keyword_low_entropy,
            ml_enabled: config.ml_enabled,
            ml_weight: config.ml_weight,
            min_confidence: config.min_confidence,
            unicode_normalization: config.unicode_normalization,
            max_decode_bytes: config.max_decode_bytes,
            max_matches_per_chunk: config.max_matches_per_chunk,
            scan_comments: config.scan_comments,
            multiline: crate::multiline::MultilineConfig::default(),
            known_prefixes: config.known_prefixes,
            secret_keywords: config.secret_keywords,
            test_keywords: config.test_keywords,
            placeholder_keywords: config.placeholder_keywords,
            // Scanner-only knob; the CLI flips it off for
            // `--no-suppress-test-fixtures`.
            penalize_test_paths: true,
        };
        // Defensive clamp + NaN scrub on every user-influenced
        // numeric field. Idempotent. See `ScannerConfig::sanitise`
        // for rationale.
        out.sanitise();
        out
    }
}

/// Queued ML match waiting for batch inference at the end of a scan.
#[cfg(feature = "ml")]
#[derive(Debug, Clone)]
pub struct MlPendingMatch {
    /// The raw match built with heuristic confidence only.
    pub raw_match: keyhog_core::RawMatch,
    /// Heuristic confidence before ML blending.
    pub heuristic_conf: f64,
    /// Inferred code context for post-ML adjustments.
    pub code_context: crate::context::CodeContext,
    /// Credential text for feature extraction.
    pub credential: String,
    /// Surrounding context passed to the ML scorer.
    pub ml_context: String,
    /// When true, the MoE score is AUTHORITATIVE for this candidate: the final
    /// confidence is the model score directly, NOT `max(heuristic, ml)`. Set for
    /// entropy-fallback candidates, whose "heuristic" is bare entropy magnitude -
    /// exactly the signal that mislabels high-entropy non-secrets (FQDNs, git
    /// SHAs, base64 blobs) as findings. Flooring by that heuristic (as the
    /// detector path does, where the regex IS positive evidence) would defeat the
    /// model's ability to suppress those FPs. Detector/generic matches set this
    /// false and keep the heuristic floor. See `apply_ml_batch_scores`.
    pub model_authoritative: bool,
}

/// Internal state for a single scan operation (tracks matches and ML cache).
#[derive(Default)]
pub struct ScanState {
    /// Matches collected for this chunk, prioritized by confidence.
    /// Uses Reverse to make it a min-heap so we can easily pop the LOWEST confidence.
    pub matches: BinaryHeap<Reverse<keyhog_core::RawMatch>>,
    /// Interner for credentials found in this chunk to save memory on duplicates.
    pub credential_interner: HashSet<Arc<str>>,
    /// Static string cache for detector metadata. Uses
    /// `HashSet<Arc<str>>` (not `HashMap<String, Arc<str>>`) so a
    /// cache miss allocates ONLY the `Arc<str>` - the prior shape
    /// also allocated a `String` to serve as the HashMap key, paying
    /// twice for what's a single dedup slot. `HashSet::get(&s)` works
    /// via `Arc<str>: Borrow<str>`, no allocation on hits.
    ///
    /// Hit ONLY by dynamic strings now: the scanner-wide
    /// `StaticInterner` (vyre CHD perfect hash) handles every
    /// `(detector_id, detector_name, service, source_type)` lookup
    /// without per-scan allocation.
    pub metadata_interner: HashSet<Arc<str>>,
    /// Optional reference to the scanner's frozen static-string
    /// interner. When `Some`, `intern_metadata` checks here first
    /// before falling through to the per-scan `metadata_interner`.
    /// Lock-free on read so concurrent rayon workers share one
    /// instance without contention.
    pub static_intern: Option<Arc<crate::static_intern::StaticInterner>>,
    #[cfg(feature = "ml")]
    pub ml_score_cache: HashMap<(String, String), f64>,
    #[cfg(feature = "ml")]
    pub ml_cache_order: VecDeque<(String, String)>,
    #[cfg(feature = "ml")]
    pub ml_cache_bytes: usize,
    #[cfg(feature = "ml")]
    /// Detector matches queued for batch ML scoring at the end of the scan.
    pub ml_pending: Vec<MlPendingMatch>,
}

impl ScanState {
    /// Intern a credential string, returning an `Arc<str>`.
    pub fn intern_credential(&mut self, s: &str) -> Arc<str> {
        if let Some(existing) = self.credential_interner.get(s) {
            existing.clone()
        } else {
            let shared: Arc<str> = Arc::from(s);
            self.credential_interner.insert(shared.clone());
            shared
        }
    }

    /// Intern a metadata string (detector_id, name, service, source_type, ...).
    ///
    /// Lookup order:
    ///   1. Scanner-wide `StaticInterner` (vyre CHD perfect hash) for
    ///      detector metadata that's frozen at scanner construction -
    ///      O(1), no allocation, no lock contention.
    ///   2. Per-scan `metadata_interner` `HashSet` for dynamic strings
    ///      (file paths, commit SHAs, author names, dates).
    pub fn intern_metadata(&mut self, s: &str) -> Arc<str> {
        if let Some(intern) = self.static_intern.as_ref() {
            if let Some(arc) = intern.lookup(s) {
                return arc;
            }
        }
        if let Some(existing) = self.metadata_interner.get(s) {
            return existing.clone();
        }
        let shared: Arc<str> = Arc::from(s);
        self.metadata_interner.insert(shared.clone());
        shared
    }

    /// Construct a `ScanState` that consults the scanner-wide static
    /// interner first. Use this from any path that has a
    /// `&CompiledScanner` in scope; falls back to `default()` for
    /// stand-alone unit tests.
    pub fn with_static_intern(intern: Arc<crate::static_intern::StaticInterner>) -> Self {
        Self {
            static_intern: Some(intern),
            ..Self::default()
        }
    }

    /// Push a match to the state, maintaining priority and capacity.
    /// High-confidence secrets will displace lower-confidence findings.
    pub fn push_match(&mut self, m: keyhog_core::RawMatch, limit: usize) {
        if self.matches.len() < limit {
            self.matches.push(Reverse(m));
        } else if let Some(mut lowest) = self.matches.peek_mut() {
            if m > lowest.0 {
                *lowest = Reverse(m);
            }
        }
    }

    /// Drain all matches into a sorted vector. Dedups identical findings
    /// (same detector + same credential + same offset) - two engines can
    /// produce the same finding for the same pattern (e.g. ac_map's
    /// literal hit + homoglyph fallback variant both fire on plain ASCII
    /// because the homoglyph char-class includes the original char). The
    /// caller only wants one of them in the result set.
    pub fn into_matches(self) -> Vec<keyhog_core::RawMatch> {
        let mut matches: Vec<_> = self.matches.into_iter().map(|r| r.0).collect();
        // Sort descending by confidence for final output
        matches.sort_by(|a, b| b.cmp(a));
        // Dedup identical findings (same detector + credential + offset).
        // 0 or 1 match cannot contain a duplicate, so skip all dedup work -
        // no HashSet alloc, no refcount traffic - on the overwhelmingly
        // common small-chunk case.
        if matches.len() <= 1 {
            return matches;
        }
        // For small N a sort-based adjacent dedup beats a HashSet: it adds
        // no allocation and no `Arc::clone` (two atomics per match) - it
        // only borrows the identity fields for comparison. The Vec is
        // already sorted confidence-descending above; `sort_by` is a STABLE
        // sort, so grouping by (detector_id, credential, offset) preserves
        // that confidence-descending order within each identity group. The
        // first element of each run is therefore the highest-confidence
        // entry, which `dedup_by` keeps. A final `b.cmp(a)` restores the
        // canonical output order. Same result as the HashSet path, no alloc.
        if matches.len() <= 64 {
            matches.sort_by(|a, b| {
                a.detector_id
                    .cmp(&b.detector_id)
                    .then_with(|| a.credential.cmp(&b.credential))
                    .then_with(|| a.location.offset.cmp(&b.location.offset))
            });
            matches.dedup_by(|a, b| {
                a.detector_id == b.detector_id
                    && a.credential == b.credential
                    && a.location.offset == b.location.offset
            });
            // Restore confidence-descending order for output.
            matches.sort_by(|a, b| b.cmp(a));
            return matches;
        }
        // Large N: HashSet dedup amortises better than repeated sorts.
        // Stable: keeps the highest-confidence entry of any duplicate set
        // thanks to the confidence sort above.
        let mut seen: std::collections::HashSet<(std::sync::Arc<str>, std::sync::Arc<str>, usize)> =
            std::collections::HashSet::with_capacity(matches.len());
        matches.retain(|m| {
            seen.insert((
                std::sync::Arc::clone(&m.detector_id),
                std::sync::Arc::clone(&m.credential),
                m.location.offset,
            ))
        });
        matches
    }
}