1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
//! Scanner configuration and scan state types.
use std::cmp::Reverse;
use std::collections::{BinaryHeap, HashSet};
#[cfg(feature = "ml")]
use std::collections::{HashMap, VecDeque};
use std::sync::Arc;
/// Configuration for the scanner's decoding and processing heuristics.
#[derive(Debug, Clone)]
pub struct ScannerConfig {
/// Maximum recursion depth for decode-through (base64, hex, etc.)
pub max_decode_depth: usize,
/// Validate decoded strings (e.g. check if decoded base64 is UTF-8)
pub validate_decode: bool,
/// Enable entropy-based detection
pub entropy_enabled: bool,
/// Threshold for entropy-based detection
pub entropy_threshold: f64,
/// Enable entropy-based detection in source code files
pub entropy_in_source_files: bool,
/// Enable ML-based confidence scoring
pub ml_enabled: bool,
/// ML weight for confidence scoring, 0.0-1.0
pub ml_weight: f64,
/// Minimum confidence threshold for matches
pub min_confidence: f64,
/// Enable Unicode normalization
pub unicode_normalization: bool,
/// Maximum bytes for decode-through processing
pub max_decode_bytes: usize,
/// Maximum matches to collect per chunk before stopping.
/// Prevents OOM on extremely noisy files.
pub max_matches_per_chunk: usize,
/// When `true`, credentials inside source-code comments are
/// treated as first-class findings (no confidence downgrade,
/// no comment-context multiplier). Mirrors
/// `keyhog_core::config::ScanConfig::scan_comments` and the
/// CLI's `--scan-comments` flag. See that field's doc for why
/// the default is off.
pub scan_comments: bool,
/// Configuration for multiline concatenation
pub multiline: crate::multiline::MultilineConfig,
/// Known secret prefixes used to boost confidence.
pub known_prefixes: Vec<String>,
/// Keywords indicating a secret context (e.g. "api_key", "token").
pub secret_keywords: Vec<String>,
/// Keywords indicating a test/mock context (e.g. "test", "fake").
pub test_keywords: Vec<String>,
/// Keywords indicating a placeholder value (e.g. "change_me", "todo").
pub placeholder_keywords: Vec<String>,
}
impl Default for ScannerConfig {
fn default() -> Self {
keyhog_core::config::ScanConfig::default().into()
}
}
impl ScannerConfig {
pub fn fast() -> Self {
Self {
max_decode_depth: 0,
ml_enabled: false,
entropy_enabled: false,
..Default::default()
}
}
pub fn thorough() -> Self {
Self {
max_decode_depth: 10,
ml_enabled: true,
entropy_enabled: true,
min_confidence: 0.5,
..Default::default()
}
}
pub fn min_confidence(mut self, min_confidence: f64) -> Self {
self.min_confidence = min_confidence;
self
}
/// Clamp every float field into its valid range and replace any
/// NaN with a safe default. A user-supplied
/// `--min-confidence=-5.0` or a corrupt config TOML feeding
/// `min_confidence = nan` would otherwise NaN-infect the
/// confidence-comparison path and silently drop every finding
/// (NaN comparisons are always false, so `conf < min_confidence`
/// is `false`, but `conf >= min_confidence` is also `false`,
/// behaviour-dependent on the call site).
///
/// Idempotent - sanitising an already-sane config is a no-op.
/// Called inside `From<ScanConfig>` so any path that constructs
/// a ScannerConfig from a user-influenced source pays this
/// once at config-build time.
pub fn sanitise(&mut self) {
// Probabilities: clamp to [0.0, 1.0], NaN → default.
if !self.ml_weight.is_finite() {
self.ml_weight = 0.6;
} else {
self.ml_weight = self.ml_weight.clamp(0.0, 1.0);
}
if !self.min_confidence.is_finite() {
self.min_confidence = 0.3;
} else {
self.min_confidence = self.min_confidence.clamp(0.0, 1.0);
}
// Shannon entropy: 8.0 is the upper bound for byte-level
// entropy. NaN / negative → conservative default.
if !self.entropy_threshold.is_finite() || self.entropy_threshold < 0.0 {
self.entropy_threshold = 4.5;
} else if self.entropy_threshold > 8.0 {
self.entropy_threshold = 8.0;
}
// Recursion-depth + chunk-size caps. Production-bound the
// worst case: max_decode_depth > 32 risks stack overflow on
// pathological nested base64. max_matches_per_chunk has no
// theoretical upper bound but a billion is misconfiguration.
if self.max_decode_depth > 32 {
self.max_decode_depth = 32;
}
if self.max_matches_per_chunk > 1_000_000 {
self.max_matches_per_chunk = 1_000_000;
}
if self.max_matches_per_chunk == 0 {
self.max_matches_per_chunk = 1000;
}
}
}
impl From<keyhog_core::config::ScanConfig> for ScannerConfig {
fn from(config: keyhog_core::config::ScanConfig) -> Self {
let mut out = Self {
max_decode_depth: config.max_decode_depth,
validate_decode: true,
entropy_enabled: config.entropy_enabled,
entropy_threshold: config.entropy_threshold,
entropy_in_source_files: config.entropy_in_source_files,
ml_enabled: config.ml_enabled,
ml_weight: config.ml_weight,
min_confidence: config.min_confidence,
unicode_normalization: config.unicode_normalization,
max_decode_bytes: config.decode_size_limit,
max_matches_per_chunk: config.max_matches_per_chunk,
scan_comments: config.scan_comments,
multiline: crate::multiline::MultilineConfig::default(),
known_prefixes: config.known_prefixes,
secret_keywords: config.secret_keywords,
test_keywords: config.test_keywords,
placeholder_keywords: config.placeholder_keywords,
};
// Defensive clamp + NaN scrub on every user-influenced
// numeric field. Idempotent. See `ScannerConfig::sanitise`
// for rationale.
out.sanitise();
out
}
}
/// Deferred ML match waiting for batch inference at the end of a scan.
#[cfg(feature = "ml")]
#[derive(Debug, Clone)]
pub struct MlPendingMatch {
/// The raw match built with heuristic confidence only.
pub raw_match: keyhog_core::RawMatch,
/// Heuristic confidence before ML blending.
pub heuristic_conf: f64,
/// Inferred code context for post-ML adjustments.
pub code_context: crate::context::CodeContext,
/// Credential text for feature extraction.
pub credential: String,
/// Surrounding context passed to the ML scorer.
pub ml_context: String,
}
/// Internal state for a single scan operation (tracks matches and ML cache).
#[derive(Default)]
pub struct ScanState {
/// Matches collected for this chunk, prioritized by confidence.
/// Uses Reverse to make it a min-heap so we can easily pop the LOWEST confidence.
pub matches: BinaryHeap<Reverse<keyhog_core::RawMatch>>,
/// Interner for credentials found in this chunk to save memory on duplicates.
pub credential_interner: HashSet<Arc<str>>,
/// Static string cache for detector metadata. Uses
/// `HashSet<Arc<str>>` (not `HashMap<String, Arc<str>>`) so a
/// cache miss allocates ONLY the `Arc<str>` - the prior shape
/// also allocated a `String` to serve as the HashMap key, paying
/// twice for what's a single dedup slot. `HashSet::get(&s)` works
/// via `Arc<str>: Borrow<str>`, no allocation on hits.
///
/// Hit ONLY by dynamic strings now: the scanner-wide
/// `StaticInterner` (vyre CHD perfect hash) handles every
/// `(detector_id, detector_name, service, source_type)` lookup
/// without per-scan allocation.
pub metadata_interner: HashSet<Arc<str>>,
/// Optional reference to the scanner's frozen static-string
/// interner. When `Some`, `intern_metadata` checks here first
/// before falling through to the per-scan `metadata_interner`.
/// Lock-free on read so concurrent rayon workers share one
/// instance without contention.
pub static_intern: Option<Arc<crate::static_intern::StaticInterner>>,
#[cfg(feature = "ml")]
pub ml_score_cache: HashMap<(String, String), f64>,
#[cfg(feature = "ml")]
pub ml_cache_order: VecDeque<(String, String)>,
#[cfg(feature = "ml")]
pub ml_cache_bytes: usize,
#[cfg(feature = "ml")]
/// Detector matches deferred for batch ML scoring at the end of the scan.
pub ml_pending: Vec<MlPendingMatch>,
}
impl ScanState {
/// Intern a credential string, returning an `Arc<str>`.
pub fn intern_credential(&mut self, s: &str) -> Arc<str> {
if let Some(existing) = self.credential_interner.get(s) {
existing.clone()
} else {
let shared: Arc<str> = Arc::from(s);
self.credential_interner.insert(shared.clone());
shared
}
}
/// Intern a metadata string (detector_id, name, service, source_type, ...).
///
/// Lookup order:
/// 1. Scanner-wide `StaticInterner` (vyre CHD perfect hash) for
/// detector metadata that's frozen at scanner construction -
/// O(1), no allocation, no lock contention.
/// 2. Per-scan `metadata_interner` `HashSet` for dynamic strings
/// (file paths, commit SHAs, author names, dates).
pub fn intern_metadata(&mut self, s: &str) -> Arc<str> {
if let Some(intern) = self.static_intern.as_ref() {
if let Some(arc) = intern.lookup(s) {
return arc;
}
}
if let Some(existing) = self.metadata_interner.get(s) {
return existing.clone();
}
let shared: Arc<str> = Arc::from(s);
self.metadata_interner.insert(shared.clone());
shared
}
/// Construct a `ScanState` that consults the scanner-wide static
/// interner first. Use this from any path that has a
/// `&CompiledScanner` in scope; falls back to `default()` for
/// stand-alone unit tests.
pub fn with_static_intern(intern: Arc<crate::static_intern::StaticInterner>) -> Self {
Self {
static_intern: Some(intern),
..Self::default()
}
}
/// Push a match to the state, maintaining priority and capacity.
/// High-confidence secrets will displace lower-confidence findings.
pub fn push_match(&mut self, m: keyhog_core::RawMatch, limit: usize) {
if self.matches.len() < limit {
self.matches.push(Reverse(m));
} else if let Some(mut lowest) = self.matches.peek_mut() {
if m > lowest.0 {
*lowest = Reverse(m);
}
}
}
/// Drain all matches into a sorted vector. Dedups identical findings
/// (same detector + same credential + same offset) - two engines can
/// produce the same finding for the same pattern (e.g. ac_map's
/// literal hit + homoglyph fallback variant both fire on plain ASCII
/// because the homoglyph char-class includes the original char). The
/// caller only wants one of them in the result set.
pub fn into_matches(self) -> Vec<keyhog_core::RawMatch> {
let mut matches: Vec<_> = self.matches.into_iter().map(|r| r.0).collect();
// Sort descending by confidence for final output
matches.sort_by(|a, b| b.cmp(a));
// Dedup identical findings. Stable: keeps the highest-confidence
// entry of any duplicate set thanks to the sort above.
let mut seen: std::collections::HashSet<(std::sync::Arc<str>, std::sync::Arc<str>, usize)> =
std::collections::HashSet::with_capacity(matches.len());
matches.retain(|m| {
seen.insert((
std::sync::Arc::clone(&m.detector_id),
std::sync::Arc::clone(&m.credential),
m.location.offset,
))
});
matches
}
}