keyhog_core/config.rs
1//! Configuration for KeyHog scanning and verification.
2//!
3//! Provides the [`ScanConfig`] struct used to control decoding depth,
4//! entropy thresholds, deduplication strategy, and performance tuning.
5
6use serde::{Deserialize, Serialize};
7use thiserror::Error;
8
9use crate::DedupScope;
10
11/// Configuration for a scan run.
12#[derive(Clone, Debug, Serialize, Deserialize)]
13pub struct ScanConfig {
14 /// Minimum confidence (0.0 to 1.0) required to report a finding.
15 pub min_confidence: f64,
16 /// Maximum recursive decoding depth (e.g. Base64(Hex(URL(secret)))).
17 pub max_decode_depth: usize,
18 /// Whether to enable Shannon entropy analysis for unknown high-entropy strings.
19 pub entropy_enabled: bool,
20 /// Whether to enable entropy analysis even in standard source code files.
21 pub entropy_in_source_files: bool,
22 /// When the entropy fallback fires, score its candidates through the MoE
23 /// with the model AUTHORITATIVE (the entropy magnitude is NOT a confidence
24 /// floor) instead of emitting the bare entropy heuristic. Default on: on the
25 /// real-distribution-trained model this is a recall-safe precision win — the
26 /// model scores real high-entropy secrets high and structured non-secrets
27 /// (FQDNs, git SHAs, base64 blobs) low, so FPs fall below the report floor
28 /// while genuine recall is preserved. Opt out with `--no-entropy-ml-scoring`.
29 /// No-op when `entropy_enabled` or `ml_enabled` is false.
30 #[serde(default = "default_entropy_ml_authoritative")]
31 pub entropy_ml_authoritative: bool,
32 /// When the generic keyword bridge (`PASSWORD=`, `*_PASS=`, `secret:`,
33 /// `api_key=` ...) extracts a value, admit it on a far lower entropy floor
34 /// (the `generic-keyword-secret` base, ~1.5 bits) than the bare
35 /// `generic-secret` path (2.8/3.2/3.5). The credential KEYWORD in the key is
36 /// the evidence; precision is carried by the MoE + shape filters, not by
37 /// entropy. Default on: this is what lets keyhog surface the real-world
38 /// low-entropy credentials (config passwords, `*_PASS=` values) that pin
39 /// CredData recall near zero when gated on entropy alone. Opt out with
40 /// `--no-keyword-low-entropy` to restore the high-entropy-only generic gate.
41 /// No-op unless the keyword bridge fires.
42 #[serde(default = "default_generic_keyword_low_entropy")]
43 pub generic_keyword_low_entropy: bool,
44 /// Shannon entropy threshold (typical secrets are 4.5+).
45 pub entropy_threshold: f64,
46 /// Minimum length for entropy-based secret detection.
47 ///
48 /// NOTE: not yet read by the live scan. `From<ScanConfig> for
49 /// ScannerConfig` does not carry this field; the entropy length
50 /// gate currently uses the engine's own length constants. Setting
51 /// it in a deserialized config is a no-op until a reader is wired
52 /// in. See the `From` impl on `ScannerConfig` for the canonical
53 /// list of carried vs uncarried fields.
54 pub min_secret_len: usize,
55 /// Maximum file size to scan (bytes). Large files are skipped or sampled.
56 ///
57 /// NOTE: not read here on the live path. The effective cap is set
58 /// at the source walker (`FilesystemSource::with_max_file_size`,
59 /// fed from `ScanArgs.max_file_size`); this field is retained for
60 /// the canonical config surface but is not carried into
61 /// `ScannerConfig`.
62 pub max_file_size: u64,
63 /// Deduplication strategy.
64 ///
65 /// NOTE: not read here on the live path. The effective scope comes
66 /// from `ScanArgs.dedup` and is applied by the verifier via
67 /// `DedupScope`; this field is not carried into `ScannerConfig`.
68 pub dedup: DedupScope,
69
70 /// Whether to enable ML-based probabilistic gating.
71 pub ml_enabled: bool,
72 /// Weight given to the ML score (0.0 to 1.0).
73 pub ml_weight: f64,
74 /// Whether to normalize Unicode characters before scanning.
75 pub unicode_normalization: bool,
76 /// Whether to validate decoded strings (e.g. that decoded base64 is
77 /// UTF-8) before recursing into them.
78 pub validate_decode: bool,
79 /// Maximum bytes allowed from recursive decoding. Same field name on
80 /// `ScannerConfig` so `From<ScanConfig>` is a 1:1 carry, not a rename.
81 pub max_decode_bytes: usize,
82 /// Maximum matches allowed per chunk to prevent OOM.
83 pub max_matches_per_chunk: usize,
84
85 /// When `true`, credentials inside source-code comments
86 /// (//, #, /* */, <!-- -->) get the same confidence treatment as
87 /// credentials in regular code. Default `false` - comment context
88 /// downgrades confidence on the theory that examples are the
89 /// common case. CLI exposes this as `--scan-comments`; opt-in
90 /// because the rate of EXAMPLE secrets pasted into doc comments
91 /// vastly outweighs the rate of real ones.
92 #[serde(default)]
93 pub scan_comments: bool,
94
95 /// List of common secret prefixes to prioritize.
96 pub known_prefixes: Vec<String>,
97 /// List of keywords that strongly indicate a secret.
98 pub secret_keywords: Vec<String>,
99 /// Keywords used in test environments.
100 pub test_keywords: Vec<String>,
101 /// Keywords for placeholders and documentation.
102 pub placeholder_keywords: Vec<String>,
103}
104
105/// Limits for decoding to prevent infinite recursion or memory exhaustion.
106pub const MAX_DECODE_DEPTH_LIMIT: usize = 16;
107
108/// Serde default for [`ScanConfig::entropy_ml_authoritative`]: a config
109/// deserialized from a TOML that predates the field gets the shipped default
110/// (on) rather than `bool`'s `false`, so old configs don't silently disable it.
111fn default_entropy_ml_authoritative() -> bool {
112 true
113}
114
115/// Serde default for [`ScanConfig::generic_keyword_low_entropy`]: configs that
116/// predate the field get the shipped default (on) rather than `bool`'s `false`,
117/// so old TOMLs don't silently fall back to the high-entropy-only generic gate.
118fn default_generic_keyword_low_entropy() -> bool {
119 true
120}
121
122/// Errors returned while validating a scan configuration.
123#[derive(Debug, Error)]
124pub enum ConfigError {
125 /// `min_confidence` was outside the closed unit interval `[0.0, 1.0]`.
126 #[error("min_confidence must be between 0.0 and 1.0, found {0}")]
127 InvalidConfidence(f64),
128 /// `max_decode_depth` exceeded the safety ceiling
129 /// [`MAX_DECODE_DEPTH_LIMIT`].
130 #[error("max_decode_depth exceeds limit of {MAX_DECODE_DEPTH_LIMIT}, found {0}")]
131 DepthTooHigh(usize),
132}
133
134impl Default for ScanConfig {
135 fn default() -> Self {
136 Self {
137 // Bench-tuned floor (SecretBench mirror grid-sweep 2026-05-30):
138 // 0.40 maximises F1 (0.8642, P=0.984, FP=37) and is the precision
139 // sweet spot. 0.30 admits a low-confidence FP band (FP 174); 0.50
140 // is WORSE on both axes (the floor is non-monotonic in FP - see
141 // the scan-time/ML entanglement bug tracked in backlog DET-08).
142 // This is the canonical tuned == benched == shipped floor; the
143 // post-scan gate (orchestrator/postprocess.rs) and the scan-time
144 // generic gate (engine/fallback_generic.rs) both resolve to it.
145 min_confidence: 0.40,
146 // Aligned with CLI / scanner defaults (`ScannerConfig` derives from this).
147 max_decode_depth: 10,
148 entropy_enabled: true,
149 entropy_in_source_files: false,
150 entropy_ml_authoritative: true,
151 generic_keyword_low_entropy: true,
152 entropy_threshold: 4.5,
153 min_secret_len: 20,
154 max_file_size: 10 * 1024 * 1024, // 10 MB
155 dedup: DedupScope::Credential,
156 ml_enabled: true,
157 ml_weight: 0.5,
158 unicode_normalization: true,
159 validate_decode: true,
160 // Per-chunk decode-through ceiling (conservative vs multi-MiB blobs).
161 max_decode_bytes: 512 * 1024,
162 max_matches_per_chunk: 1000,
163 scan_comments: false,
164 known_prefixes: vec!["AKIA".into(), "ASIA".into(), "ghp_".into(), "sk_".into()],
165 secret_keywords: vec![
166 "password".into(),
167 "passwd".into(),
168 "pwd".into(),
169 "secret".into(),
170 "token".into(),
171 "api_key".into(),
172 "apikey".into(),
173 "api-key".into(),
174 "access_key".into(),
175 "auth_token".into(),
176 "auth_key".into(),
177 "private_key".into(),
178 "client_secret".into(),
179 "encryption_key".into(),
180 "signing_key".into(),
181 "bearer".into(),
182 "credential".into(),
183 "license_key".into(),
184 ],
185 test_keywords: vec![
186 "test".into(),
187 "mock".into(),
188 "fake".into(),
189 "dummy".into(),
190 "stub".into(),
191 "fixture".into(),
192 "example".into(),
193 "sample".into(),
194 "sandbox".into(),
195 "staging".into(),
196 ],
197 placeholder_keywords: vec![
198 "change_me".into(),
199 "changeme".into(),
200 "replace_me".into(),
201 "todo".into(),
202 "fixme".into(),
203 "your_".into(),
204 "insert_".into(),
205 "put_your".into(),
206 "fill_in".into(),
207 "<your".into(),
208 ],
209 }
210 }
211}
212
213impl ScanConfig {
214 // PRESET ROUTING NOTE: these core presets are the canonical preset
215 // definitions, reachable in the engine only via
216 // `ScannerConfig::from(ScanConfig::fast()/thorough()/paranoid())`.
217 // The CLI's `build_scanner_config` currently selects the parallel
218 // `ScannerConfig::fast()/thorough()` instead, whose values DIVERGE
219 // from these (e.g. fast decode-depth 0 vs 2, thorough 10 vs 8). The
220 // single-source-of-truth fix is to route the CLI through these core
221 // presets and drop the scanner-side duplicates; until that lands,
222 // a reader auditing "what --fast does" must check the CLI path, not
223 // these methods. Values here are pinned by `crates/core/tests/unit`.
224
225 /// Fast configuration optimized for speed over exhaustive recall.
226 pub fn fast() -> Self {
227 Self {
228 max_decode_depth: 2,
229 entropy_enabled: false,
230 ml_enabled: false,
231 ..Default::default()
232 }
233 }
234
235 /// Thorough configuration for deep penetration into encoded layers.
236 pub fn thorough() -> Self {
237 Self {
238 max_decode_depth: 8,
239 entropy_in_source_files: true,
240 ml_enabled: true,
241 ..Default::default()
242 }
243 }
244
245 /// Maximum paranoia: deep decoding and aggressive entropy analysis.
246 pub fn paranoid() -> Self {
247 Self {
248 max_decode_depth: MAX_DECODE_DEPTH_LIMIT,
249 entropy_enabled: true,
250 entropy_in_source_files: true,
251 // Deliberately below the default of 20: paranoid mode trades
252 // precision for recall and accepts shorter candidates. Not a
253 // default disagreement - see `min_secret_len`'s field note on
254 // its (currently no-op) live-path status.
255 min_secret_len: 16,
256 ml_enabled: true,
257 ..Default::default()
258 }
259 }
260
261 /// Validate the configuration parameters.
262 pub fn validate(&self) -> Result<(), ConfigError> {
263 if !(0.0..=1.0).contains(&self.min_confidence) {
264 return Err(ConfigError::InvalidConfidence(self.min_confidence));
265 }
266 if self.max_decode_depth > MAX_DECODE_DEPTH_LIMIT {
267 return Err(ConfigError::DepthTooHigh(self.max_decode_depth));
268 }
269 Ok(())
270 }
271}
272
273/// List of filenames that typically contain secrets (e.g. .env, config.json).
274/// Return a list of filenames that typically contain secrets (e.g., .env, id_rsa).
275pub fn secret_filenames() -> Vec<String> {
276 vec![
277 ".env",
278 ".env.local",
279 ".env.production",
280 ".env.development",
281 ".env.test",
282 "config.json",
283 "config.yaml",
284 "config.yml",
285 "credentials.json",
286 "secrets.json",
287 "settings.json",
288 "production.json",
289 "development.json",
290 "local.json",
291 "appsettings.json",
292 "web.config",
293 "web.Debug.config",
294 "web.Release.config",
295 "Application.xml",
296 "Settings.xml",
297 "App.config",
298 "pom.xml",
299 "build.gradle",
300 "build.gradle.kts",
301 "package.json",
302 "package-lock.json",
303 "yarn.lock",
304 "composer.json",
305 "composer.lock",
306 "pipfile",
307 "pipfile.lock",
308 "requirements.txt",
309 "gemfile",
310 "gemfile.lock",
311 "cargo.toml",
312 "cargo.lock",
313 "go.mod",
314 "go.sum",
315 "docker-compose.yml",
316 "docker-compose.yaml",
317 "dockerfile",
318 "kubernetes.yml",
319 "kubernetes.yaml",
320 "k8s.yml",
321 "k8s.yaml",
322 "deploy.yml",
323 "deploy.yaml",
324 "service.yml",
325 "service.yaml",
326 "configmap.yml",
327 "configmap.yaml",
328 "secret.yml",
329 "secret.yaml",
330 ]
331 .iter()
332 .map(|s| s.to_string())
333 .collect()
334}