1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
//! Configuration for KeyHog scanning and verification.
//!
//! Provides the [`ScanConfig`] struct used to control decoding depth,
//! entropy thresholds, deduplication strategy, and performance tuning.
use serde::{Deserialize, Serialize};
use thiserror::Error;
use crate::DedupScope;
/// Configuration for a scan run.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ScanConfig {
/// Minimum confidence (0.0 to 1.0) required to report a finding.
pub min_confidence: f64,
/// Maximum recursive decoding depth (e.g. Base64(Hex(URL(secret)))).
pub max_decode_depth: usize,
/// Whether to enable Shannon entropy analysis for unknown high-entropy strings.
pub entropy_enabled: bool,
/// Whether to enable entropy analysis even in standard source code files.
pub entropy_in_source_files: bool,
/// Shannon entropy threshold (typical secrets are 4.5+).
pub entropy_threshold: f64,
/// Minimum length for entropy-based secret detection.
pub min_secret_len: usize,
/// Maximum file size to scan (bytes). Large files are skipped or sampled.
pub max_file_size: u64,
/// Deduplication strategy.
pub dedup: DedupScope,
/// Whether to enable ML-based probabilistic gating.
pub ml_enabled: bool,
/// Weight given to the ML score (0.0 to 1.0).
pub ml_weight: f64,
/// Whether to normalize Unicode characters before scanning.
pub unicode_normalization: bool,
/// Maximum bytes allowed from recursive decoding.
pub decode_size_limit: usize,
/// Maximum matches allowed per chunk to prevent OOM.
pub max_matches_per_chunk: usize,
/// List of common secret prefixes to prioritize.
pub known_prefixes: Vec<String>,
/// List of keywords that strongly indicate a secret.
pub secret_keywords: Vec<String>,
/// Keywords used in test environments.
pub test_keywords: Vec<String>,
/// Keywords for placeholders and documentation.
pub placeholder_keywords: Vec<String>,
}
/// Limits for decoding to prevent infinite recursion or memory exhaustion.
pub const MAX_DECODE_DEPTH_LIMIT: usize = 16;
/// Errors returned while validating a scan configuration.
#[derive(Debug, Error)]
pub enum ConfigError {
#[error("min_confidence must be between 0.0 and 1.0, found {0}")]
InvalidConfidence(f64),
#[error("max_decode_depth exceeds limit of {MAX_DECODE_DEPTH_LIMIT}, found {0}")]
DepthTooHigh(usize),
}
impl Default for ScanConfig {
fn default() -> Self {
Self {
// Raised from 0.3 → 0.5 (kimi-wave3 §4 LOW). The previous
// 0.3 default let low-confidence generic-entropy matches
// through, drowning real findings in noise. Detector
// configs that want the looser bar can opt back in.
min_confidence: 0.5,
// Aligned with CLI / scanner defaults (`ScannerConfig` derives from this).
max_decode_depth: 10,
entropy_enabled: true,
entropy_in_source_files: false,
entropy_threshold: 4.5,
min_secret_len: 20,
max_file_size: 10 * 1024 * 1024, // 10 MB
dedup: DedupScope::Credential,
ml_enabled: true,
ml_weight: 0.5,
unicode_normalization: true,
// Per-chunk decode-through ceiling (conservative vs multi‑MiB blobs).
decode_size_limit: 512 * 1024,
max_matches_per_chunk: 1000,
known_prefixes: vec!["AKIA".into(), "ASIA".into(), "ghp_".into(), "sk_".into()],
secret_keywords: vec![
"password".into(),
"passwd".into(),
"pwd".into(),
"secret".into(),
"token".into(),
"api_key".into(),
"apikey".into(),
"api-key".into(),
"access_key".into(),
"auth_token".into(),
"auth_key".into(),
"private_key".into(),
"client_secret".into(),
"encryption_key".into(),
"signing_key".into(),
"bearer".into(),
"credential".into(),
"license_key".into(),
],
test_keywords: vec![
"test".into(),
"mock".into(),
"fake".into(),
"dummy".into(),
"stub".into(),
"fixture".into(),
"example".into(),
"sample".into(),
"sandbox".into(),
"staging".into(),
],
placeholder_keywords: vec![
"change_me".into(),
"changeme".into(),
"replace_me".into(),
"todo".into(),
"fixme".into(),
"your_".into(),
"insert_".into(),
"put_your".into(),
"fill_in".into(),
"<your".into(),
],
}
}
}
impl ScanConfig {
/// Fast configuration optimized for speed over exhaustive recall.
pub fn fast() -> Self {
Self {
max_decode_depth: 2,
entropy_enabled: false,
ml_enabled: false,
..Default::default()
}
}
/// Thorough configuration for deep penetration into encoded layers.
pub fn thorough() -> Self {
Self {
max_decode_depth: 8,
entropy_in_source_files: true,
ml_enabled: true,
..Default::default()
}
}
/// Maximum paranoia: deep decoding and aggressive entropy analysis.
pub fn paranoid() -> Self {
Self {
max_decode_depth: MAX_DECODE_DEPTH_LIMIT,
entropy_enabled: true,
entropy_in_source_files: true,
min_secret_len: 16,
ml_enabled: true,
..Default::default()
}
}
/// Validate the configuration parameters.
pub fn validate(&self) -> Result<(), ConfigError> {
if !(0.0..=1.0).contains(&self.min_confidence) {
return Err(ConfigError::InvalidConfidence(self.min_confidence));
}
if self.max_decode_depth > MAX_DECODE_DEPTH_LIMIT {
return Err(ConfigError::DepthTooHigh(self.max_decode_depth));
}
Ok(())
}
}
/// List of filenames that typically contain secrets (e.g. .env, config.json).
/// Return a list of filenames that typically contain secrets (e.g., .env, id_rsa).
pub fn secret_filenames() -> Vec<String> {
vec![
".env",
".env.local",
".env.production",
".env.development",
".env.test",
"config.json",
"config.yaml",
"config.yml",
"credentials.json",
"secrets.json",
"settings.json",
"production.json",
"development.json",
"local.json",
"appsettings.json",
"web.config",
"web.Debug.config",
"web.Release.config",
"Application.xml",
"Settings.xml",
"App.config",
"pom.xml",
"build.gradle",
"build.gradle.kts",
"package.json",
"package-lock.json",
"yarn.lock",
"composer.json",
"composer.lock",
"pipfile",
"pipfile.lock",
"requirements.txt",
"gemfile",
"gemfile.lock",
"cargo.toml",
"cargo.lock",
"go.mod",
"go.sum",
"docker-compose.yml",
"docker-compose.yaml",
"dockerfile",
"kubernetes.yml",
"kubernetes.yaml",
"k8s.yml",
"k8s.yaml",
"deploy.yml",
"deploy.yaml",
"service.yml",
"service.yaml",
"configmap.yml",
"configmap.yaml",
"secret.yml",
"secret.yaml",
]
.iter()
.map(|s| s.to_string())
.collect()
}