Skip to main content

keyhog_core/
config.rs

1//! Configuration for KeyHog scanning and verification.
2//!
3//! Provides the [`ScanConfig`] struct used to control decoding depth,
4//! entropy thresholds, deduplication strategy, and performance tuning.
5
6use serde::{Deserialize, Serialize};
7use thiserror::Error;
8
9use crate::DedupScope;
10
11/// Configuration for a scan run.
12#[derive(Clone, Debug, Serialize, Deserialize)]
13pub struct ScanConfig {
14    /// Minimum confidence (0.0 to 1.0) required to report a finding.
15    pub min_confidence: f64,
16    /// Maximum recursive decoding depth (e.g. Base64(Hex(URL(secret)))).
17    pub max_decode_depth: usize,
18    /// Whether to enable Shannon entropy analysis for unknown high-entropy strings.
19    pub entropy_enabled: bool,
20    /// Whether to enable entropy analysis even in standard source code files.
21    pub entropy_in_source_files: bool,
22    /// Shannon entropy threshold (typical secrets are 4.5+).
23    pub entropy_threshold: f64,
24    /// Minimum length for entropy-based secret detection.
25    pub min_secret_len: usize,
26    /// Maximum file size to scan (bytes). Large files are skipped or sampled.
27    pub max_file_size: u64,
28    /// Deduplication strategy.
29    pub dedup: DedupScope,
30
31    /// Whether to enable ML-based probabilistic gating.
32    pub ml_enabled: bool,
33    /// Weight given to the ML score (0.0 to 1.0).
34    pub ml_weight: f64,
35    /// Whether to normalize Unicode characters before scanning.
36    pub unicode_normalization: bool,
37    /// Maximum bytes allowed from recursive decoding.
38    pub decode_size_limit: usize,
39    /// Maximum matches allowed per chunk to prevent OOM.
40    pub max_matches_per_chunk: usize,
41
42    /// When `true`, credentials inside source-code comments
43    /// (//, #, /* */, <!-- -->) get the same confidence treatment as
44    /// credentials in regular code. Default `false` - comment context
45    /// downgrades confidence on the theory that examples are the
46    /// common case. CLI exposes this as `--scan-comments`; opt-in
47    /// because the rate of EXAMPLE secrets pasted into doc comments
48    /// vastly outweighs the rate of real ones.
49    #[serde(default)]
50    pub scan_comments: bool,
51
52    /// List of common secret prefixes to prioritize.
53    pub known_prefixes: Vec<String>,
54    /// List of keywords that strongly indicate a secret.
55    pub secret_keywords: Vec<String>,
56    /// Keywords used in test environments.
57    pub test_keywords: Vec<String>,
58    /// Keywords for placeholders and documentation.
59    pub placeholder_keywords: Vec<String>,
60}
61
62/// Limits for decoding to prevent infinite recursion or memory exhaustion.
63pub const MAX_DECODE_DEPTH_LIMIT: usize = 16;
64
65/// Errors returned while validating a scan configuration.
66#[derive(Debug, Error)]
67pub enum ConfigError {
68    /// `min_confidence` was outside the closed unit interval `[0.0, 1.0]`.
69    #[error("min_confidence must be between 0.0 and 1.0, found {0}")]
70    InvalidConfidence(f64),
71    /// `max_decode_depth` exceeded the safety ceiling
72    /// [`MAX_DECODE_DEPTH_LIMIT`].
73    #[error("max_decode_depth exceeds limit of {MAX_DECODE_DEPTH_LIMIT}, found {0}")]
74    DepthTooHigh(usize),
75}
76
77impl Default for ScanConfig {
78    fn default() -> Self {
79        Self {
80            // Raised from 0.3 → 0.5 (kimi-wave3 §4 LOW). The previous
81            // 0.3 default let low-confidence generic-entropy matches
82            // through, drowning real findings in noise. Detector
83            // configs that want the looser bar can opt back in.
84            min_confidence: 0.5,
85            // Aligned with CLI / scanner defaults (`ScannerConfig` derives from this).
86            max_decode_depth: 10,
87            entropy_enabled: true,
88            entropy_in_source_files: false,
89            entropy_threshold: 4.5,
90            min_secret_len: 20,
91            max_file_size: 10 * 1024 * 1024, // 10 MB
92            dedup: DedupScope::Credential,
93            ml_enabled: true,
94            ml_weight: 0.5,
95            unicode_normalization: true,
96            // Per-chunk decode-through ceiling (conservative vs multi‑MiB blobs).
97            decode_size_limit: 512 * 1024,
98            max_matches_per_chunk: 1000,
99            scan_comments: false,
100            known_prefixes: vec!["AKIA".into(), "ASIA".into(), "ghp_".into(), "sk_".into()],
101            secret_keywords: vec![
102                "password".into(),
103                "passwd".into(),
104                "pwd".into(),
105                "secret".into(),
106                "token".into(),
107                "api_key".into(),
108                "apikey".into(),
109                "api-key".into(),
110                "access_key".into(),
111                "auth_token".into(),
112                "auth_key".into(),
113                "private_key".into(),
114                "client_secret".into(),
115                "encryption_key".into(),
116                "signing_key".into(),
117                "bearer".into(),
118                "credential".into(),
119                "license_key".into(),
120            ],
121            test_keywords: vec![
122                "test".into(),
123                "mock".into(),
124                "fake".into(),
125                "dummy".into(),
126                "stub".into(),
127                "fixture".into(),
128                "example".into(),
129                "sample".into(),
130                "sandbox".into(),
131                "staging".into(),
132            ],
133            placeholder_keywords: vec![
134                "change_me".into(),
135                "changeme".into(),
136                "replace_me".into(),
137                "todo".into(),
138                "fixme".into(),
139                "your_".into(),
140                "insert_".into(),
141                "put_your".into(),
142                "fill_in".into(),
143                "<your".into(),
144            ],
145        }
146    }
147}
148
149impl ScanConfig {
150    /// Fast configuration optimized for speed over exhaustive recall.
151    pub fn fast() -> Self {
152        Self {
153            max_decode_depth: 2,
154            entropy_enabled: false,
155            ml_enabled: false,
156            ..Default::default()
157        }
158    }
159
160    /// Thorough configuration for deep penetration into encoded layers.
161    pub fn thorough() -> Self {
162        Self {
163            max_decode_depth: 8,
164            entropy_in_source_files: true,
165            ml_enabled: true,
166            ..Default::default()
167        }
168    }
169
170    /// Maximum paranoia: deep decoding and aggressive entropy analysis.
171    pub fn paranoid() -> Self {
172        Self {
173            max_decode_depth: MAX_DECODE_DEPTH_LIMIT,
174            entropy_enabled: true,
175            entropy_in_source_files: true,
176            min_secret_len: 16,
177            ml_enabled: true,
178            ..Default::default()
179        }
180    }
181
182    /// Validate the configuration parameters.
183    pub fn validate(&self) -> Result<(), ConfigError> {
184        if !(0.0..=1.0).contains(&self.min_confidence) {
185            return Err(ConfigError::InvalidConfidence(self.min_confidence));
186        }
187        if self.max_decode_depth > MAX_DECODE_DEPTH_LIMIT {
188            return Err(ConfigError::DepthTooHigh(self.max_decode_depth));
189        }
190        Ok(())
191    }
192}
193
194/// List of filenames that typically contain secrets (e.g. .env, config.json).
195/// Return a list of filenames that typically contain secrets (e.g., .env, id_rsa).
196pub fn secret_filenames() -> Vec<String> {
197    vec![
198        ".env",
199        ".env.local",
200        ".env.production",
201        ".env.development",
202        ".env.test",
203        "config.json",
204        "config.yaml",
205        "config.yml",
206        "credentials.json",
207        "secrets.json",
208        "settings.json",
209        "production.json",
210        "development.json",
211        "local.json",
212        "appsettings.json",
213        "web.config",
214        "web.Debug.config",
215        "web.Release.config",
216        "Application.xml",
217        "Settings.xml",
218        "App.config",
219        "pom.xml",
220        "build.gradle",
221        "build.gradle.kts",
222        "package.json",
223        "package-lock.json",
224        "yarn.lock",
225        "composer.json",
226        "composer.lock",
227        "pipfile",
228        "pipfile.lock",
229        "requirements.txt",
230        "gemfile",
231        "gemfile.lock",
232        "cargo.toml",
233        "cargo.lock",
234        "go.mod",
235        "go.sum",
236        "docker-compose.yml",
237        "docker-compose.yaml",
238        "dockerfile",
239        "kubernetes.yml",
240        "kubernetes.yaml",
241        "k8s.yml",
242        "k8s.yaml",
243        "deploy.yml",
244        "deploy.yaml",
245        "service.yml",
246        "service.yaml",
247        "configmap.yml",
248        "configmap.yaml",
249        "secret.yml",
250        "secret.yaml",
251    ]
252    .iter()
253    .map(|s| s.to_string())
254    .collect()
255}