Skip to main content

keyhog_core/
config.rs

1//! Configuration for KeyHog scanning and verification.
2//!
3//! Provides the [`ScanConfig`] struct used to control decoding depth,
4//! entropy thresholds, deduplication strategy, and performance tuning.
5
6use serde::{Deserialize, Serialize};
7use thiserror::Error;
8
9use crate::DedupScope;
10
11/// Configuration for a scan run.
12#[derive(Clone, Debug, Serialize, Deserialize)]
13pub struct ScanConfig {
14    /// Minimum confidence (0.0 to 1.0) required to report a finding.
15    pub min_confidence: f64,
16    /// Maximum recursive decoding depth (e.g. Base64(Hex(URL(secret)))).
17    pub max_decode_depth: usize,
18    /// Whether to enable Shannon entropy analysis for unknown high-entropy strings.
19    pub entropy_enabled: bool,
20    /// Whether to enable entropy analysis even in standard source code files.
21    pub entropy_in_source_files: bool,
22    /// Shannon entropy threshold (typical secrets are 4.5+).
23    pub entropy_threshold: f64,
24    /// Minimum length for entropy-based secret detection.
25    pub min_secret_len: usize,
26    /// Maximum file size to scan (bytes). Large files are skipped or sampled.
27    pub max_file_size: u64,
28    /// Deduplication strategy.
29    pub dedup: DedupScope,
30
31    /// Whether to enable ML-based probabilistic gating.
32    pub ml_enabled: bool,
33    /// Weight given to the ML score (0.0 to 1.0).
34    pub ml_weight: f64,
35    /// Whether to normalize Unicode characters before scanning.
36    pub unicode_normalization: bool,
37    /// Maximum bytes allowed from recursive decoding.
38    pub decode_size_limit: usize,
39    /// Maximum matches allowed per chunk to prevent OOM.
40    pub max_matches_per_chunk: usize,
41
42    /// List of common secret prefixes to prioritize.
43    pub known_prefixes: Vec<String>,
44    /// List of keywords that strongly indicate a secret.
45    pub secret_keywords: Vec<String>,
46    /// Keywords used in test environments.
47    pub test_keywords: Vec<String>,
48    /// Keywords for placeholders and documentation.
49    pub placeholder_keywords: Vec<String>,
50}
51
52/// Limits for decoding to prevent infinite recursion or memory exhaustion.
53pub const MAX_DECODE_DEPTH_LIMIT: usize = 16;
54
55/// Errors returned while validating a scan configuration.
56#[derive(Debug, Error)]
57pub enum ConfigError {
58    #[error("min_confidence must be between 0.0 and 1.0, found {0}")]
59    InvalidConfidence(f64),
60    #[error("max_decode_depth exceeds limit of {MAX_DECODE_DEPTH_LIMIT}, found {0}")]
61    DepthTooHigh(usize),
62}
63
64impl Default for ScanConfig {
65    fn default() -> Self {
66        Self {
67            // Raised from 0.3 → 0.5 (kimi-wave3 §4 LOW). The previous
68            // 0.3 default let low-confidence generic-entropy matches
69            // through, drowning real findings in noise. Detector
70            // configs that want the looser bar can opt back in.
71            min_confidence: 0.5,
72            // Aligned with CLI / scanner defaults (`ScannerConfig` derives from this).
73            max_decode_depth: 10,
74            entropy_enabled: true,
75            entropy_in_source_files: false,
76            entropy_threshold: 4.5,
77            min_secret_len: 20,
78            max_file_size: 10 * 1024 * 1024, // 10 MB
79            dedup: DedupScope::Credential,
80            ml_enabled: true,
81            ml_weight: 0.5,
82            unicode_normalization: true,
83            // Per-chunk decode-through ceiling (conservative vs multi‑MiB blobs).
84            decode_size_limit: 512 * 1024,
85            max_matches_per_chunk: 1000,
86            known_prefixes: vec!["AKIA".into(), "ASIA".into(), "ghp_".into(), "sk_".into()],
87            secret_keywords: vec![
88                "password".into(),
89                "passwd".into(),
90                "pwd".into(),
91                "secret".into(),
92                "token".into(),
93                "api_key".into(),
94                "apikey".into(),
95                "api-key".into(),
96                "access_key".into(),
97                "auth_token".into(),
98                "auth_key".into(),
99                "private_key".into(),
100                "client_secret".into(),
101                "encryption_key".into(),
102                "signing_key".into(),
103                "bearer".into(),
104                "credential".into(),
105                "license_key".into(),
106            ],
107            test_keywords: vec![
108                "test".into(),
109                "mock".into(),
110                "fake".into(),
111                "dummy".into(),
112                "stub".into(),
113                "fixture".into(),
114                "example".into(),
115                "sample".into(),
116                "sandbox".into(),
117                "staging".into(),
118            ],
119            placeholder_keywords: vec![
120                "change_me".into(),
121                "changeme".into(),
122                "replace_me".into(),
123                "todo".into(),
124                "fixme".into(),
125                "your_".into(),
126                "insert_".into(),
127                "put_your".into(),
128                "fill_in".into(),
129                "<your".into(),
130            ],
131        }
132    }
133}
134
135impl ScanConfig {
136    /// Fast configuration optimized for speed over exhaustive recall.
137    pub fn fast() -> Self {
138        Self {
139            max_decode_depth: 2,
140            entropy_enabled: false,
141            ml_enabled: false,
142            ..Default::default()
143        }
144    }
145
146    /// Thorough configuration for deep penetration into encoded layers.
147    pub fn thorough() -> Self {
148        Self {
149            max_decode_depth: 8,
150            entropy_in_source_files: true,
151            ml_enabled: true,
152            ..Default::default()
153        }
154    }
155
156    /// Maximum paranoia: deep decoding and aggressive entropy analysis.
157    pub fn paranoid() -> Self {
158        Self {
159            max_decode_depth: MAX_DECODE_DEPTH_LIMIT,
160            entropy_enabled: true,
161            entropy_in_source_files: true,
162            min_secret_len: 16,
163            ml_enabled: true,
164            ..Default::default()
165        }
166    }
167
168    /// Validate the configuration parameters.
169    pub fn validate(&self) -> Result<(), ConfigError> {
170        if !(0.0..=1.0).contains(&self.min_confidence) {
171            return Err(ConfigError::InvalidConfidence(self.min_confidence));
172        }
173        if self.max_decode_depth > MAX_DECODE_DEPTH_LIMIT {
174            return Err(ConfigError::DepthTooHigh(self.max_decode_depth));
175        }
176        Ok(())
177    }
178}
179
180/// List of filenames that typically contain secrets (e.g. .env, config.json).
181/// Return a list of filenames that typically contain secrets (e.g., .env, id_rsa).
182pub fn secret_filenames() -> Vec<String> {
183    vec![
184        ".env",
185        ".env.local",
186        ".env.production",
187        ".env.development",
188        ".env.test",
189        "config.json",
190        "config.yaml",
191        "config.yml",
192        "credentials.json",
193        "secrets.json",
194        "settings.json",
195        "production.json",
196        "development.json",
197        "local.json",
198        "appsettings.json",
199        "web.config",
200        "web.Debug.config",
201        "web.Release.config",
202        "Application.xml",
203        "Settings.xml",
204        "App.config",
205        "pom.xml",
206        "build.gradle",
207        "build.gradle.kts",
208        "package.json",
209        "package-lock.json",
210        "yarn.lock",
211        "composer.json",
212        "composer.lock",
213        "pipfile",
214        "pipfile.lock",
215        "requirements.txt",
216        "gemfile",
217        "gemfile.lock",
218        "cargo.toml",
219        "cargo.lock",
220        "go.mod",
221        "go.sum",
222        "docker-compose.yml",
223        "docker-compose.yaml",
224        "dockerfile",
225        "kubernetes.yml",
226        "kubernetes.yaml",
227        "k8s.yml",
228        "k8s.yaml",
229        "deploy.yml",
230        "deploy.yaml",
231        "service.yml",
232        "service.yaml",
233        "configmap.yml",
234        "configmap.yaml",
235        "secret.yml",
236        "secret.yaml",
237    ]
238    .iter()
239    .map(|s| s.to_string())
240    .collect()
241}