Skip to main content

keyhog_scanner/
confidence.rs

1//! Confidence scoring: combines multiple signals into a 0.0–1.0 score.
2//! Higher confidence means more likely to be a real secret.
3
4const SCORE_ZERO: f64 = 0.0;
5const CONFIDENCE_MIN: f64 = 0.0;
6const CONFIDENCE_MAX: f64 = 1.0;
7const LITERAL_PREFIX_WEIGHT: f64 = 0.35;
8const CONTEXT_ANCHOR_WEIGHT: f64 = 0.20;
9const ENTROPY_WEIGHT: f64 = 0.20;
10const HIGH_ENTROPY_PARTIAL_WEIGHT: f64 = 0.12;
11const MODERATE_ENTROPY_THRESHOLD: f64 = 3.0;
12const MODERATE_ENTROPY_WEIGHT: f64 = 0.05;
13const LOW_ENTROPY_THRESHOLD: f64 = 2.0;
14const LOW_ENTROPY_MIN_MATCH_LENGTH: usize = 10;
15const LOW_ENTROPY_PENALTY: f64 = 0.6;
16const KEYWORD_NEARBY_WEIGHT: f64 = 0.10;
17const SENSITIVE_FILE_WEIGHT: f64 = 0.10;
18const COMPANION_WEIGHT: f64 = 0.05;
19const HIGH_ENTROPY_THRESHOLD: f64 = 4.5;
20const VERY_HIGH_ENTROPY_THRESHOLD: f64 = 5.5;
21
22/// Confidence signals for a potential match.
23///
24/// # Examples
25///
26/// ```rust
27/// use keyhog_scanner::confidence::ConfidenceSignals;
28///
29/// let signals = ConfidenceSignals {
30///     has_literal_prefix: true,
31///     has_context_anchor: true,
32///     entropy: 5.0,
33///     keyword_nearby: true,
34///     sensitive_file: true,
35///     match_length: 32,
36///     has_companion: false,
37/// };
38/// assert!(signals.has_literal_prefix);
39/// ```
40pub struct ConfidenceSignals {
41    /// Pattern has a distinctive literal prefix (e.g., sk-proj-, ghp_)
42    pub has_literal_prefix: bool,
43    /// Pattern uses a capture group with context anchoring
44    pub has_context_anchor: bool,
45    /// Shannon entropy of the matched credential
46    pub entropy: f64,
47    /// A secret-related keyword appears nearby
48    pub keyword_nearby: bool,
49    /// File extension suggests config/env/secret file
50    pub sensitive_file: bool,
51    /// Matched credential length
52    pub match_length: usize,
53    /// Companion credential was found
54    pub has_companion: bool,
55}
56
57/// Compute a confidence score from 0.0 to 1.0.
58///
59/// The hand-tuned weights below were calibrated against the adversarial test
60/// corpus before the scanner blends this heuristic score with the ML model:
61/// literal prefixes dominate, context and entropy are secondary, and file/path
62/// hints plus companion matches provide smaller incremental adjustments.
63///
64/// # Examples
65///
66/// ```rust
67/// use keyhog_scanner::confidence::{ConfidenceSignals, compute_confidence};
68///
69/// let score = compute_confidence(&ConfidenceSignals {
70///     has_literal_prefix: true,
71///     has_context_anchor: true,
72///     entropy: 5.0,
73///     keyword_nearby: true,
74///     sensitive_file: true,
75///     match_length: 32,
76///     has_companion: false,
77/// });
78/// assert!(score > 0.5);
79/// ```
80pub fn compute_confidence(signals: &ConfidenceSignals) -> f64 {
81    let mut score = SCORE_ZERO;
82    let mut max_possible = SCORE_ZERO;
83
84    // Literal prefix: strongest signal. If it starts with "sk-proj-", it's almost certainly real.
85    // Literal prefix is the strongest signal: sk-proj-, ghp_, AKIA are nearly certain.
86    // Weight: 0.35 (largest single factor). Validated by ML classifier agreement.
87    max_possible += LITERAL_PREFIX_WEIGHT;
88    if signals.has_literal_prefix {
89        score += LITERAL_PREFIX_WEIGHT;
90    }
91
92    // Context anchor: "API_KEY=..." near the value.
93    max_possible += CONTEXT_ANCHOR_WEIGHT;
94    if signals.has_context_anchor {
95        score += CONTEXT_ANCHOR_WEIGHT;
96    }
97
98    // Entropy: high entropy = likely random/secret, low entropy = likely placeholder.
99    max_possible += ENTROPY_WEIGHT;
100    if signals.entropy >= VERY_HIGH_ENTROPY_THRESHOLD {
101        score += ENTROPY_WEIGHT;
102    } else if signals.entropy >= HIGH_ENTROPY_THRESHOLD {
103        score += HIGH_ENTROPY_PARTIAL_WEIGHT;
104    } else if signals.entropy >= MODERATE_ENTROPY_THRESHOLD {
105        score += MODERATE_ENTROPY_WEIGHT;
106    }
107    // Very low entropy is a negative signal: multiply down the score.
108    // Applied after normalization so the weighted-average math stays consistent.
109    let low_entropy_penalty = if signals.entropy < LOW_ENTROPY_THRESHOLD
110        && signals.match_length > LOW_ENTROPY_MIN_MATCH_LENGTH
111    {
112        LOW_ENTROPY_PENALTY
113    } else {
114        CONFIDENCE_MAX
115    };
116
117    // Keyword proximity.
118    max_possible += KEYWORD_NEARBY_WEIGHT;
119    if signals.keyword_nearby {
120        score += KEYWORD_NEARBY_WEIGHT;
121    }
122
123    // Sensitive file type (.env, .secrets, credentials.json, etc.)
124    max_possible += SENSITIVE_FILE_WEIGHT;
125    if signals.sensitive_file {
126        score += SENSITIVE_FILE_WEIGHT;
127    }
128
129    // Companion found (e.g., AWS secret key near access key).
130    max_possible += COMPANION_WEIGHT;
131    if signals.has_companion {
132        score += COMPANION_WEIGHT;
133    }
134
135    // Normalize to 0.0 - 1.0.
136    if max_possible == SCORE_ZERO {
137        return SCORE_ZERO;
138    }
139    let normalized_score: f64 = (score / max_possible) * low_entropy_penalty;
140    normalized_score.clamp(CONFIDENCE_MIN, CONFIDENCE_MAX)
141}
142
143/// Check if a file path suggests a sensitive file.
144///
145/// # Examples
146///
147/// ```rust
148/// use keyhog_scanner::confidence::is_sensitive_path;
149///
150/// assert!(is_sensitive_path(".env.production"));
151/// assert!(!is_sensitive_path("src/main.rs"));
152/// ```
153pub fn is_sensitive_path(path: &str) -> bool {
154    let path_bytes = path.as_bytes();
155    const SENSITIVE_NAMES: &[&[u8]] = &[
156        b".env",
157        b".env.local",
158        b".env.production",
159        b".env.staging",
160        b"credentials",
161        b"secrets",
162        b"apikeys",
163        b"api_keys",
164        b".npmrc",
165        b".pypirc",
166        b".netrc",
167        b".pgpass",
168        b"terraform.tfvars",
169        b"variables.tf",
170        b"docker-compose",
171        b"application.yml",
172        b"application.properties",
173        b"config.json",
174        b"config.yaml",
175    ];
176
177    for name in SENSITIVE_NAMES {
178        if path_bytes
179            .windows(name.len())
180            .any(|w| w.eq_ignore_ascii_case(name))
181        {
182            return true;
183        }
184    }
185    const SENSITIVE_EXTENSIONS: &[&[u8]] = &[b".env", b".pem", b".key", b".p12", b".pfx", b".jks"];
186    for ext in SENSITIVE_EXTENSIONS {
187        if path_bytes.len() >= ext.len()
188            && path_bytes[path_bytes.len() - ext.len()..].eq_ignore_ascii_case(ext)
189        {
190            return true;
191        }
192    }
193    false
194}
195
196#[cfg(test)]
197mod tests {
198    use super::*;
199
200    #[test]
201    fn high_confidence_with_prefix_and_entropy() {
202        let signals = ConfidenceSignals {
203            has_literal_prefix: true,
204            has_context_anchor: false,
205            entropy: 5.2,
206            keyword_nearby: true,
207            sensitive_file: true,
208            match_length: 50,
209            has_companion: false,
210        };
211        let score = compute_confidence(&signals);
212        assert!(score > 0.6, "score was {}", score);
213    }
214
215    #[test]
216    fn low_confidence_generic_hex() {
217        let signals = ConfidenceSignals {
218            has_literal_prefix: false,
219            has_context_anchor: false,
220            entropy: 3.5,
221            keyword_nearby: false,
222            sensitive_file: false,
223            match_length: 32,
224            has_companion: false,
225        };
226        let score = compute_confidence(&signals);
227        assert!(score < 0.3, "score was {}", score);
228    }
229
230    #[test]
231    fn medium_confidence_with_context() {
232        let signals = ConfidenceSignals {
233            has_literal_prefix: false,
234            has_context_anchor: true,
235            entropy: 4.8,
236            keyword_nearby: true,
237            sensitive_file: false,
238            match_length: 40,
239            has_companion: false,
240        };
241        let score = compute_confidence(&signals);
242        assert!(score > 0.4 && score < 0.8, "score was {}", score);
243    }
244
245    #[test]
246    fn sensitive_paths() {
247        assert!(is_sensitive_path(".env.production"));
248        assert!(is_sensitive_path("config/credentials.json"));
249        assert!(is_sensitive_path("server.key"));
250        assert!(!is_sensitive_path("src/main.rs"));
251        assert!(!is_sensitive_path("README.md"));
252    }
253
254    #[test]
255    fn low_entropy_penalty() {
256        let signals = ConfidenceSignals {
257            has_literal_prefix: true,
258            has_context_anchor: false,
259            entropy: 1.5, // Very low — likely "aaaaaaa..." or placeholder
260            keyword_nearby: false,
261            sensitive_file: false,
262            match_length: 32,
263            has_companion: false,
264        };
265        let score = compute_confidence(&signals);
266        // Should be penalized despite having prefix
267        assert!(score < 0.5, "score was {}", score);
268    }
269
270    #[test]
271    fn confidence_is_zero_without_positive_signals() {
272        let signals = ConfidenceSignals {
273            has_literal_prefix: false,
274            has_context_anchor: false,
275            entropy: 0.0,
276            keyword_nearby: false,
277            sensitive_file: false,
278            match_length: 0,
279            has_companion: false,
280        };
281        assert_eq!(compute_confidence(&signals), 0.0);
282    }
283
284    #[test]
285    fn confidence_clamps_to_one_for_all_positive_signals() {
286        let signals = ConfidenceSignals {
287            has_literal_prefix: true,
288            has_context_anchor: true,
289            entropy: 8.0,
290            keyword_nearby: true,
291            sensitive_file: true,
292            match_length: 128,
293            has_companion: true,
294        };
295        assert_eq!(compute_confidence(&signals), 1.0);
296    }
297
298    #[test]
299    fn very_high_entropy_gets_full_entropy_weight() {
300        let signals = ConfidenceSignals {
301            has_literal_prefix: false,
302            has_context_anchor: false,
303            entropy: VERY_HIGH_ENTROPY_THRESHOLD,
304            keyword_nearby: false,
305            sensitive_file: false,
306            match_length: 32,
307            has_companion: false,
308        };
309        let score = compute_confidence(&signals);
310        assert!((score - 0.2).abs() < 1e-9, "score was {}", score);
311    }
312
313    #[test]
314    fn high_entropy_gets_partial_entropy_weight() {
315        let signals = ConfidenceSignals {
316            has_literal_prefix: false,
317            has_context_anchor: false,
318            entropy: HIGH_ENTROPY_THRESHOLD,
319            keyword_nearby: false,
320            sensitive_file: false,
321            match_length: 32,
322            has_companion: false,
323        };
324        let score = compute_confidence(&signals);
325        assert!((score - 0.12).abs() < 1e-9, "score was {}", score);
326    }
327
328    #[test]
329    fn moderate_entropy_gets_small_weight() {
330        let signals = ConfidenceSignals {
331            has_literal_prefix: false,
332            has_context_anchor: false,
333            entropy: 3.0,
334            keyword_nearby: false,
335            sensitive_file: false,
336            match_length: 32,
337            has_companion: false,
338        };
339        let score = compute_confidence(&signals);
340        assert!((score - 0.05).abs() < 1e-9, "score was {}", score);
341    }
342
343    #[test]
344    fn entropy_below_moderate_threshold_adds_no_weight() {
345        let signals = ConfidenceSignals {
346            has_literal_prefix: false,
347            has_context_anchor: false,
348            entropy: 2.99,
349            keyword_nearby: false,
350            sensitive_file: false,
351            match_length: 32,
352            has_companion: false,
353        };
354        assert_eq!(compute_confidence(&signals), 0.0);
355    }
356
357    #[test]
358    fn low_entropy_penalty_requires_length_above_threshold() {
359        let signals = ConfidenceSignals {
360            has_literal_prefix: true,
361            has_context_anchor: false,
362            entropy: 1.0,
363            keyword_nearby: false,
364            sensitive_file: false,
365            match_length: 10,
366            has_companion: false,
367        };
368        let score = compute_confidence(&signals);
369        assert!((score - 0.35).abs() < 1e-9, "score was {}", score);
370    }
371
372    #[test]
373    fn low_entropy_penalty_applies_only_below_threshold() {
374        let signals = ConfidenceSignals {
375            has_literal_prefix: true,
376            has_context_anchor: false,
377            entropy: 2.0,
378            keyword_nearby: false,
379            sensitive_file: false,
380            match_length: 64,
381            has_companion: false,
382        };
383        let score = compute_confidence(&signals);
384        assert!((score - 0.35).abs() < 1e-9, "score was {}", score);
385    }
386
387    #[test]
388    fn low_entropy_penalty_scales_nonzero_score() {
389        let signals = ConfidenceSignals {
390            has_literal_prefix: true,
391            has_context_anchor: true,
392            entropy: 1.0,
393            keyword_nearby: false,
394            sensitive_file: false,
395            match_length: 11,
396            has_companion: false,
397        };
398        let score = compute_confidence(&signals);
399        assert!((score - 0.33).abs() < 1e-9, "score was {}", score);
400    }
401
402    #[test]
403    fn companion_signal_adds_expected_weight() {
404        let signals = ConfidenceSignals {
405            has_literal_prefix: false,
406            has_context_anchor: false,
407            entropy: 0.0,
408            keyword_nearby: false,
409            sensitive_file: false,
410            match_length: 24,
411            has_companion: true,
412        };
413        let score = compute_confidence(&signals);
414        assert!((score - 0.03).abs() < 1e-9, "score was {}", score);
415    }
416
417    #[test]
418    fn context_and_keyword_signals_stack_linearly() {
419        let signals = ConfidenceSignals {
420            has_literal_prefix: false,
421            has_context_anchor: true,
422            entropy: 0.0,
423            keyword_nearby: true,
424            sensitive_file: false,
425            match_length: 20,
426            has_companion: false,
427        };
428        let score = compute_confidence(&signals);
429        assert!((score - 0.18).abs() < 1e-9, "score was {}", score);
430    }
431
432    #[test]
433    fn sensitive_path_matches_case_insensitively() {
434        assert!(is_sensitive_path("CONFIG/.ENV.PRODUCTION"));
435        assert!(is_sensitive_path("Secrets/CREDENTIALS.JSON"));
436        assert!(is_sensitive_path("keys/CLIENT.P12"));
437    }
438
439    #[test]
440    fn sensitive_path_rejects_empty_and_non_sensitive_values() {
441        assert!(!is_sensitive_path(""));
442        assert!(!is_sensitive_path("notes/environment.txt"));
443        assert!(!is_sensitive_path("docs/secretary.txt"));
444    }
445
446    #[test]
447    fn sensitive_path_detects_embedded_sensitive_names_with_special_characters() {
448        assert!(is_sensitive_path("deploy/docker-compose.override.yml"));
449        assert!(is_sensitive_path("dir/my api_keys-backup.txt"));
450        assert!(is_sensitive_path("nested/application.properties.template"));
451    }
452
453    #[test]
454    fn sensitive_path_handles_huge_input() {
455        let long_prefix = "a/".repeat(4096);
456        let long_sensitive = format!("{long_prefix}terraform.tfvars");
457        let long_non_sensitive = format!("{long_prefix}plain-text-file.txt");
458        assert!(is_sensitive_path(&long_sensitive));
459        assert!(!is_sensitive_path(&long_non_sensitive));
460    }
461}