Skip to main content

keyhog_scanner/entropy/
mod.rs

1//! Shannon entropy analysis for distinguishing secrets from ordinary text.
2//!
3//! Real secrets have high entropy (4.5+), while hashes, UUIDs, and placeholders
4//! have characteristic entropy profiles that help separate true positives.
5
6pub(crate) mod keywords;
7mod scanner;
8
9#[cfg(test)]
10mod entropy_tests;
11
12pub use scanner::{find_entropy_secrets, find_entropy_secrets_with_threshold, is_sensitive_file};
13
14/// Threshold for keyword-context entropy detection.
15pub const LOW_ENTROPY_THRESHOLD: f64 = 3.0;
16pub const HIGH_ENTROPY_THRESHOLD: f64 = 4.5;
17/// Threshold for keyword-independent entropy detection.
18pub const VERY_HIGH_ENTROPY_THRESHOLD: f64 = 5.8;
19/// Threshold for keyword-independent detection in clearly sensitive files.
20pub const SENSITIVE_FILE_VERY_HIGH_ENTROPY_THRESHOLD: f64 = 5.5;
21
22/// Shannon entropy in bits per byte.
23/// Compute Shannon entropy of a byte slice, with thread-local caching.
24///
25/// At scale, many matches in the same file produce identical or overlapping
26/// credential strings. The cache eliminates redundant entropy computations
27/// using a fast hash of the input as key. Cache is bounded to prevent
28/// unbounded memory growth on adversarial input.
29pub fn shannon_entropy(data: &[u8]) -> f64 {
30    use std::cell::RefCell;
31    use std::collections::HashMap;
32
33    const MAX_CACHE_ENTRIES: usize = 4096;
34
35    thread_local! {
36        static CACHE: RefCell<HashMap<u64, f64>> = RefCell::new(HashMap::with_capacity(256));
37    }
38
39    // Fast hash for cache key — FNV-1a, same as decode pipeline
40    let mut hash: u64 = 0xcbf29ce484222325;
41    for &byte in data {
42        hash ^= u64::from(byte);
43        hash = hash.wrapping_mul(0x100000001b3);
44    }
45
46    CACHE.with(|cache| {
47        let mut cache = cache.borrow_mut();
48        if let Some(&cached) = cache.get(&hash) {
49            return cached;
50        }
51        let entropy = shannon_entropy_uncached(data);
52        if cache.len() >= MAX_CACHE_ENTRIES {
53            cache.clear(); // simple eviction — bounded memory
54        }
55        cache.insert(hash, entropy);
56        entropy
57    })
58}
59
60fn shannon_entropy_uncached(data: &[u8]) -> f64 {
61    crate::entropy_fast::shannon_entropy_simd(data)
62}
63
64/// Compute entropy normalized to the range `0.0..=1.0`.
65pub fn normalized_entropy(data: &[u8]) -> f64 {
66    if data.is_empty() {
67        return 0.0;
68    }
69
70    let unique_chars = {
71        let mut seen = [false; 256];
72        for &byte in data {
73            seen[byte as usize] = true;
74        }
75        seen.iter().filter(|&&value| value).count()
76    };
77
78    if unique_chars <= 1 {
79        return 0.0;
80    }
81
82    let max_entropy = (unique_chars as f64).log2();
83    if max_entropy == 0.0 {
84        return 0.0;
85    }
86
87    shannon_entropy(data) / max_entropy
88}
89
90/// Entropy-based candidate match returned by fallback secret detection.
91#[derive(Debug, Clone)]
92pub struct EntropyMatch {
93    /// The candidate string that exceeded the entropy threshold.
94    pub value: String,
95    /// Shannon entropy measured for `value`.
96    pub entropy: f64,
97    /// The keyword context that caused the candidate to be evaluated.
98    pub keyword: String,
99    /// One-based source line number for the match.
100    pub line: usize,
101    /// Byte offset of the start of the containing line.
102    pub offset: usize,
103}
104
105/// Decide whether entropy scanning should run for the given path.
106pub fn is_entropy_appropriate(path: Option<&str>, allow_source_files: bool) -> bool {
107    let Some(path) = path else { return true };
108    let lower = path.to_lowercase();
109
110    for extension in [".json", ".lock", ".map"] {
111        if lower.ends_with(extension) {
112            return false;
113        }
114    }
115    if lower.ends_with(".min.js") || lower.ends_with(".min.css") {
116        return false;
117    }
118    if allow_source_files {
119        return true;
120    }
121
122    for extension in [
123        ".env",
124        ".yaml",
125        ".yml",
126        ".toml",
127        ".properties",
128        ".cfg",
129        ".conf",
130        ".ini",
131        ".config",
132        ".secrets",
133        ".pem",
134        ".key",
135        ".tfvars",
136        ".hcl",
137    ] {
138        if lower.ends_with(extension) {
139            return true;
140        }
141    }
142
143    let filename = lower.rsplit(['/', '\\']).next().unwrap_or(&lower);
144    for name in [
145        ".env",
146        "credentials",
147        "secrets",
148        "apikeys",
149        "docker-compose",
150        ".npmrc",
151        ".pypirc",
152        ".netrc",
153    ] {
154        if filename.starts_with(name) || filename == name {
155            return true;
156        }
157    }
158    false
159}