Skip to main content

keyhog_scanner/entropy/
mod.rs

1//! Shannon entropy analysis for distinguishing secrets from ordinary text.
2//!
3//! Real secrets have high entropy (4.5+), while hashes, UUIDs, and placeholders
4//! have characteristic entropy profiles that help separate true positives.
5
6pub mod keywords;
7pub(crate) mod scanner;
8
9pub use scanner::{find_entropy_secrets, find_entropy_secrets_with_threshold, is_sensitive_file};
10
11/// Threshold for keyword-context entropy detection.
12pub const LOW_ENTROPY_THRESHOLD: f64 = 3.0;
13pub const HIGH_ENTROPY_THRESHOLD: f64 = 4.5;
14/// Threshold for keyword-independent entropy detection.
15pub const VERY_HIGH_ENTROPY_THRESHOLD: f64 = 5.8;
16/// Threshold for keyword-independent detection in clearly sensitive files.
17pub const SENSITIVE_FILE_VERY_HIGH_ENTROPY_THRESHOLD: f64 = 5.5;
18
19/// Shannon entropy in bits per byte, with thread-local caching for repeat
20/// inputs ≤1KB (typical credential size). Cache evicts wholesale when full
21/// to bound memory under adversarial input.
22pub fn shannon_entropy(data: &[u8]) -> f64 {
23    // Length gate: don't cache entropy for massive buffers (e.g. minified JS)
24    // that won't repeat exactly. Just calculate directly.
25    if data.len() > 1024 {
26        return shannon_entropy_uncached(data);
27    }
28
29    use std::cell::RefCell;
30    use std::collections::HashMap;
31
32    thread_local! {
33        static CACHE: RefCell<HashMap<u64, f64>> = RefCell::new(HashMap::with_capacity(256));
34    }
35
36    // FNV-1a content key, shared seed with every other per-scan cache.
37    let hash = crate::util_hash::hash_fast(data);
38    crate::util_hash::memoize_by_hash(
39        &CACHE,
40        hash,
41        crate::util_hash::DEFAULT_MAX_CACHE_ENTRIES,
42        || shannon_entropy_uncached(data),
43    )
44}
45
46fn shannon_entropy_uncached(data: &[u8]) -> f64 {
47    crate::entropy_fast::shannon_entropy_simd(data)
48}
49
50/// Shannon entropy rescaled to `0.0..=1.0` by dividing by `log2(unique_bytes)`.
51pub fn normalized_entropy(data: &[u8]) -> f64 {
52    if data.is_empty() {
53        return 0.0;
54    }
55
56    let unique_chars = {
57        let mut seen = [false; 256];
58        for &byte in data {
59            seen[byte as usize] = true;
60        }
61        seen.iter().filter(|&&value| value).count()
62    };
63
64    if unique_chars <= 1 {
65        return 0.0;
66    }
67
68    let max_entropy = (unique_chars as f64).log2();
69    if max_entropy == 0.0 {
70        return 0.0;
71    }
72
73    shannon_entropy(data) / max_entropy
74}
75
76/// Entropy-based candidate match returned by fallback secret detection.
77#[derive(Debug, Clone)]
78pub struct EntropyMatch {
79    /// The candidate string that exceeded the entropy threshold.
80    pub value: String,
81    /// Shannon entropy measured for `value`.
82    pub entropy: f64,
83    /// The keyword context that caused the candidate to be evaluated.
84    pub keyword: String,
85    /// One-based source line number for the match.
86    pub line: usize,
87    /// Byte offset of the start of the containing line.
88    pub offset: usize,
89}
90
91/// True if the file at `path` is worth running entropy scanning on.
92///
93/// Path-only gate: `.json` and all source-code extensions are hard-OFF here.
94/// For the keyword-anchored lift of those hard-OFFs (a `.json` body or a
95/// source file that carries a secret-keyword assignment line still holds
96/// real, unprefixed high-entropy secrets), call
97/// [`is_entropy_appropriate_with_content`], which the entropy fallback uses.
98pub fn is_entropy_appropriate(path: Option<&str>, allow_source_files: bool) -> bool {
99    is_entropy_appropriate_inner(path, allow_source_files, false)
100}
101
102/// Content-aware variant of [`is_entropy_appropriate`].
103///
104/// `has_secret_keyword_line` is true when the chunk text contains at least one
105/// secret-keyword assignment line (same predicate the entropy scanner uses to
106/// seed keyword contexts, [`keywords::find_keyword_assignment_lines`]). When
107/// set, two path-only hard-OFFs are lifted:
108///
109///   * `.json` files (the single biggest FN wrapper - `{"auth": "<40-char
110///     base64>"}` was scoring 0 while the identical `auth: "<same>"` in
111///     `.yaml` was caught), and
112///   * source-code files when `allow_source_files` is false (the dominant
113///     go/rust/js FN shape `const apiKey = "<base64-40>"` lives in a quoted
114///     RHS of a const/assignment with a secret keyword).
115///
116/// Both lifts are contract-safe: the keyword-assignment anchor confines the
117/// recall expansion to credential-shaped lines, away from prose / identifiers,
118/// and the per-candidate suppression gates on the emit path
119/// (pure-identifier, prose, kebab, filename-shape, ...) still run.
120///
121/// `.lock` / `.map` / minified bundles stay hard-OFF unconditionally - they
122/// are not credential wrappers, only alphabet-coincidence noise.
123pub fn is_entropy_appropriate_with_content(
124    path: Option<&str>,
125    allow_source_files: bool,
126    text: &str,
127    secret_keywords: &[String],
128) -> bool {
129    let has_secret_keyword_line = !keywords::find_keyword_assignment_lines(
130        &text.lines().collect::<Vec<_>>(),
131        secret_keywords,
132    )
133    .is_empty();
134    is_entropy_appropriate_inner(path, allow_source_files, has_secret_keyword_line)
135}
136
137fn is_entropy_appropriate_inner(
138    path: Option<&str>,
139    allow_source_files: bool,
140    has_secret_keyword_line: bool,
141) -> bool {
142    let Some(path) = path else { return true };
143    // ASCII case-insensitive byte comparison - no whole-path lowercase
144    // allocation per call. Hot path on every chunk during a scan.
145    let bytes = path.as_bytes();
146    let ends_ci = |suffix: &[u8]| -> bool {
147        bytes.len() >= suffix.len()
148            && bytes[bytes.len() - suffix.len()..].eq_ignore_ascii_case(suffix)
149    };
150
151    // `.lock` / `.map` are never credential wrappers - stay hard-OFF even with
152    // a keyword line. `.json` is lifted when a secret-keyword assignment line
153    // is present (part (a) of the FN-recall fix): JSON is the biggest FN
154    // wrapper, but only the keyword-anchored bodies hold real secrets.
155    for extension in [b".lock".as_slice(), b".map"] {
156        if ends_ci(extension) {
157            return false;
158        }
159    }
160    if ends_ci(b".json") && !has_secret_keyword_line {
161        return false;
162    }
163    if ends_ci(b".min.js") || ends_ci(b".min.css") {
164        return false;
165    }
166    if allow_source_files {
167        return true;
168    }
169
170    // Last segment after `/` or `\` - index into bytes, no alloc.
171    let last_sep = bytes
172        .iter()
173        .rposition(|&b| b == b'/' || b == b'\\')
174        .map(|i| i + 1)
175        .unwrap_or(0);
176    let filename = &bytes[last_sep..];
177
178    // Package-manifest exclusion: Cargo.toml / package.json / pyproject.toml
179    // / Pipfile / Gemfile / pom.xml / build.gradle have [package.keywords]
180    // / "keywords" / "categories" array data that look like high-entropy
181    // strings but are package metadata, not credentials. Entropy fires on
182    // ["compression", "encryption", "history"] as `entropy-api-key`
183    // because the array literal happens to clear the keyword + entropy
184    // thresholds. Suppress on stem match, ASCII case-insensitive.
185    // #15 regression: envseal dogfood, ~10 FPs per Cargo.toml.
186    for stem in [
187        b"Cargo.toml".as_slice(),
188        b"package.json",
189        b"pyproject.toml",
190        b"composer.json",
191        b"Pipfile",
192        b"Gemfile",
193        b"pom.xml",
194        b"build.gradle",
195        b"build.gradle.kts",
196        b"build.sbt",
197        b"mix.exs",
198    ] {
199        if filename.eq_ignore_ascii_case(stem) {
200            return false;
201        }
202    }
203
204    for extension in [
205        b".env".as_slice(),
206        b".yaml",
207        b".yml",
208        b".toml",
209        b".properties",
210        b".cfg",
211        b".conf",
212        b".ini",
213        b".config",
214        b".secrets",
215        b".pem",
216        b".key",
217        b".tfvars",
218        b".hcl",
219    ] {
220        if ends_ci(extension) {
221            return true;
222        }
223    }
224
225    // Filename-prefix match: `.env-staging`, `.env.production` should count
226    // as a secret file. But `secrets.rs`, `credentials.py`, `apikeys.go`
227    // are source code ABOUT credentials, not credential files - the
228    // surrounding code uses `secret` / `credential` / `apikey` as
229    // identifiers, and the entropy fallback was misclassifying every
230    // identifier-shaped value on those lines as `entropy-api-key`.
231    //
232    // Split policy:
233    //   - `.env` keeps the prefix-match semantics (legitimate variants
234    //     exist: `.env-staging`, `.env.production`, `.envfile`).
235    //   - All other names require an EXACT filename match (no extension)
236    //     OR a prefix match followed by a known config extension
237    //     (`secrets.env`, `credentials.yaml`, `apikeys.toml`).
238    //
239    // #15 regression: envseal/cli/src/tui/secrets.rs fired entropy on
240    // every `Style`/`Paragraph::new` call because filename prefix
241    // "secrets" matched. After this filter, scanning a `secrets.rs`
242    // requires `--entropy-source-files`.
243    const PREFIX_MATCH_NAMES: &[&[u8]] = &[b".env", b".npmrc", b".pypirc", b".netrc"];
244    for name in PREFIX_MATCH_NAMES {
245        let starts_ci =
246            filename.len() >= name.len() && filename[..name.len()].eq_ignore_ascii_case(name);
247        if starts_ci {
248            return true;
249        }
250    }
251
252    const EXACT_OR_CONFIG_EXT_NAMES: &[&[u8]] =
253        &[b"credentials", b"secrets", b"apikeys", b"docker-compose"];
254    const CONFIG_EXTENSIONS_AFTER_STEM: &[&[u8]] = &[
255        b".env",
256        b".yaml",
257        b".yml",
258        b".toml",
259        b".properties",
260        b".cfg",
261        b".conf",
262        b".ini",
263        b".config",
264        b".secrets",
265        b".pem",
266        b".key",
267        b".tfvars",
268        b".hcl",
269        b".enc",
270        b".vault",
271        b".prod",
272        b".txt",
273    ];
274    for name in EXACT_OR_CONFIG_EXT_NAMES {
275        if filename.eq_ignore_ascii_case(name) {
276            return true;
277        }
278        // Prefix + config extension: `secrets.yaml`, `credentials.env`,
279        // `apikeys.toml`, `secrets-prod.toml`. The trailing extension
280        // gate keeps `secrets.rs`, `credentials.py`, etc. on the
281        // source-code path (skipped unless --entropy-source-files).
282        if filename.len() > name.len() && filename[..name.len()].eq_ignore_ascii_case(name) {
283            let tail = &filename[name.len()..];
284            for ext in CONFIG_EXTENSIONS_AFTER_STEM {
285                if tail.len() >= ext.len()
286                    && tail[tail.len() - ext.len()..].eq_ignore_ascii_case(ext)
287                {
288                    return true;
289                }
290            }
291        }
292    }
293
294    // Source-file lift (part (b) of the FN-recall fix). Everything that
295    // reaches here is a genuine source-code file (`.rs`, `.go`, `.js`,
296    // `.py`, ...) that is neither a recognized config/secret file nor a
297    // package manifest (both returned earlier). The dominant go/rust/js
298    // FN shape is a quoted RHS of a const/assignment with a secret keyword,
299    // `const apiKey = "<base64-40>"`. When the chunk carries such a
300    // secret-keyword assignment line, allow entropy scanning here even
301    // without `--entropy-source-files`; the per-candidate emit gates
302    // (pure-identifier, prose, kebab, filename-shape, ...) reject the
303    // identifier noise that motivated the source-file hard-OFF, so the
304    // keyword anchor keeps this contract-safe. Manifests are unaffected -
305    // they already returned `false` above, so a `name = "my-secret"` line
306    // in `Cargo.toml` cannot re-enable scanning here.
307    has_secret_keyword_line
308}