keyhog_scanner/entropy/mod.rs
1//! Shannon entropy analysis for distinguishing secrets from ordinary text.
2//!
3//! Real secrets have high entropy (4.5+), while hashes, UUIDs, and placeholders
4//! have characteristic entropy profiles that help separate true positives.
5
6pub mod keywords;
7pub(crate) mod scanner;
8
9pub use scanner::{find_entropy_secrets, find_entropy_secrets_with_threshold, is_sensitive_file};
10
11/// Threshold for keyword-context entropy detection.
12pub const LOW_ENTROPY_THRESHOLD: f64 = 3.0;
13pub const HIGH_ENTROPY_THRESHOLD: f64 = 4.5;
14/// Threshold for keyword-independent entropy detection.
15pub const VERY_HIGH_ENTROPY_THRESHOLD: f64 = 5.8;
16/// Threshold for keyword-independent detection in clearly sensitive files.
17pub const SENSITIVE_FILE_VERY_HIGH_ENTROPY_THRESHOLD: f64 = 5.5;
18
19/// Shannon entropy in bits per byte, with thread-local caching for repeat
20/// inputs ≤1KB (typical credential size). Cache evicts wholesale when full
21/// to bound memory under adversarial input.
22pub fn shannon_entropy(data: &[u8]) -> f64 {
23 // Length gate: don't cache entropy for massive buffers (e.g. minified JS)
24 // that won't repeat exactly. Just calculate directly.
25 if data.len() > 1024 {
26 return shannon_entropy_uncached(data);
27 }
28
29 use std::cell::RefCell;
30 use std::collections::HashMap;
31
32 thread_local! {
33 static CACHE: RefCell<HashMap<u64, f64>> = RefCell::new(HashMap::with_capacity(256));
34 }
35
36 // FNV-1a content key, shared seed with every other per-scan cache.
37 let hash = crate::util_hash::hash_fast(data);
38 crate::util_hash::memoize_by_hash(
39 &CACHE,
40 hash,
41 crate::util_hash::DEFAULT_MAX_CACHE_ENTRIES,
42 || shannon_entropy_uncached(data),
43 )
44}
45
46fn shannon_entropy_uncached(data: &[u8]) -> f64 {
47 crate::entropy_fast::shannon_entropy_simd(data)
48}
49
50/// Shannon entropy rescaled to `0.0..=1.0` by dividing by `log2(unique_bytes)`.
51pub fn normalized_entropy(data: &[u8]) -> f64 {
52 if data.is_empty() {
53 return 0.0;
54 }
55
56 let unique_chars = {
57 let mut seen = [false; 256];
58 for &byte in data {
59 seen[byte as usize] = true;
60 }
61 seen.iter().filter(|&&value| value).count()
62 };
63
64 if unique_chars <= 1 {
65 return 0.0;
66 }
67
68 let max_entropy = (unique_chars as f64).log2();
69 if max_entropy == 0.0 {
70 return 0.0;
71 }
72
73 shannon_entropy(data) / max_entropy
74}
75
76/// Entropy-based candidate match returned by fallback secret detection.
77#[derive(Debug, Clone)]
78pub struct EntropyMatch {
79 /// The candidate string that exceeded the entropy threshold.
80 pub value: String,
81 /// Shannon entropy measured for `value`.
82 pub entropy: f64,
83 /// The keyword context that caused the candidate to be evaluated.
84 pub keyword: String,
85 /// One-based source line number for the match.
86 pub line: usize,
87 /// Byte offset of the start of the containing line.
88 pub offset: usize,
89}
90
91/// True if the file at `path` is worth running entropy scanning on.
92///
93/// Path-only gate: `.json` and all source-code extensions are hard-OFF here.
94/// For the keyword-anchored lift of those hard-OFFs (a `.json` body or a
95/// source file that carries a secret-keyword assignment line still holds
96/// real, unprefixed high-entropy secrets), call
97/// [`is_entropy_appropriate_with_content`], which the entropy fallback uses.
98pub fn is_entropy_appropriate(path: Option<&str>, allow_source_files: bool) -> bool {
99 is_entropy_appropriate_inner(path, allow_source_files, false)
100}
101
102/// Content-aware variant of [`is_entropy_appropriate`].
103///
104/// `has_secret_keyword_line` is true when the chunk text contains at least one
105/// secret-keyword assignment line (same predicate the entropy scanner uses to
106/// seed keyword contexts, [`keywords::find_keyword_assignment_lines`]). When
107/// set, two path-only hard-OFFs are lifted:
108///
109/// * `.json` files (the single biggest FN wrapper - `{"auth": "<40-char
110/// base64>"}` was scoring 0 while the identical `auth: "<same>"` in
111/// `.yaml` was caught), and
112/// * source-code files when `allow_source_files` is false (the dominant
113/// go/rust/js FN shape `const apiKey = "<base64-40>"` lives in a quoted
114/// RHS of a const/assignment with a secret keyword).
115///
116/// Both lifts are contract-safe: the keyword-assignment anchor confines the
117/// recall expansion to credential-shaped lines, away from prose / identifiers,
118/// and the per-candidate suppression gates on the emit path
119/// (pure-identifier, prose, kebab, filename-shape, ...) still run.
120///
121/// `.lock` / `.map` / minified bundles stay hard-OFF unconditionally - they
122/// are not credential wrappers, only alphabet-coincidence noise.
123pub fn is_entropy_appropriate_with_content(
124 path: Option<&str>,
125 allow_source_files: bool,
126 text: &str,
127 secret_keywords: &[String],
128) -> bool {
129 let has_secret_keyword_line = !keywords::find_keyword_assignment_lines(
130 &text.lines().collect::<Vec<_>>(),
131 secret_keywords,
132 )
133 .is_empty();
134 is_entropy_appropriate_inner(path, allow_source_files, has_secret_keyword_line)
135}
136
137fn is_entropy_appropriate_inner(
138 path: Option<&str>,
139 allow_source_files: bool,
140 has_secret_keyword_line: bool,
141) -> bool {
142 let Some(path) = path else { return true };
143 // ASCII case-insensitive byte comparison - no whole-path lowercase
144 // allocation per call. Hot path on every chunk during a scan.
145 let bytes = path.as_bytes();
146 let ends_ci = |suffix: &[u8]| -> bool {
147 bytes.len() >= suffix.len()
148 && bytes[bytes.len() - suffix.len()..].eq_ignore_ascii_case(suffix)
149 };
150
151 // `.lock` / `.map` are never credential wrappers - stay hard-OFF even with
152 // a keyword line. `.json` is lifted when a secret-keyword assignment line
153 // is present (part (a) of the FN-recall fix): JSON is the biggest FN
154 // wrapper, but only the keyword-anchored bodies hold real secrets.
155 for extension in [b".lock".as_slice(), b".map"] {
156 if ends_ci(extension) {
157 return false;
158 }
159 }
160 if ends_ci(b".json") && !has_secret_keyword_line {
161 return false;
162 }
163 if ends_ci(b".min.js") || ends_ci(b".min.css") {
164 return false;
165 }
166 if allow_source_files {
167 return true;
168 }
169
170 // Last segment after `/` or `\` - index into bytes, no alloc.
171 let last_sep = bytes
172 .iter()
173 .rposition(|&b| b == b'/' || b == b'\\')
174 .map(|i| i + 1)
175 .unwrap_or(0);
176 let filename = &bytes[last_sep..];
177
178 // Package-manifest exclusion: Cargo.toml / package.json / pyproject.toml
179 // / Pipfile / Gemfile / pom.xml / build.gradle have [package.keywords]
180 // / "keywords" / "categories" array data that look like high-entropy
181 // strings but are package metadata, not credentials. Entropy fires on
182 // ["compression", "encryption", "history"] as `entropy-api-key`
183 // because the array literal happens to clear the keyword + entropy
184 // thresholds. Suppress on stem match, ASCII case-insensitive.
185 // #15 regression: envseal dogfood, ~10 FPs per Cargo.toml.
186 for stem in [
187 b"Cargo.toml".as_slice(),
188 b"package.json",
189 b"pyproject.toml",
190 b"composer.json",
191 b"Pipfile",
192 b"Gemfile",
193 b"pom.xml",
194 b"build.gradle",
195 b"build.gradle.kts",
196 b"build.sbt",
197 b"mix.exs",
198 ] {
199 if filename.eq_ignore_ascii_case(stem) {
200 return false;
201 }
202 }
203
204 for extension in [
205 b".env".as_slice(),
206 b".yaml",
207 b".yml",
208 b".toml",
209 b".properties",
210 b".cfg",
211 b".conf",
212 b".ini",
213 b".config",
214 b".secrets",
215 b".pem",
216 b".key",
217 b".tfvars",
218 b".hcl",
219 ] {
220 if ends_ci(extension) {
221 return true;
222 }
223 }
224
225 // Filename-prefix match: `.env-staging`, `.env.production` should count
226 // as a secret file. But `secrets.rs`, `credentials.py`, `apikeys.go`
227 // are source code ABOUT credentials, not credential files - the
228 // surrounding code uses `secret` / `credential` / `apikey` as
229 // identifiers, and the entropy fallback was misclassifying every
230 // identifier-shaped value on those lines as `entropy-api-key`.
231 //
232 // Split policy:
233 // - `.env` keeps the prefix-match semantics (legitimate variants
234 // exist: `.env-staging`, `.env.production`, `.envfile`).
235 // - All other names require an EXACT filename match (no extension)
236 // OR a prefix match followed by a known config extension
237 // (`secrets.env`, `credentials.yaml`, `apikeys.toml`).
238 //
239 // #15 regression: envseal/cli/src/tui/secrets.rs fired entropy on
240 // every `Style`/`Paragraph::new` call because filename prefix
241 // "secrets" matched. After this filter, scanning a `secrets.rs`
242 // requires `--entropy-source-files`.
243 const PREFIX_MATCH_NAMES: &[&[u8]] = &[b".env", b".npmrc", b".pypirc", b".netrc"];
244 for name in PREFIX_MATCH_NAMES {
245 let starts_ci =
246 filename.len() >= name.len() && filename[..name.len()].eq_ignore_ascii_case(name);
247 if starts_ci {
248 return true;
249 }
250 }
251
252 const EXACT_OR_CONFIG_EXT_NAMES: &[&[u8]] =
253 &[b"credentials", b"secrets", b"apikeys", b"docker-compose"];
254 const CONFIG_EXTENSIONS_AFTER_STEM: &[&[u8]] = &[
255 b".env",
256 b".yaml",
257 b".yml",
258 b".toml",
259 b".properties",
260 b".cfg",
261 b".conf",
262 b".ini",
263 b".config",
264 b".secrets",
265 b".pem",
266 b".key",
267 b".tfvars",
268 b".hcl",
269 b".enc",
270 b".vault",
271 b".prod",
272 b".txt",
273 ];
274 for name in EXACT_OR_CONFIG_EXT_NAMES {
275 if filename.eq_ignore_ascii_case(name) {
276 return true;
277 }
278 // Prefix + config extension: `secrets.yaml`, `credentials.env`,
279 // `apikeys.toml`, `secrets-prod.toml`. The trailing extension
280 // gate keeps `secrets.rs`, `credentials.py`, etc. on the
281 // source-code path (skipped unless --entropy-source-files).
282 if filename.len() > name.len() && filename[..name.len()].eq_ignore_ascii_case(name) {
283 let tail = &filename[name.len()..];
284 for ext in CONFIG_EXTENSIONS_AFTER_STEM {
285 if tail.len() >= ext.len()
286 && tail[tail.len() - ext.len()..].eq_ignore_ascii_case(ext)
287 {
288 return true;
289 }
290 }
291 }
292 }
293
294 // Source-file lift (part (b) of the FN-recall fix). Everything that
295 // reaches here is a genuine source-code file (`.rs`, `.go`, `.js`,
296 // `.py`, ...) that is neither a recognized config/secret file nor a
297 // package manifest (both returned earlier). The dominant go/rust/js
298 // FN shape is a quoted RHS of a const/assignment with a secret keyword,
299 // `const apiKey = "<base64-40>"`. When the chunk carries such a
300 // secret-keyword assignment line, allow entropy scanning here even
301 // without `--entropy-source-files`; the per-candidate emit gates
302 // (pure-identifier, prose, kebab, filename-shape, ...) reject the
303 // identifier noise that motivated the source-file hard-OFF, so the
304 // keyword anchor keeps this contract-safe. Manifests are unaffected -
305 // they already returned `false` above, so a `name = "my-secret"` line
306 // in `Cargo.toml` cannot re-enable scanning here.
307 has_secret_keyword_line
308}