Skip to main content

keyhog_scanner/
entropy.rs

1//! Shannon entropy analysis for distinguishing secrets from ordinary text.
2//!
3//! Real secrets have high entropy (4.5+), while hashes, UUIDs, and placeholders
4//! have characteristic entropy profiles that help separate true positives.
5
6/// Shannon entropy in bits per byte. Range: 0.0 (constant) to 8.0 (perfectly random).
7/// Compute Shannon entropy in bits per byte.
8///
9/// # Examples
10///
11/// ```rust
12/// use keyhog_scanner::entropy::shannon_entropy;
13///
14/// assert_eq!(shannon_entropy(b""), 0.0);
15/// ```
16pub fn shannon_entropy(data: &[u8]) -> f64 {
17    if data.is_empty() {
18        return 0.0;
19    }
20
21    let mut counts0 = [0u64; 256];
22    let mut counts1 = [0u64; 256];
23    let mut counts2 = [0u64; 256];
24    let mut counts3 = [0u64; 256];
25
26    let mut chunks = data.chunks_exact(4);
27    for chunk in &mut chunks {
28        counts0[usize::from(chunk[0])] += 1;
29        counts1[usize::from(chunk[1])] += 1;
30        counts2[usize::from(chunk[2])] += 1;
31        counts3[usize::from(chunk[3])] += 1;
32    }
33
34    let mut counts = [0u64; 256];
35    for &byte in chunks.remainder() {
36        counts[usize::from(byte)] += 1;
37    }
38
39    for i in 0..256 {
40        counts[i] += counts0[i] + counts1[i] + counts2[i] + counts3[i];
41    }
42
43    let len = data.len() as f64;
44    let mut entropy = 0.0;
45
46    for &count in &counts {
47        if count > 0 {
48            let p = count as f64 / len;
49            entropy -= p * p.log2();
50        }
51    }
52
53    entropy
54}
55
56/// Normalized entropy: Shannon entropy divided by max possible entropy
57/// for the number of unique characters. Range: 0.0 to 1.0.
58/// Better than raw Shannon for comparing strings of different lengths/charsets.
59/// Compute entropy normalized to the range `0.0..=1.0`.
60///
61/// # Examples
62///
63/// ```rust
64/// use keyhog_scanner::entropy::normalized_entropy;
65///
66/// assert_eq!(normalized_entropy(b""), 0.0);
67/// ```
68pub fn normalized_entropy(data: &[u8]) -> f64 {
69    if data.is_empty() {
70        return 0.0;
71    }
72
73    let unique_chars = {
74        let mut seen = [false; 256];
75        for &b in data {
76            seen[b as usize] = true;
77        }
78        seen.iter().filter(|&&v| v).count()
79    };
80
81    if unique_chars <= 1 {
82        return 0.0;
83    }
84
85    let max_entropy = (unique_chars as f64).log2();
86    if max_entropy == 0.0 {
87        return 0.0;
88    }
89
90    shannon_entropy(data) / max_entropy
91}
92
93/// Entropy thresholds for credential detection.
94/// 4.5 is aggressive enough to catch real secrets (which are typically > 4.5)
95/// while avoiding most false positives (hex hashes ~3.5-4.0, UUIDs ~3.8-4.2).
96/// Threshold for keyword-context entropy detection.
97/// Derivation: English text ~1.5-2.0 bits/byte, hex hashes ~3.5-4.0,
98/// real API keys ~4.5-5.5, random bytes ~7.5-8.0.
99/// 4.5 sits above hashes and below real secrets. Validated against
100/// 79 adversarial tests + 0 FP on express.js + 196 findings on TruffleHog repo.
101pub const HIGH_ENTROPY_THRESHOLD: f64 = 4.5;
102/// Threshold for keyword-independent detection. Must be very high to avoid
103/// FPs on code strings (function names, import paths, constants).
104/// Only truly random-looking strings pass this bar.
105/// Threshold for keyword-INDEPENDENT entropy detection (no context signal).
106/// Higher than HIGH_ENTROPY_THRESHOLD because without keyword context,
107/// we need stronger statistical evidence. 5.5 captures real API keys
108/// (typically 5.0-6.5) while rejecting most code identifiers (3.0-5.0).
109pub const VERY_HIGH_ENTROPY_THRESHOLD: f64 = 5.5;
110const CREDENTIAL_CONTEXT_THRESHOLD: f64 = 3.5;
111const CREDENTIAL_CONTEXT_MIN_LEN: usize = 16;
112const KEYWORD_FREE_MIN_LEN: usize = 30;
113const MIN_PASSWORD_LEN: usize = 8;
114const FIRST_SOURCE_LINE_NUMBER: usize = 1;
115const KEYWORD_FREE_LABEL: &str = "none (high-entropy)";
116
117/// Keywords that indicate a string near them might be a secret.
118const SECRET_KEYWORDS: &[&str] = &[
119    "api_key",
120    "apikey",
121    "api-key",
122    "api_token",
123    "api-token",
124    "secret",
125    "secret_key",
126    "secretkey",
127    "token",
128    "access_token",
129    "auth_token",
130    "auth-token",
131    "password",
132    "passwd",
133    "pwd",
134    "credential",
135    "credentials",
136    "private_key",
137    "privatekey",
138    "client_secret",
139    "jwt_secret",
140    "jwtsecret",
141    "session_key",
142    "session-key",
143    "signing_key",
144    "encryption_key",
145    "oauth_token",
146    "bearer",
147    "authorization",
148    "webhook_secret",
149    "database_url",
150    "connection_string",
151    "dsn",
152];
153
154/// A high-entropy string found near a secret keyword.
155#[derive(Debug, Clone)]
156/// Entropy-based candidate match returned by fallback secret detection.
157///
158/// # Examples
159///
160/// ```rust,ignore
161/// use keyhog_scanner::entropy::EntropyMatch;
162/// let _ = std::mem::size_of::<EntropyMatch>();
163/// ```
164pub struct EntropyMatch {
165    /// The candidate string that exceeded the entropy threshold.
166    pub value: String,
167    /// Shannon entropy measured for `value`.
168    pub entropy: f64,
169    /// The keyword context that caused the candidate to be evaluated.
170    pub keyword: String,
171    /// One-based source line number for the match.
172    pub line: usize,
173    /// Byte offset of the start of the containing line.
174    pub offset: usize,
175}
176
177/// Check if a file path suggests a config/secret file (where entropy scanning is useful).
178/// Source code files have too many high-entropy strings (function names, imports, constants)
179/// for entropy to be reliable without ML.
180/// Decide whether entropy scanning should run for the given path.
181///
182/// # Examples
183///
184/// ```rust
185/// use keyhog_scanner::entropy::is_entropy_appropriate;
186///
187/// assert!(is_entropy_appropriate(Some(".env")));
188/// ```
189pub fn is_entropy_appropriate(path: Option<&str>) -> bool {
190    let Some(path) = path else { return true }; // stdin = scan
191    let lower = path.to_lowercase();
192    // Config/secret files: entropy is highly useful
193    const CONFIG_EXTENSIONS: &[&str] = &[
194        ".env",
195        ".yaml",
196        ".yml",
197        ".json",
198        ".toml",
199        ".properties",
200        ".cfg",
201        ".conf",
202        ".ini",
203        ".config",
204        ".secrets",
205        ".pem",
206        ".key",
207        ".tfvars",
208        ".hcl",
209    ];
210    for ext in CONFIG_EXTENSIONS {
211        if lower.ends_with(ext) {
212            return true;
213        }
214    }
215    // Check FILENAME (not full path) for config-like names.
216    // "docker_auth_config_test.go" should NOT match just because it contains "config".
217    let filename = lower.rsplit('/').next().unwrap_or(&lower);
218    const CONFIG_FILENAMES: &[&str] = &[
219        ".env",
220        "credentials",
221        "secrets",
222        "apikeys",
223        "docker-compose",
224        ".npmrc",
225        ".pypirc",
226        ".netrc",
227    ];
228    for name in CONFIG_FILENAMES {
229        if filename.starts_with(name) || filename == *name {
230            return true;
231        }
232    }
233    // Source code files: skip entropy (too noisy without ML)
234    false
235}
236
237/// Find high-entropy strings near secret keywords in text.
238/// This catches secrets that have no known pattern — the TruffleHog gap.
239/// Find secret-like tokens using entropy heuristics near likely credential context.
240///
241/// # Examples
242///
243/// ```rust
244/// use keyhog_scanner::entropy::find_entropy_secrets;
245///
246/// let matches = find_entropy_secrets("API_KEY=abcdEFGH12345678", 16, 1);
247/// assert!(!matches.is_empty());
248/// ```
249pub fn find_entropy_secrets(
250    text: &str,
251    min_length: usize,
252    context_lines: usize,
253) -> Vec<EntropyMatch> {
254    let lines: Vec<&str> = text.lines().collect();
255    let line_offsets = cumulative_line_offsets(&lines);
256    let mut matches = Vec::new();
257    let mut seen = std::collections::HashSet::new();
258    let keyword_lines = find_keyword_assignment_lines(&lines);
259
260    scan_keyword_contexts(
261        &lines,
262        &line_offsets,
263        &keyword_lines,
264        min_length,
265        context_lines,
266        &mut seen,
267        &mut matches,
268    );
269    scan_keyword_free_candidates(&lines, &line_offsets, &mut seen, &mut matches);
270    matches
271}
272
273fn find_keyword_assignment_lines<'a>(lines: &'a [&str]) -> Vec<(usize, &'a str)> {
274    lines
275        .iter()
276        .enumerate()
277        .filter_map(|(index, line)| is_keyword_assignment_line(line).then_some((index, *line)))
278        .collect()
279}
280
281fn is_keyword_assignment_line(line: &str) -> bool {
282    let line_bytes = line.as_bytes();
283    let has_keyword = SECRET_KEYWORDS.iter().any(|keyword| {
284        let keyword_bytes = keyword.as_bytes();
285        line_bytes
286            .windows(keyword_bytes.len())
287            .any(|window| window.eq_ignore_ascii_case(keyword_bytes))
288    });
289    let trimmed = line.trim();
290    let is_import = trimmed.starts_with("import")
291        || trimmed.starts_with("package")
292        || trimmed.starts_with("use ")
293        || trimmed.starts_with("from ")
294        || trimmed.starts_with("require(");
295    has_keyword && (line.contains('=') || line.contains(": ")) && !is_import
296}
297
298fn scan_keyword_contexts(
299    lines: &[&str],
300    line_offsets: &[usize],
301    keyword_lines: &[(usize, &str)],
302    min_length: usize,
303    context_lines: usize,
304    seen: &mut std::collections::HashSet<String>,
305    matches: &mut Vec<EntropyMatch>,
306) {
307    for (keyword_line_index, keyword_line) in keyword_lines {
308        let context = keyword_context(keyword_line, min_length);
309        let start = keyword_line_index.saturating_sub(context_lines);
310        let end = (*keyword_line_index + context_lines + 1).min(lines.len());
311        for line_idx in start..end {
312            collect_line_candidates(
313                lines[line_idx],
314                line_idx,
315                line_offsets[line_idx],
316                &context,
317                seen,
318                matches,
319            );
320        }
321    }
322}
323
324fn scan_keyword_free_candidates(
325    lines: &[&str],
326    line_offsets: &[usize],
327    seen: &mut std::collections::HashSet<String>,
328    matches: &mut Vec<EntropyMatch>,
329) {
330    let keyword_free_context = KeywordContext {
331        keyword: KEYWORD_FREE_LABEL.to_string(),
332        threshold: VERY_HIGH_ENTROPY_THRESHOLD,
333        min_len: KEYWORD_FREE_MIN_LEN,
334        is_credential_context: false,
335    };
336    for (line_idx, line) in lines.iter().enumerate() {
337        collect_line_candidates(
338            line,
339            line_idx,
340            line_offsets[line_idx],
341            &keyword_free_context,
342            seen,
343            matches,
344        );
345    }
346}
347
348struct KeywordContext {
349    keyword: String,
350    threshold: f64,
351    min_len: usize,
352    is_credential_context: bool,
353}
354
355fn keyword_context(keyword_line: &str, min_length: usize) -> KeywordContext {
356    const CREDENTIAL_KEYWORDS: &[&str] = &[
357        "password",
358        "passwd",
359        "pwd",
360        "db_pass",
361        "db_password",
362        "api_key",
363        "apikey",
364        "api-key",
365        "_key",
366        "-key",
367        "token",
368        "_token",
369        "-token",
370        "secret",
371        "_secret",
372        "-secret",
373    ];
374
375    let lowered = keyword_line.to_lowercase();
376    let keyword = SECRET_KEYWORDS
377        .iter()
378        .find(|keyword| lowered.contains(*keyword))
379        .copied()
380        .unwrap_or("unknown");
381    let is_credential_context = CREDENTIAL_KEYWORDS
382        .iter()
383        .any(|credential_keyword| lowered.contains(credential_keyword));
384    KeywordContext {
385        keyword: keyword.to_string(),
386        threshold: if is_credential_context {
387            CREDENTIAL_CONTEXT_THRESHOLD
388        } else {
389            HIGH_ENTROPY_THRESHOLD
390        },
391        min_len: if is_credential_context {
392            CREDENTIAL_CONTEXT_MIN_LEN
393        } else {
394            min_length
395        },
396        is_credential_context,
397    }
398}
399
400fn collect_line_candidates(
401    line: &str,
402    line_idx: usize,
403    line_offset: usize,
404    context: &KeywordContext,
405    seen: &mut std::collections::HashSet<String>,
406    matches: &mut Vec<EntropyMatch>,
407) {
408    for candidate in extract_candidates(line, context.min_len) {
409        let entropy = shannon_entropy(candidate.as_bytes());
410        if !candidate_is_plausible(&candidate, entropy, context) || !seen.insert(candidate.clone())
411        {
412            continue;
413        }
414        matches.push(EntropyMatch {
415            value: candidate,
416            entropy,
417            keyword: context.keyword.clone(),
418            line: line_idx + FIRST_SOURCE_LINE_NUMBER,
419            offset: line_offset,
420        });
421    }
422}
423
424fn candidate_is_plausible(candidate: &str, entropy: f64, context: &KeywordContext) -> bool {
425    if entropy < context.threshold {
426        return false;
427    }
428    if context.is_credential_context {
429        return candidate.len() >= MIN_PASSWORD_LEN;
430    }
431    candidate.len() >= KEYWORD_FREE_MIN_LEN.min(context.min_len) && is_secret_plausible(candidate)
432}
433
434fn cumulative_line_offsets(lines: &[&str]) -> Vec<usize> {
435    let mut offsets = Vec::with_capacity(lines.len());
436    let mut current = 0usize;
437    for line in lines {
438        offsets.push(current);
439        // Chunks are already resident in memory, so the practical upper bound
440        // is `usize::MAX` bytes on the current host architecture.
441        current = current.saturating_add(line.len().saturating_add(1));
442    }
443    offsets
444}
445
446/// Extract candidate secret strings from a line.
447/// Looks for values after `=`, `:`, or inside quotes.
448fn extract_candidates(line: &str, min_length: usize) -> Vec<String> {
449    let mut candidates = Vec::new();
450
451    // Skip lines that appear to be part of a string concatenation sequence.
452    // These are lines with just quoted string fragments, not complete secrets.
453    if is_likely_concatenation_fragment(line) {
454        return candidates;
455    }
456
457    // Extract values after assignment operators (common in config files).
458    // Search for `=` first because `:` appears inside secret values (URLs,
459    // base64) and splitting there would extract only the tail fragment.
460    // For `: ` (YAML/JSON mapping), require the trailing space to avoid
461    // matching colons inside values like `postgres://host:5432`.
462    if let Some(eq_pos) = line.find('=').or_else(|| line.find(": ")) {
463        let sep_len = if line.as_bytes().get(eq_pos) == Some(&b'=') {
464            1
465        } else {
466            2 // ": "
467        };
468        let value_part = line[eq_pos + sep_len..].trim();
469        let cleaned = value_part
470            .trim_matches(|c: char| c == '"' || c == '\'' || c == '`' || c == ';' || c == ',');
471        if cleaned.len() >= min_length && is_candidate_plausible(cleaned) {
472            candidates.push(cleaned.to_string());
473        }
474    }
475
476    // Extract quoted strings.
477    for quote in &['"', '\''] {
478        let mut start = None;
479        for (i, ch) in line.char_indices() {
480            if ch == *quote {
481                match start {
482                    None => start = Some(i + 1),
483                    Some(s) => {
484                        let content = &line[s..i];
485                        if content.len() >= min_length && is_secret_plausible(content) {
486                            candidates.push(content.to_string());
487                        }
488                        start = None;
489                    }
490                }
491            }
492        }
493    }
494
495    candidates
496}
497
498/// Check if a line is likely a string concatenation fragment.
499/// These are lines that contain just a quoted string, often part of a multi-line
500/// concatenation in Python, JavaScript, or JSON with line continuations.
501fn is_likely_concatenation_fragment(line: &str) -> bool {
502    let trimmed = line.trim();
503
504    // Check for Python/Javascript-style: "string" or "string" + or "string" \
505    // Pattern: optional whitespace, quote, content, quote, optional + or \ or ,
506    if trimmed.starts_with('"') || trimmed.starts_with('\'') {
507        // Count quotes in the line
508        let double_quotes = trimmed.matches('"').count();
509        let single_quotes = trimmed.matches('\'').count();
510
511        // If there's exactly one pair of quotes (2 quotes), it's likely just a quoted string
512        if (double_quotes == 2 && single_quotes == 0) || (single_quotes == 2 && double_quotes == 0)
513        {
514            // Check if the entire line is just the quoted string with optional trailing punctuation
515            // Pattern: "content" or 'content' optionally followed by + , \ or )
516            let after_quote = if double_quotes == 2 {
517                trimmed
518                    .rfind('"')
519                    .map(|i| &trimmed[i + 1..])
520                    .unwrap_or("")
521                    .trim()
522            } else {
523                trimmed
524                    .rfind('\'')
525                    .map(|i| &trimmed[i + 1..])
526                    .unwrap_or("")
527                    .trim()
528            };
529
530            // If after the closing quote we only have + , \ ) or nothing, it's a fragment
531            let is_fragment_suffix = after_quote.is_empty()
532                || after_quote == "+"
533                || after_quote == "\\"
534                || after_quote == ","
535                || after_quote == ")"
536                || after_quote.starts_with('+')
537                || after_quote.starts_with(')');
538
539            if is_fragment_suffix {
540                return true;
541            }
542        }
543    }
544
545    // Check for JSON line continuation pattern
546    if trimmed.ends_with("\\\"") || trimmed.ends_with("-\\") {
547        return true;
548    }
549
550    false
551}
552
553/// Shared plausibility filter with two modes:
554/// - candidate mode: allows hex strings so keyword-guided extraction can inspect them later
555/// - secret mode: rejects hex-only strings and requires high entropy
556///
557/// Controls how strict plausibility filtering is.
558enum PlausibilityMode {
559    /// Lenient: allows hex strings, used for keyword-context candidates.
560    Lenient,
561    /// Strict: rejects hex, requires high entropy. Used for keyword-independent scan.
562    Strict,
563}
564fn passes_plausibility_checks(s: &str, mode: PlausibilityMode) -> bool {
565    if matches_universal_rejection(s) {
566        return false;
567    }
568
569    if is_placeholder_ci(s.as_bytes()) || has_low_alnum_ratio(s) {
570        return false;
571    }
572
573    if matches!(mode, PlausibilityMode::Strict) && !passes_strict_secret_checks(s) {
574        return false;
575    }
576
577    true
578}
579
580fn matches_universal_rejection(s: &str) -> bool {
581    s.contains("://")
582        || s.starts_with('/')
583        || s.starts_with("./")
584        || s.starts_with("../")
585        || s.starts_with("${{")
586        || s.starts_with("{{")
587        || s.starts_with("${")
588        || s.starts_with("(?")
589        || s.starts_with('^')
590        || s.starts_with("ssh-")
591        || s.starts_with("ecdsa-")
592        || (s.starts_with("eyJ") && s.matches('.').count() == 2)
593        || s.starts_with("$ANSIBLE_VAULT")
594        || s.starts_with("ENC[")
595        || s.starts_with("-----BEGIN")
596        || (s.starts_with("Ag") && s.len() > 40)
597        || s.starts_with("age1")
598        || s.starts_with("vault:")
599        || s.starts_with("AQI")
600        || s.starts_with("CiQ")
601        // Reject Windows drive paths like "C:\..." — single ASCII letter + colon.
602        || (s.len() > 2
603            && s.as_bytes()[1] == b':'
604            && s.as_bytes()[0].is_ascii_alphabetic()
605            && (s.as_bytes()[2] == b'\\' || s.as_bytes()[2] == b'/'))
606        || s.starts_with("```")
607        || s.starts_with("---")
608        || s.starts_with("===")
609}
610
611fn has_low_alnum_ratio(s: &str) -> bool {
612    let alnum = s.chars().filter(|c| c.is_alphanumeric()).count() as f64 / s.len().max(1) as f64;
613    alnum < 0.5
614}
615
616fn passes_strict_secret_checks(s: &str) -> bool {
617    if s.chars().all(|c| c.is_ascii_hexdigit()) && s.len() > 10 {
618        return false;
619    }
620    if s.len() > 4
621        && let Some(first) = s.chars().next()
622        && s.chars().all(|c| c == first)
623    {
624        return false;
625    }
626    if s.len() > 16 && unique_char_count(s) < 8 {
627        return false;
628    }
629    if s.len() > 16 && second_half_entropy(s) < 2.5 {
630        return false;
631    }
632
633    shannon_entropy(s.as_bytes()) >= HIGH_ENTROPY_THRESHOLD
634}
635
636fn unique_char_count(s: &str) -> usize {
637    let mut seen = std::collections::HashSet::new();
638    for ch in s.chars() {
639        seen.insert(ch);
640    }
641    seen.len()
642}
643
644fn second_half_entropy(s: &str) -> f64 {
645    let mid = s.len() / 2;
646    let half_start = s.floor_char_boundary(mid);
647    shannon_entropy(&s.as_bytes()[half_start..])
648}
649
650/// For extract_candidates: lightweight filter (allows hex for password context).
651fn is_candidate_plausible(s: &str) -> bool {
652    passes_plausibility_checks(s, PlausibilityMode::Lenient)
653}
654
655/// For keyword-independent entropy scan: strict filter (rejects hex, requires entropy).
656fn is_secret_plausible(s: &str) -> bool {
657    passes_plausibility_checks(s, PlausibilityMode::Strict)
658}
659
660#[cfg(test)]
661mod tests {
662    use super::*;
663
664    #[test]
665    fn entropy_constant_string() {
666        assert!(shannon_entropy(b"aaaaaaaaaa") < 0.1);
667    }
668
669    #[test]
670    fn entropy_random_string() {
671        // High entropy string (looks like an API key)
672        let key = b"aK7xP9mQ2wE5rT8yU1iO3pA6sD4fG0hJ";
673        assert!(shannon_entropy(key) > 4.0);
674    }
675
676    #[test]
677    fn entropy_hex_hash() {
678        let hash = b"d41d8cd98f00b204e9800998ecf8427e";
679        let e = shannon_entropy(hash);
680        // Hex hashes have moderate entropy (only 16 possible chars)
681        assert!(e > 3.0);
682        assert!(e < 5.0);
683    }
684
685    #[test]
686    fn find_secrets_near_keywords() {
687        let text = r#"
688# Config
689DATABASE_URL=postgres://localhost/mydb
690API_KEY=aK7xP9mQ2wE5rT8yU1iO3pA6sD4fG0hJkL
691DEBUG=true
692"#;
693        let matches = find_entropy_secrets(text, 16, 2);
694        assert!(
695            !matches.is_empty(),
696            "should find high-entropy string near API_KEY"
697        );
698        assert_eq!(matches[0].value, "aK7xP9mQ2wE5rT8yU1iO3pA6sD4fG0hJkL");
699        // The matched value should be the API key content.
700        assert!(
701            matches.iter().any(|m| m.entropy > 4.0),
702            "should have high entropy match"
703        );
704    }
705
706    #[test]
707    fn skip_placeholders() {
708        let text = r#"
709API_KEY=YOUR_API_KEY_HERE
710SECRET=change_me_placeholder
711TOKEN=xxxxxxxxxxxxxxxxxxxx
712"#;
713        let matches = find_entropy_secrets(text, 16, 2);
714        assert!(matches.is_empty());
715    }
716
717    #[test]
718    fn plausible_secret_filter() {
719        assert!(!is_secret_plausible("https://example.com/api"));
720        assert!(!is_secret_plausible("/usr/local/bin/python"));
721        assert!(!is_secret_plausible("your_api_key_here"));
722        assert!(is_secret_plausible("aK7xP9mQ2wE5rT8yU1iO3pA6sD4fG0hJ"));
723    }
724
725    #[test]
726    fn candidate_mode_skips_strict_secret_checks() {
727        assert!(is_candidate_plausible("0123456789abcdef"));
728        assert!(!is_secret_plausible("0123456789abcdef"));
729    }
730
731    #[test]
732    fn detect_db_password_hex() {
733        let text = "DB_PASSWORD=8ae31cacf141669ddfb5da\n";
734        let matches = find_entropy_secrets(text, 8, 2);
735        assert!(
736            !matches.is_empty(),
737            "Should detect hex password near DB_PASSWORD keyword. Got 0 matches."
738        );
739        assert!(
740            matches[0].value.contains("8ae31cac"),
741            "Should extract the password value"
742        );
743    }
744
745    #[test]
746    fn entropy_match_offsets_are_cumulative() {
747        let text = "first=line\nAPI_KEY=aK7xP9mQ2wE5rT8yU1iO3pA6sD4fG0hJkL\n";
748        let matches = find_entropy_secrets(text, 16, 2);
749        assert_eq!(matches.len(), 1);
750        assert_eq!(matches[0].value, "aK7xP9mQ2wE5rT8yU1iO3pA6sD4fG0hJkL");
751        assert_eq!(matches[0].offset, "first=line\n".len());
752    }
753
754    #[test]
755    fn entropy_empty_input_is_zero() {
756        assert_eq!(shannon_entropy(b""), 0.0);
757    }
758
759    #[test]
760    fn entropy_single_unique_byte_is_zero() {
761        assert_eq!(shannon_entropy(b"zzzzzzzz"), 0.0);
762    }
763
764    #[test]
765    fn entropy_all_byte_values_is_near_eight() {
766        let all_bytes: Vec<u8> = (0u8..=255).collect();
767        let entropy = shannon_entropy(&all_bytes);
768        assert!((entropy - 8.0).abs() < 1e-9, "entropy was {}", entropy);
769    }
770
771    #[test]
772    fn entropy_huge_repeated_input_stays_low() {
773        let repeated = vec![b'A'; 100_000];
774        assert_eq!(shannon_entropy(&repeated), 0.0);
775    }
776
777    #[test]
778    fn normalized_entropy_empty_input_is_zero() {
779        assert_eq!(normalized_entropy(b""), 0.0);
780    }
781
782    #[test]
783    fn normalized_entropy_single_unique_byte_is_zero() {
784        assert_eq!(normalized_entropy(b"aaaaaaaaaaaaaaaa"), 0.0);
785    }
786
787    #[test]
788    fn normalized_entropy_binary_pattern_reaches_one() {
789        let entropy = normalized_entropy(b"abababababababab");
790        assert!((entropy - 1.0).abs() < 1e-9, "entropy was {}", entropy);
791    }
792
793    #[test]
794    fn normalized_entropy_all_unique_bytes_reaches_one() {
795        let all_bytes: Vec<u8> = (0u8..=255).collect();
796        let entropy = normalized_entropy(&all_bytes);
797        assert!((entropy - 1.0).abs() < 1e-9, "entropy was {}", entropy);
798    }
799
800    #[test]
801    fn normalized_entropy_stays_bounded_for_large_mixed_input() {
802        let mut data = Vec::with_capacity(16_000);
803        for _ in 0..500 {
804            data.extend_from_slice(b"abc123XYZ!@#$%^&*()");
805        }
806        let entropy = normalized_entropy(&data);
807        assert!((0.0..=1.0).contains(&entropy), "entropy was {}", entropy);
808    }
809
810    #[test]
811    fn entropy_is_appropriate_for_stdin() {
812        assert!(is_entropy_appropriate(None));
813    }
814
815    #[test]
816    fn entropy_is_appropriate_for_config_extensions_case_insensitively() {
817        assert!(is_entropy_appropriate(Some("CONFIG/SETTINGS.YAML")));
818        assert!(is_entropy_appropriate(Some("keys/server.PEM")));
819        assert!(is_entropy_appropriate(Some("infra/secrets.TFVARS")));
820    }
821
822    #[test]
823    fn entropy_is_appropriate_for_sensitive_filenames_only() {
824        assert!(is_entropy_appropriate(Some("/tmp/.npmrc.backup")));
825        assert!(is_entropy_appropriate(Some("nested/docker-compose.prod")));
826        assert!(is_entropy_appropriate(Some("config/apikeys.txt")));
827    }
828
829    #[test]
830    fn entropy_is_not_appropriate_for_source_files_even_with_config_substrings() {
831        assert!(!is_entropy_appropriate(Some(
832            "src/docker_auth_config_test.go"
833        )));
834        assert!(!is_entropy_appropriate(Some(
835            "lib/application_yaml_parser.rs"
836        )));
837        assert!(!is_entropy_appropriate(Some("src/main.rs")));
838    }
839
840    #[test]
841    fn entropy_secret_scan_empty_input_returns_no_matches() {
842        assert!(find_entropy_secrets("", 16, 2).is_empty());
843    }
844
845    #[test]
846
847    fn keyword_free_scan_detects_long_high_entropy_strings() {
848        let secret = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!@";
849        let text = format!("prefix\n  value: \"{secret}\"\nsuffix\n");
850        let matches = find_entropy_secrets(&text, 16, 0);
851        assert_eq!(matches.len(), 1);
852        assert_eq!(matches[0].value, secret);
853        assert_eq!(matches[0].keyword, "none (high-entropy)");
854        assert_eq!(matches[0].line, 2);
855    }
856
857    #[test]
858    fn keyword_free_scan_rejects_short_high_entropy_strings() {
859        let text = "ZxCvBn123!@#AsDfGh456$%^QwErTy789";
860        assert!(find_entropy_secrets(text, 16, 0).is_empty());
861    }
862
863    #[test]
864    fn duplicate_secret_value_is_reported_once() {
865        let secret = "aK7xP9mQ2wE5rT8yU1iO3pA6sD4fG0hJkL";
866        let text = format!("API_KEY={secret}\nTOKEN={secret}\n");
867        let matches = find_entropy_secrets(&text, 16, 1);
868        assert_eq!(matches.len(), 1);
869        assert_eq!(matches[0].value, secret);
870    }
871
872    #[test]
873    fn import_statements_with_keywords_are_ignored() {
874        let text = "import API_KEY from \"aK7xP9mQ2wE5rT8yU1iO3pA6sD4fG0hJkL\"\n";
875        assert!(find_entropy_secrets(text, 16, 1).is_empty());
876    }
877
878    #[test]
879    fn url_like_values_are_rejected_even_in_keyword_context() {
880        let text = "DATABASE_URL=https://example.com/super/secret/path/value\n";
881        assert!(find_entropy_secrets(text, 16, 1).is_empty());
882    }
883
884    #[test]
885    fn context_lines_zero_limits_scan_to_keyword_line() {
886        let secret = "aK7xP9mQ2wE5rT8yU1iO3pA6sD4fG0hJkL";
887        let text = format!("API_KEY=placeholder\n\"{secret}\"\n");
888        assert!(find_entropy_secrets(&text, 16, 0).is_empty());
889    }
890
891    #[test]
892
893    fn context_lines_include_neighboring_lines() {
894        let secret = "aK7xP9mQ2wE5rT8yU1iO3pA6sD4fG0hJkL";
895        let text = format!("API_KEY=placeholder\n  value: \"{secret}\"\n");
896        let matches = find_entropy_secrets(&text, 16, 1);
897        assert_eq!(matches.len(), 1);
898        assert_eq!(matches[0].value, secret);
899        assert_eq!(matches[0].line, 2);
900    }
901
902    #[test]
903    fn special_character_placeholders_are_rejected() {
904        let text = "SECRET=<replace-with-real-secret>\nTOKEN=${{ secrets.API_TOKEN }}\n";
905        assert!(find_entropy_secrets(text, 8, 1).is_empty());
906    }
907
908    #[test]
909    fn large_input_preserves_line_and_offset_for_match() {
910        let filler = "abcd1234\n".repeat(2000);
911        let secret = "QwErTy123!@#ZxCvBn456$%^AsDfGh789&*(YuIoP0)_+LmNoPqRsTuV";
912        let text = format!("{filler}API_KEY={secret}\n");
913        let matches = find_entropy_secrets(&text, 16, 0);
914        assert_eq!(matches.len(), 1);
915        assert_eq!(matches[0].value, secret);
916        assert_eq!(matches[0].line, 2001);
917        assert_eq!(matches[0].offset, filler.len());
918    }
919}
920
921/// Case-insensitive placeholder check without heap allocation.
922fn is_placeholder_ci(bytes: &[u8]) -> bool {
923    const PLACEHOLDERS: &[&[u8]] = &[
924        b"example",
925        b"placeholder",
926        b"change_me",
927        b"changeme",
928        b"your_",
929        b"your-",
930        b"xxx",
931        b"todo",
932        b"fixme",
933        b"replace",
934        b"insert",
935        b"enter_",
936        b"enter-",
937        b"dummy",
938        b"sample",
939        b"demo",
940        b"fake",
941        b"mock",
942        b"goes-here",
943        b"fill_in",
944        b"not-a-real",
945        b"not_a_real",
946    ];
947    PLACEHOLDERS
948        .iter()
949        .any(|p| bytes.windows(p.len()).any(|w| w.eq_ignore_ascii_case(p)))
950        || bytes.contains(&b'<')
951        || bytes.contains(&b'>')
952        || matches!(
953            bytes,
954            b"null" | b"none" | b"undefined" | b"empty" | b"default" | b"secret" | b"password"
955        )
956}