pub fn shannon_entropy(data: &[u8]) -> f64 {
if data.is_empty() {
return 0.0;
}
let mut counts0 = [0u64; 256];
let mut counts1 = [0u64; 256];
let mut counts2 = [0u64; 256];
let mut counts3 = [0u64; 256];
let mut chunks = data.chunks_exact(4);
for chunk in &mut chunks {
counts0[usize::from(chunk[0])] += 1;
counts1[usize::from(chunk[1])] += 1;
counts2[usize::from(chunk[2])] += 1;
counts3[usize::from(chunk[3])] += 1;
}
let mut counts = [0u64; 256];
for &byte in chunks.remainder() {
counts[usize::from(byte)] += 1;
}
for i in 0..256 {
counts[i] += counts0[i] + counts1[i] + counts2[i] + counts3[i];
}
let len = data.len() as f64;
let mut entropy = 0.0;
for &count in &counts {
if count > 0 {
let p = count as f64 / len;
entropy -= p * p.log2();
}
}
entropy
}
pub fn normalized_entropy(data: &[u8]) -> f64 {
if data.is_empty() {
return 0.0;
}
let unique_chars = {
let mut seen = [false; 256];
for &b in data {
seen[b as usize] = true;
}
seen.iter().filter(|&&v| v).count()
};
if unique_chars <= 1 {
return 0.0;
}
let max_entropy = (unique_chars as f64).log2();
if max_entropy == 0.0 {
return 0.0;
}
shannon_entropy(data) / max_entropy
}
pub const HIGH_ENTROPY_THRESHOLD: f64 = 4.5;
pub const VERY_HIGH_ENTROPY_THRESHOLD: f64 = 5.5;
const CREDENTIAL_CONTEXT_THRESHOLD: f64 = 3.5;
const CREDENTIAL_CONTEXT_MIN_LEN: usize = 16;
const KEYWORD_FREE_MIN_LEN: usize = 30;
const MIN_PASSWORD_LEN: usize = 8;
const FIRST_SOURCE_LINE_NUMBER: usize = 1;
const KEYWORD_FREE_LABEL: &str = "none (high-entropy)";
const SECRET_KEYWORDS: &[&str] = &[
"api_key",
"apikey",
"api-key",
"api_token",
"api-token",
"secret",
"secret_key",
"secretkey",
"token",
"access_token",
"auth_token",
"auth-token",
"password",
"passwd",
"pwd",
"credential",
"credentials",
"private_key",
"privatekey",
"client_secret",
"jwt_secret",
"jwtsecret",
"session_key",
"session-key",
"signing_key",
"encryption_key",
"oauth_token",
"bearer",
"authorization",
"webhook_secret",
"database_url",
"connection_string",
"dsn",
];
#[derive(Debug, Clone)]
pub struct EntropyMatch {
pub value: String,
pub entropy: f64,
pub keyword: String,
pub line: usize,
pub offset: usize,
}
pub fn is_entropy_appropriate(path: Option<&str>) -> bool {
let Some(path) = path else { return true }; let lower = path.to_lowercase();
const CONFIG_EXTENSIONS: &[&str] = &[
".env",
".yaml",
".yml",
".json",
".toml",
".properties",
".cfg",
".conf",
".ini",
".config",
".secrets",
".pem",
".key",
".tfvars",
".hcl",
];
for ext in CONFIG_EXTENSIONS {
if lower.ends_with(ext) {
return true;
}
}
let filename = lower.rsplit('/').next().unwrap_or(&lower);
const CONFIG_FILENAMES: &[&str] = &[
".env",
"credentials",
"secrets",
"apikeys",
"docker-compose",
".npmrc",
".pypirc",
".netrc",
];
for name in CONFIG_FILENAMES {
if filename.starts_with(name) || filename == *name {
return true;
}
}
false
}
pub fn find_entropy_secrets(
text: &str,
min_length: usize,
context_lines: usize,
) -> Vec<EntropyMatch> {
let lines: Vec<&str> = text.lines().collect();
let line_offsets = cumulative_line_offsets(&lines);
let mut matches = Vec::new();
let mut seen = std::collections::HashSet::new();
let keyword_lines = find_keyword_assignment_lines(&lines);
scan_keyword_contexts(
&lines,
&line_offsets,
&keyword_lines,
min_length,
context_lines,
&mut seen,
&mut matches,
);
scan_keyword_free_candidates(&lines, &line_offsets, &mut seen, &mut matches);
matches
}
fn find_keyword_assignment_lines<'a>(lines: &'a [&str]) -> Vec<(usize, &'a str)> {
lines
.iter()
.enumerate()
.filter_map(|(index, line)| is_keyword_assignment_line(line).then_some((index, *line)))
.collect()
}
fn is_keyword_assignment_line(line: &str) -> bool {
let line_bytes = line.as_bytes();
let has_keyword = SECRET_KEYWORDS.iter().any(|keyword| {
let keyword_bytes = keyword.as_bytes();
line_bytes
.windows(keyword_bytes.len())
.any(|window| window.eq_ignore_ascii_case(keyword_bytes))
});
let trimmed = line.trim();
let is_import = trimmed.starts_with("import")
|| trimmed.starts_with("package")
|| trimmed.starts_with("use ")
|| trimmed.starts_with("from ")
|| trimmed.starts_with("require(");
has_keyword && (line.contains('=') || line.contains(": ")) && !is_import
}
fn scan_keyword_contexts(
lines: &[&str],
line_offsets: &[usize],
keyword_lines: &[(usize, &str)],
min_length: usize,
context_lines: usize,
seen: &mut std::collections::HashSet<String>,
matches: &mut Vec<EntropyMatch>,
) {
for (keyword_line_index, keyword_line) in keyword_lines {
let context = keyword_context(keyword_line, min_length);
let start = keyword_line_index.saturating_sub(context_lines);
let end = (*keyword_line_index + context_lines + 1).min(lines.len());
for line_idx in start..end {
collect_line_candidates(
lines[line_idx],
line_idx,
line_offsets[line_idx],
&context,
seen,
matches,
);
}
}
}
fn scan_keyword_free_candidates(
lines: &[&str],
line_offsets: &[usize],
seen: &mut std::collections::HashSet<String>,
matches: &mut Vec<EntropyMatch>,
) {
let keyword_free_context = KeywordContext {
keyword: KEYWORD_FREE_LABEL.to_string(),
threshold: VERY_HIGH_ENTROPY_THRESHOLD,
min_len: KEYWORD_FREE_MIN_LEN,
is_credential_context: false,
};
for (line_idx, line) in lines.iter().enumerate() {
collect_line_candidates(
line,
line_idx,
line_offsets[line_idx],
&keyword_free_context,
seen,
matches,
);
}
}
struct KeywordContext {
keyword: String,
threshold: f64,
min_len: usize,
is_credential_context: bool,
}
fn keyword_context(keyword_line: &str, min_length: usize) -> KeywordContext {
const CREDENTIAL_KEYWORDS: &[&str] = &[
"password",
"passwd",
"pwd",
"db_pass",
"db_password",
"api_key",
"apikey",
"api-key",
"_key",
"-key",
"token",
"_token",
"-token",
"secret",
"_secret",
"-secret",
];
let lowered = keyword_line.to_lowercase();
let keyword = SECRET_KEYWORDS
.iter()
.find(|keyword| lowered.contains(*keyword))
.copied()
.unwrap_or("unknown");
let is_credential_context = CREDENTIAL_KEYWORDS
.iter()
.any(|credential_keyword| lowered.contains(credential_keyword));
KeywordContext {
keyword: keyword.to_string(),
threshold: if is_credential_context {
CREDENTIAL_CONTEXT_THRESHOLD
} else {
HIGH_ENTROPY_THRESHOLD
},
min_len: if is_credential_context {
CREDENTIAL_CONTEXT_MIN_LEN
} else {
min_length
},
is_credential_context,
}
}
fn collect_line_candidates(
line: &str,
line_idx: usize,
line_offset: usize,
context: &KeywordContext,
seen: &mut std::collections::HashSet<String>,
matches: &mut Vec<EntropyMatch>,
) {
for candidate in extract_candidates(line, context.min_len) {
let entropy = shannon_entropy(candidate.as_bytes());
if !candidate_is_plausible(&candidate, entropy, context) || !seen.insert(candidate.clone())
{
continue;
}
matches.push(EntropyMatch {
value: candidate,
entropy,
keyword: context.keyword.clone(),
line: line_idx + FIRST_SOURCE_LINE_NUMBER,
offset: line_offset,
});
}
}
fn candidate_is_plausible(candidate: &str, entropy: f64, context: &KeywordContext) -> bool {
if entropy < context.threshold {
return false;
}
if context.is_credential_context {
return candidate.len() >= MIN_PASSWORD_LEN;
}
candidate.len() >= KEYWORD_FREE_MIN_LEN.min(context.min_len) && is_secret_plausible(candidate)
}
fn cumulative_line_offsets(lines: &[&str]) -> Vec<usize> {
let mut offsets = Vec::with_capacity(lines.len());
let mut current = 0usize;
for line in lines {
offsets.push(current);
current = current.saturating_add(line.len().saturating_add(1));
}
offsets
}
fn extract_candidates(line: &str, min_length: usize) -> Vec<String> {
let mut candidates = Vec::new();
if is_likely_concatenation_fragment(line) {
return candidates;
}
if let Some(eq_pos) = line.find('=').or_else(|| line.find(": ")) {
let sep_len = if line.as_bytes().get(eq_pos) == Some(&b'=') {
1
} else {
2 };
let value_part = line[eq_pos + sep_len..].trim();
let cleaned = value_part
.trim_matches(|c: char| c == '"' || c == '\'' || c == '`' || c == ';' || c == ',');
if cleaned.len() >= min_length && is_candidate_plausible(cleaned) {
candidates.push(cleaned.to_string());
}
}
for quote in &['"', '\''] {
let mut start = None;
for (i, ch) in line.char_indices() {
if ch == *quote {
match start {
None => start = Some(i + 1),
Some(s) => {
let content = &line[s..i];
if content.len() >= min_length && is_secret_plausible(content) {
candidates.push(content.to_string());
}
start = None;
}
}
}
}
}
candidates
}
fn is_likely_concatenation_fragment(line: &str) -> bool {
let trimmed = line.trim();
if trimmed.starts_with('"') || trimmed.starts_with('\'') {
let double_quotes = trimmed.matches('"').count();
let single_quotes = trimmed.matches('\'').count();
if (double_quotes == 2 && single_quotes == 0) || (single_quotes == 2 && double_quotes == 0)
{
let after_quote = if double_quotes == 2 {
trimmed
.rfind('"')
.map(|i| &trimmed[i + 1..])
.unwrap_or("")
.trim()
} else {
trimmed
.rfind('\'')
.map(|i| &trimmed[i + 1..])
.unwrap_or("")
.trim()
};
let is_fragment_suffix = after_quote.is_empty()
|| after_quote == "+"
|| after_quote == "\\"
|| after_quote == ","
|| after_quote == ")"
|| after_quote.starts_with('+')
|| after_quote.starts_with(')');
if is_fragment_suffix {
return true;
}
}
}
if trimmed.ends_with("\\\"") || trimmed.ends_with("-\\") {
return true;
}
false
}
enum PlausibilityMode {
Lenient,
Strict,
}
fn passes_plausibility_checks(s: &str, mode: PlausibilityMode) -> bool {
if matches_universal_rejection(s) {
return false;
}
if is_placeholder_ci(s.as_bytes()) || has_low_alnum_ratio(s) {
return false;
}
if matches!(mode, PlausibilityMode::Strict) && !passes_strict_secret_checks(s) {
return false;
}
true
}
fn matches_universal_rejection(s: &str) -> bool {
s.contains("://")
|| s.starts_with('/')
|| s.starts_with("./")
|| s.starts_with("../")
|| s.starts_with("${{")
|| s.starts_with("{{")
|| s.starts_with("${")
|| s.starts_with("(?")
|| s.starts_with('^')
|| s.starts_with("ssh-")
|| s.starts_with("ecdsa-")
|| (s.starts_with("eyJ") && s.matches('.').count() == 2)
|| s.starts_with("$ANSIBLE_VAULT")
|| s.starts_with("ENC[")
|| s.starts_with("-----BEGIN")
|| (s.starts_with("Ag") && s.len() > 40)
|| s.starts_with("age1")
|| s.starts_with("vault:")
|| s.starts_with("AQI")
|| s.starts_with("CiQ")
|| (s.len() > 2
&& s.as_bytes()[1] == b':'
&& s.as_bytes()[0].is_ascii_alphabetic()
&& (s.as_bytes()[2] == b'\\' || s.as_bytes()[2] == b'/'))
|| s.starts_with("```")
|| s.starts_with("---")
|| s.starts_with("===")
}
fn has_low_alnum_ratio(s: &str) -> bool {
let alnum = s.chars().filter(|c| c.is_alphanumeric()).count() as f64 / s.len().max(1) as f64;
alnum < 0.5
}
fn passes_strict_secret_checks(s: &str) -> bool {
if s.chars().all(|c| c.is_ascii_hexdigit()) && s.len() > 10 {
return false;
}
if s.len() > 4
&& let Some(first) = s.chars().next()
&& s.chars().all(|c| c == first)
{
return false;
}
if s.len() > 16 && unique_char_count(s) < 8 {
return false;
}
if s.len() > 16 && second_half_entropy(s) < 2.5 {
return false;
}
shannon_entropy(s.as_bytes()) >= HIGH_ENTROPY_THRESHOLD
}
fn unique_char_count(s: &str) -> usize {
let mut seen = std::collections::HashSet::new();
for ch in s.chars() {
seen.insert(ch);
}
seen.len()
}
fn second_half_entropy(s: &str) -> f64 {
let mid = s.len() / 2;
let half_start = s.floor_char_boundary(mid);
shannon_entropy(&s.as_bytes()[half_start..])
}
fn is_candidate_plausible(s: &str) -> bool {
passes_plausibility_checks(s, PlausibilityMode::Lenient)
}
fn is_secret_plausible(s: &str) -> bool {
passes_plausibility_checks(s, PlausibilityMode::Strict)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn entropy_constant_string() {
assert!(shannon_entropy(b"aaaaaaaaaa") < 0.1);
}
#[test]
fn entropy_random_string() {
let key = b"aK7xP9mQ2wE5rT8yU1iO3pA6sD4fG0hJ";
assert!(shannon_entropy(key) > 4.0);
}
#[test]
fn entropy_hex_hash() {
let hash = b"d41d8cd98f00b204e9800998ecf8427e";
let e = shannon_entropy(hash);
assert!(e > 3.0);
assert!(e < 5.0);
}
#[test]
fn find_secrets_near_keywords() {
let text = r#"
# Config
DATABASE_URL=postgres://localhost/mydb
API_KEY=aK7xP9mQ2wE5rT8yU1iO3pA6sD4fG0hJkL
DEBUG=true
"#;
let matches = find_entropy_secrets(text, 16, 2);
assert!(
!matches.is_empty(),
"should find high-entropy string near API_KEY"
);
assert_eq!(matches[0].value, "aK7xP9mQ2wE5rT8yU1iO3pA6sD4fG0hJkL");
assert!(
matches.iter().any(|m| m.entropy > 4.0),
"should have high entropy match"
);
}
#[test]
fn skip_placeholders() {
let text = r#"
API_KEY=YOUR_API_KEY_HERE
SECRET=change_me_placeholder
TOKEN=xxxxxxxxxxxxxxxxxxxx
"#;
let matches = find_entropy_secrets(text, 16, 2);
assert!(matches.is_empty());
}
#[test]
fn plausible_secret_filter() {
assert!(!is_secret_plausible("https://example.com/api"));
assert!(!is_secret_plausible("/usr/local/bin/python"));
assert!(!is_secret_plausible("your_api_key_here"));
assert!(is_secret_plausible("aK7xP9mQ2wE5rT8yU1iO3pA6sD4fG0hJ"));
}
#[test]
fn candidate_mode_skips_strict_secret_checks() {
assert!(is_candidate_plausible("0123456789abcdef"));
assert!(!is_secret_plausible("0123456789abcdef"));
}
#[test]
fn detect_db_password_hex() {
let text = "DB_PASSWORD=8ae31cacf141669ddfb5da\n";
let matches = find_entropy_secrets(text, 8, 2);
assert!(
!matches.is_empty(),
"Should detect hex password near DB_PASSWORD keyword. Got 0 matches."
);
assert!(
matches[0].value.contains("8ae31cac"),
"Should extract the password value"
);
}
#[test]
fn entropy_match_offsets_are_cumulative() {
let text = "first=line\nAPI_KEY=aK7xP9mQ2wE5rT8yU1iO3pA6sD4fG0hJkL\n";
let matches = find_entropy_secrets(text, 16, 2);
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].value, "aK7xP9mQ2wE5rT8yU1iO3pA6sD4fG0hJkL");
assert_eq!(matches[0].offset, "first=line\n".len());
}
#[test]
fn entropy_empty_input_is_zero() {
assert_eq!(shannon_entropy(b""), 0.0);
}
#[test]
fn entropy_single_unique_byte_is_zero() {
assert_eq!(shannon_entropy(b"zzzzzzzz"), 0.0);
}
#[test]
fn entropy_all_byte_values_is_near_eight() {
let all_bytes: Vec<u8> = (0u8..=255).collect();
let entropy = shannon_entropy(&all_bytes);
assert!((entropy - 8.0).abs() < 1e-9, "entropy was {}", entropy);
}
#[test]
fn entropy_huge_repeated_input_stays_low() {
let repeated = vec![b'A'; 100_000];
assert_eq!(shannon_entropy(&repeated), 0.0);
}
#[test]
fn normalized_entropy_empty_input_is_zero() {
assert_eq!(normalized_entropy(b""), 0.0);
}
#[test]
fn normalized_entropy_single_unique_byte_is_zero() {
assert_eq!(normalized_entropy(b"aaaaaaaaaaaaaaaa"), 0.0);
}
#[test]
fn normalized_entropy_binary_pattern_reaches_one() {
let entropy = normalized_entropy(b"abababababababab");
assert!((entropy - 1.0).abs() < 1e-9, "entropy was {}", entropy);
}
#[test]
fn normalized_entropy_all_unique_bytes_reaches_one() {
let all_bytes: Vec<u8> = (0u8..=255).collect();
let entropy = normalized_entropy(&all_bytes);
assert!((entropy - 1.0).abs() < 1e-9, "entropy was {}", entropy);
}
#[test]
fn normalized_entropy_stays_bounded_for_large_mixed_input() {
let mut data = Vec::with_capacity(16_000);
for _ in 0..500 {
data.extend_from_slice(b"abc123XYZ!@#$%^&*()");
}
let entropy = normalized_entropy(&data);
assert!((0.0..=1.0).contains(&entropy), "entropy was {}", entropy);
}
#[test]
fn entropy_is_appropriate_for_stdin() {
assert!(is_entropy_appropriate(None));
}
#[test]
fn entropy_is_appropriate_for_config_extensions_case_insensitively() {
assert!(is_entropy_appropriate(Some("CONFIG/SETTINGS.YAML")));
assert!(is_entropy_appropriate(Some("keys/server.PEM")));
assert!(is_entropy_appropriate(Some("infra/secrets.TFVARS")));
}
#[test]
fn entropy_is_appropriate_for_sensitive_filenames_only() {
assert!(is_entropy_appropriate(Some("/tmp/.npmrc.backup")));
assert!(is_entropy_appropriate(Some("nested/docker-compose.prod")));
assert!(is_entropy_appropriate(Some("config/apikeys.txt")));
}
#[test]
fn entropy_is_not_appropriate_for_source_files_even_with_config_substrings() {
assert!(!is_entropy_appropriate(Some(
"src/docker_auth_config_test.go"
)));
assert!(!is_entropy_appropriate(Some(
"lib/application_yaml_parser.rs"
)));
assert!(!is_entropy_appropriate(Some("src/main.rs")));
}
#[test]
fn entropy_secret_scan_empty_input_returns_no_matches() {
assert!(find_entropy_secrets("", 16, 2).is_empty());
}
#[test]
fn keyword_free_scan_detects_long_high_entropy_strings() {
let secret = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!@";
let text = format!("prefix\n value: \"{secret}\"\nsuffix\n");
let matches = find_entropy_secrets(&text, 16, 0);
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].value, secret);
assert_eq!(matches[0].keyword, "none (high-entropy)");
assert_eq!(matches[0].line, 2);
}
#[test]
fn keyword_free_scan_rejects_short_high_entropy_strings() {
let text = "ZxCvBn123!@#AsDfGh456$%^QwErTy789";
assert!(find_entropy_secrets(text, 16, 0).is_empty());
}
#[test]
fn duplicate_secret_value_is_reported_once() {
let secret = "aK7xP9mQ2wE5rT8yU1iO3pA6sD4fG0hJkL";
let text = format!("API_KEY={secret}\nTOKEN={secret}\n");
let matches = find_entropy_secrets(&text, 16, 1);
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].value, secret);
}
#[test]
fn import_statements_with_keywords_are_ignored() {
let text = "import API_KEY from \"aK7xP9mQ2wE5rT8yU1iO3pA6sD4fG0hJkL\"\n";
assert!(find_entropy_secrets(text, 16, 1).is_empty());
}
#[test]
fn url_like_values_are_rejected_even_in_keyword_context() {
let text = "DATABASE_URL=https://example.com/super/secret/path/value\n";
assert!(find_entropy_secrets(text, 16, 1).is_empty());
}
#[test]
fn context_lines_zero_limits_scan_to_keyword_line() {
let secret = "aK7xP9mQ2wE5rT8yU1iO3pA6sD4fG0hJkL";
let text = format!("API_KEY=placeholder\n\"{secret}\"\n");
assert!(find_entropy_secrets(&text, 16, 0).is_empty());
}
#[test]
fn context_lines_include_neighboring_lines() {
let secret = "aK7xP9mQ2wE5rT8yU1iO3pA6sD4fG0hJkL";
let text = format!("API_KEY=placeholder\n value: \"{secret}\"\n");
let matches = find_entropy_secrets(&text, 16, 1);
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].value, secret);
assert_eq!(matches[0].line, 2);
}
#[test]
fn special_character_placeholders_are_rejected() {
let text = "SECRET=<replace-with-real-secret>\nTOKEN=${{ secrets.API_TOKEN }}\n";
assert!(find_entropy_secrets(text, 8, 1).is_empty());
}
#[test]
fn large_input_preserves_line_and_offset_for_match() {
let filler = "abcd1234\n".repeat(2000);
let secret = "QwErTy123!@#ZxCvBn456$%^AsDfGh789&*(YuIoP0)_+LmNoPqRsTuV";
let text = format!("{filler}API_KEY={secret}\n");
let matches = find_entropy_secrets(&text, 16, 0);
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].value, secret);
assert_eq!(matches[0].line, 2001);
assert_eq!(matches[0].offset, filler.len());
}
}
fn is_placeholder_ci(bytes: &[u8]) -> bool {
const PLACEHOLDERS: &[&[u8]] = &[
b"example",
b"placeholder",
b"change_me",
b"changeme",
b"your_",
b"your-",
b"xxx",
b"todo",
b"fixme",
b"replace",
b"insert",
b"enter_",
b"enter-",
b"dummy",
b"sample",
b"demo",
b"fake",
b"mock",
b"goes-here",
b"fill_in",
b"not-a-real",
b"not_a_real",
];
PLACEHOLDERS
.iter()
.any(|p| bytes.windows(p.len()).any(|w| w.eq_ignore_ascii_case(p)))
|| bytes.contains(&b'<')
|| bytes.contains(&b'>')
|| matches!(
bytes,
b"null" | b"none" | b"undefined" | b"empty" | b"default" | b"secret" | b"password"
)
}