pub fn shannon_entropy(input: &str) -> f64 {
let bytes = input.as_bytes();
let len = bytes.len();
if len == 0 {
return 0.0;
}
let mut counts = [0u32; 256];
for &b in bytes {
counts[b as usize] += 1;
}
let len_f = len as f64;
let mut entropy = 0.0f64;
for &count in &counts {
if count == 0 {
continue;
}
let p = count as f64 / len_f;
entropy -= p * p.log2();
}
entropy
}
#[derive(Debug, Clone)]
pub struct EntropyConfig {
pub enabled: bool,
pub threshold: f64,
pub min_len: usize,
}
impl Default for EntropyConfig {
fn default() -> Self {
Self {
enabled: true,
threshold: 3.5,
min_len: 20,
}
}
}
pub struct EntropyMatch<'a> {
pub start: usize,
pub end: usize,
pub token: &'a str,
pub entropy: f64,
}
fn is_delimiter(ch: char) -> bool {
matches!(
ch,
' ' | '\t'
| '\n'
| '\r'
| '"'
| '\''
| '`'
| '='
| ':'
| ','
| '{'
| '}'
| '['
| ']'
| '('
| ')'
| ';'
| '<'
| '>'
| '|'
| '\\'
)
}
fn is_all_lowercase_alpha(token: &str) -> bool {
!token.is_empty() && token.bytes().all(|b| b.is_ascii_lowercase())
}
fn check_token<'a>(
input: &'a str,
start: usize,
end: usize,
min_len: usize,
threshold: f64,
matches: &mut Vec<EntropyMatch<'a>>,
) {
let token = &input[start..end];
if token.len() < min_len {
return;
}
if is_all_lowercase_alpha(token) {
return;
}
let entropy = shannon_entropy(token);
if entropy >= threshold {
matches.push(EntropyMatch {
start,
end,
token,
entropy,
});
}
}
pub fn find_high_entropy_tokens<'a>(
input: &'a str,
min_len: usize,
threshold: f64,
) -> Vec<EntropyMatch<'a>> {
let mut matches = Vec::new();
let mut token_start: Option<usize> = None;
for (idx, ch) in input.char_indices() {
if is_delimiter(ch) {
if let Some(start) = token_start.take() {
check_token(input, start, idx, min_len, threshold, &mut matches);
}
} else if token_start.is_none() {
token_start = Some(idx);
}
}
if let Some(start) = token_start {
check_token(input, start, input.len(), min_len, threshold, &mut matches);
}
matches
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn entropy_uniform_distribution_near_eight() {
let alnum: String = ('a'..='z').chain('A'..='Z').chain('0'..='9').collect();
let e = shannon_entropy(&alnum);
assert!((e - 62f64.log2()).abs() < 0.01, "expected ~5.954, got {e}");
}
#[test]
fn entropy_single_repeated_char() {
let input = "aaaaaaaaaa";
assert_eq!(shannon_entropy(input), 0.0);
}
#[test]
fn entropy_empty_string() {
assert_eq!(shannon_entropy(""), 0.0);
}
#[test]
fn finds_base64_like_api_key() {
let input = "token=aB3dE5fG7hI9jK1lM3nO5pQ7rS9tU1v";
let matches = find_high_entropy_tokens(input, 20, 3.5);
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].token, "aB3dE5fG7hI9jK1lM3nO5pQ7rS9tU1v");
}
#[test]
fn skips_english_prose() {
let input = "this is a perfectly normal sentence with only regular words";
let matches = find_high_entropy_tokens(input, 5, 3.5);
assert!(
matches.is_empty(),
"prose should not trigger entropy detection"
);
}
#[test]
fn skips_short_tokens() {
let input = "key=Ab1";
let matches = find_high_entropy_tokens(input, 20, 3.5);
assert!(matches.is_empty());
}
#[test]
fn returns_correct_byte_offsets() {
let key = "aB3dE5fG7hI9jK1lM3nO5pQ7rS9tU1v";
let input = format!("prefix={key}");
let matches = find_high_entropy_tokens(&input, 20, 3.5);
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].start, 7);
assert_eq!(matches[0].end, 7 + key.len());
assert_eq!(&input[matches[0].start..matches[0].end], key);
}
#[test]
fn skips_all_lowercase_alpha_tokens() {
let input = "abcdefghijklmnopqrstuvwxyz";
let matches = find_high_entropy_tokens(input, 5, 2.0);
assert!(matches.is_empty(), "all-lowercase tokens should be skipped");
}
}