use std::collections::HashMap;
#[derive(Debug, Clone)]
pub struct EntropyResult {
pub shannon_entropy: f64,
pub normalized_entropy: f64,
pub charset_size: usize,
pub length: usize,
pub char_frequencies: HashMap<char, f64>,
pub is_likely_secret: bool,
pub confidence: f64,
}
impl EntropyResult {
pub fn is_likely_secret(&self) -> bool {
self.is_likely_secret
}
pub fn description(&self) -> String {
format!(
"Entropy: {:.2}, Normalized: {:.2}, Charset: {}, Length: {}, Confidence: {:.2}",
self.shannon_entropy,
self.normalized_entropy,
self.charset_size,
self.length,
self.confidence
)
}
}
#[derive(Debug, Clone)]
pub struct EntropyAnalyzer {
pub min_length: usize,
pub max_length: usize,
pub min_entropy_threshold: f64,
pub min_normalized_entropy: f64,
pub min_charset_size: usize,
}
impl Default for EntropyAnalyzer {
fn default() -> Self {
Self::new()
}
}
impl EntropyAnalyzer {
pub fn new() -> Self {
Self {
min_length: 8, max_length: 1000, min_entropy_threshold: 3.5, min_normalized_entropy: 0.6, min_charset_size: 8, }
}
pub fn for_api_keys() -> Self {
Self {
min_length: 10,
max_length: 200,
min_entropy_threshold: 4.0,
min_normalized_entropy: 0.7,
min_charset_size: 10,
}
}
pub fn for_tokens() -> Self {
Self {
min_length: 20,
max_length: 500,
min_entropy_threshold: 4.5,
min_normalized_entropy: 0.75,
min_charset_size: 16,
}
}
pub fn analyze(&self, text: &str) -> EntropyResult {
if text.len() < self.min_length || text.len() > self.max_length {
return EntropyResult {
shannon_entropy: 0.0,
normalized_entropy: 0.0,
charset_size: 0,
length: text.len(),
char_frequencies: HashMap::new(),
is_likely_secret: false,
confidence: 0.0,
};
}
let char_frequencies = self.calculate_char_frequencies(text);
let charset_size = char_frequencies.len();
let shannon_entropy = self.calculate_shannon_entropy(&char_frequencies, text.len());
let max_possible_entropy = (charset_size as f64).log2();
let normalized_entropy = if max_possible_entropy > 0.0 {
shannon_entropy / max_possible_entropy
} else {
0.0
};
let is_likely_secret = self.is_likely_secret_by_entropy(
shannon_entropy,
normalized_entropy,
charset_size,
text,
);
let confidence = self.calculate_confidence(
shannon_entropy,
normalized_entropy,
charset_size,
text,
);
EntropyResult {
shannon_entropy,
normalized_entropy,
charset_size,
length: text.len(),
char_frequencies,
is_likely_secret,
confidence,
}
}
pub fn analyze_candidates(&self, candidates: &[&str]) -> Vec<(String, EntropyResult)> {
candidates
.iter()
.map(|&text| (text.to_string(), self.analyze(text)))
.filter(|(_, result)| result.is_likely_secret)
.collect()
}
pub fn extract_high_entropy_substrings(&self, text: &str, min_length: usize) -> Vec<(String, EntropyResult)> {
let mut results = Vec::new();
for len in min_length..=std::cmp::min(text.len(), self.max_length) {
for start in 0..=(text.len().saturating_sub(len)) {
let substring = &text[start..start + len];
if substring.chars().any(|c| c.is_whitespace() || "\"'(){}[]<>".contains(c)) {
continue;
}
let result = self.analyze(substring);
if result.is_likely_secret && result.confidence > 0.7 {
results.push((substring.to_string(), result));
}
}
}
results.sort_by(|a, b| b.1.confidence.partial_cmp(&a.1.confidence).unwrap());
results.dedup_by(|a, b| a.0 == b.0);
results
}
fn calculate_char_frequencies(&self, text: &str) -> HashMap<char, f64> {
let mut frequencies = HashMap::new();
let total_chars = text.len() as f64;
for ch in text.chars() {
*frequencies.entry(ch).or_insert(0.0) += 1.0;
}
for frequency in frequencies.values_mut() {
*frequency /= total_chars;
}
frequencies
}
fn calculate_shannon_entropy(&self, frequencies: &HashMap<char, f64>, _total_length: usize) -> f64 {
frequencies
.values()
.filter(|&&freq| freq > 0.0)
.map(|&freq| -freq * freq.log2())
.sum()
}
fn is_likely_secret_by_entropy(
&self,
shannon_entropy: f64,
normalized_entropy: f64,
charset_size: usize,
text: &str,
) -> bool {
if shannon_entropy < self.min_entropy_threshold {
return false;
}
if normalized_entropy < self.min_normalized_entropy {
return false;
}
if charset_size < self.min_charset_size {
return false;
}
if self.looks_like_natural_language(text) {
return false;
}
if self.is_single_character_type(text) {
return false;
}
if self.is_common_non_secret_pattern(text) {
return false;
}
true
}
fn calculate_confidence(
&self,
shannon_entropy: f64,
normalized_entropy: f64,
charset_size: usize,
text: &str,
) -> f64 {
let mut confidence = 0.0;
confidence += (shannon_entropy / 6.0).min(0.4);
confidence += normalized_entropy * 0.3;
confidence += (charset_size as f64 / 62.0).min(0.2);
if text.len() >= 20 {
confidence += 0.1;
} else if text.len() >= 12 {
confidence += 0.05;
}
if self.has_secret_like_patterns(text) {
confidence += 0.1;
}
if self.looks_like_natural_language(text) {
confidence -= 0.3;
}
if self.is_common_non_secret_pattern(text) {
confidence -= 0.4;
}
confidence.max(0.0).min(1.0)
}
fn looks_like_natural_language(&self, text: &str) -> bool {
let lowercase_text = text.to_lowercase();
let common_words = [
"the", "and", "for", "are", "but", "not", "you", "all", "can", "had", "was", "one",
"our", "out", "day", "get", "has", "him", "his", "how", "its", "may", "new", "now",
"old", "see", "two", "way", "who", "boy", "did", "man", "car", "dog", "cat", "run",
];
let word_count = common_words
.iter()
.filter(|&&word| lowercase_text.contains(word))
.count();
word_count >= 2
}
fn is_single_character_type(&self, text: &str) -> bool {
text.chars().all(|c| c.is_ascii_digit()) ||
text.chars().all(|c| c.is_ascii_uppercase()) ||
text.chars().all(|c| c.is_ascii_lowercase())
}
fn is_common_non_secret_pattern(&self, text: &str) -> bool {
let lowercase_text = text.to_lowercase();
let non_secret_patterns = [
"localhost", "127.0.0.1", "example.com", "test.com",
"placeholder", "your_key_here", "insert_key_here",
"todo", "fixme", "changeme", "password123",
"abcdefgh", "12345678", "qwertyui",
];
non_secret_patterns
.iter()
.any(|&pattern| lowercase_text.contains(pattern))
}
fn has_secret_like_patterns(&self, text: &str) -> bool {
let has_lowercase = text.chars().any(|c| c.is_ascii_lowercase());
let has_uppercase = text.chars().any(|c| c.is_ascii_uppercase());
let has_digits = text.chars().any(|c| c.is_ascii_digit());
let has_special = text.chars().any(|c| !c.is_ascii_alphanumeric());
let variety_count = [has_lowercase, has_uppercase, has_digits, has_special]
.iter()
.filter(|&&x| x)
.count();
variety_count >= 3
}
}
pub mod utils {
use super::*;
pub fn has_high_entropy(text: &str) -> bool {
if text.len() < 8 {
return false;
}
let analyzer = EntropyAnalyzer::new();
let result = analyzer.analyze(text);
result.is_likely_secret
}
pub fn shannon_entropy(text: &str) -> f64 {
let analyzer = EntropyAnalyzer::new();
let frequencies = analyzer.calculate_char_frequencies(text);
analyzer.calculate_shannon_entropy(&frequencies, text.len())
}
pub fn highest_entropy_substring(text: &str, min_length: usize) -> Option<String> {
let analyzer = EntropyAnalyzer::new();
let candidates = analyzer.extract_high_entropy_substrings(text, min_length);
candidates
.into_iter()
.max_by(|a, b| a.1.shannon_entropy.partial_cmp(&b.1.shannon_entropy).unwrap())
.map(|(substring, _)| substring)
}
pub fn entropy_stats(text: &str) -> (f64, f64, usize) {
let analyzer = EntropyAnalyzer::new();
let result = analyzer.analyze(text);
(result.shannon_entropy, result.normalized_entropy, result.charset_size)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_entropy_analysis() {
let analyzer = EntropyAnalyzer::new();
let high_entropy = analyzer.analyze("AKIAIOSFODNN7EXAMPLE");
assert!(high_entropy.shannon_entropy > 3.0);
assert!(high_entropy.normalized_entropy > 0.5);
let low_entropy = analyzer.analyze("aaaaaaaaaaaaa");
assert!(low_entropy.shannon_entropy < 1.0);
assert!(low_entropy.normalized_entropy < 0.3);
}
#[test]
fn test_secret_detection() {
let analyzer = EntropyAnalyzer::new();
assert!(analyzer.analyze("wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY").is_likely_secret);
assert!(analyzer.analyze("sk_test_FAKE1234567890ABCDEF").is_likely_secret);
assert!(!analyzer.analyze("hello_world_example").is_likely_secret);
assert!(!analyzer.analyze("12345678901234567890").is_likely_secret);
assert!(!analyzer.analyze("configuration_value").is_likely_secret);
}
#[test]
fn test_natural_language_detection() {
let analyzer = EntropyAnalyzer::new();
assert!(analyzer.looks_like_natural_language("the quick brown fox"));
assert!(analyzer.looks_like_natural_language("you can see the dog"));
assert!(!analyzer.looks_like_natural_language("xk2j9mL4nQ8pR7vS"));
}
#[test]
fn test_pattern_recognition() {
let analyzer = EntropyAnalyzer::new();
assert!(analyzer.has_secret_like_patterns("Aa1@"));
assert!(analyzer.has_secret_like_patterns("MyS3cr3t!"));
assert!(!analyzer.has_secret_like_patterns("hello"));
assert!(!analyzer.has_secret_like_patterns("12345"));
}
#[test]
fn test_utility_functions() {
use super::utils::*;
assert!(has_high_entropy("AKIAIOSFODNN7EXAMPLE"));
assert!(!has_high_entropy("hello"));
let entropy = shannon_entropy("AKIAIOSFODNN7EXAMPLE");
assert!(entropy > 3.0);
let highest = highest_entropy_substring("hello AKIAIOSFODNN7EXAMPLE world", 8);
assert!(highest.is_some());
assert!(highest.unwrap().contains("AKIA"));
}
#[test]
fn test_confidence_scoring() {
let analyzer = EntropyAnalyzer::new();
let aws_key = analyzer.analyze("AKIAIOSFODNN7EXAMPLE");
assert!(aws_key.confidence > 0.7);
let simple = analyzer.analyze("hello_world");
assert!(simple.confidence < 0.3);
}
}