use super::{HashType, Token};
use regex::Regex;
use std::sync::LazyLock;
static MD5_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\b[a-fA-F0-9]{32}\b").unwrap());
static SHA1_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\b[a-fA-F0-9]{40}\b").unwrap());
static SHA256_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\b[a-fA-F0-9]{64}\b").unwrap());
static SHA512_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\b[a-fA-F0-9]{128}\b").unwrap());
static GIT_HASH_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\b[a-fA-F0-9]{7,39}\b").unwrap());
static HEX_16_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\b[a-fA-F0-9]{16}\b").unwrap());
static HEX_24_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\b[a-fA-F0-9]{24}\b").unwrap());
static HEX_48_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\b[a-fA-F0-9]{48}\b").unwrap());
static HEX_56_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\b[a-fA-F0-9]{56}\b").unwrap());
pub struct HashDetector;
impl HashDetector {
fn has_hex_run(text: &str) -> bool {
let mut run = 0u32;
for b in text.bytes() {
if b.is_ascii_hexdigit() {
run += 1;
if run >= 7 {
return true;
}
} else {
run = 0;
}
}
false
}
pub fn detect_and_replace(text: &str) -> (String, Vec<Token>) {
if !Self::has_hex_run(text) {
return (text.to_string(), Vec::new());
}
let mut result = text.to_string();
let mut tokens = Vec::new();
for cap in SHA512_REGEX.find_iter(text) {
let hash_str = cap.as_str();
tokens.push(Token::Hash(HashType::SHA512, hash_str.to_string()));
}
result = SHA512_REGEX.replace_all(&result, "<HASH>").to_string();
for cap in SHA256_REGEX.find_iter(&result) {
let hash_str = cap.as_str();
tokens.push(Token::Hash(HashType::SHA256, hash_str.to_string()));
}
result = SHA256_REGEX.replace_all(&result, "<HASH>").to_string();
for cap in HEX_56_REGEX.find_iter(&result) {
let hash_str = cap.as_str();
tokens.push(Token::Hash(HashType::Generic(56), hash_str.to_string()));
}
result = HEX_56_REGEX.replace_all(&result, "<HASH>").to_string();
for cap in HEX_48_REGEX.find_iter(&result) {
let hash_str = cap.as_str();
tokens.push(Token::Hash(HashType::Generic(48), hash_str.to_string()));
}
result = HEX_48_REGEX.replace_all(&result, "<HASH>").to_string();
for cap in SHA1_REGEX.find_iter(&result) {
let hash_str = cap.as_str();
tokens.push(Token::Hash(HashType::SHA1, hash_str.to_string()));
}
result = SHA1_REGEX.replace_all(&result, "<HASH>").to_string();
for cap in MD5_REGEX.find_iter(&result) {
let hash_str = cap.as_str();
tokens.push(Token::Hash(HashType::MD5, hash_str.to_string()));
}
result = MD5_REGEX.replace_all(&result, "<HASH>").to_string();
for cap in HEX_24_REGEX.find_iter(&result) {
let hash_str = cap.as_str();
tokens.push(Token::Hash(HashType::Generic(24), hash_str.to_string()));
}
result = HEX_24_REGEX.replace_all(&result, "<HASH>").to_string();
for cap in HEX_16_REGEX.find_iter(&result) {
let hash_str = cap.as_str();
tokens.push(Token::Hash(HashType::Generic(16), hash_str.to_string()));
}
result = HEX_16_REGEX.replace_all(&result, "<HASH>").to_string();
for cap in GIT_HASH_REGEX.find_iter(&result) {
let hash_str = cap.as_str();
tokens.push(Token::Hash(
HashType::Generic(hash_str.len()),
hash_str.to_string(),
));
}
result = GIT_HASH_REGEX.replace_all(&result, "<HASH>").to_string();
(result, tokens)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_md5_detection() {
let text = "File hash: 5d41402abc4b2a76b9719d911017c592";
let (result, tokens) = HashDetector::detect_and_replace(text);
assert_eq!(result, "File hash: <HASH>");
assert_eq!(tokens.len(), 1);
assert!(matches!(tokens[0], Token::Hash(HashType::MD5, _)));
}
#[test]
fn test_sha256_detection() {
let text = "SHA256: 2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae";
let (result, tokens) = HashDetector::detect_and_replace(text);
assert_eq!(result, "SHA256: <HASH>");
assert_eq!(tokens.len(), 1);
assert!(matches!(tokens[0], Token::Hash(HashType::SHA256, _)));
}
#[test]
fn test_git_commit_detection() {
let text = "commit 01K5HWDZG06WAPM00HHKC1MYZ4 merged";
let (result, tokens) = HashDetector::detect_and_replace(text);
assert_eq!(result, "commit 01K5HWDZG06WAPM00HHKC1MYZ4 merged");
assert_eq!(tokens.len(), 0);
}
#[test]
fn test_multiple_hashes() {
let text =
"MD5: 5d41402abc4b2a76b9719d911017c592 SHA1: 356a192b7913b04c54574d18c28d46e6395428ab";
let (result, tokens) = HashDetector::detect_and_replace(text);
assert_eq!(result, "MD5: <HASH> SHA1: <HASH>");
assert_eq!(tokens.len(), 2);
assert!(matches!(tokens[0], Token::Hash(HashType::SHA1, _)));
assert!(matches!(tokens[1], Token::Hash(HashType::MD5, _)));
}
#[test]
fn test_not_a_hash() {
let text = "Port 8080 is open";
let (result, tokens) = HashDetector::detect_and_replace(text);
assert_eq!(result, "Port 8080 is open");
assert_eq!(tokens.len(), 0);
}
#[test]
fn detect_sha512_128_chars() {
let hash = "a".repeat(128);
let text = format!("hash: {hash}");
let (result, tokens) = HashDetector::detect_and_replace(&text);
assert!(
result.contains("<HASH>"),
"SHA512 should be detected: {result}"
);
assert!(matches!(&tokens[0], Token::Hash(HashType::SHA512, _)));
}
#[test]
fn detect_generic_56_chars() {
let hash = "a".repeat(56);
let text = format!("hash: {hash}");
let (result, tokens) = HashDetector::detect_and_replace(&text);
assert!(result.contains("<HASH>"), "56-char hash: {result}");
assert!(matches!(&tokens[0], Token::Hash(HashType::Generic(56), _)));
}
#[test]
fn detect_generic_48_chars() {
let hash = "a".repeat(48);
let text = format!("hash: {hash}");
let (result, tokens) = HashDetector::detect_and_replace(&text);
assert!(result.contains("<HASH>"), "48-char hash: {result}");
assert!(matches!(&tokens[0], Token::Hash(HashType::Generic(48), _)));
}
#[test]
fn detect_generic_24_chars() {
let hash = "a".repeat(24);
let text = format!("hash: {hash}");
let (result, tokens) = HashDetector::detect_and_replace(&text);
assert!(result.contains("<HASH>"), "24-char hash: {result}");
assert!(matches!(&tokens[0], Token::Hash(HashType::Generic(24), _)));
}
#[test]
fn detect_generic_16_chars() {
let hash = "a".repeat(16);
let text = format!("hash: {hash}");
let (result, tokens) = HashDetector::detect_and_replace(&text);
assert!(result.contains("<HASH>"), "16-char hash: {result}");
assert!(matches!(&tokens[0], Token::Hash(HashType::Generic(16), _)));
}
#[test]
fn hex_run_6_chars_false() {
assert!(!HashDetector::has_hex_run("abcdef"));
}
#[test]
fn hex_run_7_chars_true() {
assert!(HashDetector::has_hex_run("abcdef0"));
}
#[test]
fn hex_run_broken_by_non_hex() {
assert!(!HashDetector::has_hex_run("abc_def"));
}
#[test]
fn hex_run_empty() {
assert!(!HashDetector::has_hex_run(""));
}
}