use super::{HashType, Token};
use regex::Regex;
use std::sync::LazyLock;
static MD5_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\b[a-fA-F0-9]{32}\b").unwrap());
static SHA1_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\b[a-fA-F0-9]{40}\b").unwrap());
static SHA256_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\b[a-fA-F0-9]{64}\b").unwrap());
static SHA512_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\b[a-fA-F0-9]{128}\b").unwrap());
static GIT_HASH_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\b[a-fA-F0-9]{7,39}\b").unwrap());
static HEX_16_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\b[a-fA-F0-9]{16}\b").unwrap());
static HEX_24_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\b[a-fA-F0-9]{24}\b").unwrap());
static HEX_48_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\b[a-fA-F0-9]{48}\b").unwrap());
static HEX_56_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\b[a-fA-F0-9]{56}\b").unwrap());
pub struct HashDetector;
impl HashDetector {
fn has_hex_run(text: &str) -> bool {
let mut run = 0u32;
for b in text.bytes() {
if b.is_ascii_hexdigit() {
run += 1;
if run >= 7 {
return true;
}
} else {
run = 0;
}
}
false
}
pub fn detect_and_replace(text: &str) -> (String, Vec<Token>) {
if !Self::has_hex_run(text) {
return (text.to_string(), Vec::new());
}
let mut result = text.to_string();
let mut tokens = Vec::new();
for cap in SHA512_REGEX.find_iter(text) {
let hash_str = cap.as_str();
tokens.push(Token::Hash(HashType::SHA512, hash_str.to_string()));
}
result = SHA512_REGEX.replace_all(&result, "<HASH>").to_string();
for cap in SHA256_REGEX.find_iter(&result) {
let hash_str = cap.as_str();
tokens.push(Token::Hash(HashType::SHA256, hash_str.to_string()));
}
result = SHA256_REGEX.replace_all(&result, "<HASH>").to_string();
for cap in HEX_56_REGEX.find_iter(&result) {
let hash_str = cap.as_str();
tokens.push(Token::Hash(HashType::Generic(56), hash_str.to_string()));
}
result = HEX_56_REGEX.replace_all(&result, "<HASH>").to_string();
for cap in HEX_48_REGEX.find_iter(&result) {
let hash_str = cap.as_str();
tokens.push(Token::Hash(HashType::Generic(48), hash_str.to_string()));
}
result = HEX_48_REGEX.replace_all(&result, "<HASH>").to_string();
for cap in SHA1_REGEX.find_iter(&result) {
let hash_str = cap.as_str();
tokens.push(Token::Hash(HashType::SHA1, hash_str.to_string()));
}
result = SHA1_REGEX.replace_all(&result, "<HASH>").to_string();
for cap in MD5_REGEX.find_iter(&result) {
let hash_str = cap.as_str();
tokens.push(Token::Hash(HashType::MD5, hash_str.to_string()));
}
result = MD5_REGEX.replace_all(&result, "<HASH>").to_string();
for cap in HEX_24_REGEX.find_iter(&result) {
let hash_str = cap.as_str();
tokens.push(Token::Hash(HashType::Generic(24), hash_str.to_string()));
}
result = HEX_24_REGEX.replace_all(&result, "<HASH>").to_string();
for cap in HEX_16_REGEX.find_iter(&result) {
let hash_str = cap.as_str();
tokens.push(Token::Hash(HashType::Generic(16), hash_str.to_string()));
}
result = HEX_16_REGEX.replace_all(&result, "<HASH>").to_string();
for cap in GIT_HASH_REGEX.find_iter(&result) {
let hash_str = cap.as_str();
tokens.push(Token::Hash(
HashType::Generic(hash_str.len()),
hash_str.to_string(),
));
}
result = GIT_HASH_REGEX.replace_all(&result, "<HASH>").to_string();
(result, tokens)
}
#[allow(dead_code)]
pub fn is_likely_hash(text: &str) -> bool {
if text.len() < 7 {
return false;
}
text.chars().all(|c| c.is_ascii_hexdigit())
}
#[allow(dead_code)]
pub fn classify_hash_type(length: usize) -> HashType {
match length {
32 => HashType::MD5,
40 => HashType::SHA1,
64 => HashType::SHA256,
128 => HashType::SHA512,
_ => HashType::Generic(length),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_md5_detection() {
let text = "File hash: 5d41402abc4b2a76b9719d911017c592";
let (result, tokens) = HashDetector::detect_and_replace(text);
assert_eq!(result, "File hash: <HASH>");
assert_eq!(tokens.len(), 1);
assert!(matches!(tokens[0], Token::Hash(HashType::MD5, _)));
}
#[test]
fn test_sha256_detection() {
let text = "SHA256: 2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae";
let (result, tokens) = HashDetector::detect_and_replace(text);
assert_eq!(result, "SHA256: <HASH>");
assert_eq!(tokens.len(), 1);
assert!(matches!(tokens[0], Token::Hash(HashType::SHA256, _)));
}
#[test]
fn test_git_commit_detection() {
let text = "commit 01K5HWDZG06WAPM00HHKC1MYZ4 merged";
let (result, tokens) = HashDetector::detect_and_replace(text);
assert_eq!(result, "commit 01K5HWDZG06WAPM00HHKC1MYZ4 merged");
assert_eq!(tokens.len(), 0);
}
#[test]
fn test_multiple_hashes() {
let text =
"MD5: 5d41402abc4b2a76b9719d911017c592 SHA1: 356a192b7913b04c54574d18c28d46e6395428ab";
let (result, tokens) = HashDetector::detect_and_replace(text);
assert_eq!(result, "MD5: <HASH> SHA1: <HASH>");
assert_eq!(tokens.len(), 2);
assert!(matches!(tokens[0], Token::Hash(HashType::SHA1, _)));
assert!(matches!(tokens[1], Token::Hash(HashType::MD5, _)));
}
#[test]
fn test_not_a_hash() {
let text = "Port 8080 is open";
let (result, tokens) = HashDetector::detect_and_replace(text);
assert_eq!(result, "Port 8080 is open");
assert_eq!(tokens.len(), 0);
}
}