1use memchr::memchr;
46use regex::RegexSet;
47use std::sync::LazyLock;
48use std::time::{Duration, Instant};
49use tracing::{debug, instrument, trace, warn};
50
51const HEREDOC_TRIGGER_PATTERNS: [&str; 14] = [
66 r"<<<",
79 r#"\bpython[0-9.]*(?:\.exe)?\b(?:\s+(?:--\S+|-[A-Za-z]+))*\s+-[A-Za-z]*[ce][A-Za-z]*(?:\s|['"]|$)"#,
81 r#"\bruby[0-9.]*(?:\.exe)?\b(?:\s+(?:--\S+|-[A-Za-z]+))*\s+-[A-Za-z]*e[A-Za-z]*(?:\s|['"]|$)"#,
83 r#"\birb[0-9.]*(?:\.exe)?\b(?:\s+(?:--\S+|-[A-Za-z]+))*\s+-[A-Za-z]*e[A-Za-z]*(?:\s|['"]|$)"#,
84 r#"\bperl[0-9.]*(?:\.exe)?\b(?:\s+(?:--\S+|-[A-Za-z]+))*\s+-[A-Za-z]*[eE][A-Za-z]*(?:\s|['"]|$)"#,
86 r#"\bnode(?:js)?[0-9.]*(?:\.exe)?\b(?:\s+(?:--\S+|-[A-Za-z]+))*\s+-[A-Za-z]*[ep][A-Za-z]*(?:\s|['"]|$)"#,
88 r#"\bphp[0-9.]*(?:\.exe)?\b(?:\s+(?:--\S+|-[A-Za-z]+))*\s+-[A-Za-z]*r[A-Za-z]*(?:\s|['"]|$)"#,
90 r#"\blua[0-9.]*(?:\.exe)?\b(?:\s+(?:--\S+|-[A-Za-z]+))*\s+-[A-Za-z]*e[A-Za-z]*(?:\s|['"]|$)"#,
92 r#"\b(?:sh|bash|zsh|fish)(?:\.exe)?\b(?:\s+(?:--\S+|-[A-Za-z]+))*\s+-[A-Za-z]*c[A-Za-z]*(?:\s|['"]|$)"#,
94 r#"(?i)\b(?:powershell|pwsh)(?:\.exe)?["']?(?:\s+(?:-\S+))*\s+-c[a-z]*\s*['"]"#,
105 r"\|\s*(?:python[0-9.]*|ruby[0-9.]*|perl[0-9.]*|node(?:js)?[0-9.]*|php[0-9.]*|lua[0-9.]*|sh|bash)(?:\.exe)?\b",
107 r"\|\s*xargs\s",
109 r#"\beval\s+['"]"#,
111 r#"\bexec\s+['"]"#,
112];
113
114const MANUAL_HEREDOC_TRIGGER_INDEX: usize = HEREDOC_TRIGGER_PATTERNS.len();
115
116static HEREDOC_TRIGGERS: LazyLock<RegexSet> = LazyLock::new(|| {
117 RegexSet::new(HEREDOC_TRIGGER_PATTERNS).expect("heredoc trigger patterns should compile")
118});
119
120#[inline]
121#[must_use]
122fn contains_active_heredoc_operator(command: &str) -> bool {
123 if memchr(b'<', command.as_bytes()).is_none() {
124 return false;
125 }
126 contains_active_heredoc_operator_recursive(command, 0, 0)
127}
128
129#[must_use]
130fn contains_active_heredoc_operator_recursive(
131 command: &str,
132 start: usize,
133 recursion_depth: usize,
134) -> bool {
135 if recursion_depth > 500 {
140 return true;
141 }
142
143 let bytes = command.as_bytes();
144 let len = bytes.len();
145 let mut i = start.min(len);
146
147 while i < len {
148 match bytes[i] {
149 b'<' if i + 1 < len && bytes[i + 1] == b'<' => {
150 return true;
152 }
153 b'\\' => {
154 if i + 2 < len && bytes[i + 1] == b'\r' && bytes[i + 2] == b'\n' {
156 i += 3;
157 } else {
158 i = (i + 2).min(len);
160 }
161 }
162 b'\'' => {
163 i += 1;
165 while i < len && bytes[i] != b'\'' {
166 i += 1;
167 }
168 if i < len {
169 i += 1;
170 }
171 }
172 b'"' => {
173 let (found, next) = scan_double_quotes_for_heredoc(command, i + 1, recursion_depth);
175 if found {
176 return true;
177 }
178 i = next;
179 }
180 b'$' if i + 1 < len && bytes[i + 1] == b'(' => {
181 let (found, next) =
182 scan_dollar_paren_for_heredoc_recursive(command, i, recursion_depth + 1);
183 if found {
184 return true;
185 }
186 i = next;
187 }
188 b'`' => {
189 let (found, next) =
190 scan_backticks_for_heredoc_recursive(command, i, recursion_depth + 1);
191 if found {
192 return true;
193 }
194 i = next;
195 }
196 _ => {
197 i += 1;
198 }
199 }
200 }
201
202 false
203}
204
205#[must_use]
206fn scan_double_quotes_for_heredoc(
207 command: &str,
208 start: usize,
209 recursion_depth: usize,
210) -> (bool, usize) {
211 if recursion_depth > 500 {
212 return (true, command.len());
213 }
214
215 let bytes = command.as_bytes();
216 let len = bytes.len();
217 let mut i = start.min(len);
218
219 while i < len {
220 match bytes[i] {
221 b'"' => return (false, i + 1),
222 b'\\' => {
223 i = (i + 2).min(len);
224 }
225 b'$' if i + 1 < len && bytes[i + 1] == b'(' => {
226 let (found, next) =
227 scan_dollar_paren_for_heredoc_recursive(command, i, recursion_depth + 1);
228 if found {
229 return (true, next);
230 }
231 i = next;
232 }
233 b'`' => {
234 let (found, next) =
235 scan_backticks_for_heredoc_recursive(command, i, recursion_depth + 1);
236 if found {
237 return (true, next);
238 }
239 i = next;
240 }
241 _ => {
242 i += 1;
243 }
244 }
245 }
246
247 (false, len)
248}
249
250#[must_use]
251fn scan_dollar_paren_for_heredoc_recursive(
252 command: &str,
253 start: usize,
254 recursion_depth: usize,
255) -> (bool, usize) {
256 if recursion_depth > 500 {
258 return (true, command.len());
259 }
260
261 let bytes = command.as_bytes();
262 let len = bytes.len();
263
264 debug_assert_eq!(bytes.get(start), Some(&b'$'));
265 debug_assert_eq!(bytes.get(start + 1), Some(&b'('));
266
267 let mut i = start + 2;
268 let mut depth: u32 = 1;
269
270 while i < len {
271 match bytes[i] {
272 b'<' if i + 1 < len && bytes[i + 1] == b'<' => {
273 return (true, i + 2);
274 }
275 b'(' => {
276 depth += 1;
277 i += 1;
278 }
279 b')' => {
280 if depth == 1 {
281 return (false, i + 1);
283 }
284 depth = depth.saturating_sub(1);
285 i += 1;
286 }
287 b'\\' => {
288 i = (i + 2).min(len);
289 }
290 b'\'' => {
291 i += 1;
293 while i < len && bytes[i] != b'\'' {
294 i += 1;
295 }
296 if i < len {
297 i += 1;
298 }
299 }
300 b'"' => {
301 let (found, next) = scan_double_quotes_for_heredoc(command, i + 1, recursion_depth);
302 if found {
303 return (true, next);
304 }
305 i = next;
306 }
307 b'$' if i + 1 < len && bytes[i + 1] == b'(' => {
308 let (found, next) =
309 scan_dollar_paren_for_heredoc_recursive(command, i, recursion_depth + 1);
310 if found {
311 return (true, next);
312 }
313 i = next;
314 }
315 b'`' => {
316 let (found, next) =
317 scan_backticks_for_heredoc_recursive(command, i, recursion_depth + 1);
318 if found {
319 return (true, next);
320 }
321 i = next;
322 }
323 _ => {
324 i += 1;
325 }
326 }
327 }
328
329 (false, len)
330}
331
332#[must_use]
333fn scan_backticks_for_heredoc_recursive(
334 command: &str,
335 start: usize,
336 recursion_depth: usize,
337) -> (bool, usize) {
338 if recursion_depth > 500 {
339 return (true, command.len());
340 }
341
342 let bytes = command.as_bytes();
343 let len = bytes.len();
344
345 debug_assert_eq!(bytes.get(start), Some(&b'`'));
346
347 let mut i = start + 1;
348 while i < len {
349 match bytes[i] {
350 b'<' if i + 1 < len && bytes[i + 1] == b'<' => {
351 return (true, i + 2);
352 }
353 b'\\' => {
354 i = (i + 2).min(len);
355 }
356 b'\'' => {
357 i += 1;
358 while i < len && bytes[i] != b'\'' {
359 i += 1;
360 }
361 if i < len {
362 i += 1;
363 }
364 }
365 b'"' => {
366 let (found, next) = scan_double_quotes_for_heredoc(command, i + 1, recursion_depth);
367 if found {
368 return (true, next);
369 }
370 i = next;
371 }
372 b'$' if i + 1 < len && bytes[i + 1] == b'(' => {
373 let (found, next) =
374 scan_dollar_paren_for_heredoc_recursive(command, i, recursion_depth + 1);
375 if found {
376 return (true, next);
377 }
378 i = next;
379 }
380 b'`' => {
381 return (false, i + 1);
382 }
383 _ => {
384 i += 1;
385 }
386 }
387 }
388
389 (false, len)
390}
391
392#[derive(Debug, Clone, Copy, PartialEq, Eq)]
394pub enum TriggerResult {
395 NoTrigger,
397 Triggered,
399}
400
401#[inline]
426#[must_use]
427#[instrument(skip(command), fields(cmd_len = command.len()))]
428pub fn check_triggers(command: &str) -> TriggerResult {
429 if contains_active_heredoc_operator(command) || HEREDOC_TRIGGERS.is_match(command) {
430 debug!("tier1_trigger: heredoc/inline script indicator detected");
431 TriggerResult::Triggered
432 } else {
433 trace!("tier1_no_trigger: fast path allow");
434 TriggerResult::NoTrigger
435 }
436}
437
438#[must_use]
442pub fn matched_triggers(command: &str) -> Vec<usize> {
443 let mut matches: Vec<usize> = HEREDOC_TRIGGERS.matches(command).into_iter().collect();
444 if contains_active_heredoc_operator(command) {
445 matches.push(MANUAL_HEREDOC_TRIGGER_INDEX);
446 }
447 matches
448}
449
450use regex::Regex;
455
456#[derive(Debug, Clone, Copy)]
458pub struct ExtractionLimits {
459 pub max_body_bytes: usize,
461 pub max_body_lines: usize,
463 pub max_heredocs: usize,
465 pub timeout_ms: u64,
467}
468
469impl Default for ExtractionLimits {
470 fn default() -> Self {
471 Self {
472 max_body_bytes: 1024 * 1024, max_body_lines: 10_000,
474 max_heredocs: 10,
475 timeout_ms: 50,
476 }
477 }
478}
479
480#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
482pub enum ScriptLanguage {
483 Bash,
484 Go,
485 Php,
486 Python,
487 Ruby,
488 Perl,
489 JavaScript,
490 TypeScript,
491 Unknown,
492}
493
494impl ScriptLanguage {
495 #[must_use]
501 pub fn from_command(cmd: &str) -> Self {
502 let cmd_lower = cmd.to_lowercase();
503 let cmd_base = cmd_lower.strip_suffix(".exe").unwrap_or(&cmd_lower);
505
506 let matches_interpreter = |base: &str| -> bool {
510 if cmd_base == base {
511 return true;
512 }
513 cmd_base.strip_prefix(base).is_some_and(|suffix| {
515 !suffix.is_empty()
516 && suffix.chars().all(|c| c.is_ascii_digit() || c == '.')
517 && suffix.chars().next().is_some_and(|c| c.is_ascii_digit())
518 })
519 };
520
521 if matches_interpreter("python") {
522 Self::Python
523 } else if matches_interpreter("ruby") || matches_interpreter("irb") {
524 Self::Ruby
525 } else if matches_interpreter("perl") {
526 Self::Perl
527 } else if matches_interpreter("node") || matches_interpreter("nodejs") {
528 Self::JavaScript
529 } else if matches_interpreter("deno") || matches_interpreter("bun") {
530 Self::TypeScript
531 } else if matches_interpreter("php") {
532 Self::Php
533 } else if matches_interpreter("go") {
534 Self::Go
536 } else if matches_interpreter("sh")
537 || matches_interpreter("bash")
538 || matches_interpreter("zsh")
539 || matches_interpreter("fish")
540 || matches_interpreter("powershell")
548 || matches_interpreter("pwsh")
549 {
550 Self::Bash
551 } else {
552 Self::Unknown
553 }
554 }
555
556 #[must_use]
563 pub fn from_shebang(content: &str) -> Option<Self> {
564 let first_line = content.lines().next()?;
565
566 let shebang = first_line.strip_prefix("#!")?;
568 let shebang = shebang.trim();
569
570 if shebang.is_empty() {
571 return None;
572 }
573
574 let mut parts = shebang.split_whitespace();
583 let first = parts.next()?;
584 let basename = first.rsplit('/').next().unwrap_or(first);
585
586 let interpreter = if basename == "env" {
588 loop {
590 let next = parts.next()?;
591 if !next.starts_with('-') {
592 break next.rsplit('/').next().unwrap_or(next);
593 }
594 }
595 } else {
596 basename
597 };
598
599 let lang = Self::from_command(interpreter);
601 if lang == Self::Unknown {
602 None
603 } else {
604 Some(lang)
605 }
606 }
607
608 #[must_use]
618 pub fn from_content(content: &str) -> Option<Self> {
619 let lines: Vec<&str> = content.lines().take(20).collect();
621
622 let has_python_import = lines.iter().any(|l| {
624 let trimmed = l.trim();
625 trimmed.starts_with("import ") || trimmed.starts_with("from ")
626 });
627 if has_python_import {
628 return Some(Self::Python);
629 }
630
631 let has_typescript_patterns = lines.iter().any(|l| {
634 let trimmed = l.trim();
635 trimmed.contains(": string")
636 || trimmed.contains(": number")
637 || trimmed.contains(": boolean")
638 || trimmed.contains("interface ")
639 || trimmed.starts_with("type ")
640 });
641 if has_typescript_patterns {
642 return Some(Self::TypeScript);
643 }
644
645 let has_js_patterns = lines.iter().any(|l| {
647 let trimmed = l.trim();
648 trimmed.contains("require(")
649 || trimmed.starts_with("const ")
650 || trimmed.starts_with("let ")
651 || trimmed.starts_with("var ")
652 || trimmed.contains("module.exports")
653 });
654 if has_js_patterns {
655 return Some(Self::JavaScript);
656 }
657
658 let has_ruby_patterns = lines.iter().any(|l| {
660 let trimmed = l.trim();
661 trimmed.starts_with("def ")
662 || trimmed.starts_with("class ")
663 || trimmed.starts_with("require ")
664 || trimmed.starts_with("require_relative ")
665 || trimmed.contains(".each do")
666 || trimmed.contains(" do |")
667 });
668 let has_end = content.contains("\nend") || content.ends_with("end");
670 if has_ruby_patterns && has_end {
671 return Some(Self::Ruby);
672 }
673
674 let has_go_patterns = lines.iter().any(|l| {
677 let trimmed = l.trim();
678 trimmed.starts_with("package ")
679 || trimmed.starts_with("func ")
680 || trimmed.contains(":=")
681 || (trimmed.starts_with("import ") && trimmed.contains('"'))
682 || trimmed == "import ("
683 });
684 if has_go_patterns {
685 return Some(Self::Go);
686 }
687
688 let has_perl_patterns = lines.iter().any(|l| {
690 let trimmed = l.trim();
691 trimmed.starts_with("use strict")
692 || trimmed.starts_with("use warnings")
693 || trimmed.starts_with("my $")
694 || trimmed.starts_with("my @")
695 || trimmed.starts_with("my %")
696 || trimmed.contains("=~ /")
697 || trimmed.contains("=~ s/")
698 });
699 if has_perl_patterns {
700 return Some(Self::Perl);
701 }
702
703 let has_bash_patterns = lines.iter().any(|l| {
705 let trimmed = l.trim();
706 trimmed.starts_with("if [")
707 || trimmed.starts_with("for ")
708 || trimmed.starts_with("while ")
709 || trimmed.starts_with("case ")
710 || trimmed.contains("$((")
711 || trimmed.contains("${")
712 || trimmed.starts_with("function ")
713 || (trimmed.contains("()") && trimmed.contains('{'))
714 });
715 if has_bash_patterns {
716 return Some(Self::Bash);
717 }
718
719 None
720 }
721
722 #[must_use]
732 pub fn detect(cmd: &str, content: &str) -> (Self, DetectionConfidence) {
733 if let Some(interpreter) = Self::extract_head_interpreter(cmd) {
735 let lang = Self::from_command(&interpreter);
736 if lang != Self::Unknown {
737 return (lang, DetectionConfidence::CommandPrefix);
738 }
739 }
740
741 if cmd.contains('|') {
744 for segment in cmd.split('|') {
745 let segment = segment.trim();
746 if segment.is_empty() {
747 continue;
748 }
749 if let Some(interpreter) = Self::extract_head_interpreter(segment) {
750 let lang = Self::from_command(&interpreter);
751 if lang != Self::Unknown {
752 return (lang, DetectionConfidence::CommandPrefix);
753 }
754 }
755 }
756 }
757
758 if let Some(lang) = Self::from_shebang(content) {
760 return (lang, DetectionConfidence::Shebang);
761 }
762
763 if let Some(lang) = Self::from_content(content) {
765 return (lang, DetectionConfidence::ContentHeuristics);
766 }
767
768 (Self::Unknown, DetectionConfidence::Unknown)
770 }
771
772 fn extract_head_interpreter(cmd: &str) -> Option<String> {
782 let normalized = crate::normalize::strip_wrapper_prefixes(cmd);
784 let cmd_to_check = normalized.normalized;
785
786 let mut parts = cmd_to_check.split_whitespace();
787 let first = parts.next()?;
788
789 let basename = first.rsplit('/').next().unwrap_or(first);
791 Some(basename.to_string())
792 }
793}
794
795#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
799pub enum DetectionConfidence {
800 CommandPrefix,
803
804 Shebang,
807
808 ContentHeuristics,
811
812 Unknown,
815}
816
817impl DetectionConfidence {
818 #[must_use]
820 pub const fn label(&self) -> &'static str {
821 match self {
822 Self::CommandPrefix => "command-prefix",
823 Self::Shebang => "shebang",
824 Self::ContentHeuristics => "content-heuristics",
825 Self::Unknown => "unknown",
826 }
827 }
828
829 #[must_use]
831 pub const fn reason(&self) -> &'static str {
832 match self {
833 Self::CommandPrefix => "detected from command interpreter (highest confidence)",
834 Self::Shebang => "detected from shebang line (high confidence)",
835 Self::ContentHeuristics => "inferred from content patterns (lower confidence)",
836 Self::Unknown => "could not determine language",
837 }
838 }
839}
840
841#[derive(Debug, Clone, Copy, PartialEq, Eq)]
843pub enum HeredocType {
844 Standard,
846 TabStripped,
848 HereString,
850 IndentStripped,
852}
853
854#[derive(Debug, Clone)]
856pub struct ExtractedContent {
857 pub content: String,
859 pub language: ScriptLanguage,
861 pub delimiter: Option<String>,
863 pub byte_range: std::ops::Range<usize>,
865 pub content_range: Option<std::ops::Range<usize>>,
871 pub quoted: bool,
873 pub heredoc_type: Option<HeredocType>,
875 pub target_command: Option<String>,
878}
879
880#[derive(Debug, Clone, PartialEq)]
882pub enum SkipReason {
883 ExceededSizeLimit { actual: usize, limit: usize },
885 ExceededLineLimit { actual: usize, limit: usize },
887 ExceededHeredocLimit { limit: usize },
889 BinaryContent {
891 null_bytes: usize,
892 non_printable_ratio: f32,
893 },
894 Timeout { elapsed_ms: u64, budget_ms: u64 },
896 UnterminatedHeredoc { delimiter: String },
898 MalformedInput { reason: String },
900}
901
902impl std::fmt::Display for SkipReason {
903 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
904 match self {
905 Self::ExceededSizeLimit { actual, limit } => {
906 write!(f, "exceeded size limit: {actual} bytes > {limit} bytes")
907 }
908 Self::ExceededLineLimit { actual, limit } => {
909 write!(f, "exceeded line limit: {actual} lines > {limit} lines")
910 }
911 Self::ExceededHeredocLimit { limit } => {
912 write!(f, "exceeded heredoc limit: max {limit} heredocs")
913 }
914 Self::BinaryContent {
915 null_bytes,
916 non_printable_ratio,
917 } => {
918 write!(
919 f,
920 "binary content detected: {null_bytes} null bytes, {:.1}% non-printable",
921 non_printable_ratio * 100.0
922 )
923 }
924 Self::Timeout {
925 elapsed_ms,
926 budget_ms,
927 } => write!(
928 f,
929 "extraction timeout: {elapsed_ms}ms > {budget_ms}ms budget"
930 ),
931 Self::UnterminatedHeredoc { delimiter } => {
932 write!(f, "unterminated heredoc: delimiter '{delimiter}' not found")
933 }
934 Self::MalformedInput { reason } => {
935 write!(f, "malformed input: {reason}")
936 }
937 }
938 }
939}
940
941#[derive(Debug)]
943pub enum ExtractionResult {
944 NoContent,
946 Extracted(Vec<ExtractedContent>),
948 Skipped(Vec<SkipReason>),
950 Partial {
951 extracted: Vec<ExtractedContent>,
952 skipped: Vec<SkipReason>,
953 },
954 Failed(String),
956}
957
958static HEREDOC_EXTRACTOR: LazyLock<Regex> = LazyLock::new(|| {
960 Regex::new(r#"<<([-~])?\s*(?:'([^']*)'|"([^"]*)"|([\w.-]+))"#).expect("heredoc regex compiles")
967});
968
969static HERESTRING_SINGLE_QUOTE: LazyLock<Regex> = LazyLock::new(|| {
971 Regex::new(r"<<<\s*'([^']*)'").expect("herestring single-quote regex compiles")
974});
975
976static HERESTRING_DOUBLE_QUOTE: LazyLock<Regex> = LazyLock::new(|| {
978 Regex::new(r#"<<<\s*"([^"]*)""#).expect("herestring double-quote regex compiles")
981});
982
983static HERESTRING_UNQUOTED: LazyLock<Regex> = LazyLock::new(|| {
985 Regex::new(r"<<<\s*([^'\x22\s]\S*)").expect("herestring unquoted regex compiles")
989});
990
991static INLINE_SCRIPT_SINGLE_QUOTE: LazyLock<Regex> = LazyLock::new(|| {
993 Regex::new(r#"\b(python[0-9.]*(?:\.exe)?|ruby[0-9.]*(?:\.exe)?|irb[0-9.]*(?:\.exe)?|perl[0-9.]*(?:\.exe)?|node(js)?[0-9.]*(?:\.exe)?|php[0-9.]*(?:\.exe)?|lua[0-9.]*(?:\.exe)?|sh(?:\.exe)?|bash(?:\.exe)?|zsh(?:\.exe)?|fish(?:\.exe)?|(?i:powershell|pwsh)(?:\.exe)?)\b["']?(?:\s+(?:--\S+|-[A-Za-z]+))*\s+(-[A-Za-z]*[ceECpr][A-Za-z]*)\s*'([^']*)'"#)
1001 .expect("inline script single-quote regex compiles")
1002});
1003
1004static INLINE_SCRIPT_DOUBLE_QUOTE: LazyLock<Regex> = LazyLock::new(|| {
1006 Regex::new(r#"\b(python[0-9.]*(?:\.exe)?|ruby[0-9.]*(?:\.exe)?|irb[0-9.]*(?:\.exe)?|perl[0-9.]*(?:\.exe)?|node(js)?[0-9.]*(?:\.exe)?|php[0-9.]*(?:\.exe)?|lua[0-9.]*(?:\.exe)?|sh(?:\.exe)?|bash(?:\.exe)?|zsh(?:\.exe)?|fish(?:\.exe)?|(?i:powershell|pwsh)(?:\.exe)?)\b['"]?(?:\s+(?:--\S+|-[A-Za-z]+))*\s+(-[A-Za-z]*[ceECpr][A-Za-z]*)\s*"([^"]*)""#)
1013 .expect("inline script double-quote regex compiles")
1014});
1015
1016const BINARY_THRESHOLD: f32 = 0.30; #[must_use]
1029#[allow(clippy::cast_precision_loss)] #[allow(clippy::naive_bytecount)] pub fn check_binary_content(content: &str) -> Option<SkipReason> {
1032 let bytes = content.as_bytes();
1033 if bytes.is_empty() {
1034 return None;
1035 }
1036
1037 let null_bytes = bytes.iter().filter(|&&b| b == 0).count();
1039 if null_bytes > 0 {
1040 return Some(SkipReason::BinaryContent {
1041 null_bytes,
1042 non_printable_ratio: null_bytes as f32 / bytes.len() as f32,
1043 });
1044 }
1045
1046 let mut suspect_chars = 0;
1049 let mut total_chars = 0;
1050
1051 for c in content.chars() {
1052 total_chars += 1;
1053 if (c.is_control() && c != '\n' && c != '\r' && c != '\t')
1054 || c == std::char::REPLACEMENT_CHARACTER
1055 {
1056 suspect_chars += 1;
1057 }
1058 }
1059
1060 let ratio = suspect_chars as f32 / total_chars.max(1) as f32;
1061 if ratio > BINARY_THRESHOLD {
1062 return Some(SkipReason::BinaryContent {
1063 null_bytes: 0,
1064 non_printable_ratio: ratio,
1065 });
1066 }
1067
1068 None
1069}
1070
1071#[inline]
1072fn record_timeout_if_needed(
1073 start_time: Instant,
1074 timeout: Duration,
1075 budget_ms: u64,
1076 skip_reasons: &mut Vec<SkipReason>,
1077) -> bool {
1078 let elapsed = start_time.elapsed();
1079 if elapsed < timeout {
1080 return false;
1081 }
1082
1083 if !skip_reasons
1084 .iter()
1085 .any(|r| matches!(r, SkipReason::Timeout { .. }))
1086 {
1087 let elapsed_ms = u64::try_from(elapsed.as_millis()).unwrap_or(u64::MAX);
1088 skip_reasons.push(SkipReason::Timeout {
1089 elapsed_ms,
1090 budget_ms,
1091 });
1092 }
1093
1094 true
1095}
1096
1097#[must_use]
1122#[instrument(skip(command, limits), fields(cmd_len = command.len(), timeout_ms = limits.timeout_ms))]
1123pub fn extract_content(command: &str, limits: &ExtractionLimits) -> ExtractionResult {
1124 let start_time = Instant::now();
1125 let timeout = Duration::from_millis(limits.timeout_ms);
1126 let mut skip_reasons: Vec<SkipReason> = Vec::new();
1127
1128 if command.len() > limits.max_body_bytes {
1130 warn!(
1131 actual = command.len(),
1132 limit = limits.max_body_bytes,
1133 "tier2_skip: input exceeds size limit"
1134 );
1135 skip_reasons.push(SkipReason::ExceededSizeLimit {
1136 actual: command.len(),
1137 limit: limits.max_body_bytes,
1138 });
1139 return ExtractionResult::Skipped(skip_reasons);
1140 }
1141
1142 if let Some(reason) = check_binary_content(command) {
1144 warn!(?reason, "tier2_skip: binary content detected");
1145 skip_reasons.push(reason);
1146 return ExtractionResult::Skipped(skip_reasons);
1147 }
1148
1149 let mut extracted: Vec<ExtractedContent> = Vec::new();
1150
1151 if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, &mut skip_reasons) {
1153 return ExtractionResult::Skipped(skip_reasons);
1154 }
1155
1156 extract_inline_scripts(
1158 command,
1159 limits,
1160 start_time,
1161 timeout,
1162 &mut extracted,
1163 &mut skip_reasons,
1164 );
1165 if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, &mut skip_reasons) {
1166 return if extracted.is_empty() {
1167 ExtractionResult::Skipped(skip_reasons)
1168 } else {
1169 ExtractionResult::Extracted(extracted)
1170 };
1171 }
1172
1173 extract_herestrings(
1175 command,
1176 limits,
1177 start_time,
1178 timeout,
1179 &mut extracted,
1180 &mut skip_reasons,
1181 );
1182 if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, &mut skip_reasons) {
1183 return if extracted.is_empty() {
1184 ExtractionResult::Skipped(skip_reasons)
1185 } else {
1186 ExtractionResult::Extracted(extracted)
1187 };
1188 }
1189
1190 extract_heredocs(
1192 command,
1193 limits,
1194 start_time,
1195 timeout,
1196 &mut extracted,
1197 &mut skip_reasons,
1198 );
1199
1200 let elapsed_us = start_time.elapsed().as_micros();
1202 match (extracted.is_empty(), skip_reasons.is_empty()) {
1203 (true, true) => {
1204 trace!(elapsed_us, "tier2_complete: no content found");
1205 ExtractionResult::NoContent
1206 }
1207 (true, false) => {
1208 warn!(
1209 elapsed_us,
1210 skip_count = skip_reasons.len(),
1211 "tier2_complete: skipped"
1212 );
1213 ExtractionResult::Skipped(skip_reasons)
1214 }
1215 (false, true) => {
1216 debug!(
1217 elapsed_us,
1218 count = extracted.len(),
1219 "tier2_complete: content extracted"
1220 );
1221 ExtractionResult::Extracted(extracted)
1222 }
1223 (false, false) => {
1224 debug!(
1226 elapsed_us,
1227 count = extracted.len(),
1228 skip_count = skip_reasons.len(),
1229 "tier2_complete: partial extraction with skips"
1230 );
1231 ExtractionResult::Extracted(extracted)
1232 }
1233 }
1234}
1235
1236fn extract_inline_scripts(
1238 command: &str,
1239 limits: &ExtractionLimits,
1240 start_time: Instant,
1241 timeout: Duration,
1242 extracted: &mut Vec<ExtractedContent>,
1243 skip_reasons: &mut Vec<SkipReason>,
1244) {
1245 if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, skip_reasons) {
1246 return;
1247 }
1248 if extracted.len() >= limits.max_heredocs {
1249 skip_reasons.push(SkipReason::ExceededHeredocLimit {
1250 limit: limits.max_heredocs,
1251 });
1252 return;
1253 }
1254
1255 let mut hit_limit = false;
1257 let mut extract_from_pattern = |pattern: &Regex| {
1258 for cap in pattern.captures_iter(command) {
1259 if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, skip_reasons) {
1260 return;
1261 }
1262 if extracted.len() >= limits.max_heredocs {
1263 hit_limit = true;
1264 break;
1265 }
1266
1267 let cmd_name = cap.get(1).map_or("", |m| m.as_str());
1268 let flag = cap.get(3).map_or("", |m| m.as_str());
1269 let content_match = cap.get(4);
1271 let content = content_match.map_or("", |m| m.as_str());
1272
1273 let is_inline_flag = if cmd_name.starts_with("python") {
1276 flag.contains('c') || flag.contains('e')
1277 } else if cmd_name.starts_with("ruby") || cmd_name.starts_with("irb") {
1278 flag.contains('e')
1279 } else if cmd_name.starts_with("perl") {
1280 flag.contains('e') || flag.contains('E')
1281 } else if cmd_name.starts_with("node") {
1282 flag.contains('e') || flag.contains('p')
1283 } else if cmd_name.starts_with("php") {
1284 flag.contains('r')
1285 } else if cmd_name.starts_with("lua") {
1286 flag.contains('e')
1287 } else if {
1288 let lower = cmd_name.to_ascii_lowercase();
1293 lower.starts_with("powershell") || lower.starts_with("pwsh")
1294 } {
1295 let f = flag.to_ascii_lowercase();
1297 f.starts_with("-c")
1298 } else {
1299 flag.contains('c')
1301 };
1302
1303 if !is_inline_flag {
1304 continue;
1305 }
1306
1307 if content.len() > limits.max_body_bytes {
1309 continue;
1311 }
1312
1313 let full_match = cap.get(0).unwrap();
1314 extracted.push(ExtractedContent {
1315 content: content.to_string(),
1316 language: ScriptLanguage::from_command(cmd_name),
1317 delimiter: None,
1318 byte_range: full_match.start()..full_match.end(),
1319 content_range: content_match.map(|m| m.start()..m.end()),
1320 quoted: true, heredoc_type: None,
1322 target_command: Some(cmd_name.to_string()), });
1324 }
1325 };
1326
1327 extract_from_pattern(&INLINE_SCRIPT_SINGLE_QUOTE);
1329 extract_from_pattern(&INLINE_SCRIPT_DOUBLE_QUOTE);
1330
1331 if hit_limit {
1332 skip_reasons.push(SkipReason::ExceededHeredocLimit {
1333 limit: limits.max_heredocs,
1334 });
1335 }
1336}
1337
1338fn extract_herestrings(
1340 command: &str,
1341 limits: &ExtractionLimits,
1342 start_time: Instant,
1343 timeout: Duration,
1344 extracted: &mut Vec<ExtractedContent>,
1345 skip_reasons: &mut Vec<SkipReason>,
1346) {
1347 if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, skip_reasons) {
1348 return;
1349 }
1350 if extracted.len() >= limits.max_heredocs {
1351 return; }
1353
1354 let mut hit_limit = false;
1355
1356 let mut extract_quoted = |pattern: &Regex, is_quoted: bool| {
1358 for cap in pattern.captures_iter(command) {
1359 if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, skip_reasons) {
1360 return;
1361 }
1362 if extracted.len() >= limits.max_heredocs {
1363 hit_limit = true;
1364 break;
1365 }
1366
1367 let content_match = cap.get(1);
1369 let content = content_match.map_or("", |m| m.as_str());
1370
1371 if content.len() > limits.max_body_bytes {
1372 continue;
1373 }
1374
1375 let full_match = cap.get(0).unwrap();
1376
1377 let target_cmd = extract_heredoc_target_command(command, full_match.start());
1379
1380 extracted.push(ExtractedContent {
1381 content: content.to_string(),
1382 language: ScriptLanguage::Bash, delimiter: None,
1384 byte_range: full_match.start()..full_match.end(),
1385 content_range: content_match.map(|m| m.start()..m.end()),
1386 quoted: is_quoted,
1387 heredoc_type: Some(HeredocType::HereString),
1388 target_command: target_cmd,
1389 });
1390 }
1391 };
1392
1393 extract_quoted(&HERESTRING_SINGLE_QUOTE, true);
1396 extract_quoted(&HERESTRING_DOUBLE_QUOTE, true);
1397 extract_quoted(&HERESTRING_UNQUOTED, false);
1398
1399 if hit_limit {
1400 skip_reasons.push(SkipReason::ExceededHeredocLimit {
1401 limit: limits.max_heredocs,
1402 });
1403 }
1404}
1405
1406fn extract_heredocs(
1408 command: &str,
1409 limits: &ExtractionLimits,
1410 start_time: Instant,
1411 timeout: Duration,
1412 extracted: &mut Vec<ExtractedContent>,
1413 skip_reasons: &mut Vec<SkipReason>,
1414) {
1415 if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, skip_reasons) {
1416 return;
1417 }
1418 if extracted.len() >= limits.max_heredocs {
1419 return; }
1421
1422 let mut hit_limit = false;
1423 for cap in HEREDOC_EXTRACTOR.captures_iter(command) {
1424 if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, skip_reasons) {
1425 return;
1426 }
1427 if extracted.len() >= limits.max_heredocs {
1428 hit_limit = true;
1429 break;
1430 }
1431
1432 let operator_variant = cap.get(1).map(|m| m.as_str());
1433
1434 let (delimiter, quoted) = if let Some(m) = cap.get(2) {
1435 (m.as_str(), true)
1436 } else if let Some(m) = cap.get(3) {
1437 (m.as_str(), true)
1438 } else if let Some(m) = cap.get(4) {
1439 (m.as_str(), false)
1440 } else {
1441 continue;
1443 };
1444
1445 let heredoc_type = match operator_variant {
1447 Some("-") => HeredocType::TabStripped,
1448 Some("~") => HeredocType::IndentStripped,
1449 _ => HeredocType::Standard,
1450 };
1451
1452 let full_match = cap.get(0).unwrap();
1453 let mut start_pos = full_match.end();
1454
1455 start_pos = command[start_pos..]
1459 .find('\n')
1460 .map_or(command.len(), |rel| start_pos.saturating_add(rel));
1461
1462 match extract_heredoc_body(
1464 command,
1465 start_pos,
1466 delimiter,
1467 heredoc_type,
1468 limits,
1469 start_time,
1470 timeout,
1471 ) {
1472 Ok((content, end_pos, body_start_abs, body_end_abs)) => {
1473 let (language, _confidence) = ScriptLanguage::detect(command, &content);
1474 let target_cmd = extract_heredoc_target_command(command, full_match.start());
1476 extracted.push(ExtractedContent {
1477 content,
1478 language,
1479 delimiter: Some(delimiter.to_string()),
1480 byte_range: full_match.start()..end_pos.min(command.len()),
1481 content_range: Some(body_start_abs..body_end_abs),
1482 quoted,
1483 heredoc_type: Some(heredoc_type),
1484 target_command: target_cmd,
1485 });
1486 }
1487 Err(reason) => {
1488 skip_reasons.push(reason);
1489 if matches!(skip_reasons.last(), Some(SkipReason::Timeout { .. })) {
1490 return;
1491 }
1492 }
1493 }
1494 }
1495
1496 if hit_limit {
1497 skip_reasons.push(SkipReason::ExceededHeredocLimit {
1498 limit: limits.max_heredocs,
1499 });
1500 }
1501}
1502
1503fn extract_heredoc_target_command(command: &str, heredoc_start: usize) -> Option<String> {
1514 if heredoc_start == 0 {
1515 return None;
1516 }
1517
1518 let before = &command[..heredoc_start];
1519
1520 let trimmed = before.trim_end();
1522 if trimmed.is_empty() {
1523 return None;
1524 }
1525
1526 let tokens = tokenize_backwards(trimmed);
1530
1531 for token in tokens.iter().rev() {
1532 if is_shell_env_assignment(token) {
1533 continue;
1534 }
1535
1536 if token.starts_with('-') {
1538 continue;
1539 }
1540
1541 if SHELL_WRAPPER_COMMANDS.contains(&token.as_str()) {
1543 continue;
1544 }
1545
1546 if (token.starts_with('\'') && token.ends_with('\''))
1548 || (token.starts_with('"') && token.ends_with('"'))
1549 {
1550 continue;
1551 }
1552
1553 if token.contains('/') {
1555 let basename = token.rsplit('/').next().unwrap_or(token);
1556
1557 let is_known_command = NON_EXECUTING_HEREDOC_COMMANDS.contains(&basename)
1560 || [
1561 "bash", "sh", "zsh", "fish", "ksh", "dash", "python", "perl", "ruby", "node",
1562 ]
1563 .contains(&basename);
1564
1565 let looks_like_command_path = token.starts_with("/bin/")
1567 || token.starts_with("/usr/bin/")
1568 || token.starts_with("/usr/local/bin/")
1569 || token.starts_with("/sbin/")
1570 || token.starts_with("/usr/sbin/")
1571 || is_known_command;
1572
1573 if !looks_like_command_path {
1574 continue;
1576 }
1577
1578 return Some(basename.to_string());
1579 }
1580
1581 let has_extension = token.contains('.') && !token.starts_with('.');
1583 let is_known_command = NON_EXECUTING_HEREDOC_COMMANDS.contains(&token.as_str())
1584 || [
1585 "bash", "sh", "zsh", "fish", "ksh", "dash", "python", "perl", "ruby", "node",
1586 ]
1587 .contains(&token.as_str());
1588 if has_extension && !is_known_command {
1589 continue;
1590 }
1591
1592 return Some(token.clone());
1593 }
1594
1595 None
1596}
1597
1598fn is_shell_env_assignment(token: &str) -> bool {
1599 let Some((name, _value)) = token.split_once('=') else {
1600 return false;
1601 };
1602
1603 !name.is_empty()
1604 && name.bytes().enumerate().all(|(idx, byte)| match byte {
1605 b'a'..=b'z' | b'A'..=b'Z' | b'_' => true,
1606 b'0'..=b'9' => idx > 0,
1607 _ => false,
1608 })
1609}
1610
1611fn tokenize_backwards(s: &str) -> Vec<String> {
1619 let mut tokens = Vec::new();
1620 let bytes = s.as_bytes();
1621 let mut i = s.len();
1622
1623 while i > 0 {
1624 while i > 0 && bytes[i - 1].is_ascii_whitespace() {
1626 i -= 1;
1627 }
1628 if i == 0 {
1629 break;
1630 }
1631
1632 let end = i;
1633
1634 if bytes[i - 1] == b'\'' || bytes[i - 1] == b'"' {
1636 let quote = bytes[i - 1];
1637 i -= 1;
1638 while i > 0 && bytes[i - 1] != quote {
1640 i -= 1;
1641 }
1642 i = i.saturating_sub(1); tokens.push(s[i..end].to_string());
1644 continue;
1645 }
1646
1647 if matches!(bytes[i - 1], b'|' | b';' | b'&' | b'$' | b'(' | b')') {
1649 break;
1651 }
1652
1653 while i > 0 {
1655 let c = bytes[i - 1];
1656 if c.is_ascii_whitespace() || matches!(c, b'|' | b';' | b'&' | b'$' | b'(' | b')') {
1657 break;
1658 }
1659 i -= 1;
1660 }
1661
1662 if i < end {
1663 tokens.push(s[i..end].to_string());
1664 }
1665 }
1666
1667 tokens
1668}
1669
1670const NON_EXECUTING_HEREDOC_COMMANDS: &[&str] = &[
1673 "cat",
1675 "tee",
1676 "echo",
1677 "printf",
1678 "dd",
1680 "head",
1682 "tail",
1683 "grep",
1684 "egrep",
1685 "fgrep",
1686 "sed",
1687 "awk",
1688 "cut",
1689 "sort",
1690 "uniq",
1691 "tr",
1692 "wc",
1693 "rev",
1694 "nl",
1695 "fold",
1696 "fmt",
1697 "expand",
1698 "unexpand",
1699 "column",
1700 "paste",
1701 "join",
1702 "base64",
1704 "xxd",
1705 "od",
1706 "hexdump",
1707 "gzip",
1708 "gunzip",
1709 "bzip2",
1710 "bunzip2",
1711 "xz",
1712 "lzma",
1713 "zcat",
1714 "bzcat",
1715 "xzcat",
1716 "nc",
1718 "netcat",
1719 "curl",
1720 "wget",
1721 "md5sum",
1723 "sha1sum",
1724 "sha256sum",
1725 "sha512sum",
1726 "cksum",
1727 "diff",
1729 "cmp",
1730 "comm",
1731 "mail",
1733 "sendmail",
1734 "read",
1736];
1737
1738const SHELL_WRAPPER_COMMANDS: &[&str] = &["sudo", "env", "command", "builtin", "nohup"];
1739
1740#[must_use]
1745pub fn is_non_executing_heredoc_command(cmd: &str) -> bool {
1746 let cmd_name = cmd.rsplit('/').next().unwrap_or(cmd);
1748 NON_EXECUTING_HEREDOC_COMMANDS.contains(&cmd_name)
1749}
1750
1751#[must_use]
1760pub fn mask_non_executing_heredocs(command: &str) -> std::borrow::Cow<'_, str> {
1761 use std::borrow::Cow;
1762
1763 if !command.contains("<<") {
1765 return Cow::Borrowed(command);
1766 }
1767
1768 let mut result = String::new();
1769 let mut pos = 0;
1770 let bytes = command.as_bytes();
1771
1772 while pos < command.len() {
1773 if let Some(offset) = command[pos..].find("<<") {
1775 let heredoc_start = pos + offset;
1776
1777 if heredoc_start + 3 <= command.len() && bytes.get(heredoc_start + 2) == Some(&b'<') {
1779 let target_cmd = extract_heredoc_target_command(command, heredoc_start);
1781 let should_mask_herestring = target_cmd
1782 .as_ref()
1783 .is_some_and(|cmd| is_non_executing_heredoc_command(cmd));
1784
1785 if should_mask_herestring {
1786 if let Some((content_start, content_end)) =
1788 find_herestring_content_bounds(command, heredoc_start + 3)
1789 {
1790 if result.is_empty() {
1792 result = command[..content_start].to_string();
1793 } else {
1794 result.push_str(&command[pos..content_start]);
1795 }
1796 result.push_str("'MASKED'");
1798 pos = content_end;
1799 continue;
1800 }
1801 }
1802
1803 if !result.is_empty() {
1805 result.push_str(&command[pos..heredoc_start + 3]);
1806 }
1807 pos = heredoc_start + 3;
1808 continue;
1809 }
1810
1811 let target_cmd = extract_heredoc_target_command(command, heredoc_start);
1813
1814 let should_mask = target_cmd
1816 .as_ref()
1817 .is_some_and(|cmd| is_non_executing_heredoc_command(cmd));
1818
1819 if should_mask {
1820 let after_op = &command[heredoc_start + 2..];
1822 if let Some((delimiter, body_start_offset, heredoc_type)) =
1823 parse_heredoc_delimiter(after_op)
1824 {
1825 let body_start = heredoc_start + 2 + body_start_offset;
1827 if let Some(body_end) =
1828 find_heredoc_terminator(command, body_start, &delimiter, heredoc_type)
1829 {
1830 if result.is_empty() {
1832 result = command[..body_start].to_string();
1833 } else {
1834 result.push_str(&command[pos..body_start]);
1835 }
1836
1837 let body_slice = &command[body_start..body_end];
1839 let terminator_rel = body_slice.rfind('\n').map_or(0, |idx| idx + 1);
1840 let terminator_abs = body_start + terminator_rel;
1841
1842 let masked_body =
1843 mask_preserve_newlines(&command[body_start..terminator_abs]);
1844 result.push_str(&masked_body);
1845 result.push_str(&command[terminator_abs..body_end]);
1846
1847 pos = body_end;
1848 continue;
1849 }
1850 }
1851 }
1852
1853 if result.is_empty() {
1855 } else {
1857 result.push_str(&command[pos..heredoc_start + 2]);
1858 }
1859 pos = heredoc_start + 2;
1860 } else {
1861 if result.is_empty() {
1863 return Cow::Borrowed(command);
1864 }
1865 result.push_str(&command[pos..]);
1866 break;
1867 }
1868 }
1869
1870 if result.is_empty() {
1871 Cow::Borrowed(command)
1872 } else {
1873 Cow::Owned(result)
1874 }
1875}
1876
1877fn mask_preserve_newlines(input: &str) -> String {
1878 let mut out: Vec<u8> = Vec::with_capacity(input.len());
1879 for b in input.as_bytes() {
1880 match b {
1881 b'\n' | b'\r' => out.push(*b),
1882 _ => out.push(b' '),
1883 }
1884 }
1885 String::from_utf8(out).unwrap_or_default()
1886}
1887
1888fn parse_heredoc_delimiter(after_op: &str) -> Option<(String, usize, HeredocType)> {
1891 let trimmed = after_op.trim_start_matches([' ', '\t']);
1892 let skip_whitespace = after_op.len() - trimmed.len();
1893
1894 if trimmed.is_empty() {
1895 return None;
1896 }
1897
1898 let (heredoc_type, marker_len) = if skip_whitespace == 0 {
1922 match trimmed.as_bytes().first() {
1923 Some(b'-') => (HeredocType::TabStripped, 1),
1924 Some(b'~') => (HeredocType::IndentStripped, 1),
1925 _ => (HeredocType::Standard, 0),
1926 }
1927 } else {
1928 (HeredocType::Standard, 0)
1929 };
1930
1931 let after_marker = &trimmed[marker_len..];
1932 let after_marker_trimmed = after_marker.trim_start_matches([' ', '\t']);
1933 let inter_whitespace = after_marker.len() - after_marker_trimmed.len();
1934 let delim_chars = after_marker_trimmed;
1935
1936 let (delimiter, delim_len) = if let Some(stripped) = delim_chars.strip_prefix('"') {
1938 let end = stripped.find('"')?;
1940 let (body, _) = stripped.split_at(end);
1941 (body.to_string(), end + 2)
1942 } else if let Some(stripped) = delim_chars.strip_prefix('\'') {
1943 let end = stripped.find('\'')?;
1945 let (body, _) = stripped.split_at(end);
1946 (body.to_string(), end + 2)
1947 } else {
1948 let end = delim_chars
1950 .find(|c: char| c.is_whitespace() || c == '\n' || c == ';' || c == '&' || c == '|')
1951 .unwrap_or(delim_chars.len());
1952 if end == 0 {
1953 return None;
1954 }
1955 (delim_chars[..end].to_string(), end)
1956 };
1957
1958 let total_delim_offset = skip_whitespace + marker_len + inter_whitespace + delim_len;
1960 let remaining = &after_op[total_delim_offset..];
1961
1962 let newline_offset = remaining.find('\n').map_or(remaining.len(), |i| i + 1);
1964
1965 Some((delimiter, total_delim_offset + newline_offset, heredoc_type))
1966}
1967
1968fn find_heredoc_terminator(
1970 command: &str,
1971 body_start: usize,
1972 delimiter: &str,
1973 heredoc_type: HeredocType,
1974) -> Option<usize> {
1975 if body_start >= command.len() {
1976 return None;
1977 }
1978
1979 let body = &command[body_start..];
1980 let mut line_start = 0;
1981
1982 for line in body.split_inclusive('\n') {
1983 let trimmed = match heredoc_type {
1984 HeredocType::TabStripped => line.trim_start_matches('\t'),
1985 HeredocType::IndentStripped => line.trim_start(),
1986 HeredocType::Standard | HeredocType::HereString => line,
1987 };
1988
1989 let line_content = trimmed.trim_end_matches(['\n', '\r']);
1990
1991 if line_content == delimiter {
1992 return Some(body_start + line_start + line.len());
1994 }
1995
1996 line_start += line.len();
1997 }
1998
1999 None
2000}
2001
2002fn find_herestring_content_bounds(command: &str, after_operator: usize) -> Option<(usize, usize)> {
2006 if after_operator >= command.len() {
2007 return None;
2008 }
2009
2010 let remaining = &command[after_operator..];
2011 let bytes = remaining.as_bytes();
2012
2013 let mut i = 0;
2015 while i < bytes.len() && bytes[i].is_ascii_whitespace() && bytes[i] != b'\n' {
2016 i += 1;
2017 }
2018
2019 if i >= bytes.len() || bytes[i] == b'\n' {
2020 return None;
2021 }
2022
2023 if bytes[i] == b'\'' || bytes[i] == b'"' {
2025 let quote = bytes[i];
2026 let quote_start = i;
2027 i += 1;
2028 while i < bytes.len() && bytes[i] != quote {
2030 if quote == b'"' && bytes[i] == b'\\' && i + 1 < bytes.len() {
2032 i += 2;
2033 } else {
2034 i += 1;
2035 }
2036 }
2037 if i < bytes.len() && bytes[i] == quote {
2038 return Some((
2040 after_operator + quote_start,
2041 after_operator + i + 1, ));
2043 }
2044 }
2046
2047 let word_start = i;
2049 while i < bytes.len() {
2050 let c = bytes[i];
2051 if c.is_ascii_whitespace() || matches!(c, b';' | b'&' | b'|' | b')' | b'\n') {
2052 break;
2053 }
2054 i += 1;
2055 }
2056
2057 if i > word_start {
2058 Some((after_operator + word_start, after_operator + i))
2059 } else {
2060 None
2061 }
2062}
2063
2064fn extract_heredoc_body(
2066 command: &str,
2067 start: usize,
2068 delimiter: &str,
2069 heredoc_type: HeredocType,
2070 limits: &ExtractionLimits,
2071 start_time: Instant,
2072 timeout: Duration,
2073) -> Result<(String, usize, usize, usize), SkipReason> {
2074 if start > command.len() {
2075 return Err(SkipReason::MalformedInput {
2076 reason: "heredoc start offset out of bounds".to_string(),
2077 });
2078 }
2079
2080 let remaining = &command[start..];
2081
2082 let body_start_offset = usize::from(remaining.starts_with('\n'));
2084 let body_start = &remaining[body_start_offset..];
2085 let body_start_abs = start + body_start_offset;
2086
2087 let mut body_lines: Vec<&str> = Vec::new();
2088 let mut total_bytes: usize = 0;
2089 let mut cursor: usize = 0; for part in body_start.split_inclusive('\n') {
2092 if start_time.elapsed() >= timeout {
2094 let elapsed_ms = u64::try_from(start_time.elapsed().as_millis()).unwrap_or(u64::MAX);
2095 return Err(SkipReason::Timeout {
2096 elapsed_ms,
2097 budget_ms: limits.timeout_ms,
2098 });
2099 }
2100
2101 let line = part.strip_suffix('\n').unwrap_or(part);
2102 let line = line.strip_suffix('\r').unwrap_or(line);
2105
2106 let trimmed = match heredoc_type {
2108 HeredocType::TabStripped => line.trim_start_matches('\t'),
2109 HeredocType::IndentStripped => line.trim_start(),
2110 HeredocType::Standard | HeredocType::HereString => line,
2111 };
2112
2113 if trimmed == delimiter {
2114 let terminator_start = body_start_abs + cursor;
2117 let terminator_end = terminator_start + line.len();
2118 let mut body_end_abs = terminator_start;
2119 if body_end_abs > body_start_abs {
2120 let bytes = command.as_bytes();
2121 if bytes.get(body_end_abs.saturating_sub(1)) == Some(&b'\n') {
2122 body_end_abs = body_end_abs.saturating_sub(1);
2123 if bytes.get(body_end_abs.saturating_sub(1)) == Some(&b'\r') {
2124 body_end_abs = body_end_abs.saturating_sub(1);
2125 }
2126 }
2127 }
2128
2129 let content = match heredoc_type {
2130 HeredocType::TabStripped => body_lines
2131 .iter()
2132 .map(|l| l.trim_start_matches('\t'))
2133 .collect::<Vec<_>>()
2134 .join("\n"),
2135 HeredocType::IndentStripped => {
2136 let min_indent = body_lines
2152 .iter()
2153 .filter(|l| !l.trim().is_empty())
2154 .map(|l| l.len() - l.trim_start().len())
2155 .min()
2156 .unwrap_or(0);
2157
2158 body_lines
2159 .iter()
2160 .map(|l| {
2161 if l.len() >= min_indent && l.is_char_boundary(min_indent) {
2162 &l[min_indent..]
2163 } else {
2164 l.trim_start()
2165 }
2166 })
2167 .collect::<Vec<_>>()
2168 .join("\n")
2169 }
2170 HeredocType::Standard | HeredocType::HereString => body_lines.join("\n"),
2171 };
2172
2173 return Ok((content, terminator_end, body_start_abs, body_end_abs));
2174 }
2175
2176 total_bytes = total_bytes.saturating_add(part.len());
2178 if total_bytes > limits.max_body_bytes {
2179 return Err(SkipReason::ExceededSizeLimit {
2180 actual: total_bytes,
2181 limit: limits.max_body_bytes,
2182 });
2183 }
2184
2185 if body_lines.len() >= limits.max_body_lines {
2186 return Err(SkipReason::ExceededLineLimit {
2187 actual: body_lines.len() + 1,
2188 limit: limits.max_body_lines,
2189 });
2190 }
2191
2192 body_lines.push(line);
2193 cursor = cursor.saturating_add(part.len());
2194 }
2195
2196 Err(SkipReason::UnterminatedHeredoc {
2197 delimiter: delimiter.to_string(),
2198 })
2199}
2200
2201use ast_grep_core::AstGrep;
2206use ast_grep_language::SupportLang;
2207
2208#[derive(Debug, Clone, PartialEq, Eq)]
2213pub struct ExtractedShellCommand {
2214 pub text: String,
2216 pub start: usize,
2218 pub end: usize,
2220 pub line_number: usize,
2222}
2223
2224#[must_use]
2268#[instrument(skip(content), fields(content_len = content.len()))]
2269pub fn extract_shell_commands(content: &str) -> Vec<ExtractedShellCommand> {
2270 if content.trim().is_empty() {
2271 trace!("extract_shell_commands: empty content");
2272 return Vec::new();
2273 }
2274
2275 let start = Instant::now();
2276 let ast = AstGrep::new(content, SupportLang::Bash);
2277 let root = ast.root();
2278
2279 let mut commands = Vec::new();
2280
2281 collect_commands_recursive(root, content, &mut commands);
2284
2285 debug!(
2286 elapsed_us = start.elapsed().as_micros(),
2287 count = commands.len(),
2288 "extract_shell_commands: AST analysis complete"
2289 );
2290 commands
2291}
2292
2293#[allow(clippy::needless_pass_by_value)]
2301fn collect_commands_recursive<D: ast_grep_core::Doc>(
2302 node: ast_grep_core::Node<'_, D>,
2303 content: &str,
2304 commands: &mut Vec<ExtractedShellCommand>,
2305) {
2306 let kind = node.kind();
2307
2308 if kind == "command" {
2310 let range = node.range();
2311 let text = node.text().to_string();
2312
2313 if !text.trim().is_empty() {
2315 let line_number = content[..range.start].matches('\n').count() + 1;
2316
2317 commands.push(ExtractedShellCommand {
2318 text,
2319 start: range.start,
2320 end: range.end,
2321 line_number,
2322 });
2323 }
2324 }
2325
2326 for child in node.children() {
2333 collect_commands_recursive(child, content, commands);
2334 }
2335}
2336
2337#[cfg(test)]
2342mod tests {
2343 use super::*;
2344 #[allow(unused_imports)]
2345 use proptest::prelude::*;
2346
2347 mod tier1_triggers {
2352 use super::*;
2353
2354 #[test]
2355 fn no_trigger_on_safe_commands() {
2356 let safe_commands = [
2358 "git status",
2359 "ls -la",
2360 "cargo build",
2361 "npm install",
2362 "docker ps",
2363 "kubectl get pods",
2364 "cat file.txt",
2365 "echo hello",
2366 "grep pattern file",
2367 "find . -name '*.rs'",
2368 ];
2369
2370 for cmd in safe_commands {
2371 assert_eq!(
2372 check_triggers(cmd),
2373 TriggerResult::NoTrigger,
2374 "should not trigger on: {cmd}"
2375 );
2376 }
2377 }
2378
2379 #[test]
2380 fn triggers_on_heredoc_basic() {
2381 let heredocs = [
2383 "cat << EOF",
2384 "cat <<EOF",
2385 "cat << 'EOF'",
2386 r#"cat << "EOF""#,
2387 "cat <<- EOF", "mysql <<< 'query'", ];
2390
2391 for cmd in heredocs {
2392 assert_eq!(
2393 check_triggers(cmd),
2394 TriggerResult::Triggered,
2395 "should trigger on heredoc: {cmd}"
2396 );
2397 }
2398 }
2399
2400 #[test]
2401 fn triggers_on_python_inline() {
2402 let python_commands = [
2403 "python -c 'import os'",
2404 "python3 -c 'import os'",
2405 "python -I -c 'import os'",
2406 "python3 -I -c 'import os'",
2407 "python -e 'print(1)'",
2408 "python3 -e 'print(1)'",
2409 ];
2410
2411 for cmd in python_commands {
2412 assert_eq!(
2413 check_triggers(cmd),
2414 TriggerResult::Triggered,
2415 "should trigger on python inline: {cmd}"
2416 );
2417 }
2418 }
2419
2420 #[test]
2421 fn triggers_on_versioned_interpreters() {
2422 let versioned_commands = [
2424 "python3.11 -c 'import os'",
2426 "python3.12.1 -c 'import os'",
2427 "python3.9 -e 'print(1)'",
2428 "ruby3.0 -e 'puts 1'",
2430 "ruby3.2.1 -e 'exit'",
2431 "perl5.36 -e 'print 1'",
2433 "perl5.38.2 -E 'say 1'",
2434 "node18 -e 'console.log(1)'",
2436 "node20.1 -e 'console.log(1)'",
2437 "nodejs18 -e 'console.log(1)'",
2438 "nodejs20.10.0 -e 'test'",
2439 ];
2440
2441 for cmd in versioned_commands {
2442 assert_eq!(
2443 check_triggers(cmd),
2444 TriggerResult::Triggered,
2445 "should trigger on versioned interpreter: {cmd}"
2446 );
2447 }
2448 }
2449
2450 #[test]
2451 fn triggers_on_ruby_inline() {
2452 let ruby_commands = ["ruby -e 'puts 1'", "ruby -w -e 'puts 1'", "irb -e 'exit'"];
2453
2454 for cmd in ruby_commands {
2455 assert_eq!(
2456 check_triggers(cmd),
2457 TriggerResult::Triggered,
2458 "should trigger on ruby inline: {cmd}"
2459 );
2460 }
2461 }
2462
2463 #[test]
2464 fn triggers_on_perl_inline() {
2465 let perl_commands = [
2466 "perl -e 'print 1'",
2467 "perl -E 'say 1'", "perl -pi -e 'print 1'",
2469 ];
2470
2471 for cmd in perl_commands {
2472 assert_eq!(
2473 check_triggers(cmd),
2474 TriggerResult::Triggered,
2475 "should trigger on perl inline: {cmd}"
2476 );
2477 }
2478 }
2479
2480 #[test]
2481 fn triggers_on_node_inline() {
2482 let node_commands = [
2483 "node -e 'console.log(1)'",
2484 "node -p 'process.version'",
2485 "node -pe 'process.version'",
2486 ];
2487
2488 for cmd in node_commands {
2489 assert_eq!(
2490 check_triggers(cmd),
2491 TriggerResult::Triggered,
2492 "should trigger on node inline: {cmd}"
2493 );
2494 }
2495 }
2496
2497 #[test]
2498 fn triggers_on_shell_inline() {
2499 let shell_commands = [
2500 "bash -c 'echo hello'",
2501 "bash -l -c 'echo hello'",
2502 "bash -lc 'echo hello'",
2503 "bash --noprofile --norc -c 'echo hello'",
2504 "sh -c 'ls'",
2505 "zsh -c 'pwd'",
2506 "fish -c 'echo hello'",
2507 ];
2508
2509 for cmd in shell_commands {
2510 assert_eq!(
2511 check_triggers(cmd),
2512 TriggerResult::Triggered,
2513 "should trigger on shell inline: {cmd}"
2514 );
2515 }
2516 }
2517
2518 #[test]
2519 fn triggers_on_xargs() {
2520 let xargs_commands = [
2521 "find . -name '*.bak' | xargs rm",
2522 "ls | xargs -I {} echo {}",
2523 "cat files.txt | xargs -n1 process",
2524 ];
2525
2526 for cmd in xargs_commands {
2527 assert_eq!(
2528 check_triggers(cmd),
2529 TriggerResult::Triggered,
2530 "should trigger on xargs: {cmd}"
2531 );
2532 }
2533 }
2534
2535 #[test]
2536 fn triggers_on_piped_execution() {
2537 let piped_commands = [
2538 "echo 'print(1)' | python",
2539 "cat script.py | python3",
2540 "echo 'puts 1' | ruby",
2541 "echo 'print 1' | perl",
2542 "echo 'console.log(1)' | node",
2543 "echo 'echo hello' | bash",
2544 "echo 'ls' | sh",
2545 ];
2546
2547 for cmd in piped_commands {
2548 assert_eq!(
2549 check_triggers(cmd),
2550 TriggerResult::Triggered,
2551 "should trigger on piped execution: {cmd}"
2552 );
2553 }
2554 }
2555
2556 #[test]
2557 fn triggers_on_eval_exec() {
2558 let eval_commands = [
2559 r#"eval "dangerous code""#,
2560 "eval 'dangerous code'",
2561 r#"exec "command""#,
2562 "exec 'command'",
2563 ];
2564
2565 for cmd in eval_commands {
2566 assert_eq!(
2567 check_triggers(cmd),
2568 TriggerResult::Triggered,
2569 "should trigger on eval/exec: {cmd}"
2570 );
2571 }
2572 }
2573
2574 #[test]
2575 fn matched_triggers_returns_indices() {
2576 let matches = matched_triggers("python -c 'test'");
2578 assert!(!matches.is_empty(), "should have matches for python -c");
2579
2580 let no_matches = matched_triggers("git status");
2581 assert!(
2582 no_matches.is_empty(),
2583 "should have no matches for git status"
2584 );
2585 }
2586
2587 #[test]
2588 fn heredoc_syntax_inside_quoted_literals_does_not_trigger() {
2589 let commands = [
2591 r#"git commit -m "docs: example heredoc: cat <<EOF rm -rf / EOF""#,
2592 r#"rg "<<EOF" README.md"#,
2593 "echo 'cat <<EOF (docs only)'",
2594 ];
2595
2596 for cmd in commands {
2597 assert_eq!(
2598 check_triggers(cmd),
2599 TriggerResult::NoTrigger,
2600 "should not trigger on quoted literal heredoc syntax: {cmd}"
2601 );
2602 }
2603 }
2604
2605 #[test]
2606 fn heredoc_inside_command_substitution_with_outer_quotes_still_triggers() {
2607 let cmd = "echo \"$(cat <<EOF\nrm -rf /\nEOF)\"";
2609 assert_eq!(check_triggers(cmd), TriggerResult::Triggered);
2610 }
2611
2612 }
2617
2618 mod tier2_extraction {
2623 use super::*;
2624
2625 #[test]
2626 fn extraction_limits_default() {
2627 let limits = ExtractionLimits::default();
2628 assert_eq!(limits.max_body_bytes, 1024 * 1024);
2629 assert_eq!(limits.max_body_lines, 10_000);
2630 assert_eq!(limits.max_heredocs, 10);
2631 assert_eq!(limits.timeout_ms, 50);
2632 }
2633
2634 #[test]
2635 fn extracts_inline_script_single_quotes() {
2636 let result = extract_content("python -c 'import os'", &ExtractionLimits::default());
2637 if let ExtractionResult::Extracted(contents) = result {
2638 assert_eq!(contents.len(), 1);
2639 assert_eq!(contents[0].content, "import os");
2640 assert_eq!(contents[0].language, ScriptLanguage::Python);
2641 assert!(contents[0].quoted);
2642 } else {
2643 panic!("Expected Extracted result");
2644 }
2645 }
2646
2647 #[test]
2648 fn extracts_inline_script_double_quotes() {
2649 let result = extract_content(r#"bash -c "echo hello""#, &ExtractionLimits::default());
2650 if let ExtractionResult::Extracted(contents) = result {
2651 assert_eq!(contents.len(), 1);
2652 assert_eq!(contents[0].content, "echo hello");
2653 assert_eq!(contents[0].language, ScriptLanguage::Bash);
2654 } else {
2655 panic!("Expected Extracted result");
2656 }
2657 }
2658
2659 #[test]
2660 fn extracts_inline_script_with_intervening_flags() {
2661 let result = extract_content("python -I -c 'import os'", &ExtractionLimits::default());
2662 if let ExtractionResult::Extracted(contents) = result {
2663 assert_eq!(contents.len(), 1);
2664 assert_eq!(contents[0].content, "import os");
2665 assert_eq!(contents[0].language, ScriptLanguage::Python);
2666 assert!(contents[0].quoted);
2667 } else {
2668 panic!("Expected Extracted result");
2669 }
2670 }
2671
2672 #[test]
2673 fn extracts_inline_script_with_combined_shell_flags() {
2674 let result = extract_content("bash -lc 'echo hello'", &ExtractionLimits::default());
2675 if let ExtractionResult::Extracted(contents) = result {
2676 assert_eq!(contents.len(), 1);
2677 assert_eq!(contents[0].content, "echo hello");
2678 assert_eq!(contents[0].language, ScriptLanguage::Bash);
2679 } else {
2680 panic!("Expected Extracted result");
2681 }
2682 }
2683
2684 #[test]
2685 fn extracts_inline_script_with_combined_node_flags() {
2686 let result =
2687 extract_content("node -pe 'process.version'", &ExtractionLimits::default());
2688 if let ExtractionResult::Extracted(contents) = result {
2689 assert_eq!(contents.len(), 1);
2690 assert_eq!(contents[0].content, "process.version");
2691 assert_eq!(contents[0].language, ScriptLanguage::JavaScript);
2692 } else {
2693 panic!("Expected Extracted result");
2694 }
2695 }
2696
2697 #[test]
2698 fn extracts_inline_script_with_interleaved_perl_flags() {
2699 let result = extract_content("perl -pi -e 'print 1'", &ExtractionLimits::default());
2700 if let ExtractionResult::Extracted(contents) = result {
2701 assert_eq!(contents.len(), 1);
2702 assert_eq!(contents[0].content, "print 1");
2703 assert_eq!(contents[0].language, ScriptLanguage::Perl);
2704 } else {
2705 panic!("Expected Extracted result");
2706 }
2707 }
2708
2709 #[test]
2714 fn extracts_powershell_command_body() {
2715 let result = extract_content(
2717 "powershell -Command 'echo hi'",
2718 &ExtractionLimits::default(),
2719 );
2720 if let ExtractionResult::Extracted(contents) = result {
2721 assert_eq!(contents.len(), 1);
2722 assert_eq!(contents[0].content, "echo hi");
2723 assert_eq!(contents[0].language, ScriptLanguage::Bash);
2724 } else {
2725 panic!("Expected Extracted result for `powershell -Command '...'`");
2726 }
2727 }
2728
2729 #[test]
2730 fn extracts_powershell_exe_command_body_double_quotes() {
2731 let result = extract_content(
2732 r#"powershell.exe -Command "echo hi""#,
2733 &ExtractionLimits::default(),
2734 );
2735 if let ExtractionResult::Extracted(contents) = result {
2736 assert_eq!(contents.len(), 1);
2737 assert_eq!(contents[0].content, "echo hi");
2738 assert_eq!(contents[0].language, ScriptLanguage::Bash);
2739 } else {
2740 panic!("Expected Extracted result for `powershell.exe -Command \"...\"`");
2741 }
2742 }
2743
2744 #[test]
2745 fn extracts_pwsh_short_flag_body() {
2746 let result = extract_content("pwsh -c 'echo hi'", &ExtractionLimits::default());
2748 if let ExtractionResult::Extracted(contents) = result {
2749 assert_eq!(contents.len(), 1);
2750 assert_eq!(contents[0].content, "echo hi");
2751 assert_eq!(contents[0].language, ScriptLanguage::Bash);
2752 } else {
2753 panic!("Expected Extracted result for `pwsh -c '...'`");
2754 }
2755 }
2756
2757 #[test]
2758 fn extracts_powershell_quoted_full_path_body() {
2759 let cmd = "\"C:\\WINDOWS\\System32\\WindowsPowerShell\\v1.0\\powershell.exe\" -Command 'echo hi'";
2762 let result = extract_content(cmd, &ExtractionLimits::default());
2763 if let ExtractionResult::Extracted(contents) = result {
2764 assert!(
2765 contents
2766 .iter()
2767 .any(|c| c.content == "echo hi" && c.language == ScriptLanguage::Bash),
2768 "expected to extract the -Command body from a quoted powershell.exe path; got {contents:?}"
2769 );
2770 } else {
2771 panic!("Expected Extracted result for quoted-full-path powershell.exe -Command");
2772 }
2773 }
2774
2775 #[test]
2776 fn extracts_here_string() {
2777 let result = extract_content("cat <<< 'hello world'", &ExtractionLimits::default());
2778 if let ExtractionResult::Extracted(contents) = result {
2779 assert_eq!(contents.len(), 1);
2780 assert_eq!(contents[0].content, "hello world");
2781 assert_eq!(contents[0].heredoc_type, Some(HeredocType::HereString));
2782 } else {
2783 panic!("Expected Extracted result");
2784 }
2785 }
2786
2787 #[test]
2788 fn extracts_heredoc_basic() {
2789 let cmd = "cat << EOF\nline1\nline2\nEOF";
2790 let result = extract_content(cmd, &ExtractionLimits::default());
2791 if let ExtractionResult::Extracted(contents) = result {
2792 assert_eq!(contents.len(), 1);
2793 assert_eq!(contents[0].content, "line1\nline2");
2794 assert_eq!(contents[0].delimiter, Some("EOF".to_string()));
2795 assert_eq!(contents[0].heredoc_type, Some(HeredocType::Standard));
2796 } else {
2797 panic!("Expected Extracted result, got {result:?}");
2798 }
2799 }
2800
2801 #[test]
2802 fn extracts_heredoc_ignores_trailing_tokens_on_delimiter_line() {
2803 let cmd = "python3 <<EOF | cat\nimport shutil\nshutil.rmtree('/tmp/test')\nEOF";
2804 let result = extract_content(cmd, &ExtractionLimits::default());
2805 if let ExtractionResult::Extracted(contents) = result {
2806 assert_eq!(contents.len(), 1);
2807 assert_eq!(contents[0].language, ScriptLanguage::Python);
2808 assert_eq!(
2809 contents[0].content,
2810 "import shutil\nshutil.rmtree('/tmp/test')"
2811 );
2812 } else {
2813 panic!("Expected Extracted result, got {result:?}");
2814 }
2815 }
2816
2817 #[test]
2818 fn extracts_heredoc_with_crlf_line_endings() {
2819 let cmd = "cat <<EOF\r\nline1\r\nEOF\r\n";
2820 let result = extract_content(cmd, &ExtractionLimits::default());
2821 if let ExtractionResult::Extracted(contents) = result {
2822 assert_eq!(contents.len(), 1);
2823 assert_eq!(contents[0].content, "line1");
2824 assert_eq!(contents[0].delimiter.as_deref(), Some("EOF"));
2825 } else {
2826 panic!("Expected Extracted result, got {result:?}");
2827 }
2828 }
2829
2830 #[test]
2831 fn extracts_heredoc_tab_stripped() {
2832 let cmd = "cat <<- EOF\n\tline1\n\tline2\nEOF";
2833 let result = extract_content(cmd, &ExtractionLimits::default());
2834 if let ExtractionResult::Extracted(contents) = result {
2835 assert_eq!(contents.len(), 1);
2836 assert_eq!(contents[0].content, "line1\nline2");
2838 assert_eq!(contents[0].heredoc_type, Some(HeredocType::TabStripped));
2839 } else {
2840 panic!("Expected Extracted result");
2841 }
2842 }
2843
2844 #[test]
2845 fn extracts_heredoc_indent_stripped() {
2846 let cmd = "cat <<~ EOF\n line1\n line2\n EOF";
2850 let result = extract_content(cmd, &ExtractionLimits::default());
2851 if let ExtractionResult::Extracted(contents) = result {
2852 assert_eq!(contents.len(), 1);
2853 assert_eq!(contents[0].content, "line1\nline2");
2854 assert_eq!(contents[0].heredoc_type, Some(HeredocType::IndentStripped));
2855 } else {
2856 panic!("Expected Extracted result, got {result:?}");
2857 }
2858 }
2859
2860 #[test]
2861 fn indent_stripped_heredoc_does_not_panic_on_multibyte_whitespace() {
2862 let cases: &[&str] = &[
2875 "cat <<~ EOF\n line1\n\u{00A0}line2\n EOF",
2880 "cat <<~ EOF\n line1\n\u{3000}foo\n EOF",
2884 "cat <<~ EOF\n\u{00A0}line1\n\u{3000}line2\nEOF",
2888 ];
2889 for cmd in cases {
2890 let result = extract_content(cmd, &ExtractionLimits::default());
2891 let _ = format!("{result:?}");
2896 }
2897 }
2898
2899 #[test]
2900 fn extracts_heredoc_quoted_delimiter_sets_quoted_flag() {
2901 let cmd = "cat << 'EOF'\nline1\nEOF";
2903 let result = extract_content(cmd, &ExtractionLimits::default());
2904 if let ExtractionResult::Extracted(contents) = result {
2905 assert_eq!(contents.len(), 1);
2906 assert_eq!(contents[0].content, "line1");
2907 assert_eq!(contents[0].delimiter.as_deref(), Some("EOF"));
2908 assert!(contents[0].quoted, "quoted delimiter must set quoted=true");
2909 } else {
2910 panic!("Expected Extracted result, got {result:?}");
2911 }
2912
2913 let cmd = "cat << EOF\nline1\nEOF";
2914 let result = extract_content(cmd, &ExtractionLimits::default());
2915 if let ExtractionResult::Extracted(contents) = result {
2916 assert_eq!(contents.len(), 1);
2917 assert!(
2918 !contents[0].quoted,
2919 "unquoted delimiter must set quoted=false"
2920 );
2921 } else {
2922 panic!("Expected Extracted result, got {result:?}");
2923 }
2924 }
2925
2926 #[test]
2934 fn extracts_heredoc_tab_stripped_quoted_with_space_after_dash() {
2935 for (form, cmd) in [
2936 ("<<-'EOF'", "cat <<-'EOF'\n\tgh repo delete\n\tEOF"),
2937 ("<<- 'EOF'", "cat <<- 'EOF'\n\tgh repo delete\n\tEOF"),
2938 ("<<-\"EOF\"", "cat <<-\"EOF\"\n\tgh repo delete\n\tEOF"),
2939 ("<<- \"EOF\"", "cat <<- \"EOF\"\n\tgh repo delete\n\tEOF"),
2940 ("<<~ 'EOF'", "cat <<~ 'EOF'\n\tgh repo delete\n\tEOF"),
2941 ] {
2942 let result = extract_content(cmd, &ExtractionLimits::default());
2943 let ExtractionResult::Extracted(contents) = result else {
2944 panic!("Expected extraction for {form}, got {result:?}");
2945 };
2946 assert_eq!(
2947 contents.len(),
2948 1,
2949 "{form}: expected single heredoc extraction"
2950 );
2951 assert_eq!(
2952 contents[0].delimiter.as_deref(),
2953 Some("EOF"),
2954 "{form}: delimiter must parse to EOF"
2955 );
2956 assert!(
2957 contents[0].quoted,
2958 "{form}: quoted delimiter must set quoted=true"
2959 );
2960 }
2961 }
2962
2963 #[test]
2975 fn parses_dash_after_space_as_part_of_unquoted_delimiter() {
2976 let cmd = "cat << -EOF\nbody line\n-EOF";
2977 let result = extract_content(cmd, &ExtractionLimits::default());
2978 let ExtractionResult::Extracted(contents) = result else {
2979 panic!("Expected extraction, got {result:?}");
2980 };
2981 assert_eq!(contents.len(), 1, "expected single heredoc extraction");
2982 assert_eq!(
2983 contents[0].delimiter.as_deref(),
2984 Some("-EOF"),
2985 "delimiter must include the leading dash when there is whitespace before it"
2986 );
2987 assert!(
2988 !contents[0].quoted,
2989 "unquoted delimiter must set quoted=false"
2990 );
2991 }
2992
2993 #[test]
3001 fn masks_indent_stripped_heredoc_body_with_space_indented_terminator() {
3002 let cmd = "cat <<~EOF\n rm -rf /\n EOF";
3003 let masked = mask_non_executing_heredocs(cmd);
3004 assert!(
3005 matches!(masked, std::borrow::Cow::Owned(_)),
3006 "expected the body to be masked (Cow::Owned), got Borrowed: {masked:?}"
3007 );
3008 assert!(
3009 !masked.contains("rm -rf /"),
3010 "masked output still contains body: {masked:?}"
3011 );
3012 let cmd = "cat <<~ 'EOF'\n rm -rf /\n EOF";
3016 let masked = mask_non_executing_heredocs(cmd);
3017 assert!(
3018 matches!(masked, std::borrow::Cow::Owned(_)),
3019 "expected the body to be masked (Cow::Owned), got Borrowed: {masked:?}"
3020 );
3021 assert!(
3022 !masked.contains("rm -rf /"),
3023 "masked output still contains body: {masked:?}"
3024 );
3025 }
3026
3027 #[test]
3028 fn heredoc_language_detects_interpreter_prefixes() {
3029 let cases = [
3031 ("python3 <<EOF\nprint('hello')\nEOF", ScriptLanguage::Python),
3032 (
3033 "node <<EOF\nconsole.log('hello');\nEOF",
3034 ScriptLanguage::JavaScript,
3035 ),
3036 ("ruby <<EOF\nputs 'hello'\nEOF", ScriptLanguage::Ruby),
3037 ("perl <<EOF\nprint \"hello\";\nEOF", ScriptLanguage::Perl),
3038 ("bash <<EOF\necho hello\nEOF", ScriptLanguage::Bash),
3039 ];
3040
3041 for (cmd, expected) in cases {
3042 let result = extract_content(cmd, &ExtractionLimits::default());
3043 if let ExtractionResult::Extracted(contents) = result {
3044 assert_eq!(
3045 contents.len(),
3046 1,
3047 "expected one heredoc extraction for: {cmd}"
3048 );
3049 assert_eq!(
3050 contents[0].language, expected,
3051 "expected language {expected:?} for heredoc: {cmd}"
3052 );
3053 } else {
3054 panic!("Expected Extracted result for heredoc: {cmd}, got {result:?}");
3055 }
3056 }
3057 }
3058
3059 #[test]
3060 fn heredoc_language_detects_shebang_when_command_unknown() {
3061 let cmd = "cat <<EOF\n#!/usr/bin/env python3\nimport os\nprint('hi')\nEOF";
3062 let result = extract_content(cmd, &ExtractionLimits::default());
3063 if let ExtractionResult::Extracted(contents) = result {
3064 assert_eq!(contents.len(), 1);
3065 assert_eq!(contents[0].language, ScriptLanguage::Python);
3066 } else {
3067 panic!("Expected Extracted result, got {result:?}");
3068 }
3069 }
3070
3071 #[test]
3072 fn extracts_empty_heredoc() {
3073 let cmd = "cat << EOF\nEOF";
3075 let result = extract_content(cmd, &ExtractionLimits::default());
3076 if let ExtractionResult::Extracted(contents) = result {
3077 assert_eq!(contents.len(), 1);
3078 assert_eq!(contents[0].content, "");
3079 assert_eq!(contents[0].delimiter, Some("EOF".to_string()));
3080 } else {
3081 panic!("Expected Extracted result for empty heredoc, got {result:?}");
3082 }
3083 }
3084
3085 #[test]
3086 fn heredoc_byte_range_is_correct() {
3087 let cmd = "python << END\nprint(1)\nEND";
3089 let result = extract_content(cmd, &ExtractionLimits::default());
3090 if let ExtractionResult::Extracted(contents) = result {
3091 assert_eq!(contents.len(), 1);
3092 assert_eq!(contents[0].language, ScriptLanguage::Python);
3093 let range = &contents[0].byte_range;
3094 let extracted_span = &cmd[range.clone()];
3096 assert_eq!(extracted_span, "<< END\nprint(1)\nEND");
3097 } else {
3098 panic!("Expected Extracted result");
3099 }
3100
3101 let cmd = "cat << EOF\nEOF";
3103 let result = extract_content(cmd, &ExtractionLimits::default());
3104 if let ExtractionResult::Extracted(contents) = result {
3105 assert_eq!(contents.len(), 1);
3106 let range = &contents[0].byte_range;
3107 let extracted_span = &cmd[range.clone()];
3108 assert_eq!(extracted_span, "<< EOF\nEOF");
3109 } else {
3110 panic!("Expected Extracted result");
3111 }
3112
3113 let cmd = "cat << EOF\nline1\nline2\nEOF";
3115 let result = extract_content(cmd, &ExtractionLimits::default());
3116 if let ExtractionResult::Extracted(contents) = result {
3117 assert_eq!(contents.len(), 1);
3118 let range = &contents[0].byte_range;
3119 let extracted_span = &cmd[range.clone()];
3120 assert_eq!(extracted_span, "<< EOF\nline1\nline2\nEOF");
3121 } else {
3122 panic!("Expected Extracted result");
3123 }
3124 }
3125
3126 #[test]
3127 fn extracts_here_string_with_nested_quotes() {
3128 let result = extract_content(
3130 r#"cat <<< 'hello "world" test'"#,
3131 &ExtractionLimits::default(),
3132 );
3133 if let ExtractionResult::Extracted(contents) = result {
3134 assert_eq!(contents.len(), 1);
3135 assert_eq!(contents[0].content, r#"hello "world" test"#);
3136 assert!(contents[0].quoted);
3137 } else {
3138 panic!("Expected Extracted result");
3139 }
3140
3141 let result = extract_content(
3143 r#"cat <<< "hello 'world' test""#,
3144 &ExtractionLimits::default(),
3145 );
3146 if let ExtractionResult::Extracted(contents) = result {
3147 assert_eq!(contents.len(), 1);
3148 assert_eq!(contents[0].content, "hello 'world' test");
3149 assert!(contents[0].quoted);
3150 } else {
3151 panic!("Expected Extracted result");
3152 }
3153 }
3154
3155 #[test]
3156 fn from_command_does_not_false_positive() {
3157 assert_eq!(
3159 ScriptLanguage::from_command("shebang"),
3160 ScriptLanguage::Unknown
3161 );
3162 assert_eq!(
3163 ScriptLanguage::from_command("shell"),
3164 ScriptLanguage::Unknown
3165 );
3166 assert_eq!(
3167 ScriptLanguage::from_command("pythonic"),
3168 ScriptLanguage::Unknown
3169 );
3170 assert_eq!(
3171 ScriptLanguage::from_command("nodemon"),
3172 ScriptLanguage::Unknown
3173 );
3174 assert_eq!(
3175 ScriptLanguage::from_command("perldoc"),
3176 ScriptLanguage::Unknown
3177 );
3178 assert_eq!(
3179 ScriptLanguage::from_command("bashful"),
3180 ScriptLanguage::Unknown
3181 );
3182 }
3183
3184 #[test]
3185 fn from_command_matches_versioned_interpreters() {
3186 assert_eq!(
3188 ScriptLanguage::from_command("python3"),
3189 ScriptLanguage::Python
3190 );
3191 assert_eq!(
3192 ScriptLanguage::from_command("python3.11"),
3193 ScriptLanguage::Python
3194 );
3195 assert_eq!(
3196 ScriptLanguage::from_command("python3.11.4"),
3197 ScriptLanguage::Python
3198 );
3199 assert_eq!(
3200 ScriptLanguage::from_command("node18"),
3201 ScriptLanguage::JavaScript
3202 );
3203 assert_eq!(ScriptLanguage::from_command("perl5"), ScriptLanguage::Perl);
3204 }
3205
3206 #[test]
3207 fn no_content_on_safe_command() {
3208 let result = extract_content("git status", &ExtractionLimits::default());
3209 assert!(matches!(result, ExtractionResult::NoContent));
3210 }
3211
3212 #[test]
3213 fn script_language_from_command() {
3214 assert_eq!(
3215 ScriptLanguage::from_command("python3"),
3216 ScriptLanguage::Python
3217 );
3218 assert_eq!(ScriptLanguage::from_command("ruby"), ScriptLanguage::Ruby);
3219 assert_eq!(ScriptLanguage::from_command("perl"), ScriptLanguage::Perl);
3220 assert_eq!(
3221 ScriptLanguage::from_command("node"),
3222 ScriptLanguage::JavaScript
3223 );
3224 assert_eq!(ScriptLanguage::from_command("bash"), ScriptLanguage::Bash);
3225 assert_eq!(
3226 ScriptLanguage::from_command("unknown"),
3227 ScriptLanguage::Unknown
3228 );
3229 }
3230
3231 #[test]
3236 fn from_shebang_detects_direct_path() {
3237 assert_eq!(
3238 ScriptLanguage::from_shebang("#!/bin/bash\necho hello"),
3239 Some(ScriptLanguage::Bash)
3240 );
3241 assert_eq!(
3242 ScriptLanguage::from_shebang("#!/usr/bin/python\nimport os"),
3243 Some(ScriptLanguage::Python)
3244 );
3245 assert_eq!(
3246 ScriptLanguage::from_shebang("#!/usr/bin/ruby\nputs 'hi'"),
3247 Some(ScriptLanguage::Ruby)
3248 );
3249 }
3250
3251 #[test]
3252 fn from_shebang_detects_env_path() {
3253 assert_eq!(
3254 ScriptLanguage::from_shebang("#!/usr/bin/env python3\nimport sys"),
3255 Some(ScriptLanguage::Python)
3256 );
3257 assert_eq!(
3258 ScriptLanguage::from_shebang("#!/usr/bin/env node\nconsole.log('hi')"),
3259 Some(ScriptLanguage::JavaScript)
3260 );
3261 assert_eq!(
3262 ScriptLanguage::from_shebang("#!/usr/bin/env perl\nprint 'hello'"),
3263 Some(ScriptLanguage::Perl)
3264 );
3265 }
3266
3267 #[test]
3268 fn from_shebang_returns_none_for_invalid() {
3269 assert_eq!(ScriptLanguage::from_shebang("import os"), None);
3271 assert_eq!(ScriptLanguage::from_shebang("#!\ncode"), None);
3273 assert_eq!(
3275 ScriptLanguage::from_shebang("#!/usr/bin/unknown\ncode"),
3276 None
3277 );
3278 }
3279
3280 #[test]
3281 fn from_shebang_ignores_interpreter_flags() {
3282 assert_eq!(
3284 ScriptLanguage::from_shebang("#!/bin/bash -e\nset -x"),
3285 Some(ScriptLanguage::Bash)
3286 );
3287 assert_eq!(
3288 ScriptLanguage::from_shebang("#!/bin/bash -ex\necho hello"),
3289 Some(ScriptLanguage::Bash)
3290 );
3291 assert_eq!(
3292 ScriptLanguage::from_shebang("#!/usr/bin/python3 -u\nimport sys"),
3293 Some(ScriptLanguage::Python)
3294 );
3295
3296 assert_eq!(
3298 ScriptLanguage::from_shebang("#!/usr/bin/env python3 -u\nimport sys"),
3299 Some(ScriptLanguage::Python)
3300 );
3301 assert_eq!(
3302 ScriptLanguage::from_shebang("#!/usr/bin/env bash -e\necho hi"),
3303 Some(ScriptLanguage::Bash)
3304 );
3305 assert_eq!(
3306 ScriptLanguage::from_shebang("#!/usr/bin/env ruby -w\nputs 'hi'"),
3307 Some(ScriptLanguage::Ruby)
3308 );
3309 }
3310
3311 #[test]
3312 fn from_shebang_handles_env_flags() {
3313 assert_eq!(
3315 ScriptLanguage::from_shebang("#!/usr/bin/env -S python3 -u\nimport sys"),
3316 Some(ScriptLanguage::Python)
3317 );
3318 assert_eq!(
3319 ScriptLanguage::from_shebang("#!/usr/bin/env -S bash -e\necho hi"),
3320 Some(ScriptLanguage::Bash)
3321 );
3322
3323 assert_eq!(
3325 ScriptLanguage::from_shebang("#!/usr/bin/env -i python3\nimport os"),
3326 Some(ScriptLanguage::Python)
3327 );
3328
3329 assert_eq!(
3331 ScriptLanguage::from_shebang("#!/usr/bin/env -i -S perl -w\nuse strict;"),
3332 Some(ScriptLanguage::Perl)
3333 );
3334 }
3335
3336 #[test]
3337 fn from_content_detects_python() {
3338 assert_eq!(
3339 ScriptLanguage::from_content("import os\nos.remove('file')"),
3340 Some(ScriptLanguage::Python)
3341 );
3342 assert_eq!(
3343 ScriptLanguage::from_content("from pathlib import Path\nPath('x').unlink()"),
3344 Some(ScriptLanguage::Python)
3345 );
3346 }
3347
3348 #[test]
3349 fn from_content_detects_javascript() {
3350 assert_eq!(
3351 ScriptLanguage::from_content("const fs = require('fs');\nfs.rm('x');"),
3352 Some(ScriptLanguage::JavaScript)
3353 );
3354 assert_eq!(
3355 ScriptLanguage::from_content("let x = 5;\nconsole.log(x);"),
3356 Some(ScriptLanguage::JavaScript)
3357 );
3358 }
3359
3360 #[test]
3361 fn from_content_detects_typescript() {
3362 assert_eq!(
3363 ScriptLanguage::from_content("const x: string = 'hello';"),
3364 Some(ScriptLanguage::TypeScript)
3365 );
3366 assert_eq!(
3367 ScriptLanguage::from_content("interface User { name: string }"),
3368 Some(ScriptLanguage::TypeScript)
3369 );
3370 }
3371
3372 #[test]
3373 fn from_content_detects_ruby() {
3374 assert_eq!(
3376 ScriptLanguage::from_content("def hello\n puts 'hi'\nend"),
3377 Some(ScriptLanguage::Ruby)
3378 );
3379 assert_eq!(
3380 ScriptLanguage::from_content("require 'fileutils'\nFileUtils.rm_rf('x')\nend"),
3381 Some(ScriptLanguage::Ruby)
3382 );
3383 }
3384
3385 #[test]
3386 fn from_content_detects_perl() {
3387 assert_eq!(
3388 ScriptLanguage::from_content("use strict;\nmy $x = 5;"),
3389 Some(ScriptLanguage::Perl)
3390 );
3391 assert_eq!(
3392 ScriptLanguage::from_content("my @arr = (1,2,3);"),
3393 Some(ScriptLanguage::Perl)
3394 );
3395 }
3396
3397 #[test]
3398 fn from_content_detects_bash() {
3399 assert_eq!(
3400 ScriptLanguage::from_content("if [ -f file ]; then\n echo 'exists'\nfi"),
3401 Some(ScriptLanguage::Bash)
3402 );
3403 assert_eq!(
3404 ScriptLanguage::from_content("x=$((1+2))\necho ${x}"),
3405 Some(ScriptLanguage::Bash)
3406 );
3407 }
3408
3409 #[test]
3410 fn from_content_returns_none_for_unknown() {
3411 assert_eq!(ScriptLanguage::from_content("hello world"), None);
3412 assert_eq!(ScriptLanguage::from_content(""), None);
3413 }
3414
3415 #[test]
3416 fn detect_uses_command_prefix_first() {
3417 let (lang, confidence) =
3419 ScriptLanguage::detect("ruby -e 'code'", "#!/usr/bin/python\nimport os");
3420 assert_eq!(lang, ScriptLanguage::Ruby);
3421 assert_eq!(confidence, DetectionConfidence::CommandPrefix);
3422 }
3423
3424 #[test]
3425 fn detect_uses_shebang_second() {
3426 let (lang, confidence) =
3428 ScriptLanguage::detect("cat script.sh", "#!/bin/bash\necho hello");
3429 assert_eq!(lang, ScriptLanguage::Bash);
3430 assert_eq!(confidence, DetectionConfidence::Shebang);
3431 }
3432
3433 #[test]
3434 fn detect_uses_content_heuristics_third() {
3435 let (lang, confidence) =
3437 ScriptLanguage::detect("cat script", "import os\nos.remove('x')");
3438 assert_eq!(lang, ScriptLanguage::Python);
3439 assert_eq!(confidence, DetectionConfidence::ContentHeuristics);
3440 }
3441
3442 #[test]
3443 fn detect_returns_unknown_for_unrecognized() {
3444 let (lang, confidence) = ScriptLanguage::detect("cat file.txt", "hello world");
3445 assert_eq!(lang, ScriptLanguage::Unknown);
3446 assert_eq!(confidence, DetectionConfidence::Unknown);
3447 }
3448
3449 #[test]
3450 fn detect_handles_env_prefix() {
3451 let (lang, confidence) = ScriptLanguage::detect("env python3 -c 'code'", "");
3452 assert_eq!(lang, ScriptLanguage::Python);
3453 assert_eq!(confidence, DetectionConfidence::CommandPrefix);
3454 }
3455
3456 #[test]
3457 fn detect_handles_absolute_path() {
3458 let (lang, confidence) = ScriptLanguage::detect("/usr/bin/python3 -c 'code'", "");
3459 assert_eq!(lang, ScriptLanguage::Python);
3460 assert_eq!(confidence, DetectionConfidence::CommandPrefix);
3461 }
3462
3463 #[test]
3464 fn detection_confidence_labels() {
3465 assert_eq!(DetectionConfidence::CommandPrefix.label(), "command-prefix");
3466 assert_eq!(DetectionConfidence::Shebang.label(), "shebang");
3467 assert_eq!(
3468 DetectionConfidence::ContentHeuristics.label(),
3469 "content-heuristics"
3470 );
3471 assert_eq!(DetectionConfidence::Unknown.label(), "unknown");
3472 }
3473
3474 #[test]
3475 fn detection_confidence_reasons() {
3476 assert!(
3477 DetectionConfidence::CommandPrefix
3478 .reason()
3479 .contains("highest")
3480 );
3481 assert!(DetectionConfidence::Shebang.reason().contains("high"));
3482 assert!(
3483 DetectionConfidence::ContentHeuristics
3484 .reason()
3485 .contains("lower")
3486 );
3487 assert!(DetectionConfidence::Unknown.reason().contains("could not"));
3488 }
3489
3490 #[test]
3491 fn enforces_max_body_bytes() {
3492 let large_content = "x".repeat(2_000_000); let cmd = format!("python -c '{large_content}'");
3494 let limits = ExtractionLimits {
3495 max_body_bytes: 1_000_000, ..Default::default()
3497 };
3498 let result = extract_content(&cmd, &limits);
3499 match result {
3501 ExtractionResult::Skipped(reasons) => {
3502 assert!(
3503 reasons
3504 .iter()
3505 .any(|r| matches!(r, SkipReason::ExceededSizeLimit { .. }))
3506 );
3507 }
3508 ExtractionResult::NoContent
3509 | ExtractionResult::Failed(_)
3510 | ExtractionResult::Partial { .. } => {}
3511 ExtractionResult::Extracted(contents) => {
3512 for c in contents {
3514 assert!(c.content.len() <= limits.max_body_bytes);
3515 }
3516 }
3517 }
3518 }
3519
3520 #[test]
3521 fn extracts_multiple_inline_scripts() {
3522 let cmd = "python -c 'code1' && ruby -e 'code2'";
3523 let result = extract_content(cmd, &ExtractionLimits::default());
3524 if let ExtractionResult::Extracted(contents) = result {
3525 assert_eq!(contents.len(), 2);
3526 assert_eq!(contents[0].content, "code1");
3527 assert_eq!(contents[1].content, "code2");
3528 } else {
3529 panic!("Expected Extracted result");
3530 }
3531 }
3532
3533 #[test]
3534 fn extracts_versioned_interpreter_scripts() {
3535 let cmd = "python3.11 -c 'import os' && nodejs18 -e 'console.log(1)'";
3537 let result = extract_content(cmd, &ExtractionLimits::default());
3538 if let ExtractionResult::Extracted(contents) = result {
3539 assert_eq!(contents.len(), 2, "should extract both scripts");
3540 assert_eq!(contents[0].content, "import os");
3541 assert_eq!(contents[0].language, ScriptLanguage::Python);
3542 assert_eq!(contents[1].content, "console.log(1)");
3543 assert_eq!(contents[1].language, ScriptLanguage::JavaScript);
3544 } else {
3545 panic!("Expected Extracted result for versioned interpreters, got {result:?}");
3546 }
3547 }
3548
3549 #[test]
3554 fn skips_binary_content_with_null_bytes() {
3555 let cmd = "python -c '\x00binary\x00content'";
3557 if let Some(reason) = check_binary_content(cmd) {
3558 assert!(
3559 matches!(reason, SkipReason::BinaryContent { null_bytes, .. } if null_bytes > 0)
3560 );
3561 } else {
3562 panic!("Expected binary content detection");
3563 }
3564 }
3565
3566 #[test]
3567 fn skips_binary_content_high_non_printable() {
3568 let binary_bytes: Vec<u8> = (0u8..50).chain(200u8..255).collect();
3570 let binary_str = String::from_utf8_lossy(&binary_bytes);
3571 if let Some(reason) = check_binary_content(&binary_str) {
3572 assert!(matches!(reason, SkipReason::BinaryContent { .. }));
3573 } else {
3574 panic!("Expected binary content detection for high non-printable ratio");
3575 }
3576 }
3577
3578 #[test]
3579 fn allows_normal_text_content() {
3580 let normal_content = "import os\nprint('hello world')\nfor i in range(10): pass";
3581 assert!(check_binary_content(normal_content).is_none());
3582 }
3583
3584 #[test]
3585 fn tracks_unterminated_heredoc() {
3586 let cmd = "cat << EOF\nunterminated content without closing delimiter";
3587 let result = extract_content(cmd, &ExtractionLimits::default());
3588 match result {
3589 ExtractionResult::Skipped(reasons) => {
3590 assert!(
3591 reasons
3592 .iter()
3593 .any(|r| matches!(r, SkipReason::UnterminatedHeredoc { .. })),
3594 "should report UnterminatedHeredoc, not ExceededSizeLimit"
3595 );
3596 }
3597 _ => panic!("Expected Skipped result for unterminated heredoc"),
3598 }
3599 }
3600
3601 #[test]
3602 fn heredoc_body_line_limit_reports_exceeded_line_limit() {
3603 let cmd = "cat << EOF\nline1\nline2\nline3\nEOF";
3604 let limits = ExtractionLimits {
3605 max_body_lines: 2,
3606 ..Default::default()
3607 };
3608
3609 let result = extract_content(cmd, &limits);
3610 match result {
3611 ExtractionResult::Skipped(reasons) => {
3612 assert!(
3613 reasons
3614 .iter()
3615 .any(|r| matches!(r, SkipReason::ExceededLineLimit { .. })),
3616 "should report ExceededLineLimit, not UnterminatedHeredoc"
3617 );
3618 }
3619 _ => panic!("Expected Skipped result for line-limited heredoc, got {result:?}"),
3620 }
3621 }
3622
3623 #[test]
3624 fn extraction_timeout_is_enforced() {
3625 let cmd = "cat << EOF\nline1\nEOF";
3626 let limits = ExtractionLimits {
3627 timeout_ms: 0,
3628 ..Default::default()
3629 };
3630
3631 let result = extract_content(cmd, &limits);
3632 match result {
3633 ExtractionResult::Skipped(reasons) => {
3634 assert!(
3635 reasons
3636 .iter()
3637 .any(|r| matches!(r, SkipReason::Timeout { .. })),
3638 "should include a Timeout skip reason"
3639 );
3640 }
3641 _ => panic!("Expected Skipped(timeout) result, got {result:?}"),
3642 }
3643 }
3644
3645 #[test]
3646 fn enforces_heredoc_limit() {
3647 let cmd = "cmd1 << A\na\nA && cmd2 << B\nb\nB && cmd3 << C\nc\nC";
3649 let limits = ExtractionLimits {
3650 max_heredocs: 2, ..Default::default()
3652 };
3653 let result = extract_content(cmd, &limits);
3654 if let ExtractionResult::Extracted(contents) = result {
3655 assert!(contents.len() <= limits.max_heredocs);
3656 }
3657 }
3659
3660 #[test]
3661 fn skip_reason_display() {
3662 let reasons = vec![
3664 SkipReason::ExceededSizeLimit {
3665 actual: 2000,
3666 limit: 1000,
3667 },
3668 SkipReason::ExceededLineLimit {
3669 actual: 200,
3670 limit: 100,
3671 },
3672 SkipReason::ExceededHeredocLimit { limit: 10 },
3673 SkipReason::BinaryContent {
3674 null_bytes: 5,
3675 non_printable_ratio: 0.5,
3676 },
3677 SkipReason::Timeout {
3678 elapsed_ms: 60,
3679 budget_ms: 50,
3680 },
3681 SkipReason::UnterminatedHeredoc {
3682 delimiter: "EOF".to_string(),
3683 },
3684 SkipReason::MalformedInput {
3685 reason: "test".to_string(),
3686 },
3687 ];
3688
3689 for reason in reasons {
3690 let display = format!("{reason}");
3691 assert!(!display.is_empty(), "Display should produce output");
3692 }
3693 }
3694
3695 #[test]
3696 fn empty_command_returns_no_content() {
3697 let result = extract_content("", &ExtractionLimits::default());
3698 assert!(matches!(result, ExtractionResult::NoContent));
3699 }
3700
3701 #[test]
3702 fn whitespace_only_returns_no_content() {
3703 let result = extract_content(" \t\n ", &ExtractionLimits::default());
3704 assert!(matches!(result, ExtractionResult::NoContent));
3705 }
3706 }
3707
3708 mod shell_extraction {
3713 use super::*;
3714
3715 #[test]
3720 fn extracts_simple_command() {
3721 let commands = extract_shell_commands("ls -la");
3722 assert_eq!(commands.len(), 1);
3723 assert_eq!(commands[0].text, "ls -la");
3724 assert_eq!(commands[0].line_number, 1);
3725 }
3726
3727 #[test]
3728 fn extracts_rm_rf() {
3729 let commands = extract_shell_commands("rm -rf /tmp/test");
3731 assert_eq!(commands.len(), 1);
3732 assert_eq!(commands[0].text, "rm -rf /tmp/test");
3733 }
3734
3735 #[test]
3736 fn extracts_git_reset_hard() {
3737 let commands = extract_shell_commands("git reset --hard");
3738 assert_eq!(commands.len(), 1);
3739 assert_eq!(commands[0].text, "git reset --hard");
3740 }
3741
3742 #[test]
3743 fn extracts_git_clean_fd() {
3744 let commands = extract_shell_commands("git clean -fd");
3745 assert_eq!(commands.len(), 1);
3746 assert_eq!(commands[0].text, "git clean -fd");
3747 }
3748
3749 #[test]
3750 fn extracts_pipeline_both_sides() {
3751 let commands = extract_shell_commands("find . -name '*.bak' | xargs rm");
3753 assert_eq!(commands.len(), 2, "pipeline should extract both commands");
3754 assert!(commands[0].text.starts_with("find"));
3755 assert!(commands[1].text.contains("xargs"));
3756 }
3757
3758 #[test]
3759 fn extracts_command_list() {
3760 let commands = extract_shell_commands("cd /tmp && rm -rf test");
3762 assert_eq!(commands.len(), 2, "command list should extract both");
3763 }
3764
3765 #[test]
3766 fn extracts_command_substitution() {
3767 let commands = extract_shell_commands("echo $(rm -rf /tmp/test)");
3769 assert!(
3770 commands.len() >= 2,
3771 "should extract command inside substitution"
3772 );
3773 assert!(
3775 commands.iter().any(|c| c.text.contains("rm")),
3776 "should extract rm from command substitution"
3777 );
3778 }
3779
3780 #[test]
3781 fn extracts_subshell_commands() {
3782 let commands = extract_shell_commands("(cd /tmp && rm -rf test)");
3784 assert!(commands.len() >= 2, "should extract commands from subshell");
3785 }
3786
3787 #[test]
3788 fn extracts_multiline_script() {
3789 let script = r#"#!/bin/bash
3790set -e
3791cd /tmp
3792rm -rf test
3793echo "done""#;
3794 let commands = extract_shell_commands(script);
3795 assert!(
3796 commands.len() >= 4,
3797 "should extract all commands from multiline script"
3798 );
3799 assert!(
3801 commands.iter().any(|c| c.text.contains("rm")),
3802 "should extract rm"
3803 );
3804 }
3805
3806 #[test]
3807 fn extracts_docker_system_prune() {
3808 let commands = extract_shell_commands("docker system prune -af");
3810 assert_eq!(commands.len(), 1);
3811 assert_eq!(commands[0].text, "docker system prune -af");
3812 }
3813
3814 #[test]
3815 fn line_numbers_are_correct() {
3816 let script = "echo first\nrm -rf /tmp\necho last";
3817 let commands = extract_shell_commands(script);
3818 assert!(commands.len() >= 3);
3819
3820 let rm_cmd = commands.iter().find(|c| c.text.contains("rm")).unwrap();
3821 assert_eq!(rm_cmd.line_number, 2, "rm should be on line 2");
3822 }
3823
3824 #[test]
3829 fn skips_comments() {
3830 let commands = extract_shell_commands("# rm -rf / would be bad");
3833 assert!(
3834 commands.is_empty(),
3835 "comment-only content should produce zero commands, got: {commands:?}"
3836 );
3837 }
3838
3839 #[test]
3840 fn echo_string_is_data_not_execution() {
3841 let commands = extract_shell_commands("echo 'rm -rf /'");
3843 assert!(
3845 commands.len() == 1,
3846 "should only extract echo, not the string content"
3847 );
3848 assert!(
3850 commands[0].text.starts_with("echo"),
3851 "extracted command should be echo"
3852 );
3853 }
3854
3855 #[test]
3856 fn printf_string_is_data_not_execution() {
3857 let commands = extract_shell_commands(r#"printf "rm -rf %s" /tmp"#);
3858 assert!(
3859 commands.len() == 1,
3860 "should only extract printf, not the format string content"
3861 );
3862 assert!(commands[0].text.starts_with("printf"));
3863 }
3864
3865 #[test]
3866 fn empty_content_returns_no_commands() {
3867 let commands = extract_shell_commands("");
3868 assert!(commands.is_empty());
3869 }
3870
3871 #[test]
3872 fn whitespace_only_returns_no_commands() {
3873 let commands = extract_shell_commands(" \n\t ");
3874 assert!(commands.is_empty());
3875 }
3876
3877 #[test]
3878 fn comment_only_returns_no_commands() {
3879 let commands = extract_shell_commands("# This is just a comment");
3881 assert!(
3882 commands.is_empty(),
3883 "comment-only content should produce zero commands, got: {commands:?}"
3884 );
3885 }
3886
3887 #[test]
3888 fn heredoc_delimiter_is_not_command() {
3889 let script = r"cat << EOF
3891some content
3892rm -rf / mentioned in text
3893EOF";
3894 let commands = extract_shell_commands(script);
3895
3896 assert!(
3898 commands.iter().any(|c| c.text.starts_with("cat")),
3899 "should extract cat command"
3900 );
3901
3902 let rm_commands: Vec<_> = commands
3905 .iter()
3906 .filter(|c| c.text.contains("rm") && !c.text.contains("cat"))
3907 .collect();
3908 assert!(
3909 rm_commands.is_empty(),
3910 "heredoc body content must NOT be extracted as commands, but found: {rm_commands:?}"
3911 );
3912 }
3913
3914 #[test]
3915 fn safe_tmp_cleanup_is_extracted() {
3916 let commands = extract_shell_commands("rm -rf /tmp/build_cache");
3919 assert_eq!(commands.len(), 1);
3920 }
3922
3923 #[test]
3928 fn handles_complex_pipeline() {
3929 let commands = extract_shell_commands("cat file | grep pattern | wc -l");
3930 assert_eq!(commands.len(), 3, "should extract all pipeline stages");
3931 }
3932
3933 #[test]
3934 fn handles_background_command() {
3935 let commands = extract_shell_commands("long_process &");
3936 assert_eq!(commands.len(), 1);
3937 assert_eq!(commands[0].text, "long_process");
3938 }
3939
3940 #[test]
3941 fn handles_redirections() {
3942 let commands = extract_shell_commands("rm -rf /tmp/test > /dev/null 2>&1");
3943 assert_eq!(commands.len(), 1);
3944 assert!(commands[0].text.contains("rm"));
3946 }
3947
3948 #[test]
3949 fn handles_variable_expansion_in_command() {
3950 let commands = extract_shell_commands("rm -rf $DIR");
3952 assert_eq!(commands.len(), 1);
3953 assert!(commands[0].text.contains("rm"));
3954 }
3955
3956 #[test]
3957 fn handles_if_then_else() {
3958 let script = r#"if [ -f /tmp/test ]; then
3959 rm -rf /tmp/test
3960else
3961 echo "not found"
3962fi"#;
3963 let commands = extract_shell_commands(script);
3964 assert!(
3966 commands.iter().any(|c| c.text.contains("rm")),
3967 "should extract rm from if body"
3968 );
3969 assert!(
3970 commands.iter().any(|c| c.text.contains("echo")),
3971 "should extract echo from else body"
3972 );
3973 }
3974
3975 #[test]
3976 fn handles_for_loop() {
3977 let script = "for f in *.txt; do rm -f \"$f\"; done";
3978 let commands = extract_shell_commands(script);
3979 assert!(
3980 commands.iter().any(|c| c.text.contains("rm")),
3981 "should extract rm from for loop body"
3982 );
3983 }
3984
3985 #[test]
3986 fn byte_ranges_are_correct() {
3987 let script = "echo hello";
3988 let commands = extract_shell_commands(script);
3989 assert_eq!(commands.len(), 1);
3990 assert_eq!(commands[0].start, 0);
3991 assert_eq!(commands[0].end, script.len());
3992
3993 let extracted = &script[commands[0].start..commands[0].end];
3995 assert_eq!(extracted, "echo hello");
3996 }
3997 }
3998
3999 proptest! {
4000 #[test]
4003 fn tier1_is_superset_of_tier2_extraction(cmd in prop_oneof![
4004 "\\PC{0,2000}",
4006 "\\PC{0,400}".prop_map(|body| format!("cat <<EOF\n{body}\nEOF")),
4008 "\\PC{0,400}".prop_map(|body| format!("cat <<'EOF'\n{body}\nEOF")),
4009 "\\PC{0,400}".prop_map(|body| format!("python -c \"{}\"", body.replace('\"', ""))),
4011 "\\PC{0,400}".prop_map(|body| format!("bash -c \"{}\"", body.replace('\"', ""))),
4012 "\\PC{0,400}".prop_map(|body| format!("node -e \"{}\"", body.replace('\"', ""))),
4013 ]) {
4014 let limits = ExtractionLimits {
4015 max_body_bytes: 10_000,
4016 max_body_lines: 1_000,
4017 max_heredocs: 5,
4018 timeout_ms: 50,
4019 };
4020
4021 let extracted = extract_content(&cmd, &limits);
4022 if let ExtractionResult::Extracted(contents) = extracted {
4023 if !contents.is_empty() {
4024 prop_assert_eq!(
4025 check_triggers(&cmd),
4026 TriggerResult::Triggered,
4027 "Tier 2 extracted but Tier 1 did not trigger for: {:?}",
4028 cmd
4029 );
4030 }
4031 }
4032 }
4033 }
4034
4035 #[test]
4036 fn detects_language_in_pipeline() {
4037 let cmd = "cat <<EOF | python";
4039 let content = "print('hello')"; let (lang, _) = ScriptLanguage::detect(cmd, content);
4041 assert_eq!(lang, ScriptLanguage::Python);
4042 }
4043
4044 #[test]
4045 fn extract_heredoc_target_command_prefers_command_over_arguments() {
4046 let cat_cmd = "cat bash <<EOF\nrm -rf /\nEOF";
4047 let cat_start = cat_cmd.find("<<").expect("cat heredoc");
4048 assert_eq!(
4049 extract_heredoc_target_command(cat_cmd, cat_start).as_deref(),
4050 Some("cat")
4051 );
4052
4053 let grep_cmd = "grep pattern . <<EOF\nrm -rf /\nEOF";
4054 let grep_start = grep_cmd.find("<<").expect("grep heredoc");
4055 assert_eq!(
4056 extract_heredoc_target_command(grep_cmd, grep_start).as_deref(),
4057 Some("grep")
4058 );
4059 }
4060
4061 #[test]
4062 fn extract_heredoc_target_command_skips_assignments_and_wrappers() {
4063 let env_cmd = "FOO=1 env -i /bin/cat <<EOF\npayload\nEOF";
4064 let env_start = env_cmd.find("<<").expect("env heredoc");
4065 assert_eq!(
4066 extract_heredoc_target_command(env_cmd, env_start).as_deref(),
4067 Some("cat")
4068 );
4069
4070 let sudo_cmd = "sudo bash <<EOF\necho hi\nEOF";
4071 let sudo_start = sudo_cmd.find("<<").expect("sudo heredoc");
4072 assert_eq!(
4073 extract_heredoc_target_command(sudo_cmd, sudo_start).as_deref(),
4074 Some("bash")
4075 );
4076 }
4077}