use memchr::memchr;
use regex::RegexSet;
use std::sync::LazyLock;
use std::time::{Duration, Instant};
use tracing::{debug, instrument, trace, warn};
const HEREDOC_TRIGGER_PATTERNS: [&str; 13] = [
r"<<<",
r#"\bpython[0-9.]*(?:\.exe)?\b(?:\s+(?:--\S+|-[A-Za-z]+))*\s+-[A-Za-z]*[ce][A-Za-z]*(?:\s|['"]|$)"#,
r#"\bruby[0-9.]*(?:\.exe)?\b(?:\s+(?:--\S+|-[A-Za-z]+))*\s+-[A-Za-z]*e[A-Za-z]*(?:\s|['"]|$)"#,
r#"\birb[0-9.]*(?:\.exe)?\b(?:\s+(?:--\S+|-[A-Za-z]+))*\s+-[A-Za-z]*e[A-Za-z]*(?:\s|['"]|$)"#,
r#"\bperl[0-9.]*(?:\.exe)?\b(?:\s+(?:--\S+|-[A-Za-z]+))*\s+-[A-Za-z]*[eE][A-Za-z]*(?:\s|['"]|$)"#,
r#"\bnode(?:js)?[0-9.]*(?:\.exe)?\b(?:\s+(?:--\S+|-[A-Za-z]+))*\s+-[A-Za-z]*[ep][A-Za-z]*(?:\s|['"]|$)"#,
r#"\bphp[0-9.]*(?:\.exe)?\b(?:\s+(?:--\S+|-[A-Za-z]+))*\s+-[A-Za-z]*r[A-Za-z]*(?:\s|['"]|$)"#,
r#"\blua[0-9.]*(?:\.exe)?\b(?:\s+(?:--\S+|-[A-Za-z]+))*\s+-[A-Za-z]*e[A-Za-z]*(?:\s|['"]|$)"#,
r#"\b(?:sh|bash|zsh|fish)(?:\.exe)?\b(?:\s+(?:--\S+|-[A-Za-z]+))*\s+-[A-Za-z]*c[A-Za-z]*(?:\s|['"]|$)"#,
r"\|\s*(?:python[0-9.]*|ruby[0-9.]*|perl[0-9.]*|node(?:js)?[0-9.]*|php[0-9.]*|lua[0-9.]*|sh|bash)(?:\.exe)?\b",
r"\|\s*xargs\s",
r#"\beval\s+['"]"#,
r#"\bexec\s+['"]"#,
];
const MANUAL_HEREDOC_TRIGGER_INDEX: usize = HEREDOC_TRIGGER_PATTERNS.len();
static HEREDOC_TRIGGERS: LazyLock<RegexSet> = LazyLock::new(|| {
RegexSet::new(HEREDOC_TRIGGER_PATTERNS).expect("heredoc trigger patterns should compile")
});
#[inline]
#[must_use]
fn contains_active_heredoc_operator(command: &str) -> bool {
if memchr(b'<', command.as_bytes()).is_none() {
return false;
}
contains_active_heredoc_operator_recursive(command, 0, 0)
}
#[must_use]
fn contains_active_heredoc_operator_recursive(
command: &str,
start: usize,
recursion_depth: usize,
) -> bool {
if recursion_depth > 500 {
return true;
}
let bytes = command.as_bytes();
let len = bytes.len();
let mut i = start.min(len);
while i < len {
match bytes[i] {
b'<' if i + 1 < len && bytes[i + 1] == b'<' => {
return true;
}
b'\\' => {
if i + 2 < len && bytes[i + 1] == b'\r' && bytes[i + 2] == b'\n' {
i += 3;
} else {
i = (i + 2).min(len);
}
}
b'\'' => {
i += 1;
while i < len && bytes[i] != b'\'' {
i += 1;
}
if i < len {
i += 1;
}
}
b'"' => {
let (found, next) = scan_double_quotes_for_heredoc(command, i + 1, recursion_depth);
if found {
return true;
}
i = next;
}
b'$' if i + 1 < len && bytes[i + 1] == b'(' => {
let (found, next) =
scan_dollar_paren_for_heredoc_recursive(command, i, recursion_depth + 1);
if found {
return true;
}
i = next;
}
b'`' => {
let (found, next) =
scan_backticks_for_heredoc_recursive(command, i, recursion_depth + 1);
if found {
return true;
}
i = next;
}
_ => {
i += 1;
}
}
}
false
}
#[must_use]
fn scan_double_quotes_for_heredoc(
command: &str,
start: usize,
recursion_depth: usize,
) -> (bool, usize) {
if recursion_depth > 500 {
return (true, command.len());
}
let bytes = command.as_bytes();
let len = bytes.len();
let mut i = start.min(len);
while i < len {
match bytes[i] {
b'"' => return (false, i + 1),
b'\\' => {
i = (i + 2).min(len);
}
b'$' if i + 1 < len && bytes[i + 1] == b'(' => {
let (found, next) =
scan_dollar_paren_for_heredoc_recursive(command, i, recursion_depth + 1);
if found {
return (true, next);
}
i = next;
}
b'`' => {
let (found, next) =
scan_backticks_for_heredoc_recursive(command, i, recursion_depth + 1);
if found {
return (true, next);
}
i = next;
}
_ => {
i += 1;
}
}
}
(false, len)
}
#[must_use]
fn scan_dollar_paren_for_heredoc_recursive(
command: &str,
start: usize,
recursion_depth: usize,
) -> (bool, usize) {
if recursion_depth > 500 {
return (true, command.len());
}
let bytes = command.as_bytes();
let len = bytes.len();
debug_assert!(bytes.get(start) == Some(&b'$'));
debug_assert!(bytes.get(start + 1) == Some(&b'('));
let mut i = start + 2;
let mut depth: u32 = 1;
while i < len {
match bytes[i] {
b'<' if i + 1 < len && bytes[i + 1] == b'<' => {
return (true, i + 2);
}
b'(' => {
depth += 1;
i += 1;
}
b')' => {
if depth == 1 {
return (false, i + 1);
}
depth = depth.saturating_sub(1);
i += 1;
}
b'\\' => {
i = (i + 2).min(len);
}
b'\'' => {
i += 1;
while i < len && bytes[i] != b'\'' {
i += 1;
}
if i < len {
i += 1;
}
}
b'"' => {
let (found, next) = scan_double_quotes_for_heredoc(command, i + 1, recursion_depth);
if found {
return (true, next);
}
i = next;
}
b'$' if i + 1 < len && bytes[i + 1] == b'(' => {
let (found, next) =
scan_dollar_paren_for_heredoc_recursive(command, i, recursion_depth + 1);
if found {
return (true, next);
}
i = next;
}
b'`' => {
let (found, next) =
scan_backticks_for_heredoc_recursive(command, i, recursion_depth + 1);
if found {
return (true, next);
}
i = next;
}
_ => {
i += 1;
}
}
}
(false, len)
}
#[must_use]
fn scan_backticks_for_heredoc_recursive(
command: &str,
start: usize,
recursion_depth: usize,
) -> (bool, usize) {
if recursion_depth > 500 {
return (true, command.len());
}
let bytes = command.as_bytes();
let len = bytes.len();
debug_assert!(bytes.get(start) == Some(&b'`'));
let mut i = start + 1;
while i < len {
match bytes[i] {
b'<' if i + 1 < len && bytes[i + 1] == b'<' => {
return (true, i + 2);
}
b'\\' => {
i = (i + 2).min(len);
}
b'\'' => {
i += 1;
while i < len && bytes[i] != b'\'' {
i += 1;
}
if i < len {
i += 1;
}
}
b'"' => {
let (found, next) = scan_double_quotes_for_heredoc(command, i + 1, recursion_depth);
if found {
return (true, next);
}
i = next;
}
b'$' if i + 1 < len && bytes[i + 1] == b'(' => {
let (found, next) =
scan_dollar_paren_for_heredoc_recursive(command, i, recursion_depth + 1);
if found {
return (true, next);
}
i = next;
}
b'`' => {
return (false, i + 1);
}
_ => {
i += 1;
}
}
}
(false, len)
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TriggerResult {
NoTrigger,
Triggered,
}
#[inline]
#[must_use]
#[instrument(skip(command), fields(cmd_len = command.len()))]
pub fn check_triggers(command: &str) -> TriggerResult {
if contains_active_heredoc_operator(command) || HEREDOC_TRIGGERS.is_match(command) {
debug!("tier1_trigger: heredoc/inline script indicator detected");
TriggerResult::Triggered
} else {
trace!("tier1_no_trigger: fast path allow");
TriggerResult::NoTrigger
}
}
#[must_use]
pub fn matched_triggers(command: &str) -> Vec<usize> {
let mut matches: Vec<usize> = HEREDOC_TRIGGERS.matches(command).into_iter().collect();
if contains_active_heredoc_operator(command) {
matches.push(MANUAL_HEREDOC_TRIGGER_INDEX);
}
matches
}
use regex::Regex;
#[derive(Debug, Clone, Copy)]
pub struct ExtractionLimits {
pub max_body_bytes: usize,
pub max_body_lines: usize,
pub max_heredocs: usize,
pub timeout_ms: u64,
}
impl Default for ExtractionLimits {
fn default() -> Self {
Self {
max_body_bytes: 1024 * 1024, max_body_lines: 10_000,
max_heredocs: 10,
timeout_ms: 50,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum ScriptLanguage {
Bash,
Go,
Php,
Python,
Ruby,
Perl,
JavaScript,
TypeScript,
Unknown,
}
impl ScriptLanguage {
#[must_use]
pub fn from_command(cmd: &str) -> Self {
let cmd_lower = cmd.to_lowercase();
let cmd_base = cmd_lower.strip_suffix(".exe").unwrap_or(&cmd_lower);
let matches_interpreter = |base: &str| -> bool {
if cmd_base == base {
return true;
}
cmd_base.strip_prefix(base).is_some_and(|suffix| {
!suffix.is_empty()
&& suffix.chars().all(|c| c.is_ascii_digit() || c == '.')
&& suffix.chars().next().is_some_and(|c| c.is_ascii_digit())
})
};
if matches_interpreter("python") {
Self::Python
} else if matches_interpreter("ruby") || matches_interpreter("irb") {
Self::Ruby
} else if matches_interpreter("perl") {
Self::Perl
} else if matches_interpreter("node") || matches_interpreter("nodejs") {
Self::JavaScript
} else if matches_interpreter("deno") || matches_interpreter("bun") {
Self::TypeScript
} else if matches_interpreter("php") {
Self::Php
} else if matches_interpreter("go") {
Self::Go
} else if matches_interpreter("sh")
|| matches_interpreter("bash")
|| matches_interpreter("zsh")
|| matches_interpreter("fish")
{
Self::Bash
} else {
Self::Unknown
}
}
#[must_use]
pub fn from_shebang(content: &str) -> Option<Self> {
let first_line = content.lines().next()?;
let shebang = first_line.strip_prefix("#!")?;
let shebang = shebang.trim();
if shebang.is_empty() {
return None;
}
let mut parts = shebang.split_whitespace();
let first = parts.next()?;
let basename = first.rsplit('/').next().unwrap_or(first);
let interpreter = if basename == "env" {
loop {
let next = parts.next()?;
if !next.starts_with('-') {
break next.rsplit('/').next().unwrap_or(next);
}
}
} else {
basename
};
let lang = Self::from_command(interpreter);
if lang == Self::Unknown {
None
} else {
Some(lang)
}
}
#[must_use]
pub fn from_content(content: &str) -> Option<Self> {
let lines: Vec<&str> = content.lines().take(20).collect();
let has_python_import = lines.iter().any(|l| {
let trimmed = l.trim();
trimmed.starts_with("import ") || trimmed.starts_with("from ")
});
if has_python_import {
return Some(Self::Python);
}
let has_typescript_patterns = lines.iter().any(|l| {
let trimmed = l.trim();
trimmed.contains(": string")
|| trimmed.contains(": number")
|| trimmed.contains(": boolean")
|| trimmed.contains("interface ")
|| trimmed.starts_with("type ")
});
if has_typescript_patterns {
return Some(Self::TypeScript);
}
let has_js_patterns = lines.iter().any(|l| {
let trimmed = l.trim();
trimmed.contains("require(")
|| trimmed.starts_with("const ")
|| trimmed.starts_with("let ")
|| trimmed.starts_with("var ")
|| trimmed.contains("module.exports")
});
if has_js_patterns {
return Some(Self::JavaScript);
}
let has_ruby_patterns = lines.iter().any(|l| {
let trimmed = l.trim();
trimmed.starts_with("def ")
|| trimmed.starts_with("class ")
|| trimmed.starts_with("require ")
|| trimmed.starts_with("require_relative ")
|| trimmed.contains(".each do")
|| trimmed.contains(" do |")
});
let has_end = content.contains("\nend") || content.ends_with("end");
if has_ruby_patterns && has_end {
return Some(Self::Ruby);
}
let has_go_patterns = lines.iter().any(|l| {
let trimmed = l.trim();
trimmed.starts_with("package ")
|| trimmed.starts_with("func ")
|| trimmed.contains(":=")
|| (trimmed.starts_with("import ") && trimmed.contains('"'))
|| trimmed == "import ("
});
if has_go_patterns {
return Some(Self::Go);
}
let has_perl_patterns = lines.iter().any(|l| {
let trimmed = l.trim();
trimmed.starts_with("use strict")
|| trimmed.starts_with("use warnings")
|| trimmed.starts_with("my $")
|| trimmed.starts_with("my @")
|| trimmed.starts_with("my %")
|| trimmed.contains("=~ /")
|| trimmed.contains("=~ s/")
});
if has_perl_patterns {
return Some(Self::Perl);
}
let has_bash_patterns = lines.iter().any(|l| {
let trimmed = l.trim();
trimmed.starts_with("if [")
|| trimmed.starts_with("for ")
|| trimmed.starts_with("while ")
|| trimmed.starts_with("case ")
|| trimmed.contains("$((")
|| trimmed.contains("${")
|| trimmed.starts_with("function ")
|| (trimmed.contains("()") && trimmed.contains('{'))
});
if has_bash_patterns {
return Some(Self::Bash);
}
None
}
#[must_use]
pub fn detect(cmd: &str, content: &str) -> (Self, DetectionConfidence) {
if let Some(interpreter) = Self::extract_head_interpreter(cmd) {
let lang = Self::from_command(&interpreter);
if lang != Self::Unknown {
return (lang, DetectionConfidence::CommandPrefix);
}
}
if cmd.contains('|') {
for segment in cmd.split('|') {
let segment = segment.trim();
if segment.is_empty() {
continue;
}
if let Some(interpreter) = Self::extract_head_interpreter(segment) {
let lang = Self::from_command(&interpreter);
if lang != Self::Unknown {
return (lang, DetectionConfidence::CommandPrefix);
}
}
}
}
if let Some(lang) = Self::from_shebang(content) {
return (lang, DetectionConfidence::Shebang);
}
if let Some(lang) = Self::from_content(content) {
return (lang, DetectionConfidence::ContentHeuristics);
}
(Self::Unknown, DetectionConfidence::Unknown)
}
fn extract_head_interpreter(cmd: &str) -> Option<String> {
let normalized = crate::normalize::strip_wrapper_prefixes(cmd);
let cmd_to_check = normalized.normalized;
let mut parts = cmd_to_check.split_whitespace();
let first = parts.next()?;
let basename = first.rsplit('/').next().unwrap_or(first);
Some(basename.to_string())
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum DetectionConfidence {
CommandPrefix,
Shebang,
ContentHeuristics,
Unknown,
}
impl DetectionConfidence {
#[must_use]
pub const fn label(&self) -> &'static str {
match self {
Self::CommandPrefix => "command-prefix",
Self::Shebang => "shebang",
Self::ContentHeuristics => "content-heuristics",
Self::Unknown => "unknown",
}
}
#[must_use]
pub const fn reason(&self) -> &'static str {
match self {
Self::CommandPrefix => "detected from command interpreter (highest confidence)",
Self::Shebang => "detected from shebang line (high confidence)",
Self::ContentHeuristics => "inferred from content patterns (lower confidence)",
Self::Unknown => "could not determine language",
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum HeredocType {
Standard,
TabStripped,
HereString,
IndentStripped,
}
#[derive(Debug, Clone)]
pub struct ExtractedContent {
pub content: String,
pub language: ScriptLanguage,
pub delimiter: Option<String>,
pub byte_range: std::ops::Range<usize>,
pub content_range: Option<std::ops::Range<usize>>,
pub quoted: bool,
pub heredoc_type: Option<HeredocType>,
pub target_command: Option<String>,
}
#[derive(Debug, Clone, PartialEq)]
pub enum SkipReason {
ExceededSizeLimit { actual: usize, limit: usize },
ExceededLineLimit { actual: usize, limit: usize },
ExceededHeredocLimit { limit: usize },
BinaryContent {
null_bytes: usize,
non_printable_ratio: f32,
},
Timeout { elapsed_ms: u64, budget_ms: u64 },
UnterminatedHeredoc { delimiter: String },
MalformedInput { reason: String },
}
impl std::fmt::Display for SkipReason {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::ExceededSizeLimit { actual, limit } => {
write!(f, "exceeded size limit: {actual} bytes > {limit} bytes")
}
Self::ExceededLineLimit { actual, limit } => {
write!(f, "exceeded line limit: {actual} lines > {limit} lines")
}
Self::ExceededHeredocLimit { limit } => {
write!(f, "exceeded heredoc limit: max {limit} heredocs")
}
Self::BinaryContent {
null_bytes,
non_printable_ratio,
} => {
write!(
f,
"binary content detected: {null_bytes} null bytes, {:.1}% non-printable",
non_printable_ratio * 100.0
)
}
Self::Timeout {
elapsed_ms,
budget_ms,
} => write!(
f,
"extraction timeout: {elapsed_ms}ms > {budget_ms}ms budget"
),
Self::UnterminatedHeredoc { delimiter } => {
write!(f, "unterminated heredoc: delimiter '{delimiter}' not found")
}
Self::MalformedInput { reason } => {
write!(f, "malformed input: {reason}")
}
}
}
}
#[derive(Debug)]
pub enum ExtractionResult {
NoContent,
Extracted(Vec<ExtractedContent>),
Skipped(Vec<SkipReason>),
Partial {
extracted: Vec<ExtractedContent>,
skipped: Vec<SkipReason>,
},
Failed(String),
}
static HEREDOC_EXTRACTOR: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r#"<<([-~])?\s*(?:'([^']*)'|"([^"]*)"|([\w.-]+))"#).expect("heredoc regex compiles")
});
static HERESTRING_SINGLE_QUOTE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"<<<\s*'([^']*)'").expect("herestring single-quote regex compiles")
});
static HERESTRING_DOUBLE_QUOTE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r#"<<<\s*"([^"]*)""#).expect("herestring double-quote regex compiles")
});
static HERESTRING_UNQUOTED: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"<<<\s*([^'\x22\s]\S*)").expect("herestring unquoted regex compiles")
});
static INLINE_SCRIPT_SINGLE_QUOTE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"\b(python[0-9.]*(?:\.exe)?|ruby[0-9.]*(?:\.exe)?|irb[0-9.]*(?:\.exe)?|perl[0-9.]*(?:\.exe)?|node(js)?[0-9.]*(?:\.exe)?|php[0-9.]*(?:\.exe)?|lua[0-9.]*(?:\.exe)?|sh(?:\.exe)?|bash(?:\.exe)?|zsh(?:\.exe)?|fish(?:\.exe)?)\b(?:\s+(?:--\S+|-[A-Za-z]+))*\s+(-[A-Za-z]*[ceEpr][A-Za-z]*)\s*'([^']*)'")
.expect("inline script single-quote regex compiles")
});
static INLINE_SCRIPT_DOUBLE_QUOTE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r#"\b(python[0-9.]*(?:\.exe)?|ruby[0-9.]*(?:\.exe)?|irb[0-9.]*(?:\.exe)?|perl[0-9.]*(?:\.exe)?|node(js)?[0-9.]*(?:\.exe)?|php[0-9.]*(?:\.exe)?|lua[0-9.]*(?:\.exe)?|sh(?:\.exe)?|bash(?:\.exe)?|zsh(?:\.exe)?|fish(?:\.exe)?)\b(?:\s+(?:--\S+|-[A-Za-z]+))*\s+(-[A-Za-z]*[ceEpr][A-Za-z]*)\s*"([^"]*)""#)
.expect("inline script double-quote regex compiles")
});
const BINARY_THRESHOLD: f32 = 0.30;
#[must_use]
#[allow(clippy::cast_precision_loss)] #[allow(clippy::naive_bytecount)] pub fn check_binary_content(content: &str) -> Option<SkipReason> {
let bytes = content.as_bytes();
if bytes.is_empty() {
return None;
}
let null_bytes = bytes.iter().filter(|&&b| b == 0).count();
if null_bytes > 0 {
return Some(SkipReason::BinaryContent {
null_bytes,
non_printable_ratio: null_bytes as f32 / bytes.len() as f32,
});
}
let mut suspect_chars = 0;
let mut total_chars = 0;
for c in content.chars() {
total_chars += 1;
if (c.is_control() && c != '\n' && c != '\r' && c != '\t')
|| c == std::char::REPLACEMENT_CHARACTER
{
suspect_chars += 1;
}
}
let ratio = suspect_chars as f32 / total_chars.max(1) as f32;
if ratio > BINARY_THRESHOLD {
return Some(SkipReason::BinaryContent {
null_bytes: 0,
non_printable_ratio: ratio,
});
}
None
}
#[inline]
fn record_timeout_if_needed(
start_time: Instant,
timeout: Duration,
budget_ms: u64,
skip_reasons: &mut Vec<SkipReason>,
) -> bool {
let elapsed = start_time.elapsed();
if elapsed < timeout {
return false;
}
if !skip_reasons
.iter()
.any(|r| matches!(r, SkipReason::Timeout { .. }))
{
let elapsed_ms = u64::try_from(elapsed.as_millis()).unwrap_or(u64::MAX);
skip_reasons.push(SkipReason::Timeout {
elapsed_ms,
budget_ms,
});
}
true
}
#[must_use]
#[instrument(skip(command, limits), fields(cmd_len = command.len(), timeout_ms = limits.timeout_ms))]
pub fn extract_content(command: &str, limits: &ExtractionLimits) -> ExtractionResult {
let start_time = Instant::now();
let timeout = Duration::from_millis(limits.timeout_ms);
let mut skip_reasons: Vec<SkipReason> = Vec::new();
if command.len() > limits.max_body_bytes {
warn!(
actual = command.len(),
limit = limits.max_body_bytes,
"tier2_skip: input exceeds size limit"
);
skip_reasons.push(SkipReason::ExceededSizeLimit {
actual: command.len(),
limit: limits.max_body_bytes,
});
return ExtractionResult::Skipped(skip_reasons);
}
if let Some(reason) = check_binary_content(command) {
warn!(?reason, "tier2_skip: binary content detected");
skip_reasons.push(reason);
return ExtractionResult::Skipped(skip_reasons);
}
let mut extracted: Vec<ExtractedContent> = Vec::new();
if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, &mut skip_reasons) {
return ExtractionResult::Skipped(skip_reasons);
}
extract_inline_scripts(
command,
limits,
start_time,
timeout,
&mut extracted,
&mut skip_reasons,
);
if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, &mut skip_reasons) {
return if extracted.is_empty() {
ExtractionResult::Skipped(skip_reasons)
} else {
ExtractionResult::Extracted(extracted)
};
}
extract_herestrings(
command,
limits,
start_time,
timeout,
&mut extracted,
&mut skip_reasons,
);
if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, &mut skip_reasons) {
return if extracted.is_empty() {
ExtractionResult::Skipped(skip_reasons)
} else {
ExtractionResult::Extracted(extracted)
};
}
extract_heredocs(
command,
limits,
start_time,
timeout,
&mut extracted,
&mut skip_reasons,
);
let elapsed_us = start_time.elapsed().as_micros();
match (extracted.is_empty(), skip_reasons.is_empty()) {
(true, true) => {
trace!(elapsed_us, "tier2_complete: no content found");
ExtractionResult::NoContent
}
(true, false) => {
warn!(
elapsed_us,
skip_count = skip_reasons.len(),
"tier2_complete: skipped"
);
ExtractionResult::Skipped(skip_reasons)
}
(false, true) => {
debug!(
elapsed_us,
count = extracted.len(),
"tier2_complete: content extracted"
);
ExtractionResult::Extracted(extracted)
}
(false, false) => {
debug!(
elapsed_us,
count = extracted.len(),
skip_count = skip_reasons.len(),
"tier2_complete: partial extraction with skips"
);
ExtractionResult::Extracted(extracted)
}
}
}
fn extract_inline_scripts(
command: &str,
limits: &ExtractionLimits,
start_time: Instant,
timeout: Duration,
extracted: &mut Vec<ExtractedContent>,
skip_reasons: &mut Vec<SkipReason>,
) {
if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, skip_reasons) {
return;
}
if extracted.len() >= limits.max_heredocs {
skip_reasons.push(SkipReason::ExceededHeredocLimit {
limit: limits.max_heredocs,
});
return;
}
let mut hit_limit = false;
let mut extract_from_pattern = |pattern: &Regex| {
for cap in pattern.captures_iter(command) {
if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, skip_reasons) {
return;
}
if extracted.len() >= limits.max_heredocs {
hit_limit = true;
break;
}
let cmd_name = cap.get(1).map_or("", |m| m.as_str());
let flag = cap.get(3).map_or("", |m| m.as_str());
let content_match = cap.get(4);
let content = content_match.map_or("", |m| m.as_str());
let is_inline_flag = if cmd_name.starts_with("python") {
flag.contains('c') || flag.contains('e')
} else if cmd_name.starts_with("ruby") || cmd_name.starts_with("irb") {
flag.contains('e')
} else if cmd_name.starts_with("perl") {
flag.contains('e') || flag.contains('E')
} else if cmd_name.starts_with("node") {
flag.contains('e') || flag.contains('p')
} else if cmd_name.starts_with("php") {
flag.contains('r')
} else if cmd_name.starts_with("lua") {
flag.contains('e')
} else {
flag.contains('c')
};
if !is_inline_flag {
continue;
}
if content.len() > limits.max_body_bytes {
continue;
}
let full_match = cap.get(0).unwrap();
extracted.push(ExtractedContent {
content: content.to_string(),
language: ScriptLanguage::from_command(cmd_name),
delimiter: None,
byte_range: full_match.start()..full_match.end(),
content_range: content_match.map(|m| m.start()..m.end()),
quoted: true, heredoc_type: None,
target_command: Some(cmd_name.to_string()), });
}
};
extract_from_pattern(&INLINE_SCRIPT_SINGLE_QUOTE);
extract_from_pattern(&INLINE_SCRIPT_DOUBLE_QUOTE);
if hit_limit {
skip_reasons.push(SkipReason::ExceededHeredocLimit {
limit: limits.max_heredocs,
});
}
}
fn extract_herestrings(
command: &str,
limits: &ExtractionLimits,
start_time: Instant,
timeout: Duration,
extracted: &mut Vec<ExtractedContent>,
skip_reasons: &mut Vec<SkipReason>,
) {
if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, skip_reasons) {
return;
}
if extracted.len() >= limits.max_heredocs {
return; }
let mut hit_limit = false;
let mut extract_quoted = |pattern: &Regex, is_quoted: bool| {
for cap in pattern.captures_iter(command) {
if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, skip_reasons) {
return;
}
if extracted.len() >= limits.max_heredocs {
hit_limit = true;
break;
}
let content_match = cap.get(1);
let content = content_match.map_or("", |m| m.as_str());
if content.len() > limits.max_body_bytes {
continue;
}
let full_match = cap.get(0).unwrap();
let target_cmd = extract_heredoc_target_command(command, full_match.start());
extracted.push(ExtractedContent {
content: content.to_string(),
language: ScriptLanguage::Bash, delimiter: None,
byte_range: full_match.start()..full_match.end(),
content_range: content_match.map(|m| m.start()..m.end()),
quoted: is_quoted,
heredoc_type: Some(HeredocType::HereString),
target_command: target_cmd,
});
}
};
extract_quoted(&HERESTRING_SINGLE_QUOTE, true);
extract_quoted(&HERESTRING_DOUBLE_QUOTE, true);
extract_quoted(&HERESTRING_UNQUOTED, false);
if hit_limit {
skip_reasons.push(SkipReason::ExceededHeredocLimit {
limit: limits.max_heredocs,
});
}
}
fn extract_heredocs(
command: &str,
limits: &ExtractionLimits,
start_time: Instant,
timeout: Duration,
extracted: &mut Vec<ExtractedContent>,
skip_reasons: &mut Vec<SkipReason>,
) {
if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, skip_reasons) {
return;
}
if extracted.len() >= limits.max_heredocs {
return; }
let mut hit_limit = false;
for cap in HEREDOC_EXTRACTOR.captures_iter(command) {
if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, skip_reasons) {
return;
}
if extracted.len() >= limits.max_heredocs {
hit_limit = true;
break;
}
let operator_variant = cap.get(1).map(|m| m.as_str());
let (delimiter, quoted) = if let Some(m) = cap.get(2) {
(m.as_str(), true)
} else if let Some(m) = cap.get(3) {
(m.as_str(), true)
} else if let Some(m) = cap.get(4) {
(m.as_str(), false)
} else {
continue;
};
let heredoc_type = match operator_variant {
Some("-") => HeredocType::TabStripped,
Some("~") => HeredocType::IndentStripped,
_ => HeredocType::Standard,
};
let full_match = cap.get(0).unwrap();
let mut start_pos = full_match.end();
start_pos = command[start_pos..]
.find('\n')
.map_or(command.len(), |rel| start_pos.saturating_add(rel));
match extract_heredoc_body(
command,
start_pos,
delimiter,
heredoc_type,
limits,
start_time,
timeout,
) {
Ok((content, end_pos, body_start_abs, body_end_abs)) => {
let (language, _confidence) = ScriptLanguage::detect(command, &content);
let target_cmd = extract_heredoc_target_command(command, full_match.start());
extracted.push(ExtractedContent {
content,
language,
delimiter: Some(delimiter.to_string()),
byte_range: full_match.start()..end_pos.min(command.len()),
content_range: Some(body_start_abs..body_end_abs),
quoted,
heredoc_type: Some(heredoc_type),
target_command: target_cmd,
});
}
Err(reason) => {
skip_reasons.push(reason);
if matches!(skip_reasons.last(), Some(SkipReason::Timeout { .. })) {
return;
}
}
}
}
if hit_limit {
skip_reasons.push(SkipReason::ExceededHeredocLimit {
limit: limits.max_heredocs,
});
}
}
fn extract_heredoc_target_command(command: &str, heredoc_start: usize) -> Option<String> {
if heredoc_start == 0 {
return None;
}
let before = &command[..heredoc_start];
let trimmed = before.trim_end();
if trimmed.is_empty() {
return None;
}
let tokens = tokenize_backwards(trimmed);
for token in tokens.iter().rev() {
if is_shell_env_assignment(token) {
continue;
}
if token.starts_with('-') {
continue;
}
if SHELL_WRAPPER_COMMANDS.contains(&token.as_str()) {
continue;
}
if (token.starts_with('\'') && token.ends_with('\''))
|| (token.starts_with('"') && token.ends_with('"'))
{
continue;
}
if token.contains('/') {
let basename = token.rsplit('/').next().unwrap_or(token);
let is_known_command = NON_EXECUTING_HEREDOC_COMMANDS.contains(&basename)
|| [
"bash", "sh", "zsh", "fish", "ksh", "dash", "python", "perl", "ruby", "node",
]
.contains(&basename);
let looks_like_command_path = token.starts_with("/bin/")
|| token.starts_with("/usr/bin/")
|| token.starts_with("/usr/local/bin/")
|| token.starts_with("/sbin/")
|| token.starts_with("/usr/sbin/")
|| is_known_command;
if !looks_like_command_path {
continue;
}
return Some(basename.to_string());
}
let has_extension = token.contains('.') && !token.starts_with('.');
let is_known_command = NON_EXECUTING_HEREDOC_COMMANDS.contains(&token.as_str())
|| [
"bash", "sh", "zsh", "fish", "ksh", "dash", "python", "perl", "ruby", "node",
]
.contains(&token.as_str());
if has_extension && !is_known_command {
continue;
}
return Some(token.clone());
}
None
}
fn is_shell_env_assignment(token: &str) -> bool {
let Some((name, _value)) = token.split_once('=') else {
return false;
};
!name.is_empty()
&& name.bytes().enumerate().all(|(idx, byte)| match byte {
b'a'..=b'z' | b'A'..=b'Z' | b'_' => true,
b'0'..=b'9' => idx > 0,
_ => false,
})
}
fn tokenize_backwards(s: &str) -> Vec<String> {
let mut tokens = Vec::new();
let bytes = s.as_bytes();
let mut i = s.len();
while i > 0 {
while i > 0 && bytes[i - 1].is_ascii_whitespace() {
i -= 1;
}
if i == 0 {
break;
}
let end = i;
if bytes[i - 1] == b'\'' || bytes[i - 1] == b'"' {
let quote = bytes[i - 1];
i -= 1;
while i > 0 && bytes[i - 1] != quote {
i -= 1;
}
i = i.saturating_sub(1); tokens.push(s[i..end].to_string());
continue;
}
if matches!(bytes[i - 1], b'|' | b';' | b'&' | b'$' | b'(' | b')') {
break;
}
while i > 0 {
let c = bytes[i - 1];
if c.is_ascii_whitespace() || matches!(c, b'|' | b';' | b'&' | b'$' | b'(' | b')') {
break;
}
i -= 1;
}
if i < end {
tokens.push(s[i..end].to_string());
}
}
tokens
}
const NON_EXECUTING_HEREDOC_COMMANDS: &[&str] = &[
"cat",
"tee",
"echo",
"printf",
"dd",
"head",
"tail",
"grep",
"egrep",
"fgrep",
"sed",
"awk",
"cut",
"sort",
"uniq",
"tr",
"wc",
"rev",
"nl",
"fold",
"fmt",
"expand",
"unexpand",
"column",
"paste",
"join",
"base64",
"xxd",
"od",
"hexdump",
"gzip",
"gunzip",
"bzip2",
"bunzip2",
"xz",
"lzma",
"zcat",
"bzcat",
"xzcat",
"nc",
"netcat",
"curl",
"wget",
"md5sum",
"sha1sum",
"sha256sum",
"sha512sum",
"cksum",
"diff",
"cmp",
"comm",
"mail",
"sendmail",
"read",
];
const SHELL_WRAPPER_COMMANDS: &[&str] = &["sudo", "env", "command", "builtin", "nohup"];
#[must_use]
pub fn is_non_executing_heredoc_command(cmd: &str) -> bool {
let cmd_name = cmd.rsplit('/').next().unwrap_or(cmd);
NON_EXECUTING_HEREDOC_COMMANDS.contains(&cmd_name)
}
#[must_use]
pub fn mask_non_executing_heredocs(command: &str) -> std::borrow::Cow<'_, str> {
use std::borrow::Cow;
if !command.contains("<<") {
return Cow::Borrowed(command);
}
let mut result = String::new();
let mut pos = 0;
let bytes = command.as_bytes();
while pos < command.len() {
if let Some(offset) = command[pos..].find("<<") {
let heredoc_start = pos + offset;
if heredoc_start + 3 <= command.len() && bytes.get(heredoc_start + 2) == Some(&b'<') {
let target_cmd = extract_heredoc_target_command(command, heredoc_start);
let should_mask_herestring = target_cmd
.as_ref()
.is_some_and(|cmd| is_non_executing_heredoc_command(cmd));
if should_mask_herestring {
if let Some((content_start, content_end)) =
find_herestring_content_bounds(command, heredoc_start + 3)
{
if result.is_empty() {
result = command[..content_start].to_string();
} else {
result.push_str(&command[pos..content_start]);
}
result.push_str("'MASKED'");
pos = content_end;
continue;
}
}
if !result.is_empty() {
result.push_str(&command[pos..heredoc_start + 3]);
}
pos = heredoc_start + 3;
continue;
}
let target_cmd = extract_heredoc_target_command(command, heredoc_start);
let should_mask = target_cmd
.as_ref()
.is_some_and(|cmd| is_non_executing_heredoc_command(cmd));
if should_mask {
let after_op = &command[heredoc_start + 2..];
if let Some((delimiter, body_start_offset, heredoc_type)) =
parse_heredoc_delimiter(after_op)
{
let body_start = heredoc_start + 2 + body_start_offset;
if let Some(body_end) =
find_heredoc_terminator(command, body_start, &delimiter, heredoc_type)
{
if result.is_empty() {
result = command[..body_start].to_string();
} else {
result.push_str(&command[pos..body_start]);
}
let body_slice = &command[body_start..body_end];
let terminator_rel = body_slice.rfind('\n').map_or(0, |idx| idx + 1);
let terminator_abs = body_start + terminator_rel;
let masked_body =
mask_preserve_newlines(&command[body_start..terminator_abs]);
result.push_str(&masked_body);
result.push_str(&command[terminator_abs..body_end]);
pos = body_end;
continue;
}
}
}
if result.is_empty() {
} else {
result.push_str(&command[pos..heredoc_start + 2]);
}
pos = heredoc_start + 2;
} else {
if result.is_empty() {
return Cow::Borrowed(command);
}
result.push_str(&command[pos..]);
break;
}
}
if result.is_empty() {
Cow::Borrowed(command)
} else {
Cow::Owned(result)
}
}
fn mask_preserve_newlines(input: &str) -> String {
let mut out: Vec<u8> = Vec::with_capacity(input.len());
for b in input.as_bytes() {
match b {
b'\n' | b'\r' => out.push(*b),
_ => out.push(b' '),
}
}
String::from_utf8(out).unwrap_or_default()
}
fn parse_heredoc_delimiter(after_op: &str) -> Option<(String, usize, HeredocType)> {
let trimmed = after_op.trim_start_matches([' ', '\t']);
let skip_whitespace = after_op.len() - trimmed.len();
if trimmed.is_empty() {
return None;
}
let (heredoc_type, delim_start) = if trimmed.starts_with('-') {
(HeredocType::TabStripped, 1)
} else {
(HeredocType::Standard, 0)
};
let delim_chars = &trimmed[delim_start..];
let (delimiter, delim_len) = if let Some(stripped) = delim_chars.strip_prefix('"') {
if let Some(end) = stripped.find('"') {
let (body, _) = stripped.split_at(end);
(body.to_string(), end + 2)
} else {
return None;
}
} else if let Some(stripped) = delim_chars.strip_prefix('\'') {
if let Some(end) = stripped.find('\'') {
let (body, _) = stripped.split_at(end);
(body.to_string(), end + 2)
} else {
return None;
}
} else {
let end = delim_chars
.find(|c: char| c.is_whitespace() || c == '\n' || c == ';' || c == '&' || c == '|')
.unwrap_or(delim_chars.len());
if end == 0 {
return None;
}
(delim_chars[..end].to_string(), end)
};
let total_delim_offset = skip_whitespace + delim_start + delim_len;
let remaining = &after_op[total_delim_offset..];
let newline_offset = remaining.find('\n').map_or(remaining.len(), |i| i + 1);
Some((delimiter, total_delim_offset + newline_offset, heredoc_type))
}
fn find_heredoc_terminator(
command: &str,
body_start: usize,
delimiter: &str,
heredoc_type: HeredocType,
) -> Option<usize> {
if body_start >= command.len() {
return None;
}
let body = &command[body_start..];
let mut line_start = 0;
for line in body.split_inclusive('\n') {
let trimmed = match heredoc_type {
HeredocType::TabStripped => line.trim_start_matches('\t'),
HeredocType::IndentStripped => line.trim_start(),
HeredocType::Standard | HeredocType::HereString => line,
};
let line_content = trimmed.trim_end_matches(['\n', '\r']);
if line_content == delimiter {
return Some(body_start + line_start + line.len());
}
line_start += line.len();
}
None
}
fn find_herestring_content_bounds(command: &str, after_operator: usize) -> Option<(usize, usize)> {
if after_operator >= command.len() {
return None;
}
let remaining = &command[after_operator..];
let bytes = remaining.as_bytes();
let mut i = 0;
while i < bytes.len() && bytes[i].is_ascii_whitespace() && bytes[i] != b'\n' {
i += 1;
}
if i >= bytes.len() || bytes[i] == b'\n' {
return None;
}
if bytes[i] == b'\'' || bytes[i] == b'"' {
let quote = bytes[i];
let quote_start = i;
i += 1;
while i < bytes.len() && bytes[i] != quote {
if quote == b'"' && bytes[i] == b'\\' && i + 1 < bytes.len() {
i += 2;
} else {
i += 1;
}
}
if i < bytes.len() && bytes[i] == quote {
return Some((
after_operator + quote_start,
after_operator + i + 1, ));
}
}
let word_start = i;
while i < bytes.len() {
let c = bytes[i];
if c.is_ascii_whitespace() || matches!(c, b';' | b'&' | b'|' | b')' | b'\n') {
break;
}
i += 1;
}
if i > word_start {
Some((after_operator + word_start, after_operator + i))
} else {
None
}
}
fn extract_heredoc_body(
command: &str,
start: usize,
delimiter: &str,
heredoc_type: HeredocType,
limits: &ExtractionLimits,
start_time: Instant,
timeout: Duration,
) -> Result<(String, usize, usize, usize), SkipReason> {
if start > command.len() {
return Err(SkipReason::MalformedInput {
reason: "heredoc start offset out of bounds".to_string(),
});
}
let remaining = &command[start..];
let body_start_offset = usize::from(remaining.starts_with('\n'));
let body_start = &remaining[body_start_offset..];
let body_start_abs = start + body_start_offset;
let mut body_lines: Vec<&str> = Vec::new();
let mut total_bytes: usize = 0;
let mut cursor: usize = 0;
for part in body_start.split_inclusive('\n') {
if start_time.elapsed() >= timeout {
let elapsed_ms = u64::try_from(start_time.elapsed().as_millis()).unwrap_or(u64::MAX);
return Err(SkipReason::Timeout {
elapsed_ms,
budget_ms: limits.timeout_ms,
});
}
let line = part.strip_suffix('\n').unwrap_or(part);
let line = line.strip_suffix('\r').unwrap_or(line);
let trimmed = match heredoc_type {
HeredocType::TabStripped => line.trim_start_matches('\t'),
HeredocType::IndentStripped => line.trim_start(),
HeredocType::Standard | HeredocType::HereString => line,
};
if trimmed == delimiter {
let terminator_start = body_start_abs + cursor;
let terminator_end = terminator_start + line.len();
let mut body_end_abs = terminator_start;
if body_end_abs > body_start_abs {
let bytes = command.as_bytes();
if bytes.get(body_end_abs.saturating_sub(1)) == Some(&b'\n') {
body_end_abs = body_end_abs.saturating_sub(1);
if bytes.get(body_end_abs.saturating_sub(1)) == Some(&b'\r') {
body_end_abs = body_end_abs.saturating_sub(1);
}
}
}
let content = match heredoc_type {
HeredocType::TabStripped => body_lines
.iter()
.map(|l| l.trim_start_matches('\t'))
.collect::<Vec<_>>()
.join("\n"),
HeredocType::IndentStripped => {
let min_indent = body_lines
.iter()
.filter(|l| !l.trim().is_empty())
.map(|l| l.len() - l.trim_start().len())
.min()
.unwrap_or(0);
body_lines
.iter()
.map(|l| {
if l.len() >= min_indent {
&l[min_indent..]
} else {
l.trim_start()
}
})
.collect::<Vec<_>>()
.join("\n")
}
HeredocType::Standard | HeredocType::HereString => body_lines.join("\n"),
};
return Ok((content, terminator_end, body_start_abs, body_end_abs));
}
total_bytes = total_bytes.saturating_add(part.len());
if total_bytes > limits.max_body_bytes {
return Err(SkipReason::ExceededSizeLimit {
actual: total_bytes,
limit: limits.max_body_bytes,
});
}
if body_lines.len() >= limits.max_body_lines {
return Err(SkipReason::ExceededLineLimit {
actual: body_lines.len() + 1,
limit: limits.max_body_lines,
});
}
body_lines.push(line);
cursor = cursor.saturating_add(part.len());
}
Err(SkipReason::UnterminatedHeredoc {
delimiter: delimiter.to_string(),
})
}
use ast_grep_core::AstGrep;
use ast_grep_language::SupportLang;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ExtractedShellCommand {
pub text: String,
pub start: usize,
pub end: usize,
pub line_number: usize,
}
#[must_use]
#[instrument(skip(content), fields(content_len = content.len()))]
pub fn extract_shell_commands(content: &str) -> Vec<ExtractedShellCommand> {
if content.trim().is_empty() {
trace!("extract_shell_commands: empty content");
return Vec::new();
}
let start = Instant::now();
let ast = AstGrep::new(content, SupportLang::Bash);
let root = ast.root();
let mut commands = Vec::new();
collect_commands_recursive(root, content, &mut commands);
debug!(
elapsed_us = start.elapsed().as_micros(),
count = commands.len(),
"extract_shell_commands: AST analysis complete"
);
commands
}
#[allow(clippy::needless_pass_by_value)]
fn collect_commands_recursive<D: ast_grep_core::Doc>(
node: ast_grep_core::Node<'_, D>,
content: &str,
commands: &mut Vec<ExtractedShellCommand>,
) {
let kind = node.kind();
if kind == "command" {
let range = node.range();
let text = node.text().to_string();
if !text.trim().is_empty() {
let line_number = content[..range.start].matches('\n').count() + 1;
commands.push(ExtractedShellCommand {
text,
start: range.start,
end: range.end,
line_number,
});
}
}
for child in node.children() {
collect_commands_recursive(child, content, commands);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[allow(unused_imports)]
use proptest::prelude::*;
mod tier1_triggers {
use super::*;
#[test]
fn no_trigger_on_safe_commands() {
let safe_commands = [
"git status",
"ls -la",
"cargo build",
"npm install",
"docker ps",
"kubectl get pods",
"cat file.txt",
"echo hello",
"grep pattern file",
"find . -name '*.rs'",
];
for cmd in safe_commands {
assert_eq!(
check_triggers(cmd),
TriggerResult::NoTrigger,
"should not trigger on: {cmd}"
);
}
}
#[test]
fn triggers_on_heredoc_basic() {
let heredocs = [
"cat << EOF",
"cat <<EOF",
"cat << 'EOF'",
r#"cat << "EOF""#,
"cat <<- EOF", "mysql <<< 'query'", ];
for cmd in heredocs {
assert_eq!(
check_triggers(cmd),
TriggerResult::Triggered,
"should trigger on heredoc: {cmd}"
);
}
}
#[test]
fn triggers_on_python_inline() {
let python_commands = [
"python -c 'import os'",
"python3 -c 'import os'",
"python -I -c 'import os'",
"python3 -I -c 'import os'",
"python -e 'print(1)'",
"python3 -e 'print(1)'",
];
for cmd in python_commands {
assert_eq!(
check_triggers(cmd),
TriggerResult::Triggered,
"should trigger on python inline: {cmd}"
);
}
}
#[test]
fn triggers_on_versioned_interpreters() {
let versioned_commands = [
"python3.11 -c 'import os'",
"python3.12.1 -c 'import os'",
"python3.9 -e 'print(1)'",
"ruby3.0 -e 'puts 1'",
"ruby3.2.1 -e 'exit'",
"perl5.36 -e 'print 1'",
"perl5.38.2 -E 'say 1'",
"node18 -e 'console.log(1)'",
"node20.1 -e 'console.log(1)'",
"nodejs18 -e 'console.log(1)'",
"nodejs20.10.0 -e 'test'",
];
for cmd in versioned_commands {
assert_eq!(
check_triggers(cmd),
TriggerResult::Triggered,
"should trigger on versioned interpreter: {cmd}"
);
}
}
#[test]
fn triggers_on_ruby_inline() {
let ruby_commands = ["ruby -e 'puts 1'", "ruby -w -e 'puts 1'", "irb -e 'exit'"];
for cmd in ruby_commands {
assert_eq!(
check_triggers(cmd),
TriggerResult::Triggered,
"should trigger on ruby inline: {cmd}"
);
}
}
#[test]
fn triggers_on_perl_inline() {
let perl_commands = [
"perl -e 'print 1'",
"perl -E 'say 1'", "perl -pi -e 'print 1'",
];
for cmd in perl_commands {
assert_eq!(
check_triggers(cmd),
TriggerResult::Triggered,
"should trigger on perl inline: {cmd}"
);
}
}
#[test]
fn triggers_on_node_inline() {
let node_commands = [
"node -e 'console.log(1)'",
"node -p 'process.version'",
"node -pe 'process.version'",
];
for cmd in node_commands {
assert_eq!(
check_triggers(cmd),
TriggerResult::Triggered,
"should trigger on node inline: {cmd}"
);
}
}
#[test]
fn triggers_on_shell_inline() {
let shell_commands = [
"bash -c 'echo hello'",
"bash -l -c 'echo hello'",
"bash -lc 'echo hello'",
"bash --noprofile --norc -c 'echo hello'",
"sh -c 'ls'",
"zsh -c 'pwd'",
"fish -c 'echo hello'",
];
for cmd in shell_commands {
assert_eq!(
check_triggers(cmd),
TriggerResult::Triggered,
"should trigger on shell inline: {cmd}"
);
}
}
#[test]
fn triggers_on_xargs() {
let xargs_commands = [
"find . -name '*.bak' | xargs rm",
"ls | xargs -I {} echo {}",
"cat files.txt | xargs -n1 process",
];
for cmd in xargs_commands {
assert_eq!(
check_triggers(cmd),
TriggerResult::Triggered,
"should trigger on xargs: {cmd}"
);
}
}
#[test]
fn triggers_on_piped_execution() {
let piped_commands = [
"echo 'print(1)' | python",
"cat script.py | python3",
"echo 'puts 1' | ruby",
"echo 'print 1' | perl",
"echo 'console.log(1)' | node",
"echo 'echo hello' | bash",
"echo 'ls' | sh",
];
for cmd in piped_commands {
assert_eq!(
check_triggers(cmd),
TriggerResult::Triggered,
"should trigger on piped execution: {cmd}"
);
}
}
#[test]
fn triggers_on_eval_exec() {
let eval_commands = [
r#"eval "dangerous code""#,
"eval 'dangerous code'",
r#"exec "command""#,
"exec 'command'",
];
for cmd in eval_commands {
assert_eq!(
check_triggers(cmd),
TriggerResult::Triggered,
"should trigger on eval/exec: {cmd}"
);
}
}
#[test]
fn matched_triggers_returns_indices() {
let matches = matched_triggers("python -c 'test'");
assert!(!matches.is_empty(), "should have matches for python -c");
let no_matches = matched_triggers("git status");
assert!(
no_matches.is_empty(),
"should have no matches for git status"
);
}
#[test]
fn heredoc_syntax_inside_quoted_literals_does_not_trigger() {
let commands = [
r#"git commit -m "docs: example heredoc: cat <<EOF rm -rf / EOF""#,
r#"rg "<<EOF" README.md"#,
"echo 'cat <<EOF (docs only)'",
];
for cmd in commands {
assert_eq!(
check_triggers(cmd),
TriggerResult::NoTrigger,
"should not trigger on quoted literal heredoc syntax: {cmd}"
);
}
}
#[test]
fn heredoc_inside_command_substitution_with_outer_quotes_still_triggers() {
let cmd = "echo \"$(cat <<EOF\nrm -rf /\nEOF)\"";
assert_eq!(check_triggers(cmd), TriggerResult::Triggered);
}
}
mod tier2_extraction {
use super::*;
#[test]
fn extraction_limits_default() {
let limits = ExtractionLimits::default();
assert_eq!(limits.max_body_bytes, 1024 * 1024);
assert_eq!(limits.max_body_lines, 10_000);
assert_eq!(limits.max_heredocs, 10);
assert_eq!(limits.timeout_ms, 50);
}
#[test]
fn extracts_inline_script_single_quotes() {
let result = extract_content("python -c 'import os'", &ExtractionLimits::default());
if let ExtractionResult::Extracted(contents) = result {
assert_eq!(contents.len(), 1);
assert_eq!(contents[0].content, "import os");
assert_eq!(contents[0].language, ScriptLanguage::Python);
assert!(contents[0].quoted);
} else {
panic!("Expected Extracted result");
}
}
#[test]
fn extracts_inline_script_double_quotes() {
let result = extract_content(r#"bash -c "echo hello""#, &ExtractionLimits::default());
if let ExtractionResult::Extracted(contents) = result {
assert_eq!(contents.len(), 1);
assert_eq!(contents[0].content, "echo hello");
assert_eq!(contents[0].language, ScriptLanguage::Bash);
} else {
panic!("Expected Extracted result");
}
}
#[test]
fn extracts_inline_script_with_intervening_flags() {
let result = extract_content("python -I -c 'import os'", &ExtractionLimits::default());
if let ExtractionResult::Extracted(contents) = result {
assert_eq!(contents.len(), 1);
assert_eq!(contents[0].content, "import os");
assert_eq!(contents[0].language, ScriptLanguage::Python);
assert!(contents[0].quoted);
} else {
panic!("Expected Extracted result");
}
}
#[test]
fn extracts_inline_script_with_combined_shell_flags() {
let result = extract_content("bash -lc 'echo hello'", &ExtractionLimits::default());
if let ExtractionResult::Extracted(contents) = result {
assert_eq!(contents.len(), 1);
assert_eq!(contents[0].content, "echo hello");
assert_eq!(contents[0].language, ScriptLanguage::Bash);
} else {
panic!("Expected Extracted result");
}
}
#[test]
fn extracts_inline_script_with_combined_node_flags() {
let result =
extract_content("node -pe 'process.version'", &ExtractionLimits::default());
if let ExtractionResult::Extracted(contents) = result {
assert_eq!(contents.len(), 1);
assert_eq!(contents[0].content, "process.version");
assert_eq!(contents[0].language, ScriptLanguage::JavaScript);
} else {
panic!("Expected Extracted result");
}
}
#[test]
fn extracts_inline_script_with_interleaved_perl_flags() {
let result = extract_content("perl -pi -e 'print 1'", &ExtractionLimits::default());
if let ExtractionResult::Extracted(contents) = result {
assert_eq!(contents.len(), 1);
assert_eq!(contents[0].content, "print 1");
assert_eq!(contents[0].language, ScriptLanguage::Perl);
} else {
panic!("Expected Extracted result");
}
}
#[test]
fn extracts_here_string() {
let result = extract_content("cat <<< 'hello world'", &ExtractionLimits::default());
if let ExtractionResult::Extracted(contents) = result {
assert_eq!(contents.len(), 1);
assert_eq!(contents[0].content, "hello world");
assert_eq!(contents[0].heredoc_type, Some(HeredocType::HereString));
} else {
panic!("Expected Extracted result");
}
}
#[test]
fn extracts_heredoc_basic() {
let cmd = "cat << EOF\nline1\nline2\nEOF";
let result = extract_content(cmd, &ExtractionLimits::default());
if let ExtractionResult::Extracted(contents) = result {
assert_eq!(contents.len(), 1);
assert_eq!(contents[0].content, "line1\nline2");
assert_eq!(contents[0].delimiter, Some("EOF".to_string()));
assert_eq!(contents[0].heredoc_type, Some(HeredocType::Standard));
} else {
panic!("Expected Extracted result, got {result:?}");
}
}
#[test]
fn extracts_heredoc_ignores_trailing_tokens_on_delimiter_line() {
let cmd = "python3 <<EOF | cat\nimport shutil\nshutil.rmtree('/tmp/test')\nEOF";
let result = extract_content(cmd, &ExtractionLimits::default());
if let ExtractionResult::Extracted(contents) = result {
assert_eq!(contents.len(), 1);
assert_eq!(contents[0].language, ScriptLanguage::Python);
assert_eq!(
contents[0].content,
"import shutil\nshutil.rmtree('/tmp/test')"
);
} else {
panic!("Expected Extracted result, got {result:?}");
}
}
#[test]
fn extracts_heredoc_with_crlf_line_endings() {
let cmd = "cat <<EOF\r\nline1\r\nEOF\r\n";
let result = extract_content(cmd, &ExtractionLimits::default());
if let ExtractionResult::Extracted(contents) = result {
assert_eq!(contents.len(), 1);
assert_eq!(contents[0].content, "line1");
assert_eq!(contents[0].delimiter.as_deref(), Some("EOF"));
} else {
panic!("Expected Extracted result, got {result:?}");
}
}
#[test]
fn extracts_heredoc_tab_stripped() {
let cmd = "cat <<- EOF\n\tline1\n\tline2\nEOF";
let result = extract_content(cmd, &ExtractionLimits::default());
if let ExtractionResult::Extracted(contents) = result {
assert_eq!(contents.len(), 1);
assert_eq!(contents[0].content, "line1\nline2");
assert_eq!(contents[0].heredoc_type, Some(HeredocType::TabStripped));
} else {
panic!("Expected Extracted result");
}
}
#[test]
fn extracts_heredoc_indent_stripped() {
let cmd = "cat <<~ EOF\n line1\n line2\n EOF";
let result = extract_content(cmd, &ExtractionLimits::default());
if let ExtractionResult::Extracted(contents) = result {
assert_eq!(contents.len(), 1);
assert_eq!(contents[0].content, "line1\nline2");
assert_eq!(contents[0].heredoc_type, Some(HeredocType::IndentStripped));
} else {
panic!("Expected Extracted result, got {result:?}");
}
}
#[test]
fn extracts_heredoc_quoted_delimiter_sets_quoted_flag() {
let cmd = "cat << 'EOF'\nline1\nEOF";
let result = extract_content(cmd, &ExtractionLimits::default());
if let ExtractionResult::Extracted(contents) = result {
assert_eq!(contents.len(), 1);
assert_eq!(contents[0].content, "line1");
assert_eq!(contents[0].delimiter.as_deref(), Some("EOF"));
assert!(contents[0].quoted, "quoted delimiter must set quoted=true");
} else {
panic!("Expected Extracted result, got {result:?}");
}
let cmd = "cat << EOF\nline1\nEOF";
let result = extract_content(cmd, &ExtractionLimits::default());
if let ExtractionResult::Extracted(contents) = result {
assert_eq!(contents.len(), 1);
assert!(
!contents[0].quoted,
"unquoted delimiter must set quoted=false"
);
} else {
panic!("Expected Extracted result, got {result:?}");
}
}
#[test]
fn heredoc_language_detects_interpreter_prefixes() {
let cases = [
("python3 <<EOF\nprint('hello')\nEOF", ScriptLanguage::Python),
(
"node <<EOF\nconsole.log('hello');\nEOF",
ScriptLanguage::JavaScript,
),
("ruby <<EOF\nputs 'hello'\nEOF", ScriptLanguage::Ruby),
("perl <<EOF\nprint \"hello\";\nEOF", ScriptLanguage::Perl),
("bash <<EOF\necho hello\nEOF", ScriptLanguage::Bash),
];
for (cmd, expected) in cases {
let result = extract_content(cmd, &ExtractionLimits::default());
if let ExtractionResult::Extracted(contents) = result {
assert_eq!(
contents.len(),
1,
"expected one heredoc extraction for: {cmd}"
);
assert_eq!(
contents[0].language, expected,
"expected language {expected:?} for heredoc: {cmd}"
);
} else {
panic!("Expected Extracted result for heredoc: {cmd}, got {result:?}");
}
}
}
#[test]
fn heredoc_language_detects_shebang_when_command_unknown() {
let cmd = "cat <<EOF\n#!/usr/bin/env python3\nimport os\nprint('hi')\nEOF";
let result = extract_content(cmd, &ExtractionLimits::default());
if let ExtractionResult::Extracted(contents) = result {
assert_eq!(contents.len(), 1);
assert_eq!(contents[0].language, ScriptLanguage::Python);
} else {
panic!("Expected Extracted result, got {result:?}");
}
}
#[test]
fn extracts_empty_heredoc() {
let cmd = "cat << EOF\nEOF";
let result = extract_content(cmd, &ExtractionLimits::default());
if let ExtractionResult::Extracted(contents) = result {
assert_eq!(contents.len(), 1);
assert_eq!(contents[0].content, "");
assert_eq!(contents[0].delimiter, Some("EOF".to_string()));
} else {
panic!("Expected Extracted result for empty heredoc, got {result:?}");
}
}
#[test]
fn heredoc_byte_range_is_correct() {
let cmd = "python << END\nprint(1)\nEND";
let result = extract_content(cmd, &ExtractionLimits::default());
if let ExtractionResult::Extracted(contents) = result {
assert_eq!(contents.len(), 1);
assert_eq!(contents[0].language, ScriptLanguage::Python);
let range = &contents[0].byte_range;
let extracted_span = &cmd[range.clone()];
assert_eq!(extracted_span, "<< END\nprint(1)\nEND");
} else {
panic!("Expected Extracted result");
}
let cmd = "cat << EOF\nEOF";
let result = extract_content(cmd, &ExtractionLimits::default());
if let ExtractionResult::Extracted(contents) = result {
assert_eq!(contents.len(), 1);
let range = &contents[0].byte_range;
let extracted_span = &cmd[range.clone()];
assert_eq!(extracted_span, "<< EOF\nEOF");
} else {
panic!("Expected Extracted result");
}
let cmd = "cat << EOF\nline1\nline2\nEOF";
let result = extract_content(cmd, &ExtractionLimits::default());
if let ExtractionResult::Extracted(contents) = result {
assert_eq!(contents.len(), 1);
let range = &contents[0].byte_range;
let extracted_span = &cmd[range.clone()];
assert_eq!(extracted_span, "<< EOF\nline1\nline2\nEOF");
} else {
panic!("Expected Extracted result");
}
}
#[test]
fn extracts_here_string_with_nested_quotes() {
let result = extract_content(
r#"cat <<< 'hello "world" test'"#,
&ExtractionLimits::default(),
);
if let ExtractionResult::Extracted(contents) = result {
assert_eq!(contents.len(), 1);
assert_eq!(contents[0].content, r#"hello "world" test"#);
assert!(contents[0].quoted);
} else {
panic!("Expected Extracted result");
}
let result = extract_content(
r#"cat <<< "hello 'world' test""#,
&ExtractionLimits::default(),
);
if let ExtractionResult::Extracted(contents) = result {
assert_eq!(contents.len(), 1);
assert_eq!(contents[0].content, "hello 'world' test");
assert!(contents[0].quoted);
} else {
panic!("Expected Extracted result");
}
}
#[test]
fn from_command_does_not_false_positive() {
assert_eq!(
ScriptLanguage::from_command("shebang"),
ScriptLanguage::Unknown
);
assert_eq!(
ScriptLanguage::from_command("shell"),
ScriptLanguage::Unknown
);
assert_eq!(
ScriptLanguage::from_command("pythonic"),
ScriptLanguage::Unknown
);
assert_eq!(
ScriptLanguage::from_command("nodemon"),
ScriptLanguage::Unknown
);
assert_eq!(
ScriptLanguage::from_command("perldoc"),
ScriptLanguage::Unknown
);
assert_eq!(
ScriptLanguage::from_command("bashful"),
ScriptLanguage::Unknown
);
}
#[test]
fn from_command_matches_versioned_interpreters() {
assert_eq!(
ScriptLanguage::from_command("python3"),
ScriptLanguage::Python
);
assert_eq!(
ScriptLanguage::from_command("python3.11"),
ScriptLanguage::Python
);
assert_eq!(
ScriptLanguage::from_command("python3.11.4"),
ScriptLanguage::Python
);
assert_eq!(
ScriptLanguage::from_command("node18"),
ScriptLanguage::JavaScript
);
assert_eq!(ScriptLanguage::from_command("perl5"), ScriptLanguage::Perl);
}
#[test]
fn no_content_on_safe_command() {
let result = extract_content("git status", &ExtractionLimits::default());
assert!(matches!(result, ExtractionResult::NoContent));
}
#[test]
fn script_language_from_command() {
assert_eq!(
ScriptLanguage::from_command("python3"),
ScriptLanguage::Python
);
assert_eq!(ScriptLanguage::from_command("ruby"), ScriptLanguage::Ruby);
assert_eq!(ScriptLanguage::from_command("perl"), ScriptLanguage::Perl);
assert_eq!(
ScriptLanguage::from_command("node"),
ScriptLanguage::JavaScript
);
assert_eq!(ScriptLanguage::from_command("bash"), ScriptLanguage::Bash);
assert_eq!(
ScriptLanguage::from_command("unknown"),
ScriptLanguage::Unknown
);
}
#[test]
fn from_shebang_detects_direct_path() {
assert_eq!(
ScriptLanguage::from_shebang("#!/bin/bash\necho hello"),
Some(ScriptLanguage::Bash)
);
assert_eq!(
ScriptLanguage::from_shebang("#!/usr/bin/python\nimport os"),
Some(ScriptLanguage::Python)
);
assert_eq!(
ScriptLanguage::from_shebang("#!/usr/bin/ruby\nputs 'hi'"),
Some(ScriptLanguage::Ruby)
);
}
#[test]
fn from_shebang_detects_env_path() {
assert_eq!(
ScriptLanguage::from_shebang("#!/usr/bin/env python3\nimport sys"),
Some(ScriptLanguage::Python)
);
assert_eq!(
ScriptLanguage::from_shebang("#!/usr/bin/env node\nconsole.log('hi')"),
Some(ScriptLanguage::JavaScript)
);
assert_eq!(
ScriptLanguage::from_shebang("#!/usr/bin/env perl\nprint 'hello'"),
Some(ScriptLanguage::Perl)
);
}
#[test]
fn from_shebang_returns_none_for_invalid() {
assert_eq!(ScriptLanguage::from_shebang("import os"), None);
assert_eq!(ScriptLanguage::from_shebang("#!\ncode"), None);
assert_eq!(
ScriptLanguage::from_shebang("#!/usr/bin/unknown\ncode"),
None
);
}
#[test]
fn from_shebang_ignores_interpreter_flags() {
assert_eq!(
ScriptLanguage::from_shebang("#!/bin/bash -e\nset -x"),
Some(ScriptLanguage::Bash)
);
assert_eq!(
ScriptLanguage::from_shebang("#!/bin/bash -ex\necho hello"),
Some(ScriptLanguage::Bash)
);
assert_eq!(
ScriptLanguage::from_shebang("#!/usr/bin/python3 -u\nimport sys"),
Some(ScriptLanguage::Python)
);
assert_eq!(
ScriptLanguage::from_shebang("#!/usr/bin/env python3 -u\nimport sys"),
Some(ScriptLanguage::Python)
);
assert_eq!(
ScriptLanguage::from_shebang("#!/usr/bin/env bash -e\necho hi"),
Some(ScriptLanguage::Bash)
);
assert_eq!(
ScriptLanguage::from_shebang("#!/usr/bin/env ruby -w\nputs 'hi'"),
Some(ScriptLanguage::Ruby)
);
}
#[test]
fn from_shebang_handles_env_flags() {
assert_eq!(
ScriptLanguage::from_shebang("#!/usr/bin/env -S python3 -u\nimport sys"),
Some(ScriptLanguage::Python)
);
assert_eq!(
ScriptLanguage::from_shebang("#!/usr/bin/env -S bash -e\necho hi"),
Some(ScriptLanguage::Bash)
);
assert_eq!(
ScriptLanguage::from_shebang("#!/usr/bin/env -i python3\nimport os"),
Some(ScriptLanguage::Python)
);
assert_eq!(
ScriptLanguage::from_shebang("#!/usr/bin/env -i -S perl -w\nuse strict;"),
Some(ScriptLanguage::Perl)
);
}
#[test]
fn from_content_detects_python() {
assert_eq!(
ScriptLanguage::from_content("import os\nos.remove('file')"),
Some(ScriptLanguage::Python)
);
assert_eq!(
ScriptLanguage::from_content("from pathlib import Path\nPath('x').unlink()"),
Some(ScriptLanguage::Python)
);
}
#[test]
fn from_content_detects_javascript() {
assert_eq!(
ScriptLanguage::from_content("const fs = require('fs');\nfs.rm('x');"),
Some(ScriptLanguage::JavaScript)
);
assert_eq!(
ScriptLanguage::from_content("let x = 5;\nconsole.log(x);"),
Some(ScriptLanguage::JavaScript)
);
}
#[test]
fn from_content_detects_typescript() {
assert_eq!(
ScriptLanguage::from_content("const x: string = 'hello';"),
Some(ScriptLanguage::TypeScript)
);
assert_eq!(
ScriptLanguage::from_content("interface User { name: string }"),
Some(ScriptLanguage::TypeScript)
);
}
#[test]
fn from_content_detects_ruby() {
assert_eq!(
ScriptLanguage::from_content("def hello\n puts 'hi'\nend"),
Some(ScriptLanguage::Ruby)
);
assert_eq!(
ScriptLanguage::from_content("require 'fileutils'\nFileUtils.rm_rf('x')\nend"),
Some(ScriptLanguage::Ruby)
);
}
#[test]
fn from_content_detects_perl() {
assert_eq!(
ScriptLanguage::from_content("use strict;\nmy $x = 5;"),
Some(ScriptLanguage::Perl)
);
assert_eq!(
ScriptLanguage::from_content("my @arr = (1,2,3);"),
Some(ScriptLanguage::Perl)
);
}
#[test]
fn from_content_detects_bash() {
assert_eq!(
ScriptLanguage::from_content("if [ -f file ]; then\n echo 'exists'\nfi"),
Some(ScriptLanguage::Bash)
);
assert_eq!(
ScriptLanguage::from_content("x=$((1+2))\necho ${x}"),
Some(ScriptLanguage::Bash)
);
}
#[test]
fn from_content_returns_none_for_unknown() {
assert_eq!(ScriptLanguage::from_content("hello world"), None);
assert_eq!(ScriptLanguage::from_content(""), None);
}
#[test]
fn detect_uses_command_prefix_first() {
let (lang, confidence) =
ScriptLanguage::detect("ruby -e 'code'", "#!/usr/bin/python\nimport os");
assert_eq!(lang, ScriptLanguage::Ruby);
assert_eq!(confidence, DetectionConfidence::CommandPrefix);
}
#[test]
fn detect_uses_shebang_second() {
let (lang, confidence) =
ScriptLanguage::detect("cat script.sh", "#!/bin/bash\necho hello");
assert_eq!(lang, ScriptLanguage::Bash);
assert_eq!(confidence, DetectionConfidence::Shebang);
}
#[test]
fn detect_uses_content_heuristics_third() {
let (lang, confidence) =
ScriptLanguage::detect("cat script", "import os\nos.remove('x')");
assert_eq!(lang, ScriptLanguage::Python);
assert_eq!(confidence, DetectionConfidence::ContentHeuristics);
}
#[test]
fn detect_returns_unknown_for_unrecognized() {
let (lang, confidence) = ScriptLanguage::detect("cat file.txt", "hello world");
assert_eq!(lang, ScriptLanguage::Unknown);
assert_eq!(confidence, DetectionConfidence::Unknown);
}
#[test]
fn detect_handles_env_prefix() {
let (lang, confidence) = ScriptLanguage::detect("env python3 -c 'code'", "");
assert_eq!(lang, ScriptLanguage::Python);
assert_eq!(confidence, DetectionConfidence::CommandPrefix);
}
#[test]
fn detect_handles_absolute_path() {
let (lang, confidence) = ScriptLanguage::detect("/usr/bin/python3 -c 'code'", "");
assert_eq!(lang, ScriptLanguage::Python);
assert_eq!(confidence, DetectionConfidence::CommandPrefix);
}
#[test]
fn detection_confidence_labels() {
assert_eq!(DetectionConfidence::CommandPrefix.label(), "command-prefix");
assert_eq!(DetectionConfidence::Shebang.label(), "shebang");
assert_eq!(
DetectionConfidence::ContentHeuristics.label(),
"content-heuristics"
);
assert_eq!(DetectionConfidence::Unknown.label(), "unknown");
}
#[test]
fn detection_confidence_reasons() {
assert!(
DetectionConfidence::CommandPrefix
.reason()
.contains("highest")
);
assert!(DetectionConfidence::Shebang.reason().contains("high"));
assert!(
DetectionConfidence::ContentHeuristics
.reason()
.contains("lower")
);
assert!(DetectionConfidence::Unknown.reason().contains("could not"));
}
#[test]
fn enforces_max_body_bytes() {
let large_content = "x".repeat(2_000_000); let cmd = format!("python -c '{large_content}'");
let limits = ExtractionLimits {
max_body_bytes: 1_000_000, ..Default::default()
};
let result = extract_content(&cmd, &limits);
match result {
ExtractionResult::Skipped(reasons) => {
assert!(
reasons
.iter()
.any(|r| matches!(r, SkipReason::ExceededSizeLimit { .. }))
);
}
ExtractionResult::NoContent
| ExtractionResult::Failed(_)
| ExtractionResult::Partial { .. } => {}
ExtractionResult::Extracted(contents) => {
for c in contents {
assert!(c.content.len() <= limits.max_body_bytes);
}
}
}
}
#[test]
fn extracts_multiple_inline_scripts() {
let cmd = "python -c 'code1' && ruby -e 'code2'";
let result = extract_content(cmd, &ExtractionLimits::default());
if let ExtractionResult::Extracted(contents) = result {
assert_eq!(contents.len(), 2);
assert_eq!(contents[0].content, "code1");
assert_eq!(contents[1].content, "code2");
} else {
panic!("Expected Extracted result");
}
}
#[test]
fn extracts_versioned_interpreter_scripts() {
let cmd = "python3.11 -c 'import os' && nodejs18 -e 'console.log(1)'";
let result = extract_content(cmd, &ExtractionLimits::default());
if let ExtractionResult::Extracted(contents) = result {
assert_eq!(contents.len(), 2, "should extract both scripts");
assert_eq!(contents[0].content, "import os");
assert_eq!(contents[0].language, ScriptLanguage::Python);
assert_eq!(contents[1].content, "console.log(1)");
assert_eq!(contents[1].language, ScriptLanguage::JavaScript);
} else {
panic!("Expected Extracted result for versioned interpreters, got {result:?}");
}
}
#[test]
fn skips_binary_content_with_null_bytes() {
let cmd = "python -c '\x00binary\x00content'";
if let Some(reason) = check_binary_content(cmd) {
assert!(
matches!(reason, SkipReason::BinaryContent { null_bytes, .. } if null_bytes > 0)
);
} else {
panic!("Expected binary content detection");
}
}
#[test]
fn skips_binary_content_high_non_printable() {
let binary_bytes: Vec<u8> = (0u8..50).chain(200u8..255).collect();
let binary_str = String::from_utf8_lossy(&binary_bytes);
if let Some(reason) = check_binary_content(&binary_str) {
assert!(matches!(reason, SkipReason::BinaryContent { .. }));
} else {
panic!("Expected binary content detection for high non-printable ratio");
}
}
#[test]
fn allows_normal_text_content() {
let normal_content = "import os\nprint('hello world')\nfor i in range(10): pass";
assert!(check_binary_content(normal_content).is_none());
}
#[test]
fn tracks_unterminated_heredoc() {
let cmd = "cat << EOF\nunterminated content without closing delimiter";
let result = extract_content(cmd, &ExtractionLimits::default());
match result {
ExtractionResult::Skipped(reasons) => {
assert!(
reasons
.iter()
.any(|r| matches!(r, SkipReason::UnterminatedHeredoc { .. })),
"should report UnterminatedHeredoc, not ExceededSizeLimit"
);
}
_ => panic!("Expected Skipped result for unterminated heredoc"),
}
}
#[test]
fn heredoc_body_line_limit_reports_exceeded_line_limit() {
let cmd = "cat << EOF\nline1\nline2\nline3\nEOF";
let limits = ExtractionLimits {
max_body_lines: 2,
..Default::default()
};
let result = extract_content(cmd, &limits);
match result {
ExtractionResult::Skipped(reasons) => {
assert!(
reasons
.iter()
.any(|r| matches!(r, SkipReason::ExceededLineLimit { .. })),
"should report ExceededLineLimit, not UnterminatedHeredoc"
);
}
_ => panic!("Expected Skipped result for line-limited heredoc, got {result:?}"),
}
}
#[test]
fn extraction_timeout_is_enforced() {
let cmd = "cat << EOF\nline1\nEOF";
let limits = ExtractionLimits {
timeout_ms: 0,
..Default::default()
};
let result = extract_content(cmd, &limits);
match result {
ExtractionResult::Skipped(reasons) => {
assert!(
reasons
.iter()
.any(|r| matches!(r, SkipReason::Timeout { .. })),
"should include a Timeout skip reason"
);
}
_ => panic!("Expected Skipped(timeout) result, got {result:?}"),
}
}
#[test]
fn enforces_heredoc_limit() {
let cmd = "cmd1 << A\na\nA && cmd2 << B\nb\nB && cmd3 << C\nc\nC";
let limits = ExtractionLimits {
max_heredocs: 2, ..Default::default()
};
let result = extract_content(cmd, &limits);
if let ExtractionResult::Extracted(contents) = result {
assert!(contents.len() <= limits.max_heredocs);
}
}
#[test]
fn skip_reason_display() {
let reasons = vec![
SkipReason::ExceededSizeLimit {
actual: 2000,
limit: 1000,
},
SkipReason::ExceededLineLimit {
actual: 200,
limit: 100,
},
SkipReason::ExceededHeredocLimit { limit: 10 },
SkipReason::BinaryContent {
null_bytes: 5,
non_printable_ratio: 0.5,
},
SkipReason::Timeout {
elapsed_ms: 60,
budget_ms: 50,
},
SkipReason::UnterminatedHeredoc {
delimiter: "EOF".to_string(),
},
SkipReason::MalformedInput {
reason: "test".to_string(),
},
];
for reason in reasons {
let display = format!("{reason}");
assert!(!display.is_empty(), "Display should produce output");
}
}
#[test]
fn empty_command_returns_no_content() {
let result = extract_content("", &ExtractionLimits::default());
assert!(matches!(result, ExtractionResult::NoContent));
}
#[test]
fn whitespace_only_returns_no_content() {
let result = extract_content(" \t\n ", &ExtractionLimits::default());
assert!(matches!(result, ExtractionResult::NoContent));
}
}
mod shell_extraction {
use super::*;
#[test]
fn extracts_simple_command() {
let commands = extract_shell_commands("ls -la");
assert_eq!(commands.len(), 1);
assert_eq!(commands[0].text, "ls -la");
assert_eq!(commands[0].line_number, 1);
}
#[test]
fn extracts_rm_rf() {
let commands = extract_shell_commands("rm -rf /tmp/test");
assert_eq!(commands.len(), 1);
assert_eq!(commands[0].text, "rm -rf /tmp/test");
}
#[test]
fn extracts_git_reset_hard() {
let commands = extract_shell_commands("git reset --hard");
assert_eq!(commands.len(), 1);
assert_eq!(commands[0].text, "git reset --hard");
}
#[test]
fn extracts_git_clean_fd() {
let commands = extract_shell_commands("git clean -fd");
assert_eq!(commands.len(), 1);
assert_eq!(commands[0].text, "git clean -fd");
}
#[test]
fn extracts_pipeline_both_sides() {
let commands = extract_shell_commands("find . -name '*.bak' | xargs rm");
assert_eq!(commands.len(), 2, "pipeline should extract both commands");
assert!(commands[0].text.starts_with("find"));
assert!(commands[1].text.contains("xargs"));
}
#[test]
fn extracts_command_list() {
let commands = extract_shell_commands("cd /tmp && rm -rf test");
assert_eq!(commands.len(), 2, "command list should extract both");
}
#[test]
fn extracts_command_substitution() {
let commands = extract_shell_commands("echo $(rm -rf /tmp/test)");
assert!(
commands.len() >= 2,
"should extract command inside substitution"
);
assert!(
commands.iter().any(|c| c.text.contains("rm")),
"should extract rm from command substitution"
);
}
#[test]
fn extracts_subshell_commands() {
let commands = extract_shell_commands("(cd /tmp && rm -rf test)");
assert!(commands.len() >= 2, "should extract commands from subshell");
}
#[test]
fn extracts_multiline_script() {
let script = r#"#!/bin/bash
set -e
cd /tmp
rm -rf test
echo "done""#;
let commands = extract_shell_commands(script);
assert!(
commands.len() >= 4,
"should extract all commands from multiline script"
);
assert!(
commands.iter().any(|c| c.text.contains("rm")),
"should extract rm"
);
}
#[test]
fn extracts_docker_system_prune() {
let commands = extract_shell_commands("docker system prune -af");
assert_eq!(commands.len(), 1);
assert_eq!(commands[0].text, "docker system prune -af");
}
#[test]
fn line_numbers_are_correct() {
let script = "echo first\nrm -rf /tmp\necho last";
let commands = extract_shell_commands(script);
assert!(commands.len() >= 3);
let rm_cmd = commands.iter().find(|c| c.text.contains("rm")).unwrap();
assert_eq!(rm_cmd.line_number, 2, "rm should be on line 2");
}
#[test]
fn skips_comments() {
let commands = extract_shell_commands("# rm -rf / would be bad");
assert!(
commands.is_empty(),
"comment-only content should produce zero commands, got: {commands:?}"
);
}
#[test]
fn echo_string_is_data_not_execution() {
let commands = extract_shell_commands("echo 'rm -rf /'");
assert!(
commands.len() == 1,
"should only extract echo, not the string content"
);
assert!(
commands[0].text.starts_with("echo"),
"extracted command should be echo"
);
}
#[test]
fn printf_string_is_data_not_execution() {
let commands = extract_shell_commands(r#"printf "rm -rf %s" /tmp"#);
assert!(
commands.len() == 1,
"should only extract printf, not the format string content"
);
assert!(commands[0].text.starts_with("printf"));
}
#[test]
fn empty_content_returns_no_commands() {
let commands = extract_shell_commands("");
assert!(commands.is_empty());
}
#[test]
fn whitespace_only_returns_no_commands() {
let commands = extract_shell_commands(" \n\t ");
assert!(commands.is_empty());
}
#[test]
fn comment_only_returns_no_commands() {
let commands = extract_shell_commands("# This is just a comment");
assert!(
commands.is_empty(),
"comment-only content should produce zero commands, got: {commands:?}"
);
}
#[test]
fn heredoc_delimiter_is_not_command() {
let script = r"cat << EOF
some content
rm -rf / mentioned in text
EOF";
let commands = extract_shell_commands(script);
assert!(
commands.iter().any(|c| c.text.starts_with("cat")),
"should extract cat command"
);
let rm_commands: Vec<_> = commands
.iter()
.filter(|c| c.text.contains("rm") && !c.text.contains("cat"))
.collect();
assert!(
rm_commands.is_empty(),
"heredoc body content must NOT be extracted as commands, but found: {rm_commands:?}"
);
}
#[test]
fn safe_tmp_cleanup_is_extracted() {
let commands = extract_shell_commands("rm -rf /tmp/build_cache");
assert_eq!(commands.len(), 1);
}
#[test]
fn handles_complex_pipeline() {
let commands = extract_shell_commands("cat file | grep pattern | wc -l");
assert_eq!(commands.len(), 3, "should extract all pipeline stages");
}
#[test]
fn handles_background_command() {
let commands = extract_shell_commands("long_process &");
assert_eq!(commands.len(), 1);
assert_eq!(commands[0].text, "long_process");
}
#[test]
fn handles_redirections() {
let commands = extract_shell_commands("rm -rf /tmp/test > /dev/null 2>&1");
assert_eq!(commands.len(), 1);
assert!(commands[0].text.contains("rm"));
}
#[test]
fn handles_variable_expansion_in_command() {
let commands = extract_shell_commands("rm -rf $DIR");
assert_eq!(commands.len(), 1);
assert!(commands[0].text.contains("rm"));
}
#[test]
fn handles_if_then_else() {
let script = r#"if [ -f /tmp/test ]; then
rm -rf /tmp/test
else
echo "not found"
fi"#;
let commands = extract_shell_commands(script);
assert!(
commands.iter().any(|c| c.text.contains("rm")),
"should extract rm from if body"
);
assert!(
commands.iter().any(|c| c.text.contains("echo")),
"should extract echo from else body"
);
}
#[test]
fn handles_for_loop() {
let script = "for f in *.txt; do rm -f \"$f\"; done";
let commands = extract_shell_commands(script);
assert!(
commands.iter().any(|c| c.text.contains("rm")),
"should extract rm from for loop body"
);
}
#[test]
fn byte_ranges_are_correct() {
let script = "echo hello";
let commands = extract_shell_commands(script);
assert_eq!(commands.len(), 1);
assert_eq!(commands[0].start, 0);
assert_eq!(commands[0].end, script.len());
let extracted = &script[commands[0].start..commands[0].end];
assert_eq!(extracted, "echo hello");
}
}
proptest! {
#[test]
fn tier1_is_superset_of_tier2_extraction(cmd in prop_oneof![
"\\PC{0,2000}",
"\\PC{0,400}".prop_map(|body| format!("cat <<EOF\n{body}\nEOF")),
"\\PC{0,400}".prop_map(|body| format!("cat <<'EOF'\n{body}\nEOF")),
"\\PC{0,400}".prop_map(|body| format!("python -c \"{}\"", body.replace('\"', ""))),
"\\PC{0,400}".prop_map(|body| format!("bash -c \"{}\"", body.replace('\"', ""))),
"\\PC{0,400}".prop_map(|body| format!("node -e \"{}\"", body.replace('\"', ""))),
]) {
let limits = ExtractionLimits {
max_body_bytes: 10_000,
max_body_lines: 1_000,
max_heredocs: 5,
timeout_ms: 50,
};
let extracted = extract_content(&cmd, &limits);
if let ExtractionResult::Extracted(contents) = extracted {
if !contents.is_empty() {
prop_assert_eq!(
check_triggers(&cmd),
TriggerResult::Triggered,
"Tier 2 extracted but Tier 1 did not trigger for: {:?}",
cmd
);
}
}
}
}
#[test]
fn detects_language_in_pipeline() {
let cmd = "cat <<EOF | python";
let content = "print('hello')"; let (lang, _) = ScriptLanguage::detect(cmd, content);
assert_eq!(lang, ScriptLanguage::Python);
}
#[test]
fn extract_heredoc_target_command_prefers_command_over_arguments() {
let cat_cmd = "cat bash <<EOF\nrm -rf /\nEOF";
let cat_start = cat_cmd.find("<<").expect("cat heredoc");
assert_eq!(
extract_heredoc_target_command(cat_cmd, cat_start).as_deref(),
Some("cat")
);
let grep_cmd = "grep pattern . <<EOF\nrm -rf /\nEOF";
let grep_start = grep_cmd.find("<<").expect("grep heredoc");
assert_eq!(
extract_heredoc_target_command(grep_cmd, grep_start).as_deref(),
Some("grep")
);
}
#[test]
fn extract_heredoc_target_command_skips_assignments_and_wrappers() {
let env_cmd = "FOO=1 env -i /bin/cat <<EOF\npayload\nEOF";
let env_start = env_cmd.find("<<").expect("env heredoc");
assert_eq!(
extract_heredoc_target_command(env_cmd, env_start).as_deref(),
Some("cat")
);
let sudo_cmd = "sudo bash <<EOF\necho hi\nEOF";
let sudo_start = sudo_cmd.find("<<").expect("sudo heredoc");
assert_eq!(
extract_heredoc_target_command(sudo_cmd, sudo_start).as_deref(),
Some("bash")
);
}
}