destructive_command_guard/
heredoc.rs

1//! Two-tier heredoc and inline script detection.
2//!
3//! This module implements a tiered detection architecture for heredoc and inline
4//! script analysis, balancing performance with detection accuracy.
5//!
6//! # Architecture
7//!
8//! ```text
9//! Command Input
10//!      │
11//!      ▼
12//! ┌─────────────────┐
13//! │ Tier 1: Trigger │ ─── No match ──► ALLOW (fast path)
14//! │   (<100μs)      │
15//! └────────┬────────┘
16//!          │ Match
17//!          ▼
18//! ┌─────────────────┐
19//! │ Tier 2: Extract │ ─── Error/Timeout ──► ALLOW + warn
20//! │   (<1ms)        │
21//! └────────┬────────┘
22//!          │ Success
23//!          ▼
24//! ┌─────────────────┐
25//! │ Tier 3: AST     │ ─── No match ──► ALLOW
26//! │   (<5ms)        │ ─── Match ──► BLOCK
27//! └─────────────────┘
28//! ```
29//!
30//! # Tier 1: Trigger Detection
31//!
32//! Ultra-fast detection using [`RegexSet`] for parallel matching.
33//! Zero allocations on non-match path. MUST have zero false negatives.
34//!
35//! # Tier 2: Content Extraction
36//!
37//! Extracts heredoc/inline script content with bounded memory and time.
38//! Graceful degradation on malformed input.
39//!
40//! # Tier 3: AST Pattern Matching (future)
41//!
42//! Uses ast-grep-core for structural pattern matching.
43//! Language-specific patterns for destructive operations.
44
45use memchr::memchr;
46use regex::RegexSet;
47use std::sync::LazyLock;
48use std::time::{Duration, Instant};
49use tracing::{debug, instrument, trace, warn};
50
51/// Tier 1 trigger patterns for heredoc and inline script detection.
52///
53/// These patterns are designed for maximum recall (zero false negatives).
54/// False positives are acceptable - they just trigger Tier 2 analysis.
55///
56/// # Performance
57///
58/// Uses [`RegexSet`] for parallel matching in a single pass over the input.
59/// Target latency: <10μs for non-matching, <100μs for matching.
60///
61/// Note: heredoc operators (e.g. `<<EOF`, `<<< "..."`) are detected via a small,
62/// quote-aware scanner so we can suppress obvious false positives inside quoted
63/// literals (commit messages, search patterns, etc.) without introducing false
64/// negatives for real shell syntax (including `$()`/backtick substitutions).
65const HEREDOC_TRIGGER_PATTERNS: [&str; 14] = [
66    // Inline interpreter execution. These patterns intentionally allow:
67    // - interleaved flags (python -I -c, bash --norc -c)
68    // - combined short-flag clusters (bash -lc, node -pe, perl -pi -e)
69    // - Windows .exe extensions (python.exe, python3.11.exe, etc.)
70    // - Attached quotes (python -c"...", bash -c'...')
71    //
72    // Tier 1 MUST have zero false negatives for Tier 2 extraction.
73    //
74    // Here-string operator (<<<).
75    // Tier 2 extracts here-strings via context-free regex, so Tier 1 must
76    // trigger on any occurrence of <<< (even inside quotes) to maintain the
77    // superset invariant.  False positives are acceptable for Tier 1.
78    r"<<<",
79    // Python inline execution (matches python, python3, python3.11, python.exe, python3.11.exe, etc.)
80    r#"\bpython[0-9.]*(?:\.exe)?\b(?:\s+(?:--\S+|-[A-Za-z]+))*\s+-[A-Za-z]*[ce][A-Za-z]*(?:\s|['"]|$)"#,
81    // Ruby inline execution (matches ruby, ruby3, ruby3.0, ruby.exe, etc.)
82    r#"\bruby[0-9.]*(?:\.exe)?\b(?:\s+(?:--\S+|-[A-Za-z]+))*\s+-[A-Za-z]*e[A-Za-z]*(?:\s|['"]|$)"#,
83    r#"\birb[0-9.]*(?:\.exe)?\b(?:\s+(?:--\S+|-[A-Za-z]+))*\s+-[A-Za-z]*e[A-Za-z]*(?:\s|['"]|$)"#,
84    // Perl inline execution (matches perl, perl5, perl5.36, perl.exe, etc.)
85    r#"\bperl[0-9.]*(?:\.exe)?\b(?:\s+(?:--\S+|-[A-Za-z]+))*\s+-[A-Za-z]*[eE][A-Za-z]*(?:\s|['"]|$)"#,
86    // Node.js inline execution (matches node, node18, nodejs, node.exe, etc.)
87    r#"\bnode(?:js)?[0-9.]*(?:\.exe)?\b(?:\s+(?:--\S+|-[A-Za-z]+))*\s+-[A-Za-z]*[ep][A-Za-z]*(?:\s|['"]|$)"#,
88    // PHP inline execution
89    r#"\bphp[0-9.]*(?:\.exe)?\b(?:\s+(?:--\S+|-[A-Za-z]+))*\s+-[A-Za-z]*r[A-Za-z]*(?:\s|['"]|$)"#,
90    // Lua inline execution
91    r#"\blua[0-9.]*(?:\.exe)?\b(?:\s+(?:--\S+|-[A-Za-z]+))*\s+-[A-Za-z]*e[A-Za-z]*(?:\s|['"]|$)"#,
92    // Shell inline execution (sh -c, bash -c, zsh -c, fish -c, bash -lc, etc.)
93    r#"\b(?:sh|bash|zsh|fish)(?:\.exe)?\b(?:\s+(?:--\S+|-[A-Za-z]+))*\s+-[A-Za-z]*c[A-Za-z]*(?:\s|['"]|$)"#,
94    // PowerShell inline execution (powershell -Command '...', pwsh -c "...",
95    // and Windows full-path forms like
96    //   "C:\WINDOWS\System32\WindowsPowerShell\v1.0\powershell.exe" -Command '...'
97    // which Codex emits as its Windows command_execution shape (#125)). The
98    // `-Command` parameter (PowerShell abbreviates it to any prefix, e.g. `-c`,
99    // `-com`, case-insensitively) runs an arbitrary inner shell command, so we
100    // must descend into its body. `(?i)` makes the interpreter + flag
101    // case-insensitive (Windows paths are case-insensitive). A possible closing
102    // `"` of a quoted interpreter path is allowed before the flag. Tier 1 may
103    // over-trigger; Tier 2 validates the actual flag.
104    r#"(?i)\b(?:powershell|pwsh)(?:\.exe)?["']?(?:\s+(?:-\S+))*\s+-c[a-z]*\s*['"]"#,
105    // Piped execution to interpreters (versioned, with optional .exe)
106    r"\|\s*(?:python[0-9.]*|ruby[0-9.]*|perl[0-9.]*|node(?:js)?[0-9.]*|php[0-9.]*|lua[0-9.]*|sh|bash)(?:\.exe)?\b",
107    // Piped to xargs (can execute arbitrary commands)
108    r"\|\s*xargs\s",
109    // exec/eval in various contexts
110    r#"\beval\s+['"]"#,
111    r#"\bexec\s+['"]"#,
112];
113
114const MANUAL_HEREDOC_TRIGGER_INDEX: usize = HEREDOC_TRIGGER_PATTERNS.len();
115
116static HEREDOC_TRIGGERS: LazyLock<RegexSet> = LazyLock::new(|| {
117    RegexSet::new(HEREDOC_TRIGGER_PATTERNS).expect("heredoc trigger patterns should compile")
118});
119
120#[inline]
121#[must_use]
122fn contains_active_heredoc_operator(command: &str) -> bool {
123    if memchr(b'<', command.as_bytes()).is_none() {
124        return false;
125    }
126    contains_active_heredoc_operator_recursive(command, 0, 0)
127}
128
129#[must_use]
130fn contains_active_heredoc_operator_recursive(
131    command: &str,
132    start: usize,
133    recursion_depth: usize,
134) -> bool {
135    // Prevent stack overflow on pathological input.
136    //
137    // Tier 1 must have zero false negatives; on recursion exhaustion we conservatively
138    // trigger (false positives are acceptable here).
139    if recursion_depth > 500 {
140        return true;
141    }
142
143    let bytes = command.as_bytes();
144    let len = bytes.len();
145    let mut i = start.min(len);
146
147    while i < len {
148        match bytes[i] {
149            b'<' if i + 1 < len && bytes[i + 1] == b'<' => {
150                // Active shell heredoc/here-string operator.
151                return true;
152            }
153            b'\\' => {
154                // Handle CRLF escape (consumes 3 bytes: \, \r, \n)
155                if i + 2 < len && bytes[i + 1] == b'\r' && bytes[i + 2] == b'\n' {
156                    i += 3;
157                } else {
158                    // Skip escaped byte. Conservative for UTF-8 (see context.rs notes).
159                    i = (i + 2).min(len);
160                }
161            }
162            b'\'' => {
163                // Single-quoted segment (no escapes, no substitutions).
164                i += 1;
165                while i < len && bytes[i] != b'\'' {
166                    i += 1;
167                }
168                if i < len {
169                    i += 1;
170                }
171            }
172            b'"' => {
173                // Double-quoted segment: ignore literal `<<` inside, but scan nested `$()`/backticks.
174                let (found, next) = scan_double_quotes_for_heredoc(command, i + 1, recursion_depth);
175                if found {
176                    return true;
177                }
178                i = next;
179            }
180            b'$' if i + 1 < len && bytes[i + 1] == b'(' => {
181                let (found, next) =
182                    scan_dollar_paren_for_heredoc_recursive(command, i, recursion_depth + 1);
183                if found {
184                    return true;
185                }
186                i = next;
187            }
188            b'`' => {
189                let (found, next) =
190                    scan_backticks_for_heredoc_recursive(command, i, recursion_depth + 1);
191                if found {
192                    return true;
193                }
194                i = next;
195            }
196            _ => {
197                i += 1;
198            }
199        }
200    }
201
202    false
203}
204
205#[must_use]
206fn scan_double_quotes_for_heredoc(
207    command: &str,
208    start: usize,
209    recursion_depth: usize,
210) -> (bool, usize) {
211    if recursion_depth > 500 {
212        return (true, command.len());
213    }
214
215    let bytes = command.as_bytes();
216    let len = bytes.len();
217    let mut i = start.min(len);
218
219    while i < len {
220        match bytes[i] {
221            b'"' => return (false, i + 1),
222            b'\\' => {
223                i = (i + 2).min(len);
224            }
225            b'$' if i + 1 < len && bytes[i + 1] == b'(' => {
226                let (found, next) =
227                    scan_dollar_paren_for_heredoc_recursive(command, i, recursion_depth + 1);
228                if found {
229                    return (true, next);
230                }
231                i = next;
232            }
233            b'`' => {
234                let (found, next) =
235                    scan_backticks_for_heredoc_recursive(command, i, recursion_depth + 1);
236                if found {
237                    return (true, next);
238                }
239                i = next;
240            }
241            _ => {
242                i += 1;
243            }
244        }
245    }
246
247    (false, len)
248}
249
250#[must_use]
251fn scan_dollar_paren_for_heredoc_recursive(
252    command: &str,
253    start: usize,
254    recursion_depth: usize,
255) -> (bool, usize) {
256    // Prevent stack overflow on pathological input.
257    if recursion_depth > 500 {
258        return (true, command.len());
259    }
260
261    let bytes = command.as_bytes();
262    let len = bytes.len();
263
264    debug_assert_eq!(bytes.get(start), Some(&b'$'));
265    debug_assert_eq!(bytes.get(start + 1), Some(&b'('));
266
267    let mut i = start + 2;
268    let mut depth: u32 = 1;
269
270    while i < len {
271        match bytes[i] {
272            b'<' if i + 1 < len && bytes[i + 1] == b'<' => {
273                return (true, i + 2);
274            }
275            b'(' => {
276                depth += 1;
277                i += 1;
278            }
279            b')' => {
280                if depth == 1 {
281                    // End of command substitution.
282                    return (false, i + 1);
283                }
284                depth = depth.saturating_sub(1);
285                i += 1;
286            }
287            b'\\' => {
288                i = (i + 2).min(len);
289            }
290            b'\'' => {
291                // Single quotes inside: consume until closing.
292                i += 1;
293                while i < len && bytes[i] != b'\'' {
294                    i += 1;
295                }
296                if i < len {
297                    i += 1;
298                }
299            }
300            b'"' => {
301                let (found, next) = scan_double_quotes_for_heredoc(command, i + 1, recursion_depth);
302                if found {
303                    return (true, next);
304                }
305                i = next;
306            }
307            b'$' if i + 1 < len && bytes[i + 1] == b'(' => {
308                let (found, next) =
309                    scan_dollar_paren_for_heredoc_recursive(command, i, recursion_depth + 1);
310                if found {
311                    return (true, next);
312                }
313                i = next;
314            }
315            b'`' => {
316                let (found, next) =
317                    scan_backticks_for_heredoc_recursive(command, i, recursion_depth + 1);
318                if found {
319                    return (true, next);
320                }
321                i = next;
322            }
323            _ => {
324                i += 1;
325            }
326        }
327    }
328
329    (false, len)
330}
331
332#[must_use]
333fn scan_backticks_for_heredoc_recursive(
334    command: &str,
335    start: usize,
336    recursion_depth: usize,
337) -> (bool, usize) {
338    if recursion_depth > 500 {
339        return (true, command.len());
340    }
341
342    let bytes = command.as_bytes();
343    let len = bytes.len();
344
345    debug_assert_eq!(bytes.get(start), Some(&b'`'));
346
347    let mut i = start + 1;
348    while i < len {
349        match bytes[i] {
350            b'<' if i + 1 < len && bytes[i + 1] == b'<' => {
351                return (true, i + 2);
352            }
353            b'\\' => {
354                i = (i + 2).min(len);
355            }
356            b'\'' => {
357                i += 1;
358                while i < len && bytes[i] != b'\'' {
359                    i += 1;
360                }
361                if i < len {
362                    i += 1;
363                }
364            }
365            b'"' => {
366                let (found, next) = scan_double_quotes_for_heredoc(command, i + 1, recursion_depth);
367                if found {
368                    return (true, next);
369                }
370                i = next;
371            }
372            b'$' if i + 1 < len && bytes[i + 1] == b'(' => {
373                let (found, next) =
374                    scan_dollar_paren_for_heredoc_recursive(command, i, recursion_depth + 1);
375                if found {
376                    return (true, next);
377                }
378                i = next;
379            }
380            b'`' => {
381                return (false, i + 1);
382            }
383            _ => {
384                i += 1;
385            }
386        }
387    }
388
389    (false, len)
390}
391
392/// Result of Tier 1 trigger detection.
393#[derive(Debug, Clone, Copy, PartialEq, Eq)]
394pub enum TriggerResult {
395    /// No heredoc/inline script indicators found - fast path to ALLOW.
396    NoTrigger,
397    /// Trigger detected - proceed to Tier 2 extraction.
398    Triggered,
399}
400
401/// Check if a command contains heredoc or inline script indicators.
402///
403/// This is Tier 1 of the detection pipeline - ultra-fast screening.
404///
405/// # Guarantees
406///
407/// - Zero false negatives: if Tier 2 would find a heredoc, this MUST trigger
408/// - Zero allocations on non-match path
409/// - Target latency: <10μs for non-matching commands
410///
411/// # Examples
412///
413/// ```ignore
414/// use destructive_command_guard::heredoc::{check_triggers, TriggerResult};
415///
416/// // No trigger - fast path
417/// assert_eq!(check_triggers("git status"), TriggerResult::NoTrigger);
418///
419/// // Heredoc trigger
420/// assert_eq!(check_triggers("cat << EOF"), TriggerResult::Triggered);
421///
422/// // Python inline execution
423/// assert_eq!(check_triggers("python -c 'import os'"), TriggerResult::Triggered);
424/// ```
425#[inline]
426#[must_use]
427#[instrument(skip(command), fields(cmd_len = command.len()))]
428pub fn check_triggers(command: &str) -> TriggerResult {
429    if contains_active_heredoc_operator(command) || HEREDOC_TRIGGERS.is_match(command) {
430        debug!("tier1_trigger: heredoc/inline script indicator detected");
431        TriggerResult::Triggered
432    } else {
433        trace!("tier1_no_trigger: fast path allow");
434        TriggerResult::NoTrigger
435    }
436}
437
438/// Returns the list of trigger pattern indices that matched.
439///
440/// Useful for debugging and logging which patterns triggered.
441#[must_use]
442pub fn matched_triggers(command: &str) -> Vec<usize> {
443    let mut matches: Vec<usize> = HEREDOC_TRIGGERS.matches(command).into_iter().collect();
444    if contains_active_heredoc_operator(command) {
445        matches.push(MANUAL_HEREDOC_TRIGGER_INDEX);
446    }
447    matches
448}
449
450// ============================================================================
451// Tier 2: Content Extraction
452// ============================================================================
453
454use regex::Regex;
455
456/// Limits for content extraction to prevent resource exhaustion.
457#[derive(Debug, Clone, Copy)]
458pub struct ExtractionLimits {
459    /// Maximum bytes to extract from heredoc body (default: 1MB)
460    pub max_body_bytes: usize,
461    /// Maximum lines to extract from heredoc body (default: 10,000)
462    pub max_body_lines: usize,
463    /// Maximum number of heredocs to process per command (default: 10)
464    pub max_heredocs: usize,
465    /// Timeout for extraction in milliseconds (default: 50ms)
466    pub timeout_ms: u64,
467}
468
469impl Default for ExtractionLimits {
470    fn default() -> Self {
471        Self {
472            max_body_bytes: 1024 * 1024, // 1MB
473            max_body_lines: 10_000,
474            max_heredocs: 10,
475            timeout_ms: 50,
476        }
477    }
478}
479
480/// Detected language for embedded script content.
481#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
482pub enum ScriptLanguage {
483    Bash,
484    Go,
485    Php,
486    Python,
487    Ruby,
488    Perl,
489    JavaScript,
490    TypeScript,
491    Unknown,
492}
493
494impl ScriptLanguage {
495    /// Infer language from a command prefix (e.g., "python", "python3", "python3.11").
496    ///
497    /// Matches exact command names or names with version suffixes (e.g., "python3.11").
498    /// Also handles Windows .exe extensions (e.g., "python.exe", "python3.11.exe").
499    /// Does NOT match arbitrary words that start with a command name (e.g., "shebang" ≠ "sh").
500    #[must_use]
501    pub fn from_command(cmd: &str) -> Self {
502        let cmd_lower = cmd.to_lowercase();
503        // Strip Windows .exe extension if present
504        let cmd_base = cmd_lower.strip_suffix(".exe").unwrap_or(&cmd_lower);
505
506        // Helper: check if cmd matches base name, optionally followed by version digits/dots
507        // e.g., "python" matches "python", "python3", "python3.11"
508        // but "python" does NOT match "pythonic" or "python_helper"
509        let matches_interpreter = |base: &str| -> bool {
510            if cmd_base == base {
511                return true;
512            }
513            // Allow version suffixes: digits and dots (e.g., "3", "3.11", "3.11.4")
514            cmd_base.strip_prefix(base).is_some_and(|suffix| {
515                !suffix.is_empty()
516                    && suffix.chars().all(|c| c.is_ascii_digit() || c == '.')
517                    && suffix.chars().next().is_some_and(|c| c.is_ascii_digit())
518            })
519        };
520
521        if matches_interpreter("python") {
522            Self::Python
523        } else if matches_interpreter("ruby") || matches_interpreter("irb") {
524            Self::Ruby
525        } else if matches_interpreter("perl") {
526            Self::Perl
527        } else if matches_interpreter("node") || matches_interpreter("nodejs") {
528            Self::JavaScript
529        } else if matches_interpreter("deno") || matches_interpreter("bun") {
530            Self::TypeScript
531        } else if matches_interpreter("php") {
532            Self::Php
533        } else if matches_interpreter("go") {
534            // Note: Go doesn't typically use version suffixes in command names
535            Self::Go
536        } else if matches_interpreter("sh")
537            || matches_interpreter("bash")
538            || matches_interpreter("zsh")
539            || matches_interpreter("fish")
540            // PowerShell (`powershell`, `powershell.exe`, `pwsh`) running an
541            // inner command via `-Command`/`-c`. We re-check the body as a
542            // shell command: destructive command names (git, rm, etc.) are
543            // identical across PowerShell and POSIX shells, so Bash-style
544            // re-evaluation surfaces the same rules. This is what lets dcg
545            // descend into Codex's Windows `powershell.exe -Command '...'`
546            // command shape (#125).
547            || matches_interpreter("powershell")
548            || matches_interpreter("pwsh")
549        {
550            Self::Bash
551        } else {
552            Self::Unknown
553        }
554    }
555
556    /// Infer language from a shebang line (e.g., `#!/usr/bin/env python3`).
557    ///
558    /// Parses both direct interpreter paths (`#!/bin/bash`) and env-based shebangs
559    /// (`#!/usr/bin/env python3`).
560    ///
561    /// Returns `None` if no valid shebang is found.
562    #[must_use]
563    pub fn from_shebang(content: &str) -> Option<Self> {
564        let first_line = content.lines().next()?;
565
566        // Shebang must start with #!
567        let shebang = first_line.strip_prefix("#!")?;
568        let shebang = shebang.trim();
569
570        if shebang.is_empty() {
571            return None;
572        }
573
574        // Extract interpreter: handle both direct paths and env-style shebangs
575        // Examples:
576        //   #!/bin/bash              -> bash
577        //   #!/bin/bash -e           -> bash (ignores flags)
578        //   #!/usr/bin/env python3   -> python3
579        //   #!/usr/bin/env python3 -u -> python3 (ignores flags)
580        //   #!/usr/bin/env -S python3 -u -> python3 (skips env flags)
581        //   #!/usr/bin/python        -> python
582        let mut parts = shebang.split_whitespace();
583        let first = parts.next()?;
584        let basename = first.rsplit('/').next().unwrap_or(first);
585
586        // If it's "env", skip any flags (starting with -) to find the interpreter
587        let interpreter = if basename == "env" {
588            // Skip env flags like -S, -i, -u, etc.
589            loop {
590                let next = parts.next()?;
591                if !next.starts_with('-') {
592                    break next.rsplit('/').next().unwrap_or(next);
593                }
594            }
595        } else {
596            basename
597        };
598
599        // Use existing from_command logic to map interpreter to language
600        let lang = Self::from_command(interpreter);
601        if lang == Self::Unknown {
602            None
603        } else {
604            Some(lang)
605        }
606    }
607
608    /// Infer language from content heuristics (fallback detection).
609    ///
610    /// Examines the first few lines for language-specific patterns like
611    /// import statements, requires, or function definitions.
612    ///
613    /// This is a low-confidence detection method used only when command
614    /// prefix and shebang detection fail.
615    ///
616    /// Returns `None` if no recognizable patterns are found.
617    #[must_use]
618    pub fn from_content(content: &str) -> Option<Self> {
619        // Only examine first 20 lines to bound heuristic cost
620        let lines: Vec<&str> = content.lines().take(20).collect();
621
622        // Python indicators (high confidence)
623        let has_python_import = lines.iter().any(|l| {
624            let trimmed = l.trim();
625            trimmed.starts_with("import ") || trimmed.starts_with("from ")
626        });
627        if has_python_import {
628            return Some(Self::Python);
629        }
630
631        // TypeScript indicators (check BEFORE JavaScript since TS is a superset)
632        // TypeScript-specific patterns that distinguish it from plain JS
633        let has_typescript_patterns = lines.iter().any(|l| {
634            let trimmed = l.trim();
635            trimmed.contains(": string")
636                || trimmed.contains(": number")
637                || trimmed.contains(": boolean")
638                || trimmed.contains("interface ")
639                || trimmed.starts_with("type ")
640        });
641        if has_typescript_patterns {
642            return Some(Self::TypeScript);
643        }
644
645        // JavaScript/Node indicators
646        let has_js_patterns = lines.iter().any(|l| {
647            let trimmed = l.trim();
648            trimmed.contains("require(")
649                || trimmed.starts_with("const ")
650                || trimmed.starts_with("let ")
651                || trimmed.starts_with("var ")
652                || trimmed.contains("module.exports")
653        });
654        if has_js_patterns {
655            return Some(Self::JavaScript);
656        }
657
658        // Ruby indicators
659        let has_ruby_patterns = lines.iter().any(|l| {
660            let trimmed = l.trim();
661            trimmed.starts_with("def ")
662                || trimmed.starts_with("class ")
663                || trimmed.starts_with("require ")
664                || trimmed.starts_with("require_relative ")
665                || trimmed.contains(".each do")
666                || trimmed.contains(" do |")
667        });
668        // Ruby also needs "end" somewhere to reduce false positives
669        let has_end = content.contains("\nend") || content.ends_with("end");
670        if has_ruby_patterns && has_end {
671            return Some(Self::Ruby);
672        }
673
674        // Go indicators (high confidence)
675        // Go has distinctive patterns: package declaration, func, :=, import with quotes
676        let has_go_patterns = lines.iter().any(|l| {
677            let trimmed = l.trim();
678            trimmed.starts_with("package ")
679                || trimmed.starts_with("func ")
680                || trimmed.contains(":=")
681                || (trimmed.starts_with("import ") && trimmed.contains('"'))
682                || trimmed == "import ("
683        });
684        if has_go_patterns {
685            return Some(Self::Go);
686        }
687
688        // Perl indicators
689        let has_perl_patterns = lines.iter().any(|l| {
690            let trimmed = l.trim();
691            trimmed.starts_with("use strict")
692                || trimmed.starts_with("use warnings")
693                || trimmed.starts_with("my $")
694                || trimmed.starts_with("my @")
695                || trimmed.starts_with("my %")
696                || trimmed.contains("=~ /")
697                || trimmed.contains("=~ s/")
698        });
699        if has_perl_patterns {
700            return Some(Self::Perl);
701        }
702
703        // Bash indicators (low priority - many scripts look like bash)
704        let has_bash_patterns = lines.iter().any(|l| {
705            let trimmed = l.trim();
706            trimmed.starts_with("if [")
707                || trimmed.starts_with("for ")
708                || trimmed.starts_with("while ")
709                || trimmed.starts_with("case ")
710                || trimmed.contains("$((")
711                || trimmed.contains("${")
712                || trimmed.starts_with("function ")
713                || (trimmed.contains("()") && trimmed.contains('{'))
714        });
715        if has_bash_patterns {
716            return Some(Self::Bash);
717        }
718
719        None
720    }
721
722    /// Detect language using all available signals with priority order.
723    ///
724    /// Priority:
725    /// 1. Command prefix (highest confidence - e.g., `python -c`)
726    /// 2. Shebang line (high confidence - e.g., `#!/usr/bin/env python3`)
727    /// 3. Content heuristics (lower confidence - imports, patterns)
728    /// 4. Unknown (fallback)
729    ///
730    /// Returns a tuple of (language, confidence) for explainability.
731    #[must_use]
732    pub fn detect(cmd: &str, content: &str) -> (Self, DetectionConfidence) {
733        // Priority 1: Extract interpreter from command prefix
734        if let Some(interpreter) = Self::extract_head_interpreter(cmd) {
735            let lang = Self::from_command(&interpreter);
736            if lang != Self::Unknown {
737                return (lang, DetectionConfidence::CommandPrefix);
738            }
739        }
740
741        // Priority 1b: Check pipe destinations (e.g. "cat <<EOF | python")
742        // This handles cases where the heredoc consumer is later in the pipeline
743        if cmd.contains('|') {
744            for segment in cmd.split('|') {
745                let segment = segment.trim();
746                if segment.is_empty() {
747                    continue;
748                }
749                if let Some(interpreter) = Self::extract_head_interpreter(segment) {
750                    let lang = Self::from_command(&interpreter);
751                    if lang != Self::Unknown {
752                        return (lang, DetectionConfidence::CommandPrefix);
753                    }
754                }
755            }
756        }
757
758        // Priority 2: Shebang detection
759        if let Some(lang) = Self::from_shebang(content) {
760            return (lang, DetectionConfidence::Shebang);
761        }
762
763        // Priority 3: Content heuristics
764        if let Some(lang) = Self::from_content(content) {
765            return (lang, DetectionConfidence::ContentHeuristics);
766        }
767
768        // Priority 4: Unknown
769        (Self::Unknown, DetectionConfidence::Unknown)
770    }
771
772    /// Extract the interpreter name from the head of a command string.
773    ///
774    /// Handles various formats:
775    /// - `python3 -c "code"` → "python3"
776    /// - `/usr/bin/python -c "code"` → "python"
777    /// - `env python3 -c "code"` → "python3"
778    /// - `env -S python3 -c "code"` → "python3" (skips env flags)
779    /// - `env VAR=val python3 -c "code"` → "python3" (skips env vars)
780    /// - `bash -c "code"` → "bash"
781    fn extract_head_interpreter(cmd: &str) -> Option<String> {
782        // Use robust wrapper stripping to handle env flags (e.g. -u, -C) correctly.
783        let normalized = crate::normalize::strip_wrapper_prefixes(cmd);
784        let cmd_to_check = normalized.normalized;
785
786        let mut parts = cmd_to_check.split_whitespace();
787        let first = parts.next()?;
788
789        // Get basename (strip path)
790        let basename = first.rsplit('/').next().unwrap_or(first);
791        Some(basename.to_string())
792    }
793}
794
795/// Confidence level of language detection.
796///
797/// Used by `dcg explain` to show why a particular language was detected.
798#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
799pub enum DetectionConfidence {
800    /// Detected from command prefix (e.g., `python -c`).
801    /// Highest confidence - the command explicitly names the interpreter.
802    CommandPrefix,
803
804    /// Detected from shebang line (e.g., `#!/usr/bin/env python3`).
805    /// High confidence - explicit interpreter declaration in the script.
806    Shebang,
807
808    /// Detected from content patterns (imports, syntax patterns).
809    /// Lower confidence - heuristic-based detection.
810    ContentHeuristics,
811
812    /// Could not determine language.
813    /// Lowest "confidence" - effectively no detection.
814    Unknown,
815}
816
817impl DetectionConfidence {
818    /// Human-readable label for this confidence level.
819    #[must_use]
820    pub const fn label(&self) -> &'static str {
821        match self {
822            Self::CommandPrefix => "command-prefix",
823            Self::Shebang => "shebang",
824            Self::ContentHeuristics => "content-heuristics",
825            Self::Unknown => "unknown",
826        }
827    }
828
829    /// Descriptive reason for this confidence level.
830    #[must_use]
831    pub const fn reason(&self) -> &'static str {
832        match self {
833            Self::CommandPrefix => "detected from command interpreter (highest confidence)",
834            Self::Shebang => "detected from shebang line (high confidence)",
835            Self::ContentHeuristics => "inferred from content patterns (lower confidence)",
836            Self::Unknown => "could not determine language",
837        }
838    }
839}
840
841/// Type of heredoc extraction.
842#[derive(Debug, Clone, Copy, PartialEq, Eq)]
843pub enum HeredocType {
844    /// Standard heredoc (<<)
845    Standard,
846    /// Tab-stripping heredoc (<<-)
847    TabStripped,
848    /// Here-string (<<<)
849    HereString,
850    /// Indentation-stripping heredoc (<<~, Ruby-style)
851    IndentStripped,
852}
853
854/// Extracted content from a heredoc or inline script.
855#[derive(Debug, Clone)]
856pub struct ExtractedContent {
857    /// The script content (body of heredoc or inline argument).
858    pub content: String,
859    /// Detected or inferred language.
860    pub language: ScriptLanguage,
861    /// Heredoc delimiter (e.g., "EOF"), if applicable.
862    pub delimiter: Option<String>,
863    /// Byte range in the original command.
864    pub byte_range: std::ops::Range<usize>,
865    /// Byte range of the extracted content inside the original command, if known.
866    ///
867    /// For inline scripts and here-strings this is the exact content span.
868    /// For heredoc bodies, this represents the raw body range (may not map
869    /// cleanly if indentation or CRLF normalization occurred).
870    pub content_range: Option<std::ops::Range<usize>>,
871    /// Whether the delimiter was quoted (suppresses expansion).
872    pub quoted: bool,
873    /// Type of heredoc (if applicable).
874    pub heredoc_type: Option<HeredocType>,
875    /// The command that receives this heredoc (e.g., "cat", "bash").
876    /// Used to determine if content should be evaluated as executable.
877    pub target_command: Option<String>,
878}
879
880/// Reason why extraction was skipped (for observability/logging).
881#[derive(Debug, Clone, PartialEq)]
882pub enum SkipReason {
883    /// Input exceeded maximum size limit.
884    ExceededSizeLimit { actual: usize, limit: usize },
885    /// Input exceeded maximum line count.
886    ExceededLineLimit { actual: usize, limit: usize },
887    /// Maximum heredoc count reached.
888    ExceededHeredocLimit { limit: usize },
889    /// Binary-like content detected (contains null bytes or high non-printable ratio).
890    BinaryContent {
891        null_bytes: usize,
892        non_printable_ratio: f32,
893    },
894    /// Tier 2 extraction exceeded the time budget (fail-open).
895    Timeout { elapsed_ms: u64, budget_ms: u64 },
896    /// Heredoc delimiter not found (unterminated).
897    UnterminatedHeredoc { delimiter: String },
898    /// Malformed input that couldn't be parsed.
899    MalformedInput { reason: String },
900}
901
902impl std::fmt::Display for SkipReason {
903    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
904        match self {
905            Self::ExceededSizeLimit { actual, limit } => {
906                write!(f, "exceeded size limit: {actual} bytes > {limit} bytes")
907            }
908            Self::ExceededLineLimit { actual, limit } => {
909                write!(f, "exceeded line limit: {actual} lines > {limit} lines")
910            }
911            Self::ExceededHeredocLimit { limit } => {
912                write!(f, "exceeded heredoc limit: max {limit} heredocs")
913            }
914            Self::BinaryContent {
915                null_bytes,
916                non_printable_ratio,
917            } => {
918                write!(
919                    f,
920                    "binary content detected: {null_bytes} null bytes, {:.1}% non-printable",
921                    non_printable_ratio * 100.0
922                )
923            }
924            Self::Timeout {
925                elapsed_ms,
926                budget_ms,
927            } => write!(
928                f,
929                "extraction timeout: {elapsed_ms}ms > {budget_ms}ms budget"
930            ),
931            Self::UnterminatedHeredoc { delimiter } => {
932                write!(f, "unterminated heredoc: delimiter '{delimiter}' not found")
933            }
934            Self::MalformedInput { reason } => {
935                write!(f, "malformed input: {reason}")
936            }
937        }
938    }
939}
940
941/// Result of Tier 2 content extraction.
942#[derive(Debug)]
943pub enum ExtractionResult {
944    /// No extractable content found after trigger.
945    NoContent,
946    /// Successfully extracted content.
947    Extracted(Vec<ExtractedContent>),
948    /// Extraction was skipped (fail-open with reason for observability).
949    Skipped(Vec<SkipReason>),
950    Partial {
951        extracted: Vec<ExtractedContent>,
952        skipped: Vec<SkipReason>,
953    },
954    /// Extraction failed (timeout, malformed, etc.) - fail open with warning.
955    Failed(String),
956}
957
958/// Regex patterns for heredoc extraction (compiled once).
959static HEREDOC_EXTRACTOR: LazyLock<Regex> = LazyLock::new(|| {
960    // Matches: <<[-~]? followed by:
961    // 1. Single-quoted delimiter: 'delim' (Group 2)
962    // 2. Double-quoted delimiter: "delim" (Group 3)
963    // 3. Unquoted delimiter: delim (Group 4)
964    // Group 1 is the operator variant (-/~/empty).
965    // Note: * instead of + allows empty delimiters (valid in bash).
966    Regex::new(r#"<<([-~])?\s*(?:'([^']*)'|"([^"]*)"|([\w.-]+))"#).expect("heredoc regex compiles")
967});
968
969/// Regex for here-string extraction with single quotes (<<<).
970static HERESTRING_SINGLE_QUOTE: LazyLock<Regex> = LazyLock::new(|| {
971    // Matches: <<< 'content' - content can contain double quotes
972    // Group 1: content
973    Regex::new(r"<<<\s*'([^']*)'").expect("herestring single-quote regex compiles")
974});
975
976/// Regex for here-string extraction with double quotes (<<<).
977static HERESTRING_DOUBLE_QUOTE: LazyLock<Regex> = LazyLock::new(|| {
978    // Matches: <<< "content" - content can contain single quotes
979    // Group 1: content
980    Regex::new(r#"<<<\s*"([^"]*)""#).expect("herestring double-quote regex compiles")
981});
982
983/// Regex for here-string extraction without quotes (<<<).
984static HERESTRING_UNQUOTED: LazyLock<Regex> = LazyLock::new(|| {
985    // Matches: <<< word - unquoted single word (NOT starting with quote)
986    // Group 1: content
987    // [^'\x22\s] ensures we don't match quoted forms
988    Regex::new(r"<<<\s*([^'\x22\s]\S*)").expect("herestring unquoted regex compiles")
989});
990
991/// Regex for inline script flag extraction with single quotes.
992static INLINE_SCRIPT_SINGLE_QUOTE: LazyLock<Regex> = LazyLock::new(|| {
993    // Matches: command -c/-e/-p/-E/-r followed by single-quoted content
994    // Groups: (1) interpreter, (2) optional "js" suffix for node, (3) flag, (4) content
995    // Supports versioned interpreters: python3.11, ruby3.0, perl5.36, node18, nodejs20, etc.
996    // Supports Windows .exe extensions: python.exe, python3.11.exe, etc.
997    // `(?i:powershell|pwsh)` matches the Windows PowerShell host case-insensitively;
998    // `["']?` after the interpreter swallows the closing quote of a quoted full
999    // path (e.g. `"...\powershell.exe" -Command '...'`) before flags (#125).
1000    Regex::new(r#"\b(python[0-9.]*(?:\.exe)?|ruby[0-9.]*(?:\.exe)?|irb[0-9.]*(?:\.exe)?|perl[0-9.]*(?:\.exe)?|node(js)?[0-9.]*(?:\.exe)?|php[0-9.]*(?:\.exe)?|lua[0-9.]*(?:\.exe)?|sh(?:\.exe)?|bash(?:\.exe)?|zsh(?:\.exe)?|fish(?:\.exe)?|(?i:powershell|pwsh)(?:\.exe)?)\b["']?(?:\s+(?:--\S+|-[A-Za-z]+))*\s+(-[A-Za-z]*[ceECpr][A-Za-z]*)\s*'([^']*)'"#)
1001        .expect("inline script single-quote regex compiles")
1002});
1003
1004/// Regex for inline script flag extraction with double quotes.
1005static INLINE_SCRIPT_DOUBLE_QUOTE: LazyLock<Regex> = LazyLock::new(|| {
1006    // Matches: command -c/-e/-p/-E/-r followed by double-quoted content
1007    // Groups: (1) interpreter, (2) optional "js" suffix for node, (3) flag, (4) content
1008    // Supports versioned interpreters: python3.11, ruby3.0, perl5.36, node18, nodejs20, etc.
1009    // Supports Windows .exe extensions: python.exe, python3.11.exe, etc.
1010    // PowerShell host + quoted-path closing quote handled as in the single-quote
1011    // variant above (#125).
1012    Regex::new(r#"\b(python[0-9.]*(?:\.exe)?|ruby[0-9.]*(?:\.exe)?|irb[0-9.]*(?:\.exe)?|perl[0-9.]*(?:\.exe)?|node(js)?[0-9.]*(?:\.exe)?|php[0-9.]*(?:\.exe)?|lua[0-9.]*(?:\.exe)?|sh(?:\.exe)?|bash(?:\.exe)?|zsh(?:\.exe)?|fish(?:\.exe)?|(?i:powershell|pwsh)(?:\.exe)?)\b['"]?(?:\s+(?:--\S+|-[A-Za-z]+))*\s+(-[A-Za-z]*[ceECpr][A-Za-z]*)\s*"([^"]*)""#)
1013        .expect("inline script double-quote regex compiles")
1014});
1015
1016// ============================================================================
1017// Robustness: Binary Content Detection
1018// ============================================================================
1019
1020/// Threshold for non-printable character ratio to consider content binary.
1021const BINARY_THRESHOLD: f32 = 0.30; // 30% non-printable characters
1022
1023/// Check if content appears to be binary (contains null bytes or high non-printable ratio).
1024///
1025/// # Returns
1026///
1027/// `Some(SkipReason::BinaryContent)` if the content appears binary, `None` otherwise.
1028#[must_use]
1029#[allow(clippy::cast_precision_loss)] // Precision loss acceptable
1030#[allow(clippy::naive_bytecount)] // Acceptable for bounded content
1031pub fn check_binary_content(content: &str) -> Option<SkipReason> {
1032    let bytes = content.as_bytes();
1033    if bytes.is_empty() {
1034        return None;
1035    }
1036
1037    // Count null bytes (definite binary indicator)
1038    let null_bytes = bytes.iter().filter(|&&b| b == 0).count();
1039    if null_bytes > 0 {
1040        return Some(SkipReason::BinaryContent {
1041            null_bytes,
1042            non_printable_ratio: null_bytes as f32 / bytes.len() as f32,
1043        });
1044    }
1045
1046    // A valid UTF-8 string shouldn't be considered binary just because it has non-ASCII.
1047    // We count actual control characters (excluding whitespace) and U+FFFD (replacement chars).
1048    let mut suspect_chars = 0;
1049    let mut total_chars = 0;
1050
1051    for c in content.chars() {
1052        total_chars += 1;
1053        if (c.is_control() && c != '\n' && c != '\r' && c != '\t')
1054            || c == std::char::REPLACEMENT_CHARACTER
1055        {
1056            suspect_chars += 1;
1057        }
1058    }
1059
1060    let ratio = suspect_chars as f32 / total_chars.max(1) as f32;
1061    if ratio > BINARY_THRESHOLD {
1062        return Some(SkipReason::BinaryContent {
1063            null_bytes: 0,
1064            non_printable_ratio: ratio,
1065        });
1066    }
1067
1068    None
1069}
1070
1071#[inline]
1072fn record_timeout_if_needed(
1073    start_time: Instant,
1074    timeout: Duration,
1075    budget_ms: u64,
1076    skip_reasons: &mut Vec<SkipReason>,
1077) -> bool {
1078    let elapsed = start_time.elapsed();
1079    if elapsed < timeout {
1080        return false;
1081    }
1082
1083    if !skip_reasons
1084        .iter()
1085        .any(|r| matches!(r, SkipReason::Timeout { .. }))
1086    {
1087        let elapsed_ms = u64::try_from(elapsed.as_millis()).unwrap_or(u64::MAX);
1088        skip_reasons.push(SkipReason::Timeout {
1089            elapsed_ms,
1090            budget_ms,
1091        });
1092    }
1093
1094    true
1095}
1096
1097/// Extract heredoc and inline script content from a command.
1098///
1099/// This is Tier 2 of the detection pipeline - content extraction with safety bounds.
1100///
1101/// # Guarantees
1102///
1103/// - Bounded memory usage (never allocate >`max_body_bytes` per heredoc)
1104/// - Graceful degradation on malformed input (fail-open with warning)
1105///
1106/// # Examples
1107///
1108/// ```ignore
1109/// use destructive_command_guard::heredoc::{extract_content, ExtractionLimits, ExtractionResult};
1110///
1111/// let result = extract_content(
1112///     "python3 -c 'import os; os.system(\"rm -rf /\")'",
1113///     &ExtractionLimits::default()
1114/// );
1115///
1116/// if let ExtractionResult::Extracted(contents) = result {
1117///     assert_eq!(contents.len(), 1);
1118///     assert!(contents[0].content.contains("os.system"));
1119/// }
1120/// ```
1121#[must_use]
1122#[instrument(skip(command, limits), fields(cmd_len = command.len(), timeout_ms = limits.timeout_ms))]
1123pub fn extract_content(command: &str, limits: &ExtractionLimits) -> ExtractionResult {
1124    let start_time = Instant::now();
1125    let timeout = Duration::from_millis(limits.timeout_ms);
1126    let mut skip_reasons: Vec<SkipReason> = Vec::new();
1127
1128    // Enforce input size limit
1129    if command.len() > limits.max_body_bytes {
1130        warn!(
1131            actual = command.len(),
1132            limit = limits.max_body_bytes,
1133            "tier2_skip: input exceeds size limit"
1134        );
1135        skip_reasons.push(SkipReason::ExceededSizeLimit {
1136            actual: command.len(),
1137            limit: limits.max_body_bytes,
1138        });
1139        return ExtractionResult::Skipped(skip_reasons);
1140    }
1141
1142    // Check for binary content (null bytes or high non-printable ratio)
1143    if let Some(reason) = check_binary_content(command) {
1144        warn!(?reason, "tier2_skip: binary content detected");
1145        skip_reasons.push(reason);
1146        return ExtractionResult::Skipped(skip_reasons);
1147    }
1148
1149    let mut extracted: Vec<ExtractedContent> = Vec::new();
1150
1151    // Enforce time budget (fail open) before doing any further work.
1152    if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, &mut skip_reasons) {
1153        return ExtractionResult::Skipped(skip_reasons);
1154    }
1155
1156    // Extract inline scripts (-c/-e flags)
1157    extract_inline_scripts(
1158        command,
1159        limits,
1160        start_time,
1161        timeout,
1162        &mut extracted,
1163        &mut skip_reasons,
1164    );
1165    if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, &mut skip_reasons) {
1166        return if extracted.is_empty() {
1167            ExtractionResult::Skipped(skip_reasons)
1168        } else {
1169            ExtractionResult::Extracted(extracted)
1170        };
1171    }
1172
1173    // Extract here-strings (<<<)
1174    extract_herestrings(
1175        command,
1176        limits,
1177        start_time,
1178        timeout,
1179        &mut extracted,
1180        &mut skip_reasons,
1181    );
1182    if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, &mut skip_reasons) {
1183        return if extracted.is_empty() {
1184            ExtractionResult::Skipped(skip_reasons)
1185        } else {
1186            ExtractionResult::Extracted(extracted)
1187        };
1188    }
1189
1190    // Extract heredocs (<<, <<-, <<~)
1191    extract_heredocs(
1192        command,
1193        limits,
1194        start_time,
1195        timeout,
1196        &mut extracted,
1197        &mut skip_reasons,
1198    );
1199
1200    // Return based on what we found
1201    let elapsed_us = start_time.elapsed().as_micros();
1202    match (extracted.is_empty(), skip_reasons.is_empty()) {
1203        (true, true) => {
1204            trace!(elapsed_us, "tier2_complete: no content found");
1205            ExtractionResult::NoContent
1206        }
1207        (true, false) => {
1208            warn!(
1209                elapsed_us,
1210                skip_count = skip_reasons.len(),
1211                "tier2_complete: skipped"
1212            );
1213            ExtractionResult::Skipped(skip_reasons)
1214        }
1215        (false, true) => {
1216            debug!(
1217                elapsed_us,
1218                count = extracted.len(),
1219                "tier2_complete: content extracted"
1220            );
1221            ExtractionResult::Extracted(extracted)
1222        }
1223        (false, false) => {
1224            // Partial extraction with some skips - return what we got
1225            debug!(
1226                elapsed_us,
1227                count = extracted.len(),
1228                skip_count = skip_reasons.len(),
1229                "tier2_complete: partial extraction with skips"
1230            );
1231            ExtractionResult::Extracted(extracted)
1232        }
1233    }
1234}
1235
1236/// Extract inline scripts from -c/-e flags.
1237fn extract_inline_scripts(
1238    command: &str,
1239    limits: &ExtractionLimits,
1240    start_time: Instant,
1241    timeout: Duration,
1242    extracted: &mut Vec<ExtractedContent>,
1243    skip_reasons: &mut Vec<SkipReason>,
1244) {
1245    if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, skip_reasons) {
1246        return;
1247    }
1248    if extracted.len() >= limits.max_heredocs {
1249        skip_reasons.push(SkipReason::ExceededHeredocLimit {
1250            limit: limits.max_heredocs,
1251        });
1252        return;
1253    }
1254
1255    // Helper to extract from a given regex pattern
1256    let mut hit_limit = false;
1257    let mut extract_from_pattern = |pattern: &Regex| {
1258        for cap in pattern.captures_iter(command) {
1259            if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, skip_reasons) {
1260                return;
1261            }
1262            if extracted.len() >= limits.max_heredocs {
1263                hit_limit = true;
1264                break;
1265            }
1266
1267            let cmd_name = cap.get(1).map_or("", |m| m.as_str());
1268            let flag = cap.get(3).map_or("", |m| m.as_str());
1269            // Content is in group 4: (1) interpreter, (2) optional "js", (3) flag, (4) content
1270            let content_match = cap.get(4);
1271            let content = content_match.map_or("", |m| m.as_str());
1272
1273            // The regex covers multiple interpreters; validate that the matched flag actually
1274            // implies inline code for this interpreter (e.g. bash needs -c, perl needs -e/-E).
1275            let is_inline_flag = if cmd_name.starts_with("python") {
1276                flag.contains('c') || flag.contains('e')
1277            } else if cmd_name.starts_with("ruby") || cmd_name.starts_with("irb") {
1278                flag.contains('e')
1279            } else if cmd_name.starts_with("perl") {
1280                flag.contains('e') || flag.contains('E')
1281            } else if cmd_name.starts_with("node") {
1282                flag.contains('e') || flag.contains('p')
1283            } else if cmd_name.starts_with("php") {
1284                flag.contains('r')
1285            } else if cmd_name.starts_with("lua") {
1286                flag.contains('e')
1287            } else if {
1288                // PowerShell host names are case-insensitive on Windows
1289                // (`powershell`, `PowerShell.exe`, `pwsh`). The inline-execution
1290                // flag is `-Command`, which PowerShell accepts as any unambiguous
1291                // prefix (`-c`, `-co`, `-com`, …), case-insensitively. (#125)
1292                let lower = cmd_name.to_ascii_lowercase();
1293                lower.starts_with("powershell") || lower.starts_with("pwsh")
1294            } {
1295                // `-Command` / `-c` (the leading char after `-` is C/c)
1296                let f = flag.to_ascii_lowercase();
1297                f.starts_with("-c")
1298            } else {
1299                // sh/bash/zsh/fish
1300                flag.contains('c')
1301            };
1302
1303            if !is_inline_flag {
1304                continue;
1305            }
1306
1307            // Enforce content size limit
1308            if content.len() > limits.max_body_bytes {
1309                // Skip but don't add to skip_reasons (would be too noisy)
1310                continue;
1311            }
1312
1313            let full_match = cap.get(0).unwrap();
1314            extracted.push(ExtractedContent {
1315                content: content.to_string(),
1316                language: ScriptLanguage::from_command(cmd_name),
1317                delimiter: None,
1318                byte_range: full_match.start()..full_match.end(),
1319                content_range: content_match.map(|m| m.start()..m.end()),
1320                quoted: true, // -c/-e content is always in quotes
1321                heredoc_type: None,
1322                target_command: Some(cmd_name.to_string()), // -c/-e content is executed by the interpreter
1323            });
1324        }
1325    };
1326
1327    // Extract from both single-quoted and double-quoted patterns
1328    extract_from_pattern(&INLINE_SCRIPT_SINGLE_QUOTE);
1329    extract_from_pattern(&INLINE_SCRIPT_DOUBLE_QUOTE);
1330
1331    if hit_limit {
1332        skip_reasons.push(SkipReason::ExceededHeredocLimit {
1333            limit: limits.max_heredocs,
1334        });
1335    }
1336}
1337
1338/// Extract here-strings (<<<).
1339fn extract_herestrings(
1340    command: &str,
1341    limits: &ExtractionLimits,
1342    start_time: Instant,
1343    timeout: Duration,
1344    extracted: &mut Vec<ExtractedContent>,
1345    skip_reasons: &mut Vec<SkipReason>,
1346) {
1347    if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, skip_reasons) {
1348        return;
1349    }
1350    if extracted.len() >= limits.max_heredocs {
1351        return; // Already hit limit, don't add another skip reason
1352    }
1353
1354    let mut hit_limit = false;
1355
1356    // Helper to extract from a given pattern (quoted patterns have content in group 1)
1357    let mut extract_quoted = |pattern: &Regex, is_quoted: bool| {
1358        for cap in pattern.captures_iter(command) {
1359            if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, skip_reasons) {
1360                return;
1361            }
1362            if extracted.len() >= limits.max_heredocs {
1363                hit_limit = true;
1364                break;
1365            }
1366
1367            // Content is in group 1 for all our here-string patterns
1368            let content_match = cap.get(1);
1369            let content = content_match.map_or("", |m| m.as_str());
1370
1371            if content.len() > limits.max_body_bytes {
1372                continue;
1373            }
1374
1375            let full_match = cap.get(0).unwrap();
1376
1377            // Extract the command that receives the here-string
1378            let target_cmd = extract_heredoc_target_command(command, full_match.start());
1379
1380            extracted.push(ExtractedContent {
1381                content: content.to_string(),
1382                language: ScriptLanguage::Bash, // Here-strings are bash-specific
1383                delimiter: None,
1384                byte_range: full_match.start()..full_match.end(),
1385                content_range: content_match.map(|m| m.start()..m.end()),
1386                quoted: is_quoted,
1387                heredoc_type: Some(HeredocType::HereString),
1388                target_command: target_cmd,
1389            });
1390        }
1391    };
1392
1393    // Extract from single-quoted, double-quoted, then unquoted patterns
1394    // Quoted patterns first to avoid unquoted matching the outer quotes
1395    extract_quoted(&HERESTRING_SINGLE_QUOTE, true);
1396    extract_quoted(&HERESTRING_DOUBLE_QUOTE, true);
1397    extract_quoted(&HERESTRING_UNQUOTED, false);
1398
1399    if hit_limit {
1400        skip_reasons.push(SkipReason::ExceededHeredocLimit {
1401            limit: limits.max_heredocs,
1402        });
1403    }
1404}
1405
1406/// Extract heredocs (<<, <<-, <<~).
1407fn extract_heredocs(
1408    command: &str,
1409    limits: &ExtractionLimits,
1410    start_time: Instant,
1411    timeout: Duration,
1412    extracted: &mut Vec<ExtractedContent>,
1413    skip_reasons: &mut Vec<SkipReason>,
1414) {
1415    if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, skip_reasons) {
1416        return;
1417    }
1418    if extracted.len() >= limits.max_heredocs {
1419        return; // Already hit limit
1420    }
1421
1422    let mut hit_limit = false;
1423    for cap in HEREDOC_EXTRACTOR.captures_iter(command) {
1424        if record_timeout_if_needed(start_time, timeout, limits.timeout_ms, skip_reasons) {
1425            return;
1426        }
1427        if extracted.len() >= limits.max_heredocs {
1428            hit_limit = true;
1429            break;
1430        }
1431
1432        let operator_variant = cap.get(1).map(|m| m.as_str());
1433
1434        let (delimiter, quoted) = if let Some(m) = cap.get(2) {
1435            (m.as_str(), true)
1436        } else if let Some(m) = cap.get(3) {
1437            (m.as_str(), true)
1438        } else if let Some(m) = cap.get(4) {
1439            (m.as_str(), false)
1440        } else {
1441            // Should be unreachable if regex matched
1442            continue;
1443        };
1444
1445        // Determine heredoc type
1446        let heredoc_type = match operator_variant {
1447            Some("-") => HeredocType::TabStripped,
1448            Some("~") => HeredocType::IndentStripped,
1449            _ => HeredocType::Standard,
1450        };
1451
1452        let full_match = cap.get(0).unwrap();
1453        let mut start_pos = full_match.end();
1454
1455        // Heredoc bodies start on the next line. If there are trailing tokens after the delimiter
1456        // on the same line (pipelines, redirects, etc.), skip them so we don't corrupt the
1457        // extracted body (which can otherwise cause AST parse failures and false negatives).
1458        start_pos = command[start_pos..]
1459            .find('\n')
1460            .map_or(command.len(), |rel| start_pos.saturating_add(rel));
1461
1462        // Find the terminating delimiter
1463        match extract_heredoc_body(
1464            command,
1465            start_pos,
1466            delimiter,
1467            heredoc_type,
1468            limits,
1469            start_time,
1470            timeout,
1471        ) {
1472            Ok((content, end_pos, body_start_abs, body_end_abs)) => {
1473                let (language, _confidence) = ScriptLanguage::detect(command, &content);
1474                // Extract the command that receives the heredoc
1475                let target_cmd = extract_heredoc_target_command(command, full_match.start());
1476                extracted.push(ExtractedContent {
1477                    content,
1478                    language,
1479                    delimiter: Some(delimiter.to_string()),
1480                    byte_range: full_match.start()..end_pos.min(command.len()),
1481                    content_range: Some(body_start_abs..body_end_abs),
1482                    quoted,
1483                    heredoc_type: Some(heredoc_type),
1484                    target_command: target_cmd,
1485                });
1486            }
1487            Err(reason) => {
1488                skip_reasons.push(reason);
1489                if matches!(skip_reasons.last(), Some(SkipReason::Timeout { .. })) {
1490                    return;
1491                }
1492            }
1493        }
1494    }
1495
1496    if hit_limit {
1497        skip_reasons.push(SkipReason::ExceededHeredocLimit {
1498            limit: limits.max_heredocs,
1499        });
1500    }
1501}
1502
1503/// Extract the command that receives a heredoc or here-string.
1504///
1505/// Looks backwards from the heredoc operator position to find the command word.
1506/// Returns `Some(command_name)` if found, `None` otherwise.
1507///
1508/// Examples:
1509/// - `cat <<EOF` -> Some("cat")
1510/// - `bash <<EOF` -> Some("bash")
1511/// - `cat file.txt | tee <<EOF` -> Some("tee")
1512/// - `$(cat <<EOF)` -> Some("cat")
1513fn extract_heredoc_target_command(command: &str, heredoc_start: usize) -> Option<String> {
1514    if heredoc_start == 0 {
1515        return None;
1516    }
1517
1518    let before = &command[..heredoc_start];
1519
1520    // Trim trailing whitespace before the heredoc operator
1521    let trimmed = before.trim_end();
1522    if trimmed.is_empty() {
1523        return None;
1524    }
1525
1526    // Parse tokens backwards, then walk them in original order so we identify
1527    // the command that owns the heredoc rather than the last argument before
1528    // the operator.
1529    let tokens = tokenize_backwards(trimmed);
1530
1531    for token in tokens.iter().rev() {
1532        if is_shell_env_assignment(token) {
1533            continue;
1534        }
1535
1536        // Skip flags
1537        if token.starts_with('-') {
1538            continue;
1539        }
1540
1541        // Skip common shell wrappers until we reach the actual target command.
1542        if SHELL_WRAPPER_COMMANDS.contains(&token.as_str()) {
1543            continue;
1544        }
1545
1546        // Skip quoted strings (arguments like '{print $1}' or "hello world")
1547        if (token.starts_with('\'') && token.ends_with('\''))
1548            || (token.starts_with('"') && token.ends_with('"'))
1549        {
1550            continue;
1551        }
1552
1553        // Skip if this looks like a file path argument
1554        if token.contains('/') {
1555            let basename = token.rsplit('/').next().unwrap_or(token);
1556
1557            // Check if this looks like a command path (/bin/cat, /usr/bin/bash)
1558            // vs a file argument (/tmp/file, /path/to/data)
1559            let is_known_command = NON_EXECUTING_HEREDOC_COMMANDS.contains(&basename)
1560                || [
1561                    "bash", "sh", "zsh", "fish", "ksh", "dash", "python", "perl", "ruby", "node",
1562                ]
1563                .contains(&basename);
1564
1565            // Command paths are typically in standard locations
1566            let looks_like_command_path = token.starts_with("/bin/")
1567                || token.starts_with("/usr/bin/")
1568                || token.starts_with("/usr/local/bin/")
1569                || token.starts_with("/sbin/")
1570                || token.starts_with("/usr/sbin/")
1571                || is_known_command;
1572
1573            if !looks_like_command_path {
1574                // Doesn't look like a command path, skip it
1575                continue;
1576            }
1577
1578            return Some(basename.to_string());
1579        }
1580
1581        // Skip if this looks like a file with extension
1582        let has_extension = token.contains('.') && !token.starts_with('.');
1583        let is_known_command = NON_EXECUTING_HEREDOC_COMMANDS.contains(&token.as_str())
1584            || [
1585                "bash", "sh", "zsh", "fish", "ksh", "dash", "python", "perl", "ruby", "node",
1586            ]
1587            .contains(&token.as_str());
1588        if has_extension && !is_known_command {
1589            continue;
1590        }
1591
1592        return Some(token.clone());
1593    }
1594
1595    None
1596}
1597
1598fn is_shell_env_assignment(token: &str) -> bool {
1599    let Some((name, _value)) = token.split_once('=') else {
1600        return false;
1601    };
1602
1603    !name.is_empty()
1604        && name.bytes().enumerate().all(|(idx, byte)| match byte {
1605            b'a'..=b'z' | b'A'..=b'Z' | b'_' => true,
1606            b'0'..=b'9' => idx > 0,
1607            _ => false,
1608        })
1609}
1610
1611/// Tokenize a command string backwards, respecting quotes.
1612/// Returns tokens in reverse order (last token first).
1613///
1614/// Note: This function does not handle escaped quotes inside double-quoted strings
1615/// (e.g., `"foo\"bar"`). In such cases, tokenization may be incorrect. This is acceptable
1616/// because the failure mode is safe - we won't find the target command and thus won't
1617/// mask the heredoc content, which is the conservative choice for security.
1618fn tokenize_backwards(s: &str) -> Vec<String> {
1619    let mut tokens = Vec::new();
1620    let bytes = s.as_bytes();
1621    let mut i = s.len();
1622
1623    while i > 0 {
1624        // Skip trailing whitespace
1625        while i > 0 && bytes[i - 1].is_ascii_whitespace() {
1626            i -= 1;
1627        }
1628        if i == 0 {
1629            break;
1630        }
1631
1632        let end = i;
1633
1634        // Check for quoted string
1635        if bytes[i - 1] == b'\'' || bytes[i - 1] == b'"' {
1636            let quote = bytes[i - 1];
1637            i -= 1;
1638            // Find matching opening quote
1639            while i > 0 && bytes[i - 1] != quote {
1640                i -= 1;
1641            }
1642            i = i.saturating_sub(1); // Skip opening quote if present
1643            tokens.push(s[i..end].to_string());
1644            continue;
1645        }
1646
1647        // Check for command separator (|, ;, &, $, ()
1648        if matches!(bytes[i - 1], b'|' | b';' | b'&' | b'$' | b'(' | b')') {
1649            // Stop parsing - we've reached a command boundary
1650            break;
1651        }
1652
1653        // Regular word - scan backwards to whitespace or separator
1654        while i > 0 {
1655            let c = bytes[i - 1];
1656            if c.is_ascii_whitespace() || matches!(c, b'|' | b';' | b'&' | b'$' | b'(' | b')') {
1657                break;
1658            }
1659            i -= 1;
1660        }
1661
1662        if i < end {
1663            tokens.push(s[i..end].to_string());
1664        }
1665    }
1666
1667    tokens
1668}
1669
1670/// Commands that do NOT execute their stdin/heredoc content as code.
1671/// Heredocs passed to these commands are DATA, not executable scripts.
1672const NON_EXECUTING_HEREDOC_COMMANDS: &[&str] = &[
1673    // Text output commands
1674    "cat",
1675    "tee",
1676    "echo",
1677    "printf",
1678    // File writing/appending
1679    "dd",
1680    // Text processing (read stdin, output transformed text)
1681    "head",
1682    "tail",
1683    "grep",
1684    "egrep",
1685    "fgrep",
1686    "sed",
1687    "awk",
1688    "cut",
1689    "sort",
1690    "uniq",
1691    "tr",
1692    "wc",
1693    "rev",
1694    "nl",
1695    "fold",
1696    "fmt",
1697    "expand",
1698    "unexpand",
1699    "column",
1700    "paste",
1701    "join",
1702    // Encoding/compression (transform data, don't execute)
1703    "base64",
1704    "xxd",
1705    "od",
1706    "hexdump",
1707    "gzip",
1708    "gunzip",
1709    "bzip2",
1710    "bunzip2",
1711    "xz",
1712    "lzma",
1713    "zcat",
1714    "bzcat",
1715    "xzcat",
1716    // Network (send data, don't execute)
1717    "nc",
1718    "netcat",
1719    "curl",
1720    "wget",
1721    // Checksum/hash
1722    "md5sum",
1723    "sha1sum",
1724    "sha256sum",
1725    "sha512sum",
1726    "cksum",
1727    // Diff/comparison
1728    "diff",
1729    "cmp",
1730    "comm",
1731    // Mail (compose message body)
1732    "mail",
1733    "sendmail",
1734    // Variable assignment (read into variable, don't execute)
1735    "read",
1736];
1737
1738const SHELL_WRAPPER_COMMANDS: &[&str] = &["sudo", "env", "command", "builtin", "nohup"];
1739
1740/// Check if a command executes its heredoc/stdin content as code.
1741///
1742/// Returns `true` if the command is known to NOT execute its input,
1743/// meaning heredoc content passed to it is DATA, not CODE.
1744#[must_use]
1745pub fn is_non_executing_heredoc_command(cmd: &str) -> bool {
1746    // Normalize: strip path prefix if present
1747    let cmd_name = cmd.rsplit('/').next().unwrap_or(cmd);
1748    NON_EXECUTING_HEREDOC_COMMANDS.contains(&cmd_name)
1749}
1750
1751/// Mask heredoc content when the target command doesn't execute it.
1752///
1753/// This prevents false positives where dangerous patterns in DATA (not CODE)
1754/// trigger security blocks. For example, `cat <<EOF\nrm -rf /\nEOF` should
1755/// not be blocked because `cat` just outputs the text - it doesn't execute it.
1756///
1757/// Returns a `Cow::Borrowed` if no masking was needed, or `Cow::Owned` if
1758/// heredoc content was replaced with placeholder text.
1759#[must_use]
1760pub fn mask_non_executing_heredocs(command: &str) -> std::borrow::Cow<'_, str> {
1761    use std::borrow::Cow;
1762
1763    // Quick check: no heredoc operator means nothing to mask
1764    if !command.contains("<<") {
1765        return Cow::Borrowed(command);
1766    }
1767
1768    let mut result = String::new();
1769    let mut pos = 0;
1770    let bytes = command.as_bytes();
1771
1772    while pos < command.len() {
1773        // Find next potential heredoc operator
1774        if let Some(offset) = command[pos..].find("<<") {
1775            let heredoc_start = pos + offset;
1776
1777            // Check for <<< (here-string)
1778            if heredoc_start + 3 <= command.len() && bytes.get(heredoc_start + 2) == Some(&b'<') {
1779                // Extract target command for here-string
1780                let target_cmd = extract_heredoc_target_command(command, heredoc_start);
1781                let should_mask_herestring = target_cmd
1782                    .as_ref()
1783                    .is_some_and(|cmd| is_non_executing_heredoc_command(cmd));
1784
1785                if should_mask_herestring {
1786                    // Mask here-string content for non-executing targets
1787                    if let Some((content_start, content_end)) =
1788                        find_herestring_content_bounds(command, heredoc_start + 3)
1789                    {
1790                        // Copy up to the content start (includes <<<)
1791                        if result.is_empty() {
1792                            result = command[..content_start].to_string();
1793                        } else {
1794                            result.push_str(&command[pos..content_start]);
1795                        }
1796                        // Replace content with placeholder
1797                        result.push_str("'MASKED'");
1798                        pos = content_end;
1799                        continue;
1800                    }
1801                }
1802
1803                // Not masking - just advance past <<< and continue
1804                if !result.is_empty() {
1805                    result.push_str(&command[pos..heredoc_start + 3]);
1806                }
1807                pos = heredoc_start + 3;
1808                continue;
1809            }
1810
1811            // Extract target command (what receives the heredoc)
1812            let target_cmd = extract_heredoc_target_command(command, heredoc_start);
1813
1814            // Check if target is non-executing
1815            let should_mask = target_cmd
1816                .as_ref()
1817                .is_some_and(|cmd| is_non_executing_heredoc_command(cmd));
1818
1819            if should_mask {
1820                // Parse the heredoc delimiter
1821                let after_op = &command[heredoc_start + 2..];
1822                if let Some((delimiter, body_start_offset, heredoc_type)) =
1823                    parse_heredoc_delimiter(after_op)
1824                {
1825                    // Find the heredoc body end (terminating delimiter)
1826                    let body_start = heredoc_start + 2 + body_start_offset;
1827                    if let Some(body_end) =
1828                        find_heredoc_terminator(command, body_start, &delimiter, heredoc_type)
1829                    {
1830                        // Mask the heredoc body while preserving length and newlines.
1831                        if result.is_empty() {
1832                            result = command[..body_start].to_string();
1833                        } else {
1834                            result.push_str(&command[pos..body_start]);
1835                        }
1836
1837                        // Identify the start of the terminator line so we keep it intact.
1838                        let body_slice = &command[body_start..body_end];
1839                        let terminator_rel = body_slice.rfind('\n').map_or(0, |idx| idx + 1);
1840                        let terminator_abs = body_start + terminator_rel;
1841
1842                        let masked_body =
1843                            mask_preserve_newlines(&command[body_start..terminator_abs]);
1844                        result.push_str(&masked_body);
1845                        result.push_str(&command[terminator_abs..body_end]);
1846
1847                        pos = body_end;
1848                        continue;
1849                    }
1850                }
1851            }
1852
1853            // Not masking - copy everything up to and including <<
1854            if result.is_empty() {
1855                // First heredoc we're not masking - check if we need to start building result
1856            } else {
1857                result.push_str(&command[pos..heredoc_start + 2]);
1858            }
1859            pos = heredoc_start + 2;
1860        } else {
1861            // No more heredoc operators
1862            if result.is_empty() {
1863                return Cow::Borrowed(command);
1864            }
1865            result.push_str(&command[pos..]);
1866            break;
1867        }
1868    }
1869
1870    if result.is_empty() {
1871        Cow::Borrowed(command)
1872    } else {
1873        Cow::Owned(result)
1874    }
1875}
1876
1877fn mask_preserve_newlines(input: &str) -> String {
1878    let mut out: Vec<u8> = Vec::with_capacity(input.len());
1879    for b in input.as_bytes() {
1880        match b {
1881            b'\n' | b'\r' => out.push(*b),
1882            _ => out.push(b' '),
1883        }
1884    }
1885    String::from_utf8(out).unwrap_or_default()
1886}
1887
1888/// Parse a heredoc delimiter after the << operator.
1889/// Returns (delimiter, `body_start_offset`, `heredoc_type`) if successful.
1890fn parse_heredoc_delimiter(after_op: &str) -> Option<(String, usize, HeredocType)> {
1891    let trimmed = after_op.trim_start_matches([' ', '\t']);
1892    let skip_whitespace = after_op.len() - trimmed.len();
1893
1894    if trimmed.is_empty() {
1895        return None;
1896    }
1897
1898    // `<<-` strips leading tabs from each body line (bash); `<<~` is the
1899    // Ruby-style "squiggly" heredoc that strips the common leading
1900    // indentation. Both accept an optional run of whitespace before the
1901    // delimiter (e.g. `cat <<- 'EOF'` is valid). Without consuming that
1902    // whitespace, the delimiter parser falls through to the unquoted branch
1903    // with a leading space and bails — the heredoc body then escapes masking
1904    // and pack matching denies prose like "gh repo delete" inside
1905    // `cat <<- 'EOF'` (issue #109).
1906    //
1907    // The marker MUST be adjacent to `<<` (no whitespace between them).
1908    // `cat << -EOF` is bash-legal as a Standard heredoc whose delimiter is
1909    // the literal `-EOF`; the `-` is part of the delimiter token rather
1910    // than a tab-strip marker. We disambiguate by checking that no leading
1911    // whitespace was consumed before the candidate marker character.
1912    //
1913    // We must distinguish `-` from `~` here because the heredoc body
1914    // terminator is matched by [`find_heredoc_terminator`] using the type:
1915    // `TabStripped` strips only tabs, while `IndentStripped` strips all
1916    // leading whitespace. Conflating the two means a `<<~` heredoc with
1917    // space-indented terminator (`  EOF`) is never recognized, the body
1918    // escapes masking, and pack matching produces false positives on
1919    // documentation prose. The regex-based extractor in [`extract_heredocs`]
1920    // already maps `~` to `IndentStripped`; this path must agree.
1921    let (heredoc_type, marker_len) = if skip_whitespace == 0 {
1922        match trimmed.as_bytes().first() {
1923            Some(b'-') => (HeredocType::TabStripped, 1),
1924            Some(b'~') => (HeredocType::IndentStripped, 1),
1925            _ => (HeredocType::Standard, 0),
1926        }
1927    } else {
1928        (HeredocType::Standard, 0)
1929    };
1930
1931    let after_marker = &trimmed[marker_len..];
1932    let after_marker_trimmed = after_marker.trim_start_matches([' ', '\t']);
1933    let inter_whitespace = after_marker.len() - after_marker_trimmed.len();
1934    let delim_chars = after_marker_trimmed;
1935
1936    // Handle quoted delimiters
1937    let (delimiter, delim_len) = if let Some(stripped) = delim_chars.strip_prefix('"') {
1938        // Find closing quote
1939        let end = stripped.find('"')?;
1940        let (body, _) = stripped.split_at(end);
1941        (body.to_string(), end + 2)
1942    } else if let Some(stripped) = delim_chars.strip_prefix('\'') {
1943        // Find closing quote
1944        let end = stripped.find('\'')?;
1945        let (body, _) = stripped.split_at(end);
1946        (body.to_string(), end + 2)
1947    } else {
1948        // Unquoted - extract word
1949        let end = delim_chars
1950            .find(|c: char| c.is_whitespace() || c == '\n' || c == ';' || c == '&' || c == '|')
1951            .unwrap_or(delim_chars.len());
1952        if end == 0 {
1953            return None;
1954        }
1955        (delim_chars[..end].to_string(), end)
1956    };
1957
1958    // Calculate total offset to body start (skip to newline)
1959    let total_delim_offset = skip_whitespace + marker_len + inter_whitespace + delim_len;
1960    let remaining = &after_op[total_delim_offset..];
1961
1962    // Find the newline that starts the body
1963    let newline_offset = remaining.find('\n').map_or(remaining.len(), |i| i + 1);
1964
1965    Some((delimiter, total_delim_offset + newline_offset, heredoc_type))
1966}
1967
1968/// Find the end of a heredoc body (position after the terminating delimiter line).
1969fn find_heredoc_terminator(
1970    command: &str,
1971    body_start: usize,
1972    delimiter: &str,
1973    heredoc_type: HeredocType,
1974) -> Option<usize> {
1975    if body_start >= command.len() {
1976        return None;
1977    }
1978
1979    let body = &command[body_start..];
1980    let mut line_start = 0;
1981
1982    for line in body.split_inclusive('\n') {
1983        let trimmed = match heredoc_type {
1984            HeredocType::TabStripped => line.trim_start_matches('\t'),
1985            HeredocType::IndentStripped => line.trim_start(),
1986            HeredocType::Standard | HeredocType::HereString => line,
1987        };
1988
1989        let line_content = trimmed.trim_end_matches(['\n', '\r']);
1990
1991        if line_content == delimiter {
1992            // Found terminator - return position after this line
1993            return Some(body_start + line_start + line.len());
1994        }
1995
1996        line_start += line.len();
1997    }
1998
1999    None
2000}
2001
2002/// Find the bounds of a here-string's content (start and end byte positions).
2003/// Returns `(content_start, content_end)` where `content_start` is after any opening quote
2004/// and `content_end` is before any closing quote or at whitespace/end for unquoted.
2005fn find_herestring_content_bounds(command: &str, after_operator: usize) -> Option<(usize, usize)> {
2006    if after_operator >= command.len() {
2007        return None;
2008    }
2009
2010    let remaining = &command[after_operator..];
2011    let bytes = remaining.as_bytes();
2012
2013    // Skip whitespace after <<<
2014    let mut i = 0;
2015    while i < bytes.len() && bytes[i].is_ascii_whitespace() && bytes[i] != b'\n' {
2016        i += 1;
2017    }
2018
2019    if i >= bytes.len() || bytes[i] == b'\n' {
2020        return None;
2021    }
2022
2023    // Check for quoted content
2024    if bytes[i] == b'\'' || bytes[i] == b'"' {
2025        let quote = bytes[i];
2026        let quote_start = i;
2027        i += 1;
2028        // Find closing quote
2029        while i < bytes.len() && bytes[i] != quote {
2030            // Handle escaped characters in double quotes
2031            if quote == b'"' && bytes[i] == b'\\' && i + 1 < bytes.len() {
2032                i += 2;
2033            } else {
2034                i += 1;
2035            }
2036        }
2037        if i < bytes.len() && bytes[i] == quote {
2038            // Include the quotes in the masked region
2039            return Some((
2040                after_operator + quote_start,
2041                after_operator + i + 1, // after closing quote
2042            ));
2043        }
2044        // No closing quote found - treat as unquoted
2045    }
2046
2047    // Unquoted - find end at whitespace or command separator
2048    let word_start = i;
2049    while i < bytes.len() {
2050        let c = bytes[i];
2051        if c.is_ascii_whitespace() || matches!(c, b';' | b'&' | b'|' | b')' | b'\n') {
2052            break;
2053        }
2054        i += 1;
2055    }
2056
2057    if i > word_start {
2058        Some((after_operator + word_start, after_operator + i))
2059    } else {
2060        None
2061    }
2062}
2063
2064/// Extract the body of a heredoc, finding the terminating delimiter.
2065fn extract_heredoc_body(
2066    command: &str,
2067    start: usize,
2068    delimiter: &str,
2069    heredoc_type: HeredocType,
2070    limits: &ExtractionLimits,
2071    start_time: Instant,
2072    timeout: Duration,
2073) -> Result<(String, usize, usize, usize), SkipReason> {
2074    if start > command.len() {
2075        return Err(SkipReason::MalformedInput {
2076            reason: "heredoc start offset out of bounds".to_string(),
2077        });
2078    }
2079
2080    let remaining = &command[start..];
2081
2082    // Skip leading newline if present (heredoc body starts on next line)
2083    let body_start_offset = usize::from(remaining.starts_with('\n'));
2084    let body_start = &remaining[body_start_offset..];
2085    let body_start_abs = start + body_start_offset;
2086
2087    let mut body_lines: Vec<&str> = Vec::new();
2088    let mut total_bytes: usize = 0;
2089    let mut cursor: usize = 0; // offset within body_start
2090
2091    for part in body_start.split_inclusive('\n') {
2092        // Enforce timeout inside the loop (a single heredoc can be large).
2093        if start_time.elapsed() >= timeout {
2094            let elapsed_ms = u64::try_from(start_time.elapsed().as_millis()).unwrap_or(u64::MAX);
2095            return Err(SkipReason::Timeout {
2096                elapsed_ms,
2097                budget_ms: limits.timeout_ms,
2098            });
2099        }
2100
2101        let line = part.strip_suffix('\n').unwrap_or(part);
2102        // Normalize CRLF line endings so terminator detection works cross-platform and so extracted
2103        // code doesn't include stray '\r' characters (which can break AST parsing).
2104        let line = line.strip_suffix('\r').unwrap_or(line);
2105
2106        // Check if this line is the terminator
2107        let trimmed = match heredoc_type {
2108            HeredocType::TabStripped => line.trim_start_matches('\t'),
2109            HeredocType::IndentStripped => line.trim_start(),
2110            HeredocType::Standard | HeredocType::HereString => line,
2111        };
2112
2113        if trimmed == delimiter {
2114            // End position should be accurate in the ORIGINAL command (including any indentation
2115            // before the delimiter). We intentionally exclude the newline after the terminator.
2116            let terminator_start = body_start_abs + cursor;
2117            let terminator_end = terminator_start + line.len();
2118            let mut body_end_abs = terminator_start;
2119            if body_end_abs > body_start_abs {
2120                let bytes = command.as_bytes();
2121                if bytes.get(body_end_abs.saturating_sub(1)) == Some(&b'\n') {
2122                    body_end_abs = body_end_abs.saturating_sub(1);
2123                    if bytes.get(body_end_abs.saturating_sub(1)) == Some(&b'\r') {
2124                        body_end_abs = body_end_abs.saturating_sub(1);
2125                    }
2126                }
2127            }
2128
2129            let content = match heredoc_type {
2130                HeredocType::TabStripped => body_lines
2131                    .iter()
2132                    .map(|l| l.trim_start_matches('\t'))
2133                    .collect::<Vec<_>>()
2134                    .join("\n"),
2135                HeredocType::IndentStripped => {
2136                    // Compute the common leading-whitespace prefix in BYTES
2137                    // and then walk each line back to a char boundary
2138                    // before slicing. The naive `&l[min_indent..]` slice
2139                    // panics when a line's `min_indent`-th byte falls in
2140                    // the middle of a multi-byte UTF-8 codepoint — which
2141                    // happens when one line uses ASCII spaces while
2142                    // another uses a multi-byte whitespace such as NBSP
2143                    // (`\u{00A0}`, 2 bytes) or the ideographic space
2144                    // (`\u{3000}`, 3 bytes). Under `panic = "abort"` (the
2145                    // release profile) such a panic crashes the hook
2146                    // process, which AGENTS.md forbids — the hook must
2147                    // fail open. If the boundary doesn't line up we fall
2148                    // back to `trim_start()` for that line, which is the
2149                    // conservative interpretation (strip ALL of its
2150                    // leading whitespace).
2151                    let min_indent = body_lines
2152                        .iter()
2153                        .filter(|l| !l.trim().is_empty())
2154                        .map(|l| l.len() - l.trim_start().len())
2155                        .min()
2156                        .unwrap_or(0);
2157
2158                    body_lines
2159                        .iter()
2160                        .map(|l| {
2161                            if l.len() >= min_indent && l.is_char_boundary(min_indent) {
2162                                &l[min_indent..]
2163                            } else {
2164                                l.trim_start()
2165                            }
2166                        })
2167                        .collect::<Vec<_>>()
2168                        .join("\n")
2169                }
2170                HeredocType::Standard | HeredocType::HereString => body_lines.join("\n"),
2171            };
2172
2173            return Ok((content, terminator_end, body_start_abs, body_end_abs));
2174        }
2175
2176        // Enforce limits (fail-open by returning a specific skip reason).
2177        total_bytes = total_bytes.saturating_add(part.len());
2178        if total_bytes > limits.max_body_bytes {
2179            return Err(SkipReason::ExceededSizeLimit {
2180                actual: total_bytes,
2181                limit: limits.max_body_bytes,
2182            });
2183        }
2184
2185        if body_lines.len() >= limits.max_body_lines {
2186            return Err(SkipReason::ExceededLineLimit {
2187                actual: body_lines.len() + 1,
2188                limit: limits.max_body_lines,
2189            });
2190        }
2191
2192        body_lines.push(line);
2193        cursor = cursor.saturating_add(part.len());
2194    }
2195
2196    Err(SkipReason::UnterminatedHeredoc {
2197        delimiter: delimiter.to_string(),
2198    })
2199}
2200
2201// ============================================================================
2202// Shell Command Extraction for Evaluator Integration (git_safety_guard-uau)
2203// ============================================================================
2204
2205use ast_grep_core::AstGrep;
2206use ast_grep_language::SupportLang;
2207
2208/// Extracted shell command with position info for evaluator integration.
2209///
2210/// Each command represents a simple command invocation that can be
2211/// fed to the evaluator for destructive pattern matching.
2212#[derive(Debug, Clone, PartialEq, Eq)]
2213pub struct ExtractedShellCommand {
2214    /// The full command text (reconstructed from AST).
2215    pub text: String,
2216    /// Byte offset in the original content.
2217    pub start: usize,
2218    /// End byte offset.
2219    pub end: usize,
2220    /// 1-based line number.
2221    pub line_number: usize,
2222}
2223
2224/// Extract executable shell commands from heredoc/script content.
2225///
2226/// This function parses shell content using tree-sitter-bash (via ast-grep)
2227/// and extracts individual commands that should be evaluated against the
2228/// main evaluator pipeline. This keeps all destructive knowledge in packs
2229/// rather than duplicating rules for heredoc content.
2230///
2231/// # What gets extracted
2232///
2233/// - Simple commands: `rm -rf /path`, `git reset --hard`
2234/// - Pipe sources and targets: commands on either side of `|`
2235/// - Commands inside command substitutions: contents of `$(...)`
2236/// - Commands inside subshells: contents of `(...)`
2237///
2238/// # What does NOT get extracted (false positive avoidance)
2239///
2240/// - Comments: `# rm -rf / dangerous` is NOT executed
2241/// - String literals in echo/printf: content inside quotes is data, not execution
2242/// - Heredoc delimiters themselves
2243///
2244/// # Performance
2245///
2246/// Uses ast-grep for parsing which is very fast (<2ms for typical heredocs).
2247/// No timeout is enforced here as the AST matcher already has its own timeout.
2248///
2249/// # Examples
2250///
2251/// ```ignore
2252/// use destructive_command_guard::heredoc::extract_shell_commands;
2253///
2254/// // Simple command
2255/// let commands = extract_shell_commands("rm -rf /tmp/test");
2256/// assert_eq!(commands.len(), 1);
2257/// assert_eq!(commands[0].text, "rm -rf /tmp/test");
2258///
2259/// // Pipeline - both sides extracted
2260/// let commands = extract_shell_commands("find . | xargs rm");
2261/// assert_eq!(commands.len(), 2);
2262///
2263/// // Comment - not extracted
2264/// let commands = extract_shell_commands("# rm -rf / dangerous");
2265/// assert_eq!(commands.len(), 0);
2266/// ```
2267#[must_use]
2268#[instrument(skip(content), fields(content_len = content.len()))]
2269pub fn extract_shell_commands(content: &str) -> Vec<ExtractedShellCommand> {
2270    if content.trim().is_empty() {
2271        trace!("extract_shell_commands: empty content");
2272        return Vec::new();
2273    }
2274
2275    let start = Instant::now();
2276    let ast = AstGrep::new(content, SupportLang::Bash);
2277    let root = ast.root();
2278
2279    let mut commands = Vec::new();
2280
2281    // Walk the AST to find command nodes
2282    // tree-sitter-bash uses "command" nodes for simple commands
2283    collect_commands_recursive(root, content, &mut commands);
2284
2285    debug!(
2286        elapsed_us = start.elapsed().as_micros(),
2287        count = commands.len(),
2288        "extract_shell_commands: AST analysis complete"
2289    );
2290    commands
2291}
2292
2293/// Recursively collect command nodes from the AST.
2294///
2295/// Walks the tree looking for "command" nodes (simple commands in bash).
2296/// Recurses into all child nodes to find nested commands, including:
2297/// - Command substitutions: `$(cmd)`
2298/// - Subshells: `(cmd)`
2299/// - Pipelines, command lists, loops, conditionals, etc.
2300#[allow(clippy::needless_pass_by_value)]
2301fn collect_commands_recursive<D: ast_grep_core::Doc>(
2302    node: ast_grep_core::Node<'_, D>,
2303    content: &str,
2304    commands: &mut Vec<ExtractedShellCommand>,
2305) {
2306    let kind = node.kind();
2307
2308    // "command" in tree-sitter-bash is a simple command
2309    if kind == "command" {
2310        let range = node.range();
2311        let text = node.text().to_string();
2312
2313        // Skip empty commands
2314        if !text.trim().is_empty() {
2315            let line_number = content[..range.start].matches('\n').count() + 1;
2316
2317            commands.push(ExtractedShellCommand {
2318                text,
2319                start: range.start,
2320                end: range.end,
2321                line_number,
2322            });
2323        }
2324    }
2325
2326    // Recurse into all children to find nested commands
2327    // This handles:
2328    // - Pipelines: `cmd1 | cmd2` has command children
2329    // - Command lists: `cmd1 && cmd2` has command children
2330    // - Command substitution: `$(cmd)` contains command
2331    // - Subshells: `(cmd)` contains command
2332    for child in node.children() {
2333        collect_commands_recursive(child, content, commands);
2334    }
2335}
2336
2337// ============================================================================
2338// Tests
2339// ============================================================================
2340
2341#[cfg(test)]
2342mod tests {
2343    use super::*;
2344    #[allow(unused_imports)]
2345    use proptest::prelude::*;
2346
2347    // ========================================================================
2348    // Tier 1: Trigger Detection Tests
2349    // ========================================================================
2350
2351    mod tier1_triggers {
2352        use super::*;
2353
2354        #[test]
2355        fn no_trigger_on_safe_commands() {
2356            // Common safe commands should NOT trigger
2357            let safe_commands = [
2358                "git status",
2359                "ls -la",
2360                "cargo build",
2361                "npm install",
2362                "docker ps",
2363                "kubectl get pods",
2364                "cat file.txt",
2365                "echo hello",
2366                "grep pattern file",
2367                "find . -name '*.rs'",
2368            ];
2369
2370            for cmd in safe_commands {
2371                assert_eq!(
2372                    check_triggers(cmd),
2373                    TriggerResult::NoTrigger,
2374                    "should not trigger on: {cmd}"
2375                );
2376            }
2377        }
2378
2379        #[test]
2380        fn triggers_on_heredoc_basic() {
2381            // Basic heredoc forms
2382            let heredocs = [
2383                "cat << EOF",
2384                "cat <<EOF",
2385                "cat << 'EOF'",
2386                r#"cat << "EOF""#,
2387                "cat <<- EOF",       // Tab-stripping heredoc
2388                "mysql <<< 'query'", // Here-string
2389            ];
2390
2391            for cmd in heredocs {
2392                assert_eq!(
2393                    check_triggers(cmd),
2394                    TriggerResult::Triggered,
2395                    "should trigger on heredoc: {cmd}"
2396                );
2397            }
2398        }
2399
2400        #[test]
2401        fn triggers_on_python_inline() {
2402            let python_commands = [
2403                "python -c 'import os'",
2404                "python3 -c 'import os'",
2405                "python -I -c 'import os'",
2406                "python3 -I -c 'import os'",
2407                "python -e 'print(1)'",
2408                "python3 -e 'print(1)'",
2409            ];
2410
2411            for cmd in python_commands {
2412                assert_eq!(
2413                    check_triggers(cmd),
2414                    TriggerResult::Triggered,
2415                    "should trigger on python inline: {cmd}"
2416                );
2417            }
2418        }
2419
2420        #[test]
2421        fn triggers_on_versioned_interpreters() {
2422            // Tier 1 MUST have zero false negatives - versioned interpreters must trigger
2423            let versioned_commands = [
2424                // Python versions
2425                "python3.11 -c 'import os'",
2426                "python3.12.1 -c 'import os'",
2427                "python3.9 -e 'print(1)'",
2428                // Ruby versions
2429                "ruby3.0 -e 'puts 1'",
2430                "ruby3.2.1 -e 'exit'",
2431                // Perl versions
2432                "perl5.36 -e 'print 1'",
2433                "perl5.38.2 -E 'say 1'",
2434                // Node versions
2435                "node18 -e 'console.log(1)'",
2436                "node20.1 -e 'console.log(1)'",
2437                "nodejs18 -e 'console.log(1)'",
2438                "nodejs20.10.0 -e 'test'",
2439            ];
2440
2441            for cmd in versioned_commands {
2442                assert_eq!(
2443                    check_triggers(cmd),
2444                    TriggerResult::Triggered,
2445                    "should trigger on versioned interpreter: {cmd}"
2446                );
2447            }
2448        }
2449
2450        #[test]
2451        fn triggers_on_ruby_inline() {
2452            let ruby_commands = ["ruby -e 'puts 1'", "ruby -w -e 'puts 1'", "irb -e 'exit'"];
2453
2454            for cmd in ruby_commands {
2455                assert_eq!(
2456                    check_triggers(cmd),
2457                    TriggerResult::Triggered,
2458                    "should trigger on ruby inline: {cmd}"
2459                );
2460            }
2461        }
2462
2463        #[test]
2464        fn triggers_on_perl_inline() {
2465            let perl_commands = [
2466                "perl -e 'print 1'",
2467                "perl -E 'say 1'", // Modern Perl
2468                "perl -pi -e 'print 1'",
2469            ];
2470
2471            for cmd in perl_commands {
2472                assert_eq!(
2473                    check_triggers(cmd),
2474                    TriggerResult::Triggered,
2475                    "should trigger on perl inline: {cmd}"
2476                );
2477            }
2478        }
2479
2480        #[test]
2481        fn triggers_on_node_inline() {
2482            let node_commands = [
2483                "node -e 'console.log(1)'",
2484                "node -p 'process.version'",
2485                "node -pe 'process.version'",
2486            ];
2487
2488            for cmd in node_commands {
2489                assert_eq!(
2490                    check_triggers(cmd),
2491                    TriggerResult::Triggered,
2492                    "should trigger on node inline: {cmd}"
2493                );
2494            }
2495        }
2496
2497        #[test]
2498        fn triggers_on_shell_inline() {
2499            let shell_commands = [
2500                "bash -c 'echo hello'",
2501                "bash -l -c 'echo hello'",
2502                "bash -lc 'echo hello'",
2503                "bash --noprofile --norc -c 'echo hello'",
2504                "sh -c 'ls'",
2505                "zsh -c 'pwd'",
2506                "fish -c 'echo hello'",
2507            ];
2508
2509            for cmd in shell_commands {
2510                assert_eq!(
2511                    check_triggers(cmd),
2512                    TriggerResult::Triggered,
2513                    "should trigger on shell inline: {cmd}"
2514                );
2515            }
2516        }
2517
2518        #[test]
2519        fn triggers_on_xargs() {
2520            let xargs_commands = [
2521                "find . -name '*.bak' | xargs rm",
2522                "ls | xargs -I {} echo {}",
2523                "cat files.txt | xargs -n1 process",
2524            ];
2525
2526            for cmd in xargs_commands {
2527                assert_eq!(
2528                    check_triggers(cmd),
2529                    TriggerResult::Triggered,
2530                    "should trigger on xargs: {cmd}"
2531                );
2532            }
2533        }
2534
2535        #[test]
2536        fn triggers_on_piped_execution() {
2537            let piped_commands = [
2538                "echo 'print(1)' | python",
2539                "cat script.py | python3",
2540                "echo 'puts 1' | ruby",
2541                "echo 'print 1' | perl",
2542                "echo 'console.log(1)' | node",
2543                "echo 'echo hello' | bash",
2544                "echo 'ls' | sh",
2545            ];
2546
2547            for cmd in piped_commands {
2548                assert_eq!(
2549                    check_triggers(cmd),
2550                    TriggerResult::Triggered,
2551                    "should trigger on piped execution: {cmd}"
2552                );
2553            }
2554        }
2555
2556        #[test]
2557        fn triggers_on_eval_exec() {
2558            let eval_commands = [
2559                r#"eval "dangerous code""#,
2560                "eval 'dangerous code'",
2561                r#"exec "command""#,
2562                "exec 'command'",
2563            ];
2564
2565            for cmd in eval_commands {
2566                assert_eq!(
2567                    check_triggers(cmd),
2568                    TriggerResult::Triggered,
2569                    "should trigger on eval/exec: {cmd}"
2570                );
2571            }
2572        }
2573
2574        #[test]
2575        fn matched_triggers_returns_indices() {
2576            // Should return the indices of matching patterns
2577            let matches = matched_triggers("python -c 'test'");
2578            assert!(!matches.is_empty(), "should have matches for python -c");
2579
2580            let no_matches = matched_triggers("git status");
2581            assert!(
2582                no_matches.is_empty(),
2583                "should have no matches for git status"
2584            );
2585        }
2586
2587        #[test]
2588        fn heredoc_syntax_inside_quoted_literals_does_not_trigger() {
2589            // Common false positives: heredoc syntax used as documentation or search patterns.
2590            let commands = [
2591                r#"git commit -m "docs: example heredoc: cat <<EOF rm -rf / EOF""#,
2592                r#"rg "<<EOF" README.md"#,
2593                "echo 'cat <<EOF (docs only)'",
2594            ];
2595
2596            for cmd in commands {
2597                assert_eq!(
2598                    check_triggers(cmd),
2599                    TriggerResult::NoTrigger,
2600                    "should not trigger on quoted literal heredoc syntax: {cmd}"
2601                );
2602            }
2603        }
2604
2605        #[test]
2606        fn heredoc_inside_command_substitution_with_outer_quotes_still_triggers() {
2607            // `$(...)` is executed even when the outer word is double-quoted.
2608            let cmd = "echo \"$(cat <<EOF\nrm -rf /\nEOF)\"";
2609            assert_eq!(check_triggers(cmd), TriggerResult::Triggered);
2610        }
2611
2612        // Property: Zero false negatives - if content extraction would find
2613        // something, trigger detection MUST fire. This is tested via the
2614        // comprehensive test cases above and will be verified with property
2615        // tests once Tier 2 is implemented.
2616    }
2617
2618    // ========================================================================
2619    // Tier 2: Content Extraction Tests
2620    // ========================================================================
2621
2622    mod tier2_extraction {
2623        use super::*;
2624
2625        #[test]
2626        fn extraction_limits_default() {
2627            let limits = ExtractionLimits::default();
2628            assert_eq!(limits.max_body_bytes, 1024 * 1024);
2629            assert_eq!(limits.max_body_lines, 10_000);
2630            assert_eq!(limits.max_heredocs, 10);
2631            assert_eq!(limits.timeout_ms, 50);
2632        }
2633
2634        #[test]
2635        fn extracts_inline_script_single_quotes() {
2636            let result = extract_content("python -c 'import os'", &ExtractionLimits::default());
2637            if let ExtractionResult::Extracted(contents) = result {
2638                assert_eq!(contents.len(), 1);
2639                assert_eq!(contents[0].content, "import os");
2640                assert_eq!(contents[0].language, ScriptLanguage::Python);
2641                assert!(contents[0].quoted);
2642            } else {
2643                panic!("Expected Extracted result");
2644            }
2645        }
2646
2647        #[test]
2648        fn extracts_inline_script_double_quotes() {
2649            let result = extract_content(r#"bash -c "echo hello""#, &ExtractionLimits::default());
2650            if let ExtractionResult::Extracted(contents) = result {
2651                assert_eq!(contents.len(), 1);
2652                assert_eq!(contents[0].content, "echo hello");
2653                assert_eq!(contents[0].language, ScriptLanguage::Bash);
2654            } else {
2655                panic!("Expected Extracted result");
2656            }
2657        }
2658
2659        #[test]
2660        fn extracts_inline_script_with_intervening_flags() {
2661            let result = extract_content("python -I -c 'import os'", &ExtractionLimits::default());
2662            if let ExtractionResult::Extracted(contents) = result {
2663                assert_eq!(contents.len(), 1);
2664                assert_eq!(contents[0].content, "import os");
2665                assert_eq!(contents[0].language, ScriptLanguage::Python);
2666                assert!(contents[0].quoted);
2667            } else {
2668                panic!("Expected Extracted result");
2669            }
2670        }
2671
2672        #[test]
2673        fn extracts_inline_script_with_combined_shell_flags() {
2674            let result = extract_content("bash -lc 'echo hello'", &ExtractionLimits::default());
2675            if let ExtractionResult::Extracted(contents) = result {
2676                assert_eq!(contents.len(), 1);
2677                assert_eq!(contents[0].content, "echo hello");
2678                assert_eq!(contents[0].language, ScriptLanguage::Bash);
2679            } else {
2680                panic!("Expected Extracted result");
2681            }
2682        }
2683
2684        #[test]
2685        fn extracts_inline_script_with_combined_node_flags() {
2686            let result =
2687                extract_content("node -pe 'process.version'", &ExtractionLimits::default());
2688            if let ExtractionResult::Extracted(contents) = result {
2689                assert_eq!(contents.len(), 1);
2690                assert_eq!(contents[0].content, "process.version");
2691                assert_eq!(contents[0].language, ScriptLanguage::JavaScript);
2692            } else {
2693                panic!("Expected Extracted result");
2694            }
2695        }
2696
2697        #[test]
2698        fn extracts_inline_script_with_interleaved_perl_flags() {
2699            let result = extract_content("perl -pi -e 'print 1'", &ExtractionLimits::default());
2700            if let ExtractionResult::Extracted(contents) = result {
2701                assert_eq!(contents.len(), 1);
2702                assert_eq!(contents[0].content, "print 1");
2703                assert_eq!(contents[0].language, ScriptLanguage::Perl);
2704            } else {
2705                panic!("Expected Extracted result");
2706            }
2707        }
2708
2709        /// #125: Codex on Windows executes shell commands as
2710        /// `powershell.exe -Command '<inner>'`. dcg must descend into the
2711        /// `-Command` body and re-evaluate it as a shell command (mapped to
2712        /// `ScriptLanguage::Bash`) so destructive inner commands are caught.
2713        #[test]
2714        fn extracts_powershell_command_body() {
2715            // Bare host name, single-quoted body.
2716            let result = extract_content(
2717                "powershell -Command 'echo hi'",
2718                &ExtractionLimits::default(),
2719            );
2720            if let ExtractionResult::Extracted(contents) = result {
2721                assert_eq!(contents.len(), 1);
2722                assert_eq!(contents[0].content, "echo hi");
2723                assert_eq!(contents[0].language, ScriptLanguage::Bash);
2724            } else {
2725                panic!("Expected Extracted result for `powershell -Command '...'`");
2726            }
2727        }
2728
2729        #[test]
2730        fn extracts_powershell_exe_command_body_double_quotes() {
2731            let result = extract_content(
2732                r#"powershell.exe -Command "echo hi""#,
2733                &ExtractionLimits::default(),
2734            );
2735            if let ExtractionResult::Extracted(contents) = result {
2736                assert_eq!(contents.len(), 1);
2737                assert_eq!(contents[0].content, "echo hi");
2738                assert_eq!(contents[0].language, ScriptLanguage::Bash);
2739            } else {
2740                panic!("Expected Extracted result for `powershell.exe -Command \"...\"`");
2741            }
2742        }
2743
2744        #[test]
2745        fn extracts_pwsh_short_flag_body() {
2746            // PowerShell accepts `-c` as an abbreviation of `-Command`.
2747            let result = extract_content("pwsh -c 'echo hi'", &ExtractionLimits::default());
2748            if let ExtractionResult::Extracted(contents) = result {
2749                assert_eq!(contents.len(), 1);
2750                assert_eq!(contents[0].content, "echo hi");
2751                assert_eq!(contents[0].language, ScriptLanguage::Bash);
2752            } else {
2753                panic!("Expected Extracted result for `pwsh -c '...'`");
2754            }
2755        }
2756
2757        #[test]
2758        fn extracts_powershell_quoted_full_path_body() {
2759            // Codex's exact Windows command_execution shape: a quoted absolute
2760            // path to powershell.exe followed by -Command and the inner command.
2761            let cmd = "\"C:\\WINDOWS\\System32\\WindowsPowerShell\\v1.0\\powershell.exe\" -Command 'echo hi'";
2762            let result = extract_content(cmd, &ExtractionLimits::default());
2763            if let ExtractionResult::Extracted(contents) = result {
2764                assert!(
2765                    contents
2766                        .iter()
2767                        .any(|c| c.content == "echo hi" && c.language == ScriptLanguage::Bash),
2768                    "expected to extract the -Command body from a quoted powershell.exe path; got {contents:?}"
2769                );
2770            } else {
2771                panic!("Expected Extracted result for quoted-full-path powershell.exe -Command");
2772            }
2773        }
2774
2775        #[test]
2776        fn extracts_here_string() {
2777            let result = extract_content("cat <<< 'hello world'", &ExtractionLimits::default());
2778            if let ExtractionResult::Extracted(contents) = result {
2779                assert_eq!(contents.len(), 1);
2780                assert_eq!(contents[0].content, "hello world");
2781                assert_eq!(contents[0].heredoc_type, Some(HeredocType::HereString));
2782            } else {
2783                panic!("Expected Extracted result");
2784            }
2785        }
2786
2787        #[test]
2788        fn extracts_heredoc_basic() {
2789            let cmd = "cat << EOF\nline1\nline2\nEOF";
2790            let result = extract_content(cmd, &ExtractionLimits::default());
2791            if let ExtractionResult::Extracted(contents) = result {
2792                assert_eq!(contents.len(), 1);
2793                assert_eq!(contents[0].content, "line1\nline2");
2794                assert_eq!(contents[0].delimiter, Some("EOF".to_string()));
2795                assert_eq!(contents[0].heredoc_type, Some(HeredocType::Standard));
2796            } else {
2797                panic!("Expected Extracted result, got {result:?}");
2798            }
2799        }
2800
2801        #[test]
2802        fn extracts_heredoc_ignores_trailing_tokens_on_delimiter_line() {
2803            let cmd = "python3 <<EOF | cat\nimport shutil\nshutil.rmtree('/tmp/test')\nEOF";
2804            let result = extract_content(cmd, &ExtractionLimits::default());
2805            if let ExtractionResult::Extracted(contents) = result {
2806                assert_eq!(contents.len(), 1);
2807                assert_eq!(contents[0].language, ScriptLanguage::Python);
2808                assert_eq!(
2809                    contents[0].content,
2810                    "import shutil\nshutil.rmtree('/tmp/test')"
2811                );
2812            } else {
2813                panic!("Expected Extracted result, got {result:?}");
2814            }
2815        }
2816
2817        #[test]
2818        fn extracts_heredoc_with_crlf_line_endings() {
2819            let cmd = "cat <<EOF\r\nline1\r\nEOF\r\n";
2820            let result = extract_content(cmd, &ExtractionLimits::default());
2821            if let ExtractionResult::Extracted(contents) = result {
2822                assert_eq!(contents.len(), 1);
2823                assert_eq!(contents[0].content, "line1");
2824                assert_eq!(contents[0].delimiter.as_deref(), Some("EOF"));
2825            } else {
2826                panic!("Expected Extracted result, got {result:?}");
2827            }
2828        }
2829
2830        #[test]
2831        fn extracts_heredoc_tab_stripped() {
2832            let cmd = "cat <<- EOF\n\tline1\n\tline2\nEOF";
2833            let result = extract_content(cmd, &ExtractionLimits::default());
2834            if let ExtractionResult::Extracted(contents) = result {
2835                assert_eq!(contents.len(), 1);
2836                // Tab-stripping removes leading tabs
2837                assert_eq!(contents[0].content, "line1\nline2");
2838                assert_eq!(contents[0].heredoc_type, Some(HeredocType::TabStripped));
2839            } else {
2840                panic!("Expected Extracted result");
2841            }
2842        }
2843
2844        #[test]
2845        fn extracts_heredoc_indent_stripped() {
2846            // Indentation-stripping heredoc (<<~) should:
2847            // - accept an indented terminator
2848            // - strip the minimum common indentation from non-empty lines
2849            let cmd = "cat <<~ EOF\n    line1\n    line2\n    EOF";
2850            let result = extract_content(cmd, &ExtractionLimits::default());
2851            if let ExtractionResult::Extracted(contents) = result {
2852                assert_eq!(contents.len(), 1);
2853                assert_eq!(contents[0].content, "line1\nline2");
2854                assert_eq!(contents[0].heredoc_type, Some(HeredocType::IndentStripped));
2855            } else {
2856                panic!("Expected Extracted result, got {result:?}");
2857            }
2858        }
2859
2860        #[test]
2861        fn indent_stripped_heredoc_does_not_panic_on_multibyte_whitespace() {
2862            // Regression: <<~ stripped `min_indent` BYTES off each line.
2863            // If one line uses ASCII spaces (1 byte each) and another uses
2864            // a multi-byte whitespace char (NBSP = 2 bytes, U+3000 = 3
2865            // bytes), the byte offset can land in the middle of a UTF-8
2866            // codepoint and panic the slice. Under release `panic = "abort"`
2867            // that crashes the hook process — a fail-open violation.
2868            //
2869            // Each of these inputs would previously have triggered a
2870            // `byte index N is not a char boundary` panic; after the fix
2871            // they all extract successfully (with the conservative
2872            // fallback of `trim_start()` on lines whose byte offset
2873            // doesn't align to a char boundary).
2874            let cases: &[&str] = &[
2875                // ASCII line + NBSP-prefixed line. min_indent in bytes
2876                // would be 2 (NBSP); slicing the 4-space line at byte 2
2877                // is char-aligned so this case is safe — but the
2878                // ideographic-space variant below is not.
2879                "cat <<~ EOF\n  line1\n\u{00A0}line2\n  EOF",
2880                // ASCII + ideographic space. U+3000 is 3 bytes; min_indent
2881                // could be 2 (the ASCII line) and slicing `\u{3000}f` at
2882                // byte 2 lands inside the codepoint.
2883                "cat <<~ EOF\n  line1\n\u{3000}foo\n  EOF",
2884                // Two multi-byte whitespace lines with different sequence
2885                // lengths. min_indent picks the shorter byte-count; the
2886                // longer-prefixed line's byte offset misaligns.
2887                "cat <<~ EOF\n\u{00A0}line1\n\u{3000}line2\nEOF",
2888            ];
2889            for cmd in cases {
2890                let result = extract_content(cmd, &ExtractionLimits::default());
2891                // Whether content is "Extracted" or "NoContent" depends on
2892                // what the upstream parser did; the only invariant we care
2893                // about is "no panic, returns a value." Using a method
2894                // call ensures we touch the result.
2895                let _ = format!("{result:?}");
2896            }
2897        }
2898
2899        #[test]
2900        fn extracts_heredoc_quoted_delimiter_sets_quoted_flag() {
2901            // Quoted delimiter suppresses expansion in real shells; we track this for context.
2902            let cmd = "cat << 'EOF'\nline1\nEOF";
2903            let result = extract_content(cmd, &ExtractionLimits::default());
2904            if let ExtractionResult::Extracted(contents) = result {
2905                assert_eq!(contents.len(), 1);
2906                assert_eq!(contents[0].content, "line1");
2907                assert_eq!(contents[0].delimiter.as_deref(), Some("EOF"));
2908                assert!(contents[0].quoted, "quoted delimiter must set quoted=true");
2909            } else {
2910                panic!("Expected Extracted result, got {result:?}");
2911            }
2912
2913            let cmd = "cat << EOF\nline1\nEOF";
2914            let result = extract_content(cmd, &ExtractionLimits::default());
2915            if let ExtractionResult::Extracted(contents) = result {
2916                assert_eq!(contents.len(), 1);
2917                assert!(
2918                    !contents[0].quoted,
2919                    "unquoted delimiter must set quoted=false"
2920                );
2921            } else {
2922                panic!("Expected Extracted result, got {result:?}");
2923            }
2924        }
2925
2926        // Regression test for issue #109: bash accepts `<<- 'EOF'` (with a
2927        // space after the `-` tab-strip marker). Before the fix, the
2928        // delimiter parser fell through to the unquoted branch with a
2929        // leading space and bailed, leaving the heredoc body unmasked so
2930        // pack matching denied dangerous-looking prose like "gh repo
2931        // delete" inside `cat <<- 'EOF'`. All four spaced/non-spaced and
2932        // single/double-quoted forms must extract the same delimiter.
2933        #[test]
2934        fn extracts_heredoc_tab_stripped_quoted_with_space_after_dash() {
2935            for (form, cmd) in [
2936                ("<<-'EOF'", "cat <<-'EOF'\n\tgh repo delete\n\tEOF"),
2937                ("<<- 'EOF'", "cat <<- 'EOF'\n\tgh repo delete\n\tEOF"),
2938                ("<<-\"EOF\"", "cat <<-\"EOF\"\n\tgh repo delete\n\tEOF"),
2939                ("<<- \"EOF\"", "cat <<- \"EOF\"\n\tgh repo delete\n\tEOF"),
2940                ("<<~ 'EOF'", "cat <<~ 'EOF'\n\tgh repo delete\n\tEOF"),
2941            ] {
2942                let result = extract_content(cmd, &ExtractionLimits::default());
2943                let ExtractionResult::Extracted(contents) = result else {
2944                    panic!("Expected extraction for {form}, got {result:?}");
2945                };
2946                assert_eq!(
2947                    contents.len(),
2948                    1,
2949                    "{form}: expected single heredoc extraction"
2950                );
2951                assert_eq!(
2952                    contents[0].delimiter.as_deref(),
2953                    Some("EOF"),
2954                    "{form}: delimiter must parse to EOF"
2955                );
2956                assert!(
2957                    contents[0].quoted,
2958                    "{form}: quoted delimiter must set quoted=true"
2959                );
2960            }
2961        }
2962
2963        // Reviewer-eyes catch from the #109 follow-up: bash treats whitespace
2964        // before the marker character as a hard divider, so `cat << -EOF`
2965        // (note the space *before* the dash) is a Standard heredoc whose
2966        // delimiter is the literal `-EOF`, not a tab-stripped heredoc with
2967        // delimiter `EOF`. Pre-fix the parser would mis-classify, the
2968        // terminator search would look for a line `EOF` rather than `-EOF`,
2969        // and the heredoc body would either run past the real terminator
2970        // or never close. The `~` variant cannot reach this path because
2971        // the unquoted-delimiter regex char class is `[\w.-]+` (no tilde),
2972        // so `<< ~FOO` is rejected by the regex before parse_heredoc_delimiter
2973        // runs — only the dash variant is reachable.
2974        #[test]
2975        fn parses_dash_after_space_as_part_of_unquoted_delimiter() {
2976            let cmd = "cat << -EOF\nbody line\n-EOF";
2977            let result = extract_content(cmd, &ExtractionLimits::default());
2978            let ExtractionResult::Extracted(contents) = result else {
2979                panic!("Expected extraction, got {result:?}");
2980            };
2981            assert_eq!(contents.len(), 1, "expected single heredoc extraction");
2982            assert_eq!(
2983                contents[0].delimiter.as_deref(),
2984                Some("-EOF"),
2985                "delimiter must include the leading dash when there is whitespace before it"
2986            );
2987            assert!(
2988                !contents[0].quoted,
2989                "unquoted delimiter must set quoted=false"
2990            );
2991        }
2992
2993        // The mask path (`mask_non_executing_heredocs`) and the regex
2994        // extraction path (`extract_heredocs`) must agree on heredoc type.
2995        // The extractor maps `<<~` -> IndentStripped; if the masker maps
2996        // it to TabStripped instead, a space-indented terminator like
2997        // `  EOF` is never recognized (TabStripped only trims `\t`), the
2998        // body escapes masking, and pack matching produces false positives
2999        // on prose like `rm -rf /` inside `cat <<~EOF` documentation.
3000        #[test]
3001        fn masks_indent_stripped_heredoc_body_with_space_indented_terminator() {
3002            let cmd = "cat <<~EOF\n  rm -rf /\n  EOF";
3003            let masked = mask_non_executing_heredocs(cmd);
3004            assert!(
3005                matches!(masked, std::borrow::Cow::Owned(_)),
3006                "expected the body to be masked (Cow::Owned), got Borrowed: {masked:?}"
3007            );
3008            assert!(
3009                !masked.contains("rm -rf /"),
3010                "masked output still contains body: {masked:?}"
3011            );
3012            // The spaced-quoted form must mask too — same path with extra
3013            // whitespace between the marker and the delimiter (issue #109
3014            // coverage).
3015            let cmd = "cat <<~ 'EOF'\n  rm -rf /\n  EOF";
3016            let masked = mask_non_executing_heredocs(cmd);
3017            assert!(
3018                matches!(masked, std::borrow::Cow::Owned(_)),
3019                "expected the body to be masked (Cow::Owned), got Borrowed: {masked:?}"
3020            );
3021            assert!(
3022                !masked.contains("rm -rf /"),
3023                "masked output still contains body: {masked:?}"
3024            );
3025        }
3026
3027        #[test]
3028        fn heredoc_language_detects_interpreter_prefixes() {
3029            // Regression test: heredoc bodies must not default to Bash when the interpreter is explicit.
3030            let cases = [
3031                ("python3 <<EOF\nprint('hello')\nEOF", ScriptLanguage::Python),
3032                (
3033                    "node <<EOF\nconsole.log('hello');\nEOF",
3034                    ScriptLanguage::JavaScript,
3035                ),
3036                ("ruby <<EOF\nputs 'hello'\nEOF", ScriptLanguage::Ruby),
3037                ("perl <<EOF\nprint \"hello\";\nEOF", ScriptLanguage::Perl),
3038                ("bash <<EOF\necho hello\nEOF", ScriptLanguage::Bash),
3039            ];
3040
3041            for (cmd, expected) in cases {
3042                let result = extract_content(cmd, &ExtractionLimits::default());
3043                if let ExtractionResult::Extracted(contents) = result {
3044                    assert_eq!(
3045                        contents.len(),
3046                        1,
3047                        "expected one heredoc extraction for: {cmd}"
3048                    );
3049                    assert_eq!(
3050                        contents[0].language, expected,
3051                        "expected language {expected:?} for heredoc: {cmd}"
3052                    );
3053                } else {
3054                    panic!("Expected Extracted result for heredoc: {cmd}, got {result:?}");
3055                }
3056            }
3057        }
3058
3059        #[test]
3060        fn heredoc_language_detects_shebang_when_command_unknown() {
3061            let cmd = "cat <<EOF\n#!/usr/bin/env python3\nimport os\nprint('hi')\nEOF";
3062            let result = extract_content(cmd, &ExtractionLimits::default());
3063            if let ExtractionResult::Extracted(contents) = result {
3064                assert_eq!(contents.len(), 1);
3065                assert_eq!(contents[0].language, ScriptLanguage::Python);
3066            } else {
3067                panic!("Expected Extracted result, got {result:?}");
3068            }
3069        }
3070
3071        #[test]
3072        fn extracts_empty_heredoc() {
3073            // Empty heredoc is valid - body is empty but terminator is found
3074            let cmd = "cat << EOF\nEOF";
3075            let result = extract_content(cmd, &ExtractionLimits::default());
3076            if let ExtractionResult::Extracted(contents) = result {
3077                assert_eq!(contents.len(), 1);
3078                assert_eq!(contents[0].content, "");
3079                assert_eq!(contents[0].delimiter, Some("EOF".to_string()));
3080            } else {
3081                panic!("Expected Extracted result for empty heredoc, got {result:?}");
3082            }
3083        }
3084
3085        #[test]
3086        fn heredoc_byte_range_is_correct() {
3087            // Test non-empty heredoc byte_range
3088            let cmd = "python << END\nprint(1)\nEND";
3089            let result = extract_content(cmd, &ExtractionLimits::default());
3090            if let ExtractionResult::Extracted(contents) = result {
3091                assert_eq!(contents.len(), 1);
3092                assert_eq!(contents[0].language, ScriptLanguage::Python);
3093                let range = &contents[0].byte_range;
3094                // byte_range should cover from "<< END" to the final "END"
3095                let extracted_span = &cmd[range.clone()];
3096                assert_eq!(extracted_span, "<< END\nprint(1)\nEND");
3097            } else {
3098                panic!("Expected Extracted result");
3099            }
3100
3101            // Test empty heredoc byte_range
3102            let cmd = "cat << EOF\nEOF";
3103            let result = extract_content(cmd, &ExtractionLimits::default());
3104            if let ExtractionResult::Extracted(contents) = result {
3105                assert_eq!(contents.len(), 1);
3106                let range = &contents[0].byte_range;
3107                let extracted_span = &cmd[range.clone()];
3108                assert_eq!(extracted_span, "<< EOF\nEOF");
3109            } else {
3110                panic!("Expected Extracted result");
3111            }
3112
3113            // Test multi-line heredoc byte_range
3114            let cmd = "cat << EOF\nline1\nline2\nEOF";
3115            let result = extract_content(cmd, &ExtractionLimits::default());
3116            if let ExtractionResult::Extracted(contents) = result {
3117                assert_eq!(contents.len(), 1);
3118                let range = &contents[0].byte_range;
3119                let extracted_span = &cmd[range.clone()];
3120                assert_eq!(extracted_span, "<< EOF\nline1\nline2\nEOF");
3121            } else {
3122                panic!("Expected Extracted result");
3123            }
3124        }
3125
3126        #[test]
3127        fn extracts_here_string_with_nested_quotes() {
3128            // Here-string with double quotes inside single quotes
3129            let result = extract_content(
3130                r#"cat <<< 'hello "world" test'"#,
3131                &ExtractionLimits::default(),
3132            );
3133            if let ExtractionResult::Extracted(contents) = result {
3134                assert_eq!(contents.len(), 1);
3135                assert_eq!(contents[0].content, r#"hello "world" test"#);
3136                assert!(contents[0].quoted);
3137            } else {
3138                panic!("Expected Extracted result");
3139            }
3140
3141            // Here-string with single quotes inside double quotes
3142            let result = extract_content(
3143                r#"cat <<< "hello 'world' test""#,
3144                &ExtractionLimits::default(),
3145            );
3146            if let ExtractionResult::Extracted(contents) = result {
3147                assert_eq!(contents.len(), 1);
3148                assert_eq!(contents[0].content, "hello 'world' test");
3149                assert!(contents[0].quoted);
3150            } else {
3151                panic!("Expected Extracted result");
3152            }
3153        }
3154
3155        #[test]
3156        fn from_command_does_not_false_positive() {
3157            // These should NOT be detected as interpreters
3158            assert_eq!(
3159                ScriptLanguage::from_command("shebang"),
3160                ScriptLanguage::Unknown
3161            );
3162            assert_eq!(
3163                ScriptLanguage::from_command("shell"),
3164                ScriptLanguage::Unknown
3165            );
3166            assert_eq!(
3167                ScriptLanguage::from_command("pythonic"),
3168                ScriptLanguage::Unknown
3169            );
3170            assert_eq!(
3171                ScriptLanguage::from_command("nodemon"),
3172                ScriptLanguage::Unknown
3173            );
3174            assert_eq!(
3175                ScriptLanguage::from_command("perldoc"),
3176                ScriptLanguage::Unknown
3177            );
3178            assert_eq!(
3179                ScriptLanguage::from_command("bashful"),
3180                ScriptLanguage::Unknown
3181            );
3182        }
3183
3184        #[test]
3185        fn from_command_matches_versioned_interpreters() {
3186            // These SHOULD be detected with version suffixes
3187            assert_eq!(
3188                ScriptLanguage::from_command("python3"),
3189                ScriptLanguage::Python
3190            );
3191            assert_eq!(
3192                ScriptLanguage::from_command("python3.11"),
3193                ScriptLanguage::Python
3194            );
3195            assert_eq!(
3196                ScriptLanguage::from_command("python3.11.4"),
3197                ScriptLanguage::Python
3198            );
3199            assert_eq!(
3200                ScriptLanguage::from_command("node18"),
3201                ScriptLanguage::JavaScript
3202            );
3203            assert_eq!(ScriptLanguage::from_command("perl5"), ScriptLanguage::Perl);
3204        }
3205
3206        #[test]
3207        fn no_content_on_safe_command() {
3208            let result = extract_content("git status", &ExtractionLimits::default());
3209            assert!(matches!(result, ExtractionResult::NoContent));
3210        }
3211
3212        #[test]
3213        fn script_language_from_command() {
3214            assert_eq!(
3215                ScriptLanguage::from_command("python3"),
3216                ScriptLanguage::Python
3217            );
3218            assert_eq!(ScriptLanguage::from_command("ruby"), ScriptLanguage::Ruby);
3219            assert_eq!(ScriptLanguage::from_command("perl"), ScriptLanguage::Perl);
3220            assert_eq!(
3221                ScriptLanguage::from_command("node"),
3222                ScriptLanguage::JavaScript
3223            );
3224            assert_eq!(ScriptLanguage::from_command("bash"), ScriptLanguage::Bash);
3225            assert_eq!(
3226                ScriptLanguage::from_command("unknown"),
3227                ScriptLanguage::Unknown
3228            );
3229        }
3230
3231        // =========================================================================
3232        // Language detection tests (git_safety_guard-du4)
3233        // =========================================================================
3234
3235        #[test]
3236        fn from_shebang_detects_direct_path() {
3237            assert_eq!(
3238                ScriptLanguage::from_shebang("#!/bin/bash\necho hello"),
3239                Some(ScriptLanguage::Bash)
3240            );
3241            assert_eq!(
3242                ScriptLanguage::from_shebang("#!/usr/bin/python\nimport os"),
3243                Some(ScriptLanguage::Python)
3244            );
3245            assert_eq!(
3246                ScriptLanguage::from_shebang("#!/usr/bin/ruby\nputs 'hi'"),
3247                Some(ScriptLanguage::Ruby)
3248            );
3249        }
3250
3251        #[test]
3252        fn from_shebang_detects_env_path() {
3253            assert_eq!(
3254                ScriptLanguage::from_shebang("#!/usr/bin/env python3\nimport sys"),
3255                Some(ScriptLanguage::Python)
3256            );
3257            assert_eq!(
3258                ScriptLanguage::from_shebang("#!/usr/bin/env node\nconsole.log('hi')"),
3259                Some(ScriptLanguage::JavaScript)
3260            );
3261            assert_eq!(
3262                ScriptLanguage::from_shebang("#!/usr/bin/env perl\nprint 'hello'"),
3263                Some(ScriptLanguage::Perl)
3264            );
3265        }
3266
3267        #[test]
3268        fn from_shebang_returns_none_for_invalid() {
3269            // No shebang
3270            assert_eq!(ScriptLanguage::from_shebang("import os"), None);
3271            // Empty shebang
3272            assert_eq!(ScriptLanguage::from_shebang("#!\ncode"), None);
3273            // Unknown interpreter
3274            assert_eq!(
3275                ScriptLanguage::from_shebang("#!/usr/bin/unknown\ncode"),
3276                None
3277            );
3278        }
3279
3280        #[test]
3281        fn from_shebang_ignores_interpreter_flags() {
3282            // Direct path with flags
3283            assert_eq!(
3284                ScriptLanguage::from_shebang("#!/bin/bash -e\nset -x"),
3285                Some(ScriptLanguage::Bash)
3286            );
3287            assert_eq!(
3288                ScriptLanguage::from_shebang("#!/bin/bash -ex\necho hello"),
3289                Some(ScriptLanguage::Bash)
3290            );
3291            assert_eq!(
3292                ScriptLanguage::from_shebang("#!/usr/bin/python3 -u\nimport sys"),
3293                Some(ScriptLanguage::Python)
3294            );
3295
3296            // Env-style with flags
3297            assert_eq!(
3298                ScriptLanguage::from_shebang("#!/usr/bin/env python3 -u\nimport sys"),
3299                Some(ScriptLanguage::Python)
3300            );
3301            assert_eq!(
3302                ScriptLanguage::from_shebang("#!/usr/bin/env bash -e\necho hi"),
3303                Some(ScriptLanguage::Bash)
3304            );
3305            assert_eq!(
3306                ScriptLanguage::from_shebang("#!/usr/bin/env ruby -w\nputs 'hi'"),
3307                Some(ScriptLanguage::Ruby)
3308            );
3309        }
3310
3311        #[test]
3312        fn from_shebang_handles_env_flags() {
3313            // env -S splits remaining arguments (GNU coreutils 8.30+)
3314            assert_eq!(
3315                ScriptLanguage::from_shebang("#!/usr/bin/env -S python3 -u\nimport sys"),
3316                Some(ScriptLanguage::Python)
3317            );
3318            assert_eq!(
3319                ScriptLanguage::from_shebang("#!/usr/bin/env -S bash -e\necho hi"),
3320                Some(ScriptLanguage::Bash)
3321            );
3322
3323            // env -i ignores environment
3324            assert_eq!(
3325                ScriptLanguage::from_shebang("#!/usr/bin/env -i python3\nimport os"),
3326                Some(ScriptLanguage::Python)
3327            );
3328
3329            // Multiple env flags
3330            assert_eq!(
3331                ScriptLanguage::from_shebang("#!/usr/bin/env -i -S perl -w\nuse strict;"),
3332                Some(ScriptLanguage::Perl)
3333            );
3334        }
3335
3336        #[test]
3337        fn from_content_detects_python() {
3338            assert_eq!(
3339                ScriptLanguage::from_content("import os\nos.remove('file')"),
3340                Some(ScriptLanguage::Python)
3341            );
3342            assert_eq!(
3343                ScriptLanguage::from_content("from pathlib import Path\nPath('x').unlink()"),
3344                Some(ScriptLanguage::Python)
3345            );
3346        }
3347
3348        #[test]
3349        fn from_content_detects_javascript() {
3350            assert_eq!(
3351                ScriptLanguage::from_content("const fs = require('fs');\nfs.rm('x');"),
3352                Some(ScriptLanguage::JavaScript)
3353            );
3354            assert_eq!(
3355                ScriptLanguage::from_content("let x = 5;\nconsole.log(x);"),
3356                Some(ScriptLanguage::JavaScript)
3357            );
3358        }
3359
3360        #[test]
3361        fn from_content_detects_typescript() {
3362            assert_eq!(
3363                ScriptLanguage::from_content("const x: string = 'hello';"),
3364                Some(ScriptLanguage::TypeScript)
3365            );
3366            assert_eq!(
3367                ScriptLanguage::from_content("interface User { name: string }"),
3368                Some(ScriptLanguage::TypeScript)
3369            );
3370        }
3371
3372        #[test]
3373        fn from_content_detects_ruby() {
3374            // Ruby needs 'end' to reduce false positives
3375            assert_eq!(
3376                ScriptLanguage::from_content("def hello\n  puts 'hi'\nend"),
3377                Some(ScriptLanguage::Ruby)
3378            );
3379            assert_eq!(
3380                ScriptLanguage::from_content("require 'fileutils'\nFileUtils.rm_rf('x')\nend"),
3381                Some(ScriptLanguage::Ruby)
3382            );
3383        }
3384
3385        #[test]
3386        fn from_content_detects_perl() {
3387            assert_eq!(
3388                ScriptLanguage::from_content("use strict;\nmy $x = 5;"),
3389                Some(ScriptLanguage::Perl)
3390            );
3391            assert_eq!(
3392                ScriptLanguage::from_content("my @arr = (1,2,3);"),
3393                Some(ScriptLanguage::Perl)
3394            );
3395        }
3396
3397        #[test]
3398        fn from_content_detects_bash() {
3399            assert_eq!(
3400                ScriptLanguage::from_content("if [ -f file ]; then\n  echo 'exists'\nfi"),
3401                Some(ScriptLanguage::Bash)
3402            );
3403            assert_eq!(
3404                ScriptLanguage::from_content("x=$((1+2))\necho ${x}"),
3405                Some(ScriptLanguage::Bash)
3406            );
3407        }
3408
3409        #[test]
3410        fn from_content_returns_none_for_unknown() {
3411            assert_eq!(ScriptLanguage::from_content("hello world"), None);
3412            assert_eq!(ScriptLanguage::from_content(""), None);
3413        }
3414
3415        #[test]
3416        fn detect_uses_command_prefix_first() {
3417            // Even with Python shebang, command should take precedence
3418            let (lang, confidence) =
3419                ScriptLanguage::detect("ruby -e 'code'", "#!/usr/bin/python\nimport os");
3420            assert_eq!(lang, ScriptLanguage::Ruby);
3421            assert_eq!(confidence, DetectionConfidence::CommandPrefix);
3422        }
3423
3424        #[test]
3425        fn detect_uses_shebang_second() {
3426            // No command interpreter, but has shebang
3427            let (lang, confidence) =
3428                ScriptLanguage::detect("cat script.sh", "#!/bin/bash\necho hello");
3429            assert_eq!(lang, ScriptLanguage::Bash);
3430            assert_eq!(confidence, DetectionConfidence::Shebang);
3431        }
3432
3433        #[test]
3434        fn detect_uses_content_heuristics_third() {
3435            // No command interpreter, no shebang, but has Python imports
3436            let (lang, confidence) =
3437                ScriptLanguage::detect("cat script", "import os\nos.remove('x')");
3438            assert_eq!(lang, ScriptLanguage::Python);
3439            assert_eq!(confidence, DetectionConfidence::ContentHeuristics);
3440        }
3441
3442        #[test]
3443        fn detect_returns_unknown_for_unrecognized() {
3444            let (lang, confidence) = ScriptLanguage::detect("cat file.txt", "hello world");
3445            assert_eq!(lang, ScriptLanguage::Unknown);
3446            assert_eq!(confidence, DetectionConfidence::Unknown);
3447        }
3448
3449        #[test]
3450        fn detect_handles_env_prefix() {
3451            let (lang, confidence) = ScriptLanguage::detect("env python3 -c 'code'", "");
3452            assert_eq!(lang, ScriptLanguage::Python);
3453            assert_eq!(confidence, DetectionConfidence::CommandPrefix);
3454        }
3455
3456        #[test]
3457        fn detect_handles_absolute_path() {
3458            let (lang, confidence) = ScriptLanguage::detect("/usr/bin/python3 -c 'code'", "");
3459            assert_eq!(lang, ScriptLanguage::Python);
3460            assert_eq!(confidence, DetectionConfidence::CommandPrefix);
3461        }
3462
3463        #[test]
3464        fn detection_confidence_labels() {
3465            assert_eq!(DetectionConfidence::CommandPrefix.label(), "command-prefix");
3466            assert_eq!(DetectionConfidence::Shebang.label(), "shebang");
3467            assert_eq!(
3468                DetectionConfidence::ContentHeuristics.label(),
3469                "content-heuristics"
3470            );
3471            assert_eq!(DetectionConfidence::Unknown.label(), "unknown");
3472        }
3473
3474        #[test]
3475        fn detection_confidence_reasons() {
3476            assert!(
3477                DetectionConfidence::CommandPrefix
3478                    .reason()
3479                    .contains("highest")
3480            );
3481            assert!(DetectionConfidence::Shebang.reason().contains("high"));
3482            assert!(
3483                DetectionConfidence::ContentHeuristics
3484                    .reason()
3485                    .contains("lower")
3486            );
3487            assert!(DetectionConfidence::Unknown.reason().contains("could not"));
3488        }
3489
3490        #[test]
3491        fn enforces_max_body_bytes() {
3492            let large_content = "x".repeat(2_000_000); // 2MB
3493            let cmd = format!("python -c '{large_content}'");
3494            let limits = ExtractionLimits {
3495                max_body_bytes: 1_000_000, // 1MB limit
3496                ..Default::default()
3497            };
3498            let result = extract_content(&cmd, &limits);
3499            // Should return Skipped with size limit reason
3500            match result {
3501                ExtractionResult::Skipped(reasons) => {
3502                    assert!(
3503                        reasons
3504                            .iter()
3505                            .any(|r| matches!(r, SkipReason::ExceededSizeLimit { .. }))
3506                    );
3507                }
3508                ExtractionResult::NoContent
3509                | ExtractionResult::Failed(_)
3510                | ExtractionResult::Partial { .. } => {}
3511                ExtractionResult::Extracted(contents) => {
3512                    // If extracted, content should be within limits
3513                    for c in contents {
3514                        assert!(c.content.len() <= limits.max_body_bytes);
3515                    }
3516                }
3517            }
3518        }
3519
3520        #[test]
3521        fn extracts_multiple_inline_scripts() {
3522            let cmd = "python -c 'code1' && ruby -e 'code2'";
3523            let result = extract_content(cmd, &ExtractionLimits::default());
3524            if let ExtractionResult::Extracted(contents) = result {
3525                assert_eq!(contents.len(), 2);
3526                assert_eq!(contents[0].content, "code1");
3527                assert_eq!(contents[1].content, "code2");
3528            } else {
3529                panic!("Expected Extracted result");
3530            }
3531        }
3532
3533        #[test]
3534        fn extracts_versioned_interpreter_scripts() {
3535            // Tier 2 must extract content from versioned interpreters
3536            let cmd = "python3.11 -c 'import os' && nodejs18 -e 'console.log(1)'";
3537            let result = extract_content(cmd, &ExtractionLimits::default());
3538            if let ExtractionResult::Extracted(contents) = result {
3539                assert_eq!(contents.len(), 2, "should extract both scripts");
3540                assert_eq!(contents[0].content, "import os");
3541                assert_eq!(contents[0].language, ScriptLanguage::Python);
3542                assert_eq!(contents[1].content, "console.log(1)");
3543                assert_eq!(contents[1].language, ScriptLanguage::JavaScript);
3544            } else {
3545                panic!("Expected Extracted result for versioned interpreters, got {result:?}");
3546            }
3547        }
3548
3549        // ====================================================================
3550        // Robustness Tests (git_safety_guard-rbst)
3551        // ====================================================================
3552
3553        #[test]
3554        fn skips_binary_content_with_null_bytes() {
3555            // Content with null bytes should be detected as binary
3556            let cmd = "python -c '\x00binary\x00content'";
3557            if let Some(reason) = check_binary_content(cmd) {
3558                assert!(
3559                    matches!(reason, SkipReason::BinaryContent { null_bytes, .. } if null_bytes > 0)
3560                );
3561            } else {
3562                panic!("Expected binary content detection");
3563            }
3564        }
3565
3566        #[test]
3567        fn skips_binary_content_high_non_printable() {
3568            // Content with high ratio of non-printable bytes
3569            let binary_bytes: Vec<u8> = (0u8..50).chain(200u8..255).collect();
3570            let binary_str = String::from_utf8_lossy(&binary_bytes);
3571            if let Some(reason) = check_binary_content(&binary_str) {
3572                assert!(matches!(reason, SkipReason::BinaryContent { .. }));
3573            } else {
3574                panic!("Expected binary content detection for high non-printable ratio");
3575            }
3576        }
3577
3578        #[test]
3579        fn allows_normal_text_content() {
3580            let normal_content = "import os\nprint('hello world')\nfor i in range(10): pass";
3581            assert!(check_binary_content(normal_content).is_none());
3582        }
3583
3584        #[test]
3585        fn tracks_unterminated_heredoc() {
3586            let cmd = "cat << EOF\nunterminated content without closing delimiter";
3587            let result = extract_content(cmd, &ExtractionLimits::default());
3588            match result {
3589                ExtractionResult::Skipped(reasons) => {
3590                    assert!(
3591                        reasons
3592                            .iter()
3593                            .any(|r| matches!(r, SkipReason::UnterminatedHeredoc { .. })),
3594                        "should report UnterminatedHeredoc, not ExceededSizeLimit"
3595                    );
3596                }
3597                _ => panic!("Expected Skipped result for unterminated heredoc"),
3598            }
3599        }
3600
3601        #[test]
3602        fn heredoc_body_line_limit_reports_exceeded_line_limit() {
3603            let cmd = "cat << EOF\nline1\nline2\nline3\nEOF";
3604            let limits = ExtractionLimits {
3605                max_body_lines: 2,
3606                ..Default::default()
3607            };
3608
3609            let result = extract_content(cmd, &limits);
3610            match result {
3611                ExtractionResult::Skipped(reasons) => {
3612                    assert!(
3613                        reasons
3614                            .iter()
3615                            .any(|r| matches!(r, SkipReason::ExceededLineLimit { .. })),
3616                        "should report ExceededLineLimit, not UnterminatedHeredoc"
3617                    );
3618                }
3619                _ => panic!("Expected Skipped result for line-limited heredoc, got {result:?}"),
3620            }
3621        }
3622
3623        #[test]
3624        fn extraction_timeout_is_enforced() {
3625            let cmd = "cat << EOF\nline1\nEOF";
3626            let limits = ExtractionLimits {
3627                timeout_ms: 0,
3628                ..Default::default()
3629            };
3630
3631            let result = extract_content(cmd, &limits);
3632            match result {
3633                ExtractionResult::Skipped(reasons) => {
3634                    assert!(
3635                        reasons
3636                            .iter()
3637                            .any(|r| matches!(r, SkipReason::Timeout { .. })),
3638                        "should include a Timeout skip reason"
3639                    );
3640                }
3641                _ => panic!("Expected Skipped(timeout) result, got {result:?}"),
3642            }
3643        }
3644
3645        #[test]
3646        fn enforces_heredoc_limit() {
3647            // Create a command with many heredocs
3648            let cmd = "cmd1 << A\na\nA && cmd2 << B\nb\nB && cmd3 << C\nc\nC";
3649            let limits = ExtractionLimits {
3650                max_heredocs: 2, // Only allow 2
3651                ..Default::default()
3652            };
3653            let result = extract_content(cmd, &limits);
3654            if let ExtractionResult::Extracted(contents) = result {
3655                assert!(contents.len() <= limits.max_heredocs);
3656            }
3657            // Otherwise, skip result is also acceptable
3658        }
3659
3660        #[test]
3661        fn skip_reason_display() {
3662            // Test Display implementations
3663            let reasons = vec![
3664                SkipReason::ExceededSizeLimit {
3665                    actual: 2000,
3666                    limit: 1000,
3667                },
3668                SkipReason::ExceededLineLimit {
3669                    actual: 200,
3670                    limit: 100,
3671                },
3672                SkipReason::ExceededHeredocLimit { limit: 10 },
3673                SkipReason::BinaryContent {
3674                    null_bytes: 5,
3675                    non_printable_ratio: 0.5,
3676                },
3677                SkipReason::Timeout {
3678                    elapsed_ms: 60,
3679                    budget_ms: 50,
3680                },
3681                SkipReason::UnterminatedHeredoc {
3682                    delimiter: "EOF".to_string(),
3683                },
3684                SkipReason::MalformedInput {
3685                    reason: "test".to_string(),
3686                },
3687            ];
3688
3689            for reason in reasons {
3690                let display = format!("{reason}");
3691                assert!(!display.is_empty(), "Display should produce output");
3692            }
3693        }
3694
3695        #[test]
3696        fn empty_command_returns_no_content() {
3697            let result = extract_content("", &ExtractionLimits::default());
3698            assert!(matches!(result, ExtractionResult::NoContent));
3699        }
3700
3701        #[test]
3702        fn whitespace_only_returns_no_content() {
3703            let result = extract_content("   \t\n  ", &ExtractionLimits::default());
3704            assert!(matches!(result, ExtractionResult::NoContent));
3705        }
3706    }
3707
3708    // ========================================================================
3709    // Shell Command Extraction Tests (git_safety_guard-uau)
3710    // ========================================================================
3711
3712    mod shell_extraction {
3713        use super::*;
3714
3715        // ====================================================================
3716        // Positive fixtures: commands that MUST be extracted
3717        // ====================================================================
3718
3719        #[test]
3720        fn extracts_simple_command() {
3721            let commands = extract_shell_commands("ls -la");
3722            assert_eq!(commands.len(), 1);
3723            assert_eq!(commands[0].text, "ls -la");
3724            assert_eq!(commands[0].line_number, 1);
3725        }
3726
3727        #[test]
3728        fn extracts_rm_rf() {
3729            // Catastrophic command - must be extracted for evaluator
3730            let commands = extract_shell_commands("rm -rf /tmp/test");
3731            assert_eq!(commands.len(), 1);
3732            assert_eq!(commands[0].text, "rm -rf /tmp/test");
3733        }
3734
3735        #[test]
3736        fn extracts_git_reset_hard() {
3737            let commands = extract_shell_commands("git reset --hard");
3738            assert_eq!(commands.len(), 1);
3739            assert_eq!(commands[0].text, "git reset --hard");
3740        }
3741
3742        #[test]
3743        fn extracts_git_clean_fd() {
3744            let commands = extract_shell_commands("git clean -fd");
3745            assert_eq!(commands.len(), 1);
3746            assert_eq!(commands[0].text, "git clean -fd");
3747        }
3748
3749        #[test]
3750        fn extracts_pipeline_both_sides() {
3751            // Both sides of a pipe are executed
3752            let commands = extract_shell_commands("find . -name '*.bak' | xargs rm");
3753            assert_eq!(commands.len(), 2, "pipeline should extract both commands");
3754            assert!(commands[0].text.starts_with("find"));
3755            assert!(commands[1].text.contains("xargs"));
3756        }
3757
3758        #[test]
3759        fn extracts_command_list() {
3760            // Commands separated by && or ;
3761            let commands = extract_shell_commands("cd /tmp && rm -rf test");
3762            assert_eq!(commands.len(), 2, "command list should extract both");
3763        }
3764
3765        #[test]
3766        fn extracts_command_substitution() {
3767            // Commands inside $(...) are executed
3768            let commands = extract_shell_commands("echo $(rm -rf /tmp/test)");
3769            assert!(
3770                commands.len() >= 2,
3771                "should extract command inside substitution"
3772            );
3773            // Should find the rm command inside the substitution
3774            assert!(
3775                commands.iter().any(|c| c.text.contains("rm")),
3776                "should extract rm from command substitution"
3777            );
3778        }
3779
3780        #[test]
3781        fn extracts_subshell_commands() {
3782            // Commands inside (...) subshells are executed
3783            let commands = extract_shell_commands("(cd /tmp && rm -rf test)");
3784            assert!(commands.len() >= 2, "should extract commands from subshell");
3785        }
3786
3787        #[test]
3788        fn extracts_multiline_script() {
3789            let script = r#"#!/bin/bash
3790set -e
3791cd /tmp
3792rm -rf test
3793echo "done""#;
3794            let commands = extract_shell_commands(script);
3795            assert!(
3796                commands.len() >= 4,
3797                "should extract all commands from multiline script"
3798            );
3799            // Should have rm command
3800            assert!(
3801                commands.iter().any(|c| c.text.contains("rm")),
3802                "should extract rm"
3803            );
3804        }
3805
3806        #[test]
3807        fn extracts_docker_system_prune() {
3808            // Docker destructive commands (if pack enabled)
3809            let commands = extract_shell_commands("docker system prune -af");
3810            assert_eq!(commands.len(), 1);
3811            assert_eq!(commands[0].text, "docker system prune -af");
3812        }
3813
3814        #[test]
3815        fn line_numbers_are_correct() {
3816            let script = "echo first\nrm -rf /tmp\necho last";
3817            let commands = extract_shell_commands(script);
3818            assert!(commands.len() >= 3);
3819
3820            let rm_cmd = commands.iter().find(|c| c.text.contains("rm")).unwrap();
3821            assert_eq!(rm_cmd.line_number, 2, "rm should be on line 2");
3822        }
3823
3824        // ====================================================================
3825        // Negative fixtures: content that must NOT be extracted as commands
3826        // ====================================================================
3827
3828        #[test]
3829        fn skips_comments() {
3830            // Comments mentioning dangerous commands should NOT be extracted
3831            // tree-sitter-bash parses "# ..." as a comment node, not a command node
3832            let commands = extract_shell_commands("# rm -rf / would be bad");
3833            assert!(
3834                commands.is_empty(),
3835                "comment-only content should produce zero commands, got: {commands:?}"
3836            );
3837        }
3838
3839        #[test]
3840        fn echo_string_is_data_not_execution() {
3841            // The string inside echo is data, not a command
3842            let commands = extract_shell_commands("echo 'rm -rf /'");
3843            // Should extract echo, but not the rm inside the string
3844            assert!(
3845                commands.len() == 1,
3846                "should only extract echo, not the string content"
3847            );
3848            // The command should be the echo, not rm
3849            assert!(
3850                commands[0].text.starts_with("echo"),
3851                "extracted command should be echo"
3852            );
3853        }
3854
3855        #[test]
3856        fn printf_string_is_data_not_execution() {
3857            let commands = extract_shell_commands(r#"printf "rm -rf %s" /tmp"#);
3858            assert!(
3859                commands.len() == 1,
3860                "should only extract printf, not the format string content"
3861            );
3862            assert!(commands[0].text.starts_with("printf"));
3863        }
3864
3865        #[test]
3866        fn empty_content_returns_no_commands() {
3867            let commands = extract_shell_commands("");
3868            assert!(commands.is_empty());
3869        }
3870
3871        #[test]
3872        fn whitespace_only_returns_no_commands() {
3873            let commands = extract_shell_commands("   \n\t  ");
3874            assert!(commands.is_empty());
3875        }
3876
3877        #[test]
3878        fn comment_only_returns_no_commands() {
3879            // tree-sitter-bash parses "# ..." as a comment node, not a command node
3880            let commands = extract_shell_commands("# This is just a comment");
3881            assert!(
3882                commands.is_empty(),
3883                "comment-only content should produce zero commands, got: {commands:?}"
3884            );
3885        }
3886
3887        #[test]
3888        fn heredoc_delimiter_is_not_command() {
3889            // The EOF itself is not a command, and heredoc body content is DATA not commands
3890            let script = r"cat << EOF
3891some content
3892rm -rf / mentioned in text
3893EOF";
3894            let commands = extract_shell_commands(script);
3895
3896            // Should extract cat command
3897            assert!(
3898                commands.iter().any(|c| c.text.starts_with("cat")),
3899                "should extract cat command"
3900            );
3901
3902            // CRITICAL: heredoc body content must NOT be extracted as commands
3903            // The "rm -rf /" text inside the heredoc is DATA, not an executable command
3904            let rm_commands: Vec<_> = commands
3905                .iter()
3906                .filter(|c| c.text.contains("rm") && !c.text.contains("cat"))
3907                .collect();
3908            assert!(
3909                rm_commands.is_empty(),
3910                "heredoc body content must NOT be extracted as commands, but found: {rm_commands:?}"
3911            );
3912        }
3913
3914        #[test]
3915        fn safe_tmp_cleanup_is_extracted() {
3916            // Policy says /tmp cleanup might be allowed - but we still extract it
3917            // for the evaluator to decide based on pack rules/allowlists
3918            let commands = extract_shell_commands("rm -rf /tmp/build_cache");
3919            assert_eq!(commands.len(), 1);
3920            // Extraction happens - policy decision is for evaluator
3921        }
3922
3923        // ====================================================================
3924        // Edge cases and robustness
3925        // ====================================================================
3926
3927        #[test]
3928        fn handles_complex_pipeline() {
3929            let commands = extract_shell_commands("cat file | grep pattern | wc -l");
3930            assert_eq!(commands.len(), 3, "should extract all pipeline stages");
3931        }
3932
3933        #[test]
3934        fn handles_background_command() {
3935            let commands = extract_shell_commands("long_process &");
3936            assert_eq!(commands.len(), 1);
3937            assert_eq!(commands[0].text, "long_process");
3938        }
3939
3940        #[test]
3941        fn handles_redirections() {
3942            let commands = extract_shell_commands("rm -rf /tmp/test > /dev/null 2>&1");
3943            assert_eq!(commands.len(), 1);
3944            // The command text includes redirections
3945            assert!(commands[0].text.contains("rm"));
3946        }
3947
3948        #[test]
3949        fn handles_variable_expansion_in_command() {
3950            // Commands with variables should still be extracted
3951            let commands = extract_shell_commands("rm -rf $DIR");
3952            assert_eq!(commands.len(), 1);
3953            assert!(commands[0].text.contains("rm"));
3954        }
3955
3956        #[test]
3957        fn handles_if_then_else() {
3958            let script = r#"if [ -f /tmp/test ]; then
3959    rm -rf /tmp/test
3960else
3961    echo "not found"
3962fi"#;
3963            let commands = extract_shell_commands(script);
3964            // Should extract the commands inside the if/else
3965            assert!(
3966                commands.iter().any(|c| c.text.contains("rm")),
3967                "should extract rm from if body"
3968            );
3969            assert!(
3970                commands.iter().any(|c| c.text.contains("echo")),
3971                "should extract echo from else body"
3972            );
3973        }
3974
3975        #[test]
3976        fn handles_for_loop() {
3977            let script = "for f in *.txt; do rm -f \"$f\"; done";
3978            let commands = extract_shell_commands(script);
3979            assert!(
3980                commands.iter().any(|c| c.text.contains("rm")),
3981                "should extract rm from for loop body"
3982            );
3983        }
3984
3985        #[test]
3986        fn byte_ranges_are_correct() {
3987            let script = "echo hello";
3988            let commands = extract_shell_commands(script);
3989            assert_eq!(commands.len(), 1);
3990            assert_eq!(commands[0].start, 0);
3991            assert_eq!(commands[0].end, script.len());
3992
3993            // Extract the text using the range
3994            let extracted = &script[commands[0].start..commands[0].end];
3995            assert_eq!(extracted, "echo hello");
3996        }
3997    }
3998
3999    proptest! {
4000        /// Tier 1 trigger detection must be a superset of Tier 2 extraction.
4001        /// If Tier 2 extracts any content, Tier 1 must have triggered.
4002        #[test]
4003        fn tier1_is_superset_of_tier2_extraction(cmd in prop_oneof![
4004            // Random UTF-8
4005            "\\PC{0,2000}",
4006            // Heredoc-ish inputs (multi-line)
4007            "\\PC{0,400}".prop_map(|body| format!("cat <<EOF\n{body}\nEOF")),
4008            "\\PC{0,400}".prop_map(|body| format!("cat <<'EOF'\n{body}\nEOF")),
4009            // Inline interpreters
4010            "\\PC{0,400}".prop_map(|body| format!("python -c \"{}\"", body.replace('\"', ""))),
4011            "\\PC{0,400}".prop_map(|body| format!("bash -c \"{}\"", body.replace('\"', ""))),
4012            "\\PC{0,400}".prop_map(|body| format!("node -e \"{}\"", body.replace('\"', ""))),
4013        ]) {
4014            let limits = ExtractionLimits {
4015                max_body_bytes: 10_000,
4016                max_body_lines: 1_000,
4017                max_heredocs: 5,
4018                timeout_ms: 50,
4019            };
4020
4021            let extracted = extract_content(&cmd, &limits);
4022            if let ExtractionResult::Extracted(contents) = extracted {
4023                if !contents.is_empty() {
4024                    prop_assert_eq!(
4025                        check_triggers(&cmd),
4026                        TriggerResult::Triggered,
4027                        "Tier 2 extracted but Tier 1 did not trigger for: {:?}",
4028                        cmd
4029                    );
4030                }
4031            }
4032        }
4033    }
4034
4035    #[test]
4036    fn detects_language_in_pipeline() {
4037        // Regression test: now detects python in pipeline via pipe scanning
4038        let cmd = "cat <<EOF | python";
4039        let content = "print('hello')"; // ambiguous content
4040        let (lang, _) = ScriptLanguage::detect(cmd, content);
4041        assert_eq!(lang, ScriptLanguage::Python);
4042    }
4043
4044    #[test]
4045    fn extract_heredoc_target_command_prefers_command_over_arguments() {
4046        let cat_cmd = "cat bash <<EOF\nrm -rf /\nEOF";
4047        let cat_start = cat_cmd.find("<<").expect("cat heredoc");
4048        assert_eq!(
4049            extract_heredoc_target_command(cat_cmd, cat_start).as_deref(),
4050            Some("cat")
4051        );
4052
4053        let grep_cmd = "grep pattern . <<EOF\nrm -rf /\nEOF";
4054        let grep_start = grep_cmd.find("<<").expect("grep heredoc");
4055        assert_eq!(
4056            extract_heredoc_target_command(grep_cmd, grep_start).as_deref(),
4057            Some("grep")
4058        );
4059    }
4060
4061    #[test]
4062    fn extract_heredoc_target_command_skips_assignments_and_wrappers() {
4063        let env_cmd = "FOO=1 env -i /bin/cat <<EOF\npayload\nEOF";
4064        let env_start = env_cmd.find("<<").expect("env heredoc");
4065        assert_eq!(
4066            extract_heredoc_target_command(env_cmd, env_start).as_deref(),
4067            Some("cat")
4068        );
4069
4070        let sudo_cmd = "sudo bash <<EOF\necho hi\nEOF";
4071        let sudo_start = sudo_cmd.find("<<").expect("sudo heredoc");
4072        assert_eq!(
4073            extract_heredoc_target_command(sudo_cmd, sudo_start).as_deref(),
4074            Some("bash")
4075        );
4076    }
4077}
destructive_command_guard/heredoc.rs

destructive_command_guard/
heredoc.rs