Skip to main content

vtcode_core/command_safety/
shell_parser.rs

1//! Shell script parser for `bash -lc` and similar commands.
2//!
3//! This module parses shell commands like:
4//! ```sh
5//! bash -lc "git status && cargo check"
6//! ```
7//!
8//! Into individual command vectors for independent safety checking:
9//! ```text
10//! [["git", "status"], ["cargo", "check"]]
11//! ```
12//!
13//! **Phase 4 Implementation**: Uses tree-sitter for accurate bash AST parsing.
14//! Falls back to basic tokenization for minimal shell syntax.
15
16use std::sync::Mutex;
17use std::sync::OnceLock;
18
19/// Lazy-initialized tree-sitter bash parser (wrapped in Mutex for mutation)
20static BASH_PARSER: OnceLock<Result<Mutex<tree_sitter::Parser>, String>> = OnceLock::new();
21
22/// Gets or initializes the bash parser
23fn get_bash_parser() -> Result<&'static Mutex<tree_sitter::Parser>, String> {
24    BASH_PARSER
25        .get_or_init(|| {
26            let mut parser = tree_sitter::Parser::new();
27            let lang: tree_sitter::Language = tree_sitter_bash::LANGUAGE.into();
28            parser
29                .set_language(&lang)
30                .map_err(|e| format!("Failed to load bash grammar: {e}"))?;
31            Ok(Mutex::new(parser))
32        })
33        .as_ref()
34        .map_err(Clone::clone)
35}
36
37/// Ensures the bash tree-sitter parser is initialized.
38pub fn prewarm_bash_parser() -> Result<(), String> {
39    let _ = get_bash_parser()?;
40    Ok(())
41}
42
43/// Parses a shell script into individual commands using tree-sitter bash grammar
44///
45/// # Example
46/// ```text
47/// Input:  "git status && cargo check"
48/// Output: Ok([["git", "status"], ["cargo", "check"]])
49/// ```
50///
51/// # Fallback
52/// If tree-sitter parsing fails, falls back to simple tokenization
53pub fn parse_shell_commands(script: &str) -> Result<Vec<Vec<String>>, String> {
54    // Try tree-sitter parsing first
55    match parse_with_tree_sitter(script) {
56        Ok(commands) if !commands.is_empty() => return Ok(commands),
57        Ok(_) => {} // Empty result, fall through to basic parsing
58        Err(e) => {
59            tracing::debug!(
60                "Tree-sitter bash parsing failed: {}, falling back to basic tokenization",
61                e
62            );
63        }
64    }
65
66    // Fallback to simple tokenization
67    parse_with_basic_tokenization(script)
68}
69
70/// Parses a shell script using tree-sitter bash grammar only (no fallback tokenization).
71///
72/// Use this when caller behavior must be strictly gated on bash grammar validity.
73pub fn parse_shell_commands_tree_sitter(script: &str) -> Result<Vec<Vec<String>>, String> {
74    parse_with_tree_sitter(script)
75}
76
77/// Parses shell script using tree-sitter bash grammar
78fn parse_with_tree_sitter(script: &str) -> Result<Vec<Vec<String>>, String> {
79    let parser_guard = get_bash_parser()?;
80    let mut parser = parser_guard
81        .lock()
82        .map_err(|e| format!("Failed to lock parser: {}", e))?;
83
84    let tree = parser
85        .parse(script, None)
86        .ok_or_else(|| "Failed to parse script".to_string())?;
87
88    let mut commands = Vec::new();
89    let root = tree.root_node();
90
91    // Walk tree-sitter AST and extract command nodes
92    let mut cursor = root.walk();
93
94    for child in root.children(&mut cursor) {
95        if is_command_node(child)
96            && let Some(cmd) = extract_command_from_node(child, script)
97            && !cmd.is_empty()
98        {
99            commands.push(cmd);
100        }
101    }
102
103    Ok(commands)
104}
105
106/// Checks if a tree-sitter node represents a command
107fn is_command_node(node: tree_sitter::Node) -> bool {
108    matches!(
109        node.kind(),
110        "command" | "pipeline" | "compound_command" | "simple_command"
111    )
112}
113
114/// Extracts a command vector from a tree-sitter node
115fn extract_command_from_node(node: tree_sitter::Node, source: &str) -> Option<Vec<String>> {
116    let mut command = Vec::new();
117    let mut cursor = node.walk();
118
119    // For pipeline nodes, extract the first command in the pipeline
120    if node.kind() == "pipeline" {
121        for child in node.children(&mut cursor) {
122            if child.kind() == "command" || child.kind() == "simple_command" {
123                return extract_command_from_node(child, source);
124            }
125        }
126    }
127
128    // Extract arguments from command node
129    for child in node.children(&mut cursor) {
130        if child.kind() == "command_name" {
131            if let Ok(arg) = child.utf8_text(source.as_bytes()) {
132                let trimmed = arg.trim();
133                if !trimmed.is_empty() {
134                    command.push(trimmed.to_string());
135                }
136            }
137            continue;
138        }
139
140        if matches!(
141            child.kind(),
142            "word" | "string" | "simple_expansion" | "variable_expansion"
143        ) {
144            let text = child.utf8_text(source.as_bytes());
145            if let Ok(arg) = text {
146                let trimmed = arg.trim();
147                if !trimmed.is_empty() {
148                    command.push(trimmed.to_string());
149                }
150            }
151        }
152    }
153
154    if command.is_empty() {
155        None
156    } else {
157        Some(command)
158    }
159}
160
161/// Fallback: Parses shell script with simple tokenization
162fn parse_with_basic_tokenization(script: &str) -> Result<Vec<Vec<String>>, String> {
163    let mut commands = Vec::new();
164    let mut current_command = String::new();
165    let mut in_quotes = false;
166    let mut quote_char = ' ';
167    let mut escaped = false;
168
169    for ch in script.chars() {
170        if escaped {
171            current_command.push(ch);
172            escaped = false;
173            continue;
174        }
175
176        match ch {
177            '\\' => {
178                escaped = true;
179            }
180            '\'' | '"' if !in_quotes => {
181                in_quotes = true;
182                quote_char = ch;
183            }
184            c if c == quote_char && in_quotes => {
185                in_quotes = false;
186            }
187            '&' | '|' | ';' if !in_quotes => {
188                if !current_command.trim().is_empty()
189                    && let Ok(cmd) = tokenize_command(&current_command)
190                {
191                    commands.push(cmd);
192                }
193                current_command.clear();
194            }
195            _ => current_command.push(ch),
196        }
197    }
198
199    if !current_command.trim().is_empty()
200        && let Ok(cmd) = tokenize_command(&current_command)
201    {
202        commands.push(cmd);
203    }
204
205    Ok(commands)
206}
207
208/// Splits a command string into arguments
209/// Respects quoted strings and escapes
210fn tokenize_command(cmd: &str) -> Result<Vec<String>, String> {
211    shell_words::split(cmd).map_err(|err| format!("failed to tokenize command: {err}"))
212}
213
214/// Parses `bash -lc "script"` style invocations
215///
216/// # Example
217/// ```text
218/// Input:  vec!["bash", "-lc", "git status && rm /"]
219/// Output: Some([["git", "status"], ["rm", "/"]])
220/// ```
221pub fn parse_bash_lc_commands(command: &[String]) -> Option<Vec<Vec<String>>> {
222    if command.is_empty() {
223        return None;
224    }
225
226    let cmd_name = command[0].as_str();
227    let base_cmd = std::path::Path::new(cmd_name)
228        .file_name()
229        .and_then(|osstr| osstr.to_str())
230        .unwrap_or("");
231
232    if base_cmd != "bash" && base_cmd != "zsh" && base_cmd != "sh" {
233        return None;
234    }
235
236    // Look for -lc or -c pattern
237    for window in command.windows(2) {
238        if matches!(window[0].as_str(), "-lc" | "-c" | "-il" | "-ic") {
239            let script = &window[1];
240            return parse_shell_commands(script).ok();
241        }
242    }
243
244    None
245}
246
247#[cfg(test)]
248mod tests {
249    use super::*;
250
251    #[test]
252    fn tokenize_simple_command() {
253        let cmd = "git status";
254        let tokens = tokenize_command(cmd).unwrap();
255        assert_eq!(tokens, vec!["git", "status"]);
256    }
257
258    #[test]
259    fn tokenize_quoted_arguments() {
260        let cmd = r#"echo "hello world""#;
261        let tokens = tokenize_command(cmd).unwrap();
262        assert_eq!(tokens, vec!["echo", "hello world"]);
263    }
264
265    #[test]
266    fn parse_single_command() {
267        let script = "git status";
268        let commands = parse_shell_commands(script).unwrap();
269        assert_eq!(commands.len(), 1);
270        assert_eq!(commands[0][0], "git");
271    }
272
273    #[test]
274    fn parse_chained_commands_with_and() {
275        let script = "git status && cargo check";
276        let commands = parse_shell_commands(script).unwrap();
277        assert_eq!(commands.len(), 2);
278        assert_eq!(commands[0][0], "git");
279        assert_eq!(commands[1][0], "cargo");
280    }
281
282    #[test]
283    fn parse_chained_commands_with_semicolon() {
284        let script = "git status; cargo check";
285        let commands = parse_shell_commands(script).unwrap();
286        assert_eq!(commands.len(), 2);
287    }
288
289    #[test]
290    fn parse_bash_lc_git_status() {
291        let cmd = vec![
292            "bash".to_string(),
293            "-lc".to_string(),
294            "git status".to_string(),
295        ];
296        let commands = parse_bash_lc_commands(&cmd);
297        assert!(commands.is_some());
298        let commands = commands.unwrap();
299        assert_eq!(commands.len(), 1);
300        assert_eq!(commands[0][0], "git");
301    }
302
303    #[test]
304    fn parse_bash_lc_chained() {
305        let cmd = vec![
306            "bash".to_string(),
307            "-lc".to_string(),
308            "git status && cargo check".to_string(),
309        ];
310        let commands = parse_bash_lc_commands(&cmd);
311        assert!(commands.is_some());
312        let commands = commands.unwrap();
313        assert_eq!(commands.len(), 2);
314    }
315
316    #[test]
317    fn parse_non_bash_command_returns_none() {
318        let cmd = vec!["echo".to_string(), "hello".to_string()];
319        let commands = parse_bash_lc_commands(&cmd);
320        assert!(commands.is_none());
321    }
322
323    #[test]
324    fn parse_bash_without_lc_returns_none() {
325        let cmd = vec!["bash".to_string(), "script.sh".to_string()];
326        let commands = parse_bash_lc_commands(&cmd);
327        assert!(commands.is_none());
328    }
329
330    // Phase 4 tests: Tree-sitter based parsing
331
332    #[test]
333    fn parse_complex_pipeline() {
334        let script = "cat file.txt | grep -i pattern | sort";
335        let commands = parse_shell_commands(script).unwrap();
336        assert!(!commands.is_empty());
337    }
338
339    #[test]
340    fn parse_with_pipes_and_redirects() {
341        let script = "ls -la | grep file > output.txt";
342        let commands = parse_shell_commands(script).unwrap();
343        assert!(!commands.is_empty());
344    }
345
346    #[test]
347    fn parse_command_substitution_fallback() {
348        let script = "echo $(git status)";
349        let commands = parse_shell_commands(script).unwrap();
350        assert!(!commands.is_empty());
351    }
352
353    #[test]
354    fn parse_escaped_quotes() {
355        let script = r#"echo "hello \"world\"""#;
356        let commands = parse_shell_commands(script).unwrap();
357        assert!(!commands.is_empty());
358    }
359
360    #[test]
361    fn parse_tree_sitter_preserves_command_name_with_quoted_args() {
362        let script = r#"echo "fish and chips""#;
363        let commands = parse_shell_commands_tree_sitter(script).unwrap();
364        assert!(!commands.is_empty());
365        assert_eq!(commands[0][0], "echo");
366    }
367
368    #[test]
369    fn parse_bash_lc_with_pipe() {
370        let cmd = vec![
371            "bash".to_string(),
372            "-lc".to_string(),
373            "ls -la | head -5".to_string(),
374        ];
375        let commands = parse_bash_lc_commands(&cmd);
376        assert!(commands.is_some());
377        let cmds = commands.unwrap();
378        assert!(!cmds.is_empty());
379    }
380
381    #[test]
382    fn parse_dangerous_shell_command() {
383        let script = "rm -rf /; echo done";
384        let commands = parse_shell_commands(script).unwrap();
385        assert_eq!(commands.len(), 2);
386        assert_eq!(commands[0][0], "rm");
387    }
388
389    #[test]
390    fn prewarm_bash_parser_initializes_successfully() {
391        prewarm_bash_parser().expect("bash parser should initialize");
392    }
393}
394
395// === Injection detection (moved from tools::validation::commands) ===
396
397use anyhow::{Result, bail};
398
399/// Quote state for shell segment splitting.
400#[derive(Clone, Copy, Eq, PartialEq)]
401enum QuoteState {
402    None,
403    Single,
404    Double,
405}
406
407/// Split a shell command into segments on unquoted `|` and `&` boundaries,
408/// while detecting injection patterns (`;`, backticks, `$()`, newlines).
409pub(crate) fn split_shell_segments(command: &str) -> Result<Vec<String>> {
410    let mut segments = Vec::new();
411    let mut state = QuoteState::None;
412    let mut escaped = false;
413    let mut segment_start = 0usize;
414    let mut chars = command.char_indices().peekable();
415
416    while let Some((idx, ch)) = chars.next() {
417        match state {
418            QuoteState::Single => {
419                if ch == '\'' {
420                    state = QuoteState::None;
421                }
422            }
423            QuoteState::Double => {
424                if escaped {
425                    escaped = false;
426                    continue;
427                }
428
429                match ch {
430                    '\\' => escaped = true,
431                    '"' => state = QuoteState::None,
432                    '`' => bail!("Command injection pattern detected"),
433                    '$' if matches!(chars.peek(), Some((_, '('))) => {
434                        bail!("Command injection pattern detected");
435                    }
436                    _ => {}
437                }
438            }
439            QuoteState::None => {
440                if escaped {
441                    escaped = false;
442                    continue;
443                }
444
445                match ch {
446                    '\\' => escaped = true,
447                    '\'' => state = QuoteState::Single,
448                    '"' => state = QuoteState::Double,
449                    '`' => bail!("Command injection pattern detected"),
450                    '$' if matches!(chars.peek(), Some((_, '('))) => {
451                        bail!("Command injection pattern detected");
452                    }
453                    ';' => bail!("Unquoted command chaining detected"),
454                    '\n' => bail!("Command injection pattern detected"),
455                    '|' | '&' => {
456                        push_segment(command, segment_start, idx, &mut segments);
457                        segment_start = idx + ch.len_utf8();
458                        if let Some((next_idx, next_ch)) = chars.peek().copied()
459                            && next_ch == ch
460                        {
461                            chars.next();
462                            segment_start = next_idx + next_ch.len_utf8();
463                        }
464                    }
465                    _ => {}
466                }
467            }
468        }
469    }
470
471    push_segment(command, segment_start, command.len(), &mut segments);
472    Ok(segments)
473}
474
475fn push_segment(command: &str, start: usize, end: usize, segments: &mut Vec<String>) {
476    let segment = command[start..end].trim();
477    if !segment.is_empty() {
478        segments.push(segment.to_string());
479    }
480}
481
482/// Check for additional dangerous patterns not covered by the central dangerous-command detector.
483pub(crate) fn additional_dangerous_pattern(segment: &str) -> Option<&'static str> {
484    let segment_lower = segment.to_ascii_lowercase();
485    if segment_lower.starts_with(":(){:|:&};:") {
486        return Some(":(){:|:&};:");
487    }
488
489    let tokens = shell_words::split(segment).unwrap_or_else(|_| {
490        segment
491            .split_whitespace()
492            .map(ToString::to_string)
493            .collect()
494    });
495    let first = tokens.first()?;
496    let command_name = base_command_name(strip_wrapping_quotes(first)).to_ascii_lowercase();
497
498    match command_name.as_str() {
499        "rmdir" => Some("rmdir"),
500        "wget" => Some("wget"),
501        "curl" => Some("curl"),
502        "chmod"
503            if tokens
504                .iter()
505                .skip(1)
506                .any(|arg| strip_wrapping_quotes(arg).starts_with("777")) =>
507        {
508            Some("chmod 777")
509        }
510        "chown"
511            if tokens.iter().skip(1).any(|arg| {
512                let arg = strip_wrapping_quotes(arg).to_ascii_lowercase();
513                arg == "root" || arg.starts_with("root:")
514            }) =>
515        {
516            Some("chown root")
517        }
518        _ => None,
519    }
520}
521
522fn strip_wrapping_quotes(token: &str) -> &str {
523    token
524        .strip_prefix('\'')
525        .and_then(|token| token.strip_suffix('\''))
526        .or_else(|| {
527            token
528                .strip_prefix('"')
529                .and_then(|token| token.strip_suffix('"'))
530        })
531        .unwrap_or(token)
532}
533
534fn base_command_name(command: &str) -> &str {
535    std::path::Path::new(command)
536        .file_name()
537        .and_then(|name| name.to_str())
538        .unwrap_or(command)
539}