use std::sync::Mutex;
use std::sync::OnceLock;
static BASH_PARSER: OnceLock<Result<Mutex<tree_sitter::Parser>, String>> = OnceLock::new();
fn get_bash_parser() -> Result<&'static Mutex<tree_sitter::Parser>, String> {
BASH_PARSER
.get_or_init(|| {
let mut parser = tree_sitter::Parser::new();
let lang: tree_sitter::Language = tree_sitter_bash::LANGUAGE.into();
parser
.set_language(&lang)
.map_err(|e| format!("Failed to load bash grammar: {e}"))?;
Ok(Mutex::new(parser))
})
.as_ref()
.map_err(Clone::clone)
}
pub fn prewarm_bash_parser() -> Result<(), String> {
let _ = get_bash_parser()?;
Ok(())
}
pub fn parse_shell_commands(script: &str) -> Result<Vec<Vec<String>>, String> {
match parse_with_tree_sitter(script) {
Ok(commands) if !commands.is_empty() => return Ok(commands),
Ok(_) => {} Err(e) => {
tracing::debug!(
"Tree-sitter bash parsing failed: {}, falling back to basic tokenization",
e
);
}
}
parse_with_basic_tokenization(script)
}
pub fn parse_shell_commands_tree_sitter(script: &str) -> Result<Vec<Vec<String>>, String> {
parse_with_tree_sitter(script)
}
fn parse_with_tree_sitter(script: &str) -> Result<Vec<Vec<String>>, String> {
let parser_guard = get_bash_parser()?;
let mut parser = parser_guard
.lock()
.map_err(|e| format!("Failed to lock parser: {}", e))?;
let tree = parser
.parse(script, None)
.ok_or_else(|| "Failed to parse script".to_string())?;
let mut commands = Vec::new();
let root = tree.root_node();
let mut cursor = root.walk();
for child in root.children(&mut cursor) {
if is_command_node(child)
&& let Some(cmd) = extract_command_from_node(child, script)
&& !cmd.is_empty()
{
commands.push(cmd);
}
}
Ok(commands)
}
fn is_command_node(node: tree_sitter::Node) -> bool {
matches!(
node.kind(),
"command" | "pipeline" | "compound_command" | "simple_command"
)
}
fn extract_command_from_node(node: tree_sitter::Node, source: &str) -> Option<Vec<String>> {
let mut command = Vec::new();
let mut cursor = node.walk();
if node.kind() == "pipeline" {
for child in node.children(&mut cursor) {
if child.kind() == "command" || child.kind() == "simple_command" {
return extract_command_from_node(child, source);
}
}
}
for child in node.children(&mut cursor) {
if child.kind() == "command_name" {
if let Ok(arg) = child.utf8_text(source.as_bytes()) {
let trimmed = arg.trim();
if !trimmed.is_empty() {
command.push(trimmed.to_string());
}
}
continue;
}
if matches!(
child.kind(),
"word" | "string" | "simple_expansion" | "variable_expansion"
) {
let text = child.utf8_text(source.as_bytes());
if let Ok(arg) = text {
let trimmed = arg.trim();
if !trimmed.is_empty() {
command.push(trimmed.to_string());
}
}
}
}
if command.is_empty() {
None
} else {
Some(command)
}
}
fn parse_with_basic_tokenization(script: &str) -> Result<Vec<Vec<String>>, String> {
let mut commands = Vec::new();
let mut current_command = String::new();
let mut in_quotes = false;
let mut quote_char = ' ';
let mut escaped = false;
for ch in script.chars() {
if escaped {
current_command.push(ch);
escaped = false;
continue;
}
match ch {
'\\' => {
escaped = true;
}
'\'' | '"' if !in_quotes => {
in_quotes = true;
quote_char = ch;
}
c if c == quote_char && in_quotes => {
in_quotes = false;
}
'&' | '|' | ';' if !in_quotes => {
if !current_command.trim().is_empty()
&& let Ok(cmd) = tokenize_command(¤t_command)
{
commands.push(cmd);
}
current_command.clear();
}
_ => current_command.push(ch),
}
}
if !current_command.trim().is_empty()
&& let Ok(cmd) = tokenize_command(¤t_command)
{
commands.push(cmd);
}
Ok(commands)
}
fn tokenize_command(cmd: &str) -> Result<Vec<String>, String> {
shell_words::split(cmd).map_err(|err| format!("failed to tokenize command: {err}"))
}
pub fn parse_bash_lc_commands(command: &[String]) -> Option<Vec<Vec<String>>> {
if command.is_empty() {
return None;
}
let cmd_name = command[0].as_str();
let base_cmd = std::path::Path::new(cmd_name)
.file_name()
.and_then(|osstr| osstr.to_str())
.unwrap_or("");
if base_cmd != "bash" && base_cmd != "zsh" && base_cmd != "sh" {
return None;
}
for window in command.windows(2) {
if matches!(window[0].as_str(), "-lc" | "-c" | "-il" | "-ic") {
let script = &window[1];
return parse_shell_commands(script).ok();
}
}
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn tokenize_simple_command() {
let cmd = "git status";
let tokens = tokenize_command(cmd).unwrap();
assert_eq!(tokens, vec!["git", "status"]);
}
#[test]
fn tokenize_quoted_arguments() {
let cmd = r#"echo "hello world""#;
let tokens = tokenize_command(cmd).unwrap();
assert_eq!(tokens, vec!["echo", "hello world"]);
}
#[test]
fn parse_single_command() {
let script = "git status";
let commands = parse_shell_commands(script).unwrap();
assert_eq!(commands.len(), 1);
assert_eq!(commands[0][0], "git");
}
#[test]
fn parse_chained_commands_with_and() {
let script = "git status && cargo check";
let commands = parse_shell_commands(script).unwrap();
assert_eq!(commands.len(), 2);
assert_eq!(commands[0][0], "git");
assert_eq!(commands[1][0], "cargo");
}
#[test]
fn parse_chained_commands_with_semicolon() {
let script = "git status; cargo check";
let commands = parse_shell_commands(script).unwrap();
assert_eq!(commands.len(), 2);
}
#[test]
fn parse_bash_lc_git_status() {
let cmd = vec![
"bash".to_string(),
"-lc".to_string(),
"git status".to_string(),
];
let commands = parse_bash_lc_commands(&cmd);
assert!(commands.is_some());
let commands = commands.unwrap();
assert_eq!(commands.len(), 1);
assert_eq!(commands[0][0], "git");
}
#[test]
fn parse_bash_lc_chained() {
let cmd = vec![
"bash".to_string(),
"-lc".to_string(),
"git status && cargo check".to_string(),
];
let commands = parse_bash_lc_commands(&cmd);
assert!(commands.is_some());
let commands = commands.unwrap();
assert_eq!(commands.len(), 2);
}
#[test]
fn parse_non_bash_command_returns_none() {
let cmd = vec!["echo".to_string(), "hello".to_string()];
let commands = parse_bash_lc_commands(&cmd);
assert!(commands.is_none());
}
#[test]
fn parse_bash_without_lc_returns_none() {
let cmd = vec!["bash".to_string(), "script.sh".to_string()];
let commands = parse_bash_lc_commands(&cmd);
assert!(commands.is_none());
}
#[test]
fn parse_complex_pipeline() {
let script = "cat file.txt | grep -i pattern | sort";
let commands = parse_shell_commands(script).unwrap();
assert!(!commands.is_empty());
}
#[test]
fn parse_with_pipes_and_redirects() {
let script = "ls -la | grep file > output.txt";
let commands = parse_shell_commands(script).unwrap();
assert!(!commands.is_empty());
}
#[test]
fn parse_command_substitution_fallback() {
let script = "echo $(git status)";
let commands = parse_shell_commands(script).unwrap();
assert!(!commands.is_empty());
}
#[test]
fn parse_escaped_quotes() {
let script = r#"echo "hello \"world\"""#;
let commands = parse_shell_commands(script).unwrap();
assert!(!commands.is_empty());
}
#[test]
fn parse_tree_sitter_preserves_command_name_with_quoted_args() {
let script = r#"echo "fish and chips""#;
let commands = parse_shell_commands_tree_sitter(script).unwrap();
assert!(!commands.is_empty());
assert_eq!(commands[0][0], "echo");
}
#[test]
fn parse_bash_lc_with_pipe() {
let cmd = vec![
"bash".to_string(),
"-lc".to_string(),
"ls -la | head -5".to_string(),
];
let commands = parse_bash_lc_commands(&cmd);
assert!(commands.is_some());
let cmds = commands.unwrap();
assert!(!cmds.is_empty());
}
#[test]
fn parse_dangerous_shell_command() {
let script = "rm -rf /; echo done";
let commands = parse_shell_commands(script).unwrap();
assert_eq!(commands.len(), 2);
assert_eq!(commands[0][0], "rm");
}
#[test]
fn prewarm_bash_parser_initializes_successfully() {
prewarm_bash_parser().expect("bash parser should initialize");
}
}