use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum ShellType {
Posix,
Fish,
PowerShell,
}
impl std::str::FromStr for ShellType {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"posix" | "bash" | "zsh" | "sh" => Ok(ShellType::Posix),
"fish" => Ok(ShellType::Fish),
"powershell" | "pwsh" => Ok(ShellType::PowerShell),
_ => Err(format!("unknown shell type: {s}")),
}
}
}
#[derive(Debug, Clone)]
pub struct Segment {
pub raw: String,
pub command: Option<String>,
pub args: Vec<String>,
pub preceding_separator: Option<String>,
}
pub fn tokenize(input: &str, shell: ShellType) -> Vec<Segment> {
match shell {
ShellType::Posix => tokenize_posix(input),
ShellType::Fish => tokenize_fish(input),
ShellType::PowerShell => tokenize_powershell(input),
}
}
fn tokenize_posix(input: &str) -> Vec<Segment> {
let mut segments = Vec::new();
let mut current = String::new();
let mut preceding_sep = None;
let chars: Vec<char> = input.chars().collect();
let len = chars.len();
let mut i = 0;
while i < len {
let ch = chars[i];
match ch {
'\\' if i + 1 < len => {
current.push(chars[i]);
current.push(chars[i + 1]);
i += 2;
continue;
}
'\'' => {
current.push(ch);
i += 1;
while i < len && chars[i] != '\'' {
current.push(chars[i]);
i += 1;
}
if i < len {
current.push(chars[i]); i += 1;
}
continue;
}
'"' => {
current.push(ch);
i += 1;
while i < len && chars[i] != '"' {
if chars[i] == '\\' && i + 1 < len {
current.push(chars[i]);
current.push(chars[i + 1]);
i += 2;
} else {
current.push(chars[i]);
i += 1;
}
}
if i < len {
current.push(chars[i]); i += 1;
}
continue;
}
'|' => {
if i + 1 < len && chars[i + 1] == '|' {
push_segment(&mut segments, ¤t, preceding_sep.take());
current.clear();
preceding_sep = Some("||".to_string());
i += 2;
continue;
} else if i + 1 < len && chars[i + 1] == '&' {
push_segment(&mut segments, ¤t, preceding_sep.take());
current.clear();
preceding_sep = Some("|&".to_string());
i += 2;
continue;
} else {
push_segment(&mut segments, ¤t, preceding_sep.take());
current.clear();
preceding_sep = Some("|".to_string());
i += 1;
continue;
}
}
'&' if i + 1 < len && chars[i + 1] == '&' => {
push_segment(&mut segments, ¤t, preceding_sep.take());
current.clear();
preceding_sep = Some("&&".to_string());
i += 2;
continue;
}
';' => {
push_segment(&mut segments, ¤t, preceding_sep.take());
current.clear();
preceding_sep = Some(";".to_string());
i += 1;
continue;
}
'\n' => {
push_segment(&mut segments, ¤t, preceding_sep.take());
current.clear();
preceding_sep = Some("\n".to_string());
i += 1;
continue;
}
_ => {
current.push(ch);
i += 1;
}
}
}
push_segment(&mut segments, ¤t, preceding_sep.take());
segments
}
fn tokenize_fish(input: &str) -> Vec<Segment> {
tokenize_posix(input)
}
fn tokenize_powershell(input: &str) -> Vec<Segment> {
let mut segments = Vec::new();
let mut current = String::new();
let mut preceding_sep = None;
let chars: Vec<char> = input.chars().collect();
let len = chars.len();
let mut i = 0;
while i < len {
let ch = chars[i];
match ch {
'`' if i + 1 < len => {
current.push(chars[i]);
current.push(chars[i + 1]);
i += 2;
continue;
}
'\'' => {
current.push(ch);
i += 1;
while i < len && chars[i] != '\'' {
current.push(chars[i]);
i += 1;
}
if i < len {
current.push(chars[i]);
i += 1;
}
continue;
}
'"' => {
current.push(ch);
i += 1;
while i < len && chars[i] != '"' {
if chars[i] == '`' && i + 1 < len {
current.push(chars[i]);
current.push(chars[i + 1]);
i += 2;
} else {
current.push(chars[i]);
i += 1;
}
}
if i < len {
current.push(chars[i]);
i += 1;
}
continue;
}
'|' => {
push_segment(&mut segments, ¤t, preceding_sep.take());
current.clear();
preceding_sep = Some("|".to_string());
i += 1;
continue;
}
';' => {
push_segment(&mut segments, ¤t, preceding_sep.take());
current.clear();
preceding_sep = Some(";".to_string());
i += 1;
continue;
}
'-' if current.ends_with(char::is_whitespace) || current.is_empty() => {
let remaining = &input[i..];
if remaining.starts_with("-and")
&& remaining[4..]
.chars()
.next()
.is_none_or(|c| c.is_whitespace())
{
push_segment(&mut segments, ¤t, preceding_sep.take());
current.clear();
preceding_sep = Some("-and".to_string());
i += 4;
continue;
} else if remaining.starts_with("-or")
&& remaining[3..]
.chars()
.next()
.is_none_or(|c| c.is_whitespace())
{
push_segment(&mut segments, ¤t, preceding_sep.take());
current.clear();
preceding_sep = Some("-or".to_string());
i += 3;
continue;
}
current.push(ch);
i += 1;
}
'\n' => {
push_segment(&mut segments, ¤t, preceding_sep.take());
current.clear();
preceding_sep = Some("\n".to_string());
i += 1;
continue;
}
_ => {
current.push(ch);
i += 1;
}
}
}
push_segment(&mut segments, ¤t, preceding_sep.take());
segments
}
fn push_segment(segments: &mut Vec<Segment>, raw: &str, preceding_sep: Option<String>) {
let trimmed = raw.trim();
if trimmed.is_empty() {
return;
}
let words = split_words(trimmed);
let command = words.first().cloned();
let args = if words.len() > 1 {
words[1..].to_vec()
} else {
Vec::new()
};
segments.push(Segment {
raw: trimmed.to_string(),
command,
args,
preceding_separator: preceding_sep,
});
}
fn split_words(input: &str) -> Vec<String> {
let mut words = Vec::new();
let mut current = String::new();
let chars: Vec<char> = input.chars().collect();
let len = chars.len();
let mut i = 0;
while i < len {
let ch = chars[i];
match ch {
' ' | '\t' if !current.is_empty() => {
words.push(current.clone());
current.clear();
i += 1;
while i < len && (chars[i] == ' ' || chars[i] == '\t') {
i += 1;
}
}
' ' | '\t' => {
i += 1;
}
'\'' => {
current.push(ch);
i += 1;
while i < len && chars[i] != '\'' {
current.push(chars[i]);
i += 1;
}
if i < len {
current.push(chars[i]);
i += 1;
}
}
'"' => {
current.push(ch);
i += 1;
while i < len && chars[i] != '"' {
if chars[i] == '\\' && i + 1 < len {
current.push(chars[i]);
current.push(chars[i + 1]);
i += 2;
} else {
current.push(chars[i]);
i += 1;
}
}
if i < len {
current.push(chars[i]);
i += 1;
}
}
'\\' if i + 1 < len => {
current.push(chars[i]);
current.push(chars[i + 1]);
i += 2;
}
_ => {
current.push(ch);
i += 1;
}
}
}
if !current.is_empty() {
words.push(current);
}
words
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_simple_pipe() {
let segs = tokenize("echo hello | grep world", ShellType::Posix);
assert_eq!(segs.len(), 2);
assert_eq!(segs[0].command.as_deref(), Some("echo"));
assert_eq!(segs[1].command.as_deref(), Some("grep"));
assert_eq!(segs[1].preceding_separator.as_deref(), Some("|"));
}
#[test]
fn test_quoted_pipe() {
let segs = tokenize(r#"echo "hello | world" | bash"#, ShellType::Posix);
assert_eq!(segs.len(), 2);
assert_eq!(segs[0].raw, r#"echo "hello | world""#);
assert_eq!(segs[1].command.as_deref(), Some("bash"));
}
#[test]
fn test_and_or() {
let segs = tokenize("cmd1 && cmd2 || cmd3", ShellType::Posix);
assert_eq!(segs.len(), 3);
assert_eq!(segs[1].preceding_separator.as_deref(), Some("&&"));
assert_eq!(segs[2].preceding_separator.as_deref(), Some("||"));
}
#[test]
fn test_semicolon() {
let segs = tokenize("cmd1; cmd2", ShellType::Posix);
assert_eq!(segs.len(), 2);
assert_eq!(segs[1].preceding_separator.as_deref(), Some(";"));
}
#[test]
fn test_pipe_ampersand() {
let segs = tokenize("cmd1 |& cmd2", ShellType::Posix);
assert_eq!(segs.len(), 2);
assert_eq!(segs[1].preceding_separator.as_deref(), Some("|&"));
}
#[test]
fn test_powershell_pipe() {
let segs = tokenize("iwr url | iex", ShellType::PowerShell);
assert_eq!(segs.len(), 2);
assert_eq!(segs[0].command.as_deref(), Some("iwr"));
assert_eq!(segs[1].command.as_deref(), Some("iex"));
}
#[test]
fn test_powershell_backtick() {
let segs = tokenize("echo `| not a pipe", ShellType::PowerShell);
assert_eq!(segs.len(), 1);
}
#[test]
fn test_single_quotes() {
let segs = tokenize("echo 'hello | world' | bash", ShellType::Posix);
assert_eq!(segs.len(), 2);
}
#[test]
fn test_backslash_escape() {
let segs = tokenize("echo hello\\|world | bash", ShellType::Posix);
assert_eq!(segs.len(), 2);
}
#[test]
fn test_empty_input() {
let segs = tokenize("", ShellType::Posix);
assert!(segs.is_empty());
}
#[test]
fn test_whitespace_only() {
let segs = tokenize(" ", ShellType::Posix);
assert!(segs.is_empty());
}
#[test]
fn test_args_extraction() {
let segs = tokenize("curl -sSL https://example.com", ShellType::Posix);
assert_eq!(segs.len(), 1);
assert_eq!(segs[0].command.as_deref(), Some("curl"));
assert_eq!(segs[0].args.len(), 2);
}
}