use std::fmt;
use thiserror::Error;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Token {
Word(String),
Colon,
Equals,
Include,
Pipe,
Indent,
Newline,
Eof,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ShellMode {
Sh,
Rc,
}
impl fmt::Display for ShellMode {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
ShellMode::Sh => write!(f, "sh"),
ShellMode::Rc => write!(f, "rc"),
}
}
}
#[derive(Debug, Error)]
pub enum LexError {
#[error("unterminated quote at position {pos}")]
UnterminatedQuote { pos: usize },
#[error("unterminated backtick at position {pos}")]
UnterminatedBacktick { pos: usize },
}
pub struct Lexer {
chars: Vec<char>,
pos: usize,
mode: ShellMode,
in_recipe: bool,
}
impl Lexer {
pub fn new(input: &str, mode: ShellMode) -> Self {
Lexer {
chars: input.chars().collect(),
pos: 0,
mode,
in_recipe: false,
}
}
pub fn tokenize(&mut self) -> Result<Vec<Token>, LexError> {
let mut tokens = Vec::new();
let mut at_line_start = true;
let mut word = String::new();
let mut brace_depth: u32 = 0;
while self.pos < self.chars.len() {
if at_line_start {
at_line_start = false;
if let Some(c) = self.peek_char() {
if c == ' ' || c == '\t' {
tokens.push(Token::Indent);
self.in_recipe = true;
self.skip_whitespace();
continue;
}
}
self.in_recipe = false;
}
let save_pos = self.pos;
let c = match self.next_char() {
Some(c) => c,
None => break,
};
match c {
'\'' => {
word.push('\'');
self.read_single_quoted(&mut word, save_pos)?;
}
'"' if self.mode == ShellMode::Sh => {
word.push('"');
self.read_double_quoted(&mut word, save_pos)?;
}
'`' => {
if self.in_recipe {
word.push('`');
} else {
word.push('`');
self.read_backtick(&mut word, save_pos)?;
}
}
'#' => {
if !word.is_empty() {
tokens.push(Token::Word(std::mem::take(&mut word)));
}
while self.pos < self.chars.len() && self.chars[self.pos] != '\n' {
self.pos += 1;
}
if self.pos < self.chars.len() && self.chars[self.pos] == '\n' {
self.pos += 1;
tokens.push(Token::Newline);
at_line_start = true;
}
}
':' if brace_depth == 0 && !self.in_recipe => {
if !word.is_empty() {
tokens.push(Token::Word(std::mem::take(&mut word)));
}
tokens.push(Token::Colon);
}
'=' if brace_depth == 0 && !self.in_recipe => {
if !word.is_empty() {
tokens.push(Token::Word(std::mem::take(&mut word)));
}
tokens.push(Token::Equals);
}
'<' if brace_depth == 0 && word.is_empty() && !self.in_recipe => {
tokens.push(Token::Include);
}
'|' if brace_depth == 0 && word.is_empty() && !self.in_recipe => {
tokens.push(Token::Pipe);
}
' ' | '\t' => {
if !word.is_empty() {
tokens.push(Token::Word(std::mem::take(&mut word)));
}
}
'\n' => {
if !word.is_empty() {
tokens.push(Token::Word(std::mem::take(&mut word)));
}
tokens.push(Token::Newline);
at_line_start = true;
}
'$' => {
word.push('$');
if let Some('{') = self.peek_char() {
self.pos += 1; word.push('{');
brace_depth += 1;
}
}
'}' => {
brace_depth = brace_depth.saturating_sub(1);
word.push('}');
}
_ => {
word.push(c);
}
}
}
if !word.is_empty() {
tokens.push(Token::Word(std::mem::take(&mut word)));
}
tokens.push(Token::Eof);
Ok(tokens)
}
fn next_char(&mut self) -> Option<char> {
if self.pos >= self.chars.len() {
return None;
}
let c = self.chars[self.pos];
self.pos += 1;
if c == '\r' {
return self.next_char();
}
if c == '\\' && self.pos < self.chars.len() {
let next = self.chars[self.pos];
if next == '\n' {
self.pos += 1; match self.mode {
ShellMode::Sh => Some(' '), ShellMode::Rc => self.next_char(), }
} else if next == '\r'
&& self.pos + 1 < self.chars.len()
&& self.chars[self.pos + 1] == '\n'
{
self.pos += 2; match self.mode {
ShellMode::Sh => Some(' '),
ShellMode::Rc => self.next_char(),
}
} else {
Some(c) }
} else {
Some(c)
}
}
fn peek_char(&self) -> Option<char> {
if self.pos >= self.chars.len() {
None
} else {
Some(self.chars[self.pos])
}
}
fn skip_whitespace(&mut self) {
while self.pos < self.chars.len() {
let c = self.chars[self.pos];
if c == ' ' || c == '\t' {
self.pos += 1;
} else {
break;
}
}
}
fn read_single_quoted(&mut self, word: &mut String, start_pos: usize) -> Result<(), LexError> {
loop {
let c = match self.next_char() {
Some(c) => c,
None => return Err(LexError::UnterminatedQuote { pos: start_pos }),
};
if c == '\'' {
if self.mode == ShellMode::Rc {
if let Some('\'') = self.peek_char() {
self.pos += 1; word.push('\'');
continue;
}
}
word.push('\'');
return Ok(());
}
word.push(c);
}
}
fn read_double_quoted(&mut self, word: &mut String, start_pos: usize) -> Result<(), LexError> {
loop {
let c = match self.next_char() {
Some(c) => c,
None => return Err(LexError::UnterminatedQuote { pos: start_pos }),
};
match c {
'"' => {
word.push('"');
return Ok(());
}
'\\' => {
word.push('\\');
if let Some(next) = self.next_char() {
word.push(next);
} else {
return Err(LexError::UnterminatedQuote { pos: start_pos });
}
}
_ => {
word.push(c);
}
}
}
}
fn read_backtick(&mut self, word: &mut String, start_pos: usize) -> Result<(), LexError> {
let first = loop {
match self.next_char() {
Some(c) if c == ' ' || c == '\t' => continue,
other => break other,
}
};
match first {
Some('{') => {
word.push('{');
self.skip_whitespace();
loop {
let c = match self.next_char() {
Some(c) => c,
None => {
return Err(LexError::UnterminatedBacktick { pos: start_pos });
}
};
word.push(c);
if c == '}' {
word.push('`');
return Ok(());
}
}
}
Some(c) => {
word.push(c);
loop {
let c = match self.next_char() {
Some(c) => c,
None => {
return Err(LexError::UnterminatedBacktick { pos: start_pos });
}
};
word.push(c);
if c == '`' {
return Ok(());
}
}
}
None => Err(LexError::UnterminatedBacktick { pos: start_pos }),
}
}
}
pub fn tokenize(input: &str, mode: ShellMode) -> Result<Vec<Token>, LexError> {
Lexer::new(input, mode).tokenize()
}
#[cfg(test)]
mod tests {
use super::*;
fn tks(tokens: Vec<Token>) -> Vec<Token> {
let mut v = tokens;
v.push(Token::Eof);
v
}
fn w(s: &str) -> Token {
Token::Word(s.to_string())
}
#[test]
fn empty_input() {
assert_eq!(tokenize("", ShellMode::Sh).unwrap(), vec![Token::Eof]);
}
#[test]
fn single_word() {
assert_eq!(
tokenize("hello", ShellMode::Sh).unwrap(),
tks(vec![w("hello")])
);
}
#[test]
fn rule_header() {
assert_eq!(
tokenize("target: prereq\n", ShellMode::Sh).unwrap(),
tks(vec![w("target"), Token::Colon, w("prereq"), Token::Newline,])
);
}
#[test]
fn comment_line() {
assert_eq!(
tokenize("# this is a comment\nword\n", ShellMode::Sh).unwrap(),
tks(vec![Token::Newline, w("word"), Token::Newline,])
);
}
#[test]
fn trailing_comment() {
assert_eq!(
tokenize("word # comment\n", ShellMode::Sh).unwrap(),
tks(vec![w("word"), Token::Newline,])
);
}
#[test]
fn assignment() {
assert_eq!(
tokenize("CC = gcc\n", ShellMode::Sh).unwrap(),
tks(vec![w("CC"), Token::Equals, w("gcc"), Token::Newline,])
);
}
#[test]
fn include_file() {
assert_eq!(
tokenize("< mkfile\n", ShellMode::Sh).unwrap(),
tks(vec![Token::Include, w("mkfile"), Token::Newline,])
);
}
#[test]
fn include_command() {
assert_eq!(
tokenize("<| gcc -M *.c\n", ShellMode::Sh).unwrap(),
tks(vec![
Token::Include,
Token::Pipe,
w("gcc"),
w("-M"),
w("*.c"),
Token::Newline,
])
);
}
#[test]
fn recipe_line() {
assert_eq!(
tokenize("\tcc -c a.c\n", ShellMode::Sh).unwrap(),
tks(vec![
Token::Indent,
w("cc"),
w("-c"),
w("a.c"),
Token::Newline,
])
);
}
#[test]
fn recipe_block() {
assert_eq!(
tokenize("target:\n\tcmd1\n\tcmd2\n\n", ShellMode::Sh).unwrap(),
tks(vec![
w("target"),
Token::Colon,
Token::Newline,
Token::Indent,
w("cmd1"),
Token::Newline,
Token::Indent,
w("cmd2"),
Token::Newline,
Token::Newline,
])
);
}
#[test]
fn escaped_newline_sh() {
assert_eq!(
tokenize("foo\\\nbar\n", ShellMode::Sh).unwrap(),
tks(vec![w("foo"), w("bar"), Token::Newline,])
);
}
#[test]
fn escaped_newline_rc() {
assert_eq!(
tokenize("foo\\\nbar\n", ShellMode::Rc).unwrap(),
tks(vec![w("foobar"), Token::Newline,])
);
}
#[test]
fn single_quoted() {
assert_eq!(
tokenize("'hello world'", ShellMode::Sh).unwrap(),
tks(vec![w("'hello world'")])
);
}
#[test]
fn double_quoted_sh() {
assert_eq!(
tokenize("\"hello world\"", ShellMode::Sh).unwrap(),
tks(vec![w("\"hello world\"")])
);
}
#[test]
fn double_quoted_rc_not_special() {
assert_eq!(
tokenize("\"hello world\"", ShellMode::Rc).unwrap(),
tks(vec![w("\"hello"), w("world\"")])
);
}
#[test]
fn backtick_command() {
assert_eq!(
tokenize("`echo hello`", ShellMode::Sh).unwrap(),
tks(vec![w("`echo hello`")])
);
}
#[test]
fn backtick_command_rc_style() {
assert_eq!(
tokenize("`{echo hello}", ShellMode::Sh).unwrap(),
tks(vec![w("`{echo hello}`")])
);
}
#[test]
fn backtick_command_rc_style_with_space() {
assert_eq!(
tokenize("` {echo hello}", ShellMode::Sh).unwrap(),
tks(vec![w("`{echo hello}`")])
);
}
#[test]
fn rc_style_backtick_unterminated() {
let result = tokenize("`{unclosed", ShellMode::Sh);
assert!(matches!(result, Err(LexError::UnterminatedBacktick { .. })));
}
#[test]
fn sh_style_backtick_still_works() {
assert_eq!(
tokenize("`echo hello`", ShellMode::Sh).unwrap(),
tks(vec![w("`echo hello`")])
);
}
#[test]
fn attribute_rule() {
assert_eq!(
tokenize("target:VQ: prereq\n", ShellMode::Sh).unwrap(),
tks(vec![
w("target"),
Token::Colon,
w("VQ"),
Token::Colon,
w("prereq"),
Token::Newline,
])
);
}
#[test]
fn multiple_targets() {
assert_eq!(
tokenize("a b c: d e\n", ShellMode::Sh).unwrap(),
tks(vec![
w("a"),
w("b"),
w("c"),
Token::Colon,
w("d"),
w("e"),
Token::Newline,
])
);
}
#[test]
fn blank_line() {
assert_eq!(
tokenize("a\n\nb\n", ShellMode::Sh).unwrap(),
tks(vec![
w("a"),
Token::Newline,
Token::Newline, w("b"),
Token::Newline,
])
);
}
#[test]
fn unterminated_quote() {
let result = tokenize("'hello", ShellMode::Sh);
assert!(matches!(result, Err(LexError::UnterminatedQuote { .. })));
}
#[test]
fn unterminated_backtick() {
let result = tokenize("`hello", ShellMode::Sh);
assert!(matches!(result, Err(LexError::UnterminatedBacktick { .. })));
}
#[test]
fn unterminated_rc_backtick() {
let result = tokenize("`{hello", ShellMode::Sh);
assert!(matches!(result, Err(LexError::UnterminatedBacktick { .. })));
}
#[test]
fn dollar_in_word() {
assert_eq!(
tokenize("$CC -o $target", ShellMode::Sh).unwrap(),
tks(vec![w("$CC"), w("-o"), w("$target")])
);
}
#[test]
fn dollar_brace_in_word() {
assert_eq!(
tokenize("${CFLAGS}", ShellMode::Sh).unwrap(),
tks(vec![w("${CFLAGS}")])
);
}
#[test]
fn dollar_brace_with_special_chars() {
assert_eq!(
tokenize("${VAR:a=b}", ShellMode::Sh).unwrap(),
tks(vec![w("${VAR:a=b}")])
);
}
#[test]
fn nested_dollar_brace() {
assert_eq!(
tokenize("${VAR:${OTHER}}", ShellMode::Sh).unwrap(),
tks(vec![w("${VAR:${OTHER}}")])
);
}
#[test]
fn shell_mode_display() {
assert_eq!(format!("{}", ShellMode::Sh), "sh");
assert_eq!(format!("{}", ShellMode::Rc), "rc");
}
#[test]
fn rc_single_quote_escape() {
assert_eq!(
tokenize("'hello''world'", ShellMode::Rc).unwrap(),
tks(vec![w("'hello'world'")])
);
}
#[test]
fn escaped_double_quote() {
assert_eq!(
tokenize(r#""hello\"world""#, ShellMode::Sh).unwrap(),
tks(vec![w(r#""hello\"world""#)])
);
}
#[test]
fn backslash_escape_in_double_quote() {
assert_eq!(
tokenize(r#""hello\nworld""#, ShellMode::Sh).unwrap(),
tks(vec![w(r#""hello\nworld""#)])
);
}
#[test]
fn multi_word_line() {
assert_eq!(
tokenize("cc -O2 -c file.c\n", ShellMode::Sh).unwrap(),
tks(vec![
w("cc"),
w("-O2"),
w("-c"),
w("file.c"),
Token::Newline,
])
);
}
#[test]
fn no_trailing_newline() {
assert_eq!(
tokenize("hello world", ShellMode::Sh).unwrap(),
tks(vec![w("hello"), w("world")])
);
}
#[test]
fn consecutive_blank_lines() {
assert_eq!(
tokenize("a\n\n\nb\n", ShellMode::Sh).unwrap(),
tks(vec![
w("a"),
Token::Newline,
Token::Newline,
Token::Newline,
w("b"),
Token::Newline,
])
);
}
#[test]
fn comment_at_eof() {
assert_eq!(
tokenize("word # no newline", ShellMode::Sh).unwrap(),
tks(vec![w("word")])
);
}
#[test]
fn whitespace_only_line() {
assert_eq!(
tokenize(" \n", ShellMode::Sh).unwrap(),
tks(vec![Token::Indent, Token::Newline])
);
}
#[test]
fn angle_in_recipe() {
assert_eq!(
tokenize("target:\n\tcat < file.txt\n", ShellMode::Sh).unwrap(),
tks(vec![
w("target"),
Token::Colon,
Token::Newline,
Token::Indent,
w("cat"),
w("<"),
w("file.txt"),
Token::Newline,
])
);
}
#[test]
fn pipe_in_recipe() {
assert_eq!(
tokenize("target:\n\techo hello | grep world\n", ShellMode::Sh).unwrap(),
tks(vec![
w("target"),
Token::Colon,
Token::Newline,
Token::Indent,
w("echo"),
w("hello"),
w("|"),
w("grep"),
w("world"),
Token::Newline,
])
);
}
#[test]
fn angle_in_middle_of_word() {
assert_eq!(tokenize("a<b", ShellMode::Sh).unwrap(), tks(vec![w("a<b")]));
}
#[test]
fn pipe_in_middle_of_word() {
assert_eq!(tokenize("a|b", ShellMode::Sh).unwrap(), tks(vec![w("a|b")]));
}
#[test]
fn colon_splits_word() {
assert_eq!(
tokenize("a:b", ShellMode::Sh).unwrap(),
tks(vec![w("a"), Token::Colon, w("b")])
);
}
#[test]
fn equals_splits_word() {
assert_eq!(
tokenize("a=b", ShellMode::Sh).unwrap(),
tks(vec![w("a"), Token::Equals, w("b")])
);
}
#[test]
fn tab_indented_recipe() {
assert_eq!(
tokenize("\tcmd\n", ShellMode::Sh).unwrap(),
tks(vec![Token::Indent, w("cmd"), Token::Newline])
);
}
#[test]
fn spaces_indent() {
assert_eq!(
tokenize(" cmd\n", ShellMode::Sh).unwrap(),
tks(vec![Token::Indent, w("cmd"), Token::Newline])
);
}
#[test]
fn backtick_in_recipe_passed_verbatim() {
let result = tokenize("target:\n\tcmd `backtick` arg\n", ShellMode::Sh).unwrap();
let tokens = tks(vec![
w("target"),
Token::Colon,
Token::Newline,
Token::Indent,
w("cmd"),
w("`backtick`"), w("arg"),
Token::Newline,
]);
assert_eq!(result, tokens);
}
#[test]
fn backtick_brace_in_recipe_passed_verbatim() {
let result = tokenize("target:\n\techo `{uptime}`\n", ShellMode::Sh).unwrap();
let tokens = tks(vec![
w("target"),
Token::Colon,
Token::Newline,
Token::Indent,
w("echo"),
w("`{uptime}`"), Token::Newline,
]);
assert_eq!(result, tokens);
}
}