use std::os::fd::RawFd;
use anyhow::anyhow;
use nom::{
branch::alt,
bytes::complete::{is_not, tag, take_while, take_while1},
character::complete::{char, digit1, line_ending, multispace0, not_line_ending},
combinator::{eof, map, opt, value},
error::ErrorKind,
multi::{fold_many0, many0, many_till},
sequence::{delimited, preceded, terminated, tuple},
Finish, IResult,
};
#[derive(Copy, Clone, Debug, PartialEq)]
pub struct FdPair(RawFd, Option<RawFd>);
#[derive(Copy, Clone, Debug, PartialEq)]
pub enum Mode {
Read,
Write,
ReadWrite,
}
#[derive(Copy, Clone, Debug, PartialEq)]
pub enum Redir {
Input,
Output,
InputOutput,
DoubleOutput,
}
#[derive(Clone, Debug, PartialEq)]
pub enum Token {
Eof,
For,
In,
While,
If,
Not,
Twiddle,
Bang,
Subshell,
Switch,
Fn,
OpenBrace,
CloseBrace,
OpenParen,
CloseParen,
AndAnd,
OrOr,
Caret,
Equal,
Ampersand,
SemiColon,
Pipe(Option<FdPair>),
HereDoc(String, bool), FdRedir(FdPair, Mode),
FileRedir(String, Option<RawFd>, Redir),
Argument(String, Option<String>), ArgumentQuote(String),
ArgumentSize(String),
Cmd(Vec<Token>),
CmdRedir(Vec<Token>, bool), CmdSplit(Vec<Token>, Box<Token>), Word(String),
}
pub fn tokenize(input: &str) -> Result<Vec<Token>, anyhow::Error> {
tokenize_many(input)
.finish()
.map(|(_, tokens)| tokens)
.map_err(|error| anyhow!("! {error:?}"))
}
fn tokenize_many(input: &str) -> IResult<&str, Vec<Token>> {
many_till(
alt((
alt((
quoted_string_token,
argument_token,
caret_token,
pipe_token,
bang_token,
subshell_token,
andand_token,
oror_token,
heredoc_token,
split_command_token,
command_token,
command_redir_token,
fd_redir_token,
file_redir_token,
if_token,
while_token,
for_token,
switch_token,
)),
alt((
close_paren_token,
not_token,
open_paren_token,
in_token,
twiddle_token,
fn_token,
open_brace_token,
close_brace_token,
equal_token,
ampersand_token,
semicolon_token,
bareword_token,
)),
)),
eof,
)(input)
.map(|(remaining, (tokens, _))| (remaining, tokens))
}
fn is_id_char(c: char) -> bool {
c.is_alphanumeric() || c == '_' || c == '*' || c == '$'
}
fn is_word_char(c: char) -> bool {
!c.is_whitespace() && !r#"#;&|^$=`'{}()<>"#.contains(c)
}
fn variable_name(input: &str) -> IResult<&str, &str> {
take_while1(|c: char| is_id_char(c))(input)
}
fn variable_name_subscript(input: &str) -> IResult<&str, (&str, Option<&str>)> {
let (input, name) = variable_name(input)?;
let (mut input, subscript) = opt(preceded(tag("("), take_while(|c: char| c != ')')))(input)?;
if subscript.is_some() {
(input, _) = tag(")")(input)?;
}
Ok((input, (name, subscript)))
}
fn argument_token(input: &str) -> IResult<&str, Token> {
let (input, _) = opt(preceded(multispace0, comment))(input)?;
alt((
map(
tuple((multispace0, tag("$\""), variable_name)),
|(_, _, name)| Token::ArgumentQuote(name.to_string()),
),
map(
tuple((multispace0, tag("$#"), variable_name)),
|(_, _, name)| Token::ArgumentSize(name.to_string()),
),
map(
tuple((multispace0, tag("$"), variable_name_subscript)),
|(_, _, (name, subscript))| {
Token::Argument(name.to_string(), subscript.map(|s| s.to_string()))
},
),
))(input)
}
fn file_redir_token(input: &str) -> IResult<&str, Token> {
let (input, _) = opt(preceded(multispace0, comment))(input)?;
let (input, redirection) =
preceded(multispace0, alt((tag(">>"), tag("<>"), tag("<"), tag(">"))))(input)?;
let redirection = match redirection {
"<" => Redir::Input,
">" => Redir::Output,
"<>" => Redir::InputOutput,
">>" => Redir::DoubleOutput,
_ => unreachable!(),
};
let (input, fd_redir) = opt(fd_redir1)(input)?;
let (input, file_redir) = preceded(multispace0, word)(input)?;
Ok((
input,
Token::FileRedir(file_redir.to_string(), fd_redir, redirection),
))
}
fn pipe_token(input: &str) -> IResult<&str, Token> {
let (input, _) = opt(preceded(multispace0, comment))(input)?;
let (input, _) = preceded(multispace0, tag("|"))(input)?;
let (input, fd_redir) = opt(fd_redir2)(input)?;
Ok((input, Token::Pipe(fd_redir)))
}
fn heredoc_token(input: &str) -> IResult<&str, Token> {
let (input, _) = opt(preceded(multispace0, comment))(input)?;
let (input, _) = preceded(multispace0, tag("<<"))(input)?;
let (input, token) = preceded(
multispace0,
alt((
map(quoted_string, |qs| Token::HereDoc(qs, true)),
map(word, |w| Token::HereDoc(w, false)),
)),
)(input)?;
Ok((input, token))
}
fn fd_redir_token(input: &str) -> IResult<&str, Token> {
let (input, _) = opt(preceded(multispace0, comment))(input)?;
let (input, operator) = preceded(multispace0, alt((tag("<>"), tag("<"), tag(">"))))(input)?;
let operator = match operator {
"<>" => Mode::ReadWrite,
"<" => Mode::Read,
">" => Mode::Write,
_ => unreachable!(),
};
let (input, fd_redir) = preceded(multispace0, fd_redir2)(input)?;
Ok((input, Token::FdRedir(fd_redir, operator)))
}
fn fd_redir1<'a>(input: &'a str) -> IResult<&'a str, RawFd> {
let (input, _) = preceded(multispace0, tag("["))(input)?;
let (input, fd) = preceded(multispace0, digit1)(input)?;
let (input, _) = preceded(multispace0, tag("]"))(input)?;
Ok((input, fd.parse().unwrap()))
}
fn fd_redir2<'a>(input: &'a str) -> IResult<&'a str, FdPair> {
let (input, _) = preceded(multispace0, tag("["))(input)?;
let (input, fd) = preceded(multispace0, digit1)(input)?;
let (input, _) = opt(preceded(multispace0, tag("=")))(input)?;
let (input, operator) = opt(preceded(multispace0, digit1))(input)?;
let (input, _) = preceded(multispace0, tag("]"))(input)?;
Ok((
input,
FdPair(fd.parse().unwrap(), operator.map(|op| op.parse().unwrap())),
))
}
fn command_token(input: &str) -> IResult<&str, Token> {
let (input, _) = opt(preceded(multispace0, comment))(input)?;
let (input, _) = preceded(multispace0, tag("`{"))(input)?;
let (_, end_pos) = find_matching_brace(input)?;
let (command_content, _) = input.split_at(end_pos);
let (_, nested_tokens) = tokenize_many(command_content)?;
let (_, input) = input.split_at(end_pos + 1);
Ok((input, Token::Cmd(nested_tokens)))
}
fn split_command_token(input: &str) -> IResult<&str, Token> {
let (input, _) = opt(preceded(multispace0, comment))(input)?;
let (input, _) = preceded(multispace0, tag("`"))(input)?;
let (input, split_token) = alt((argument_token, quoted_string_token, bareword_token))(input)?;
let (input, _) = tag("{")(input)?;
let (_, end_pos) = find_matching_brace(input)?;
let (command_content, _) = input.split_at(end_pos);
let (_, nested_tokens) = tokenize_many(command_content)?;
let (_, input) = input.split_at(end_pos + 1);
Ok((input, Token::CmdSplit(nested_tokens, Box::new(split_token))))
}
fn command_redir_token(input: &str) -> IResult<&str, Token> {
let (input, _) = opt(preceded(multispace0, comment))(input)?;
let (input, redirection) = preceded(multispace0, alt((tag("<"), tag(">"))))(input)?;
let (input, _) = preceded(multispace0, tag("{"))(input)?;
let (_, end_pos) = find_matching_brace(input)?;
let (command_content, _) = input.split_at(end_pos);
let (_, nested_tokens) = tokenize_many(command_content)?;
let (_, input) = input.split_at(end_pos + 1);
Ok((input, Token::CmdRedir(nested_tokens, redirection == "<")))
}
fn bareword_token(input: &str) -> IResult<&str, Token> {
let (input, _) = opt(preceded(multispace0, comment))(input)?;
let (input, word) = preceded(multispace0, non_quoted_word)(input)?;
if !word.is_empty() {
Ok((input, Token::Word(word.to_string())))
} else {
Ok((input, Token::Eof))
}
}
fn quoted_string_token(input: &str) -> IResult<&str, Token> {
let (input, _) = opt(preceded(multispace0, comment))(input)?;
let (input, token) = map(preceded(multispace0, quoted_string), |word| {
Token::Word(word.to_string())
})(input)?;
Ok((input, token))
}
fn word(input: &str) -> IResult<&str, String> {
let (input, word) = preceded(
multispace0,
alt((
quoted_string, non_quoted_word, )),
)(input)?;
Ok((input, word))
}
fn non_quoted_word(input: &str) -> IResult<&str, String> {
let (input, word) = map(take_while(|c| c != '#' && is_word_char(c)), |s: &str| {
s.to_string()
})(input)?;
Ok((input, word))
}
fn quoted_string(input: &str) -> IResult<&str, String> {
delimited(
char('\''),
fold_many0(
alt((
map(value("'", tag("''")), |s: &str| s.to_string()), map(is_not("'"), |s: &str| s.to_string()), )),
|| String::new(), |mut acc: String, item: String| {
acc.push_str(&item);
acc
},
),
char('\''),
)(input)
}
fn comment(input: &str) -> IResult<&str, ()> {
value(
(),
many0(terminated(
terminated(char('#'), not_line_ending),
line_ending,
)),
)(input)
}
fn for_token(input: &str) -> IResult<&str, Token> {
tag_token(input, "for", Token::For)
}
fn in_token(input: &str) -> IResult<&str, Token> {
tag_token(input, "in", Token::In)
}
fn while_token(input: &str) -> IResult<&str, Token> {
tag_token(input, "while", Token::While)
}
fn if_token(input: &str) -> IResult<&str, Token> {
tag_token(input, "if", Token::If)
}
fn not_token(input: &str) -> IResult<&str, Token> {
tag_token(input, "not", Token::Not)
}
fn switch_token(input: &str) -> IResult<&str, Token> {
tag_token(input, "switch", Token::Switch)
}
fn fn_token(input: &str) -> IResult<&str, Token> {
tag_token(input, "fn", Token::Fn)
}
fn andand_token(input: &str) -> IResult<&str, Token> {
tag_token(input, "&&", Token::AndAnd)
}
fn oror_token(input: &str) -> IResult<&str, Token> {
tag_token(input, "||", Token::OrOr)
}
fn twiddle_token(input: &str) -> IResult<&str, Token> {
char_token(input, '~', Token::Twiddle)
}
fn bang_token(input: &str) -> IResult<&str, Token> {
char_token(input, '!', Token::Bang)
}
fn subshell_token(input: &str) -> IResult<&str, Token> {
char_token(input, '@', Token::Subshell)
}
fn ampersand_token(input: &str) -> IResult<&str, Token> {
char_token(input, '&', Token::Ampersand)
}
fn semicolon_token(input: &str) -> IResult<&str, Token> {
char_token(input, ';', Token::SemiColon)
}
fn open_brace_token(input: &str) -> IResult<&str, Token> {
char_token(input, '{', Token::OpenBrace)
}
fn close_brace_token(input: &str) -> IResult<&str, Token> {
char_token(input, '}', Token::CloseBrace)
}
fn open_paren_token(input: &str) -> IResult<&str, Token> {
char_token(input, '(', Token::OpenParen)
}
fn close_paren_token(input: &str) -> IResult<&str, Token> {
char_token(input, ')', Token::CloseParen)
}
fn caret_token(input: &str) -> IResult<&str, Token> {
char_token(input, '^', Token::Caret)
}
fn equal_token(input: &str) -> IResult<&str, Token> {
char_token(input, '=', Token::Equal)
}
fn tag_token<'a>(input: &'a str, token_tag: &'static str, token: Token) -> IResult<&'a str, Token> {
let (input, _) = opt(preceded(multispace0, comment))(input)?;
let (input, token) = map(preceded(multispace0, tag(token_tag)), |_| token.clone())(input)?;
Ok((input, token))
}
fn char_token(input: &str, token_char: char, token: Token) -> IResult<&str, Token> {
let (input, _) = opt(preceded(multispace0, comment))(input)?;
let (input, token) = map(preceded(multispace0, char(token_char)), |_| token.clone())(input)?;
Ok((input, token))
}
fn find_matching_brace(input: &str) -> IResult<&str, usize> {
let mut brace_count = 1;
let mut in_single_quotes = false;
let mut skip_next_quote = false;
for (idx, c) in input.chars().enumerate() {
if skip_next_quote {
if c == '\'' {
skip_next_quote = false;
continue;
} else {
skip_next_quote = false;
in_single_quotes = false;
}
}
match c {
'{' if !in_single_quotes => brace_count += 1,
'}' if !in_single_quotes => {
brace_count -= 1;
if brace_count == 0 {
return Ok((input, idx));
}
}
'\'' => {
if in_single_quotes {
skip_next_quote = true;
} else {
in_single_quotes = true;
}
}
_ => {}
}
}
Err(nom::Err::Error(nom::error::Error::new(
input,
ErrorKind::Eof,
)))
}
#[cfg(test)]
mod tests {
use anyhow::Result;
use super::*;
#[test]
fn command_redirection() -> Result<()> {
assert_eq!(
tokenize("<{cmd}")?,
vec![Token::CmdRedir(Redir::InputRedir, "cmd".to_string())]
);
assert_eq!(
tokenize(">{cmd}")?,
vec![Token::CmdRedir(Redir::OutputRedir, "cmd".to_string())]
);
assert_eq!(
tokenize("cmp <{old} <{new}")?,
vec![
Token::Word("cmp".to_string()),
Token::CmdRedir(Redir::InputRedir, "old".to_string()),
Token::CmdRedir(Redir::InputRedir, "new".to_string()),
]
);
Ok(())
}
}