mssh 0.0.0

Mssh Simple SHell. Bash interpreter/compiler. Will not support all the functionalities.
use super::errors::*;
use super::tokens::{self, *};

use std::collections::HashSet;
use std::fmt;

pub struct Lexer {
    // Facade of the actual lexer
    literal_tokens: HashSet<u8>,
}

struct LexerState<'a, 'b> {
    text: &'a [u8], // remaining characters to process
    tokens: Tokens, //buffer to store parsed tokens
    row: usize,
    col: usize,
    literal_tokens: &'b HashSet<u8>,
}

impl<'a, 'b> Lexer {
    pub fn new() -> Self {
        let literal_tokens = HashSet::from([
            '=' as u8, '(' as u8, ')' as u8, '{' as u8, '}' as u8, '<' as u8, '>' as u8, '|' as u8,
            '&' as u8,
        ]);
        Lexer { literal_tokens }
    }

    pub fn tokenize(&'a self, text: &'b [u8]) -> Result<Tokens, String> {
        let state = LexerState::new(text, &self.literal_tokens);
        state.tokenize()
    }
}

impl<'a, 'b> LexerState<'a, 'b> {
    fn new(text: &'a [u8], literal_tokens: &'b HashSet<u8>) -> Self {
        Self {
            text,
            tokens: Vec::new(),
            row: 1,
            col: 1,
            literal_tokens,
        }
    }

    pub fn tokenize(mut self) -> Result<Tokens, String> {
        // position on the line starts from 1
        self.col += self.trim();
        self.row = 1;
        let mut last_row = 0;
        let mut last_col = 0;
        let mut no_update = 0;

        loop {
            self.remove_new_lines_and_comments();
            // lexer can crash if it doesn't exit here
            // no handling of special edge cases
            if self.text.is_empty() {
                break;
            }

            self.extract_literal_tokens();
            // only opening quote determine if vars are replaced or not
            // open quote before EOL will probably crash
            // not an issue as cannot happen in interactive mode
            // does not support nested quotes
            if self.text[0] == '"' as u8 || self.text[0] == '\'' as u8 {
                match self.extract_quoted_words() {
                    Ok(_) => {}
                    Err(e) => return Err(e),
                }
            }
            // extract a single word if no special symbol was encountered
            self.extract_token_matching_condition_as(is_alphanum, TokenVal::var_or_const_from_u8);
            self.col += self.trim();

            // lexer does not handle corner cases
            // this avoid infinite loops
            // valid "irsh" script does not require this
            if self.col == last_col && self.row == last_row {
                no_update += 1;
            }

            if no_update > 5 {
                panic!("no update during lexing loops, most likely due to syntax divergence from bash at {}:{}", self.row, self.col);
            }

            last_col = self.col;
            last_row = self.row;
        }
        Ok(self.tokens)
    }

    fn extract_quoted_words(&mut self) -> Result<(), String> {
        // might crash if last line is not properly closed?
        let quote = self.extract_chars(1)[0];
        let n = self.count_matching_condition(|char| {
            char != '\'' as u8 && char != '\n' as u8 && char != '"' as u8
        });
        let end_of_the_current_line = self.text[n];
        if end_of_the_current_line == '\n' as u8 {
            return Err(unclosed_quote(self.col, self.row));
        }
        if quote == '\'' as u8 {
            // extract single quoted constant

            let val = self.extract_chars(n);
            let val = std::str::from_utf8(val).unwrap().to_owned();

            self.tokens.push(Token {
                val: TokenVal::Const(val),
                col: self.col,
                row: self.row,
            });

            _ = self.extract_chars(1); // get rid of closing quote
            return Ok(());
        }
        if self.text[0] == '$' as u8 && self.text[1] == '(' as u8 {
            // try to process a subcommand
            if self.text[n - 1] != ')' as u8 {
                print_chars(&self.text[..n]);
                return Err(unclosed_block(self.col, self.row));
            }
            _ = self.extract_chars(2); // remove the $(
            self.extract_sub_command();
            _ = self.extract_chars(2); // remove the )'
            return Ok(());
        }

        self.extract_token_as(n, TokenVal::var_or_const_from_u8);
        _ = self.extract_chars(1); // get rid of closing quote

        Ok(())
    }

    fn extract_sub_command(&mut self) {
        // sub command is composed of var/consts and redirection tokens only
        // assuming that it is properly closed
        // crashes or infinite loop otherwise,
        // easy to fix but it's an unnecessary edge case
        let mut tokens: Vec<TokenVal> = Vec::new();
        while self.text[0] != ')' as u8 {
            //print_chars(&self.text[..end_col]);
            // will bug if ) is matched here
            if self.literal_tokens.contains(&self.text[0]) {
                let token = self._extract_literal_token();
                tokens.push(token);
                self.col += 1;
                self.trim();
            }

            let len = self.count_matching_condition(|char| {
                char != '$' as u8 && char != ')' as u8 && char != ' ' as u8
            });
            if len > 0 {
                let val = self.extract_chars(len);
                self.col += len;
                tokens.push(TokenVal::var_or_const_from_u8(val));
            }
            self.trim();
        }

        // tokens.push(tokens::from_u8s(res));
        let token = TokenVal::SubCmd(tokens);
        self.tokens.push(Token {
            val: token,
            col: self.col,
            row: self.row,
        });

        //_ = self.extract_chars(2); // get rid of )"
    }

    fn _extract_literal_token(&mut self) -> TokenVal {
        let val = self.extract_chars(1)[0];
        return tokens::from_u8(val);
    }

    fn extract_literal_tokens(&mut self) {
        if self.literal_tokens.contains(&self.text[0]) {
            let token = self._extract_literal_token();

            self.tokens.push(Token {
                val: token,
                col: self.col,
                row: self.row,
            });
            self.col += 1;
        }
    }

    fn remove_new_lines_and_comments(&mut self) {
        if is_newline(self.text[0]) || is_comment(self.text[0]) {
            let n = self.count_matching_condition(|char| !is_newline(char));
            _ = self.extract_chars(n + 1);
            self.row += 1;
            self.col = 1;
        }
    }

    fn extract_chars(&mut self, n: usize) -> &'a [u8] {
        // unsafe use with caution
        let chars = &self.text[0..n];
        self.text = &self.text[n..];
        return chars;
    }

    fn trim(&mut self) -> usize {
        let mut n = 0;
        let len = self.text.len();
        while n < len && is_space(self.text[n]) {
            n += 1;
        }
        self.text = &self.text[n..];
        return n;
    }

    fn extract_token_matching_condition_as<P, T>(&mut self, f: P, t: T)
    where
        P: Fn(u8) -> bool,
        T: Fn(&[u8]) -> TokenVal,
    {
        let n = self.count_matching_condition(|char| f(char));
        if n > 0 {
            self.extract_token_as(n, t);
        }
    }

    fn extract_token_as<T>(&mut self, n: usize, t: T)
    where
        T: Fn(&[u8]) -> TokenVal,
    {
        let val = self.extract_chars(n);
        let token = t(val);
        self.tokens.push(Token {
            val: token,
            col: self.col,
            row: self.row,
        });
        self.col += n;
    }

    fn count_matching_condition<P>(&self, f: P) -> usize
    where
        P: Fn(u8) -> bool,
    {
        let mut n = 0;
        let len = self.text.len();
        while n < len && f(self.text[n]) {
            n += 1;
        }
        return n;
    }
}

fn is_alphanum(char: u8) -> bool {
    is_alpha(char)
        || ('a' as u8 <= char && char <= 'z' as u8)
        || ('A' as u8 <= char && char <= 'Z' as u8)
        || (char == '_' as u8)
        || (char == ':' as u8)
        || (char == '/' as u8)
        || (char == '-' as u8)
}

fn is_alpha(char: u8) -> bool {
    '0' as u8 <= char && char <= '9' as u8
}
fn is_space(char: u8) -> bool {
    char == ' ' as u8 || char == '\t' as u8
}

fn is_newline(char: u8) -> bool {
    char == '\n' as u8
}

fn is_comment(char: u8) -> bool {
    char == '#' as u8
}

fn print_chars(chars: &[u8]) {
    let s = std::str::from_utf8(&chars).unwrap();
    println!("{}", s);
}

impl<'a> fmt::Display for LexerState<'a, 'a> {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        let len = self.text.len();
        let mut n = 0;
        while n < len && !self.text[n] != '\n' as u8 {
            n += 1;
        }
        let s = std::str::from_utf8(&self.text[0..n]).unwrap();
        write!(f, "{}", s)
    }
}

#[cfg(test)]
mod lexer_tests {
    use super::*;
    use insta::{self, *};

    #[test]
    fn it_trims_spaces() {
        let text = "\t  hello world  \n\n h".as_bytes();
        let l = Lexer::new();
        let mut state = LexerState::new(text, &l.literal_tokens);

        assert_eq!(3, state.trim());
    }

    #[test]
    fn it_can_extract_tokens_from_a_file() {
        let text = include_bytes!("../../tests/inputs/parser/script1.sh");
        let l = Lexer::new();
        let tokens = l.tokenize(text).unwrap();

        let mut settings = insta::Settings::clone_current();
        settings.set_snapshot_path("../../tests/snapshots");
        settings.set_description("line number: for each token in the line  TokenValType(value), literal tokens like {} as displayed as is");
        settings.bind(|| assert_snapshot!(tokens_to_string(&tokens)));
    }
}