use super::errors::*;
use super::tokens::{self, *};
use std::collections::HashSet;
use std::fmt;
pub struct Lexer {
literal_tokens: HashSet<u8>,
}
struct LexerState<'a, 'b> {
text: &'a [u8], tokens: Tokens, row: usize,
col: usize,
literal_tokens: &'b HashSet<u8>,
}
impl<'a, 'b> Lexer {
pub fn new() -> Self {
let literal_tokens = HashSet::from([
'=' as u8, '(' as u8, ')' as u8, '{' as u8, '}' as u8, '<' as u8, '>' as u8, '|' as u8,
'&' as u8,
]);
Lexer { literal_tokens }
}
pub fn tokenize(&'a self, text: &'b [u8]) -> Result<Tokens, String> {
let state = LexerState::new(text, &self.literal_tokens);
state.tokenize()
}
}
impl<'a, 'b> LexerState<'a, 'b> {
fn new(text: &'a [u8], literal_tokens: &'b HashSet<u8>) -> Self {
Self {
text,
tokens: Vec::new(),
row: 1,
col: 1,
literal_tokens,
}
}
pub fn tokenize(mut self) -> Result<Tokens, String> {
self.col += self.trim();
self.row = 1;
let mut last_row = 0;
let mut last_col = 0;
let mut no_update = 0;
loop {
self.remove_new_lines_and_comments();
if self.text.is_empty() {
break;
}
self.extract_literal_tokens();
if self.text[0] == '"' as u8 || self.text[0] == '\'' as u8 {
match self.extract_quoted_words() {
Ok(_) => {}
Err(e) => return Err(e),
}
}
self.extract_token_matching_condition_as(is_alphanum, TokenVal::var_or_const_from_u8);
self.col += self.trim();
if self.col == last_col && self.row == last_row {
no_update += 1;
}
if no_update > 5 {
panic!("no update during lexing loops, most likely due to syntax divergence from bash at {}:{}", self.row, self.col);
}
last_col = self.col;
last_row = self.row;
}
Ok(self.tokens)
}
fn extract_quoted_words(&mut self) -> Result<(), String> {
let quote = self.extract_chars(1)[0];
let n = self.count_matching_condition(|char| {
char != '\'' as u8 && char != '\n' as u8 && char != '"' as u8
});
let end_of_the_current_line = self.text[n];
if end_of_the_current_line == '\n' as u8 {
return Err(unclosed_quote(self.col, self.row));
}
if quote == '\'' as u8 {
let val = self.extract_chars(n);
let val = std::str::from_utf8(val).unwrap().to_owned();
self.tokens.push(Token {
val: TokenVal::Const(val),
col: self.col,
row: self.row,
});
_ = self.extract_chars(1); return Ok(());
}
if self.text[0] == '$' as u8 && self.text[1] == '(' as u8 {
if self.text[n - 1] != ')' as u8 {
print_chars(&self.text[..n]);
return Err(unclosed_block(self.col, self.row));
}
_ = self.extract_chars(2); self.extract_sub_command();
_ = self.extract_chars(2); return Ok(());
}
self.extract_token_as(n, TokenVal::var_or_const_from_u8);
_ = self.extract_chars(1);
Ok(())
}
fn extract_sub_command(&mut self) {
let mut tokens: Vec<TokenVal> = Vec::new();
while self.text[0] != ')' as u8 {
if self.literal_tokens.contains(&self.text[0]) {
let token = self._extract_literal_token();
tokens.push(token);
self.col += 1;
self.trim();
}
let len = self.count_matching_condition(|char| {
char != '$' as u8 && char != ')' as u8 && char != ' ' as u8
});
if len > 0 {
let val = self.extract_chars(len);
self.col += len;
tokens.push(TokenVal::var_or_const_from_u8(val));
}
self.trim();
}
let token = TokenVal::SubCmd(tokens);
self.tokens.push(Token {
val: token,
col: self.col,
row: self.row,
});
}
fn _extract_literal_token(&mut self) -> TokenVal {
let val = self.extract_chars(1)[0];
return tokens::from_u8(val);
}
fn extract_literal_tokens(&mut self) {
if self.literal_tokens.contains(&self.text[0]) {
let token = self._extract_literal_token();
self.tokens.push(Token {
val: token,
col: self.col,
row: self.row,
});
self.col += 1;
}
}
fn remove_new_lines_and_comments(&mut self) {
if is_newline(self.text[0]) || is_comment(self.text[0]) {
let n = self.count_matching_condition(|char| !is_newline(char));
_ = self.extract_chars(n + 1);
self.row += 1;
self.col = 1;
}
}
fn extract_chars(&mut self, n: usize) -> &'a [u8] {
let chars = &self.text[0..n];
self.text = &self.text[n..];
return chars;
}
fn trim(&mut self) -> usize {
let mut n = 0;
let len = self.text.len();
while n < len && is_space(self.text[n]) {
n += 1;
}
self.text = &self.text[n..];
return n;
}
fn extract_token_matching_condition_as<P, T>(&mut self, f: P, t: T)
where
P: Fn(u8) -> bool,
T: Fn(&[u8]) -> TokenVal,
{
let n = self.count_matching_condition(|char| f(char));
if n > 0 {
self.extract_token_as(n, t);
}
}
fn extract_token_as<T>(&mut self, n: usize, t: T)
where
T: Fn(&[u8]) -> TokenVal,
{
let val = self.extract_chars(n);
let token = t(val);
self.tokens.push(Token {
val: token,
col: self.col,
row: self.row,
});
self.col += n;
}
fn count_matching_condition<P>(&self, f: P) -> usize
where
P: Fn(u8) -> bool,
{
let mut n = 0;
let len = self.text.len();
while n < len && f(self.text[n]) {
n += 1;
}
return n;
}
}
fn is_alphanum(char: u8) -> bool {
is_alpha(char)
|| ('a' as u8 <= char && char <= 'z' as u8)
|| ('A' as u8 <= char && char <= 'Z' as u8)
|| (char == '_' as u8)
|| (char == ':' as u8)
|| (char == '/' as u8)
|| (char == '-' as u8)
}
fn is_alpha(char: u8) -> bool {
'0' as u8 <= char && char <= '9' as u8
}
fn is_space(char: u8) -> bool {
char == ' ' as u8 || char == '\t' as u8
}
fn is_newline(char: u8) -> bool {
char == '\n' as u8
}
fn is_comment(char: u8) -> bool {
char == '#' as u8
}
fn print_chars(chars: &[u8]) {
let s = std::str::from_utf8(&chars).unwrap();
println!("{}", s);
}
impl<'a> fmt::Display for LexerState<'a, 'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let len = self.text.len();
let mut n = 0;
while n < len && !self.text[n] != '\n' as u8 {
n += 1;
}
let s = std::str::from_utf8(&self.text[0..n]).unwrap();
write!(f, "{}", s)
}
}
#[cfg(test)]
mod lexer_tests {
use super::*;
use insta::{self, *};
#[test]
fn it_trims_spaces() {
let text = "\t hello world \n\n h".as_bytes();
let l = Lexer::new();
let mut state = LexerState::new(text, &l.literal_tokens);
assert_eq!(3, state.trim());
}
#[test]
fn it_can_extract_tokens_from_a_file() {
let text = include_bytes!("../../tests/inputs/parser/script1.sh");
let l = Lexer::new();
let tokens = l.tokenize(text).unwrap();
let mut settings = insta::Settings::clone_current();
settings.set_snapshot_path("../../tests/snapshots");
settings.set_description("line number: for each token in the line TokenValType(value), literal tokens like {} as displayed as is");
settings.bind(|| assert_snapshot!(tokens_to_string(&tokens)));
}
}