luallaby 0.1.0-alpha.3

**Work in progress** A pure-Rust Lua interpreter/compiler
Documentation
use std::convert::TryInto;
use std::iter::{Enumerate, Peekable};
use std::str::Chars;

use crate::num::parse_f64_hex;

pub struct Spanned<T> {
    inner: T,
    pos: Pos,
}

#[derive(Clone, Copy, Debug, Default)]
pub struct Pos(usize, usize);

impl<T> Spanned<T> {
    pub fn tuple(&self) -> (Pos, &T) {
        (self.pos, &self.inner)
    }

    pub fn into_tuple(self) -> (Pos, T) {
        (self.pos, self.inner)
    }

    pub fn inner(&self) -> &T {
        &self.inner
    }

    pub fn pos(&self) -> Pos {
        self.pos
    }
}

impl Pos {
    pub fn line(&self) -> usize {
        self.0
    }

    pub fn col(&self) -> usize {
        self.1
    }
}

#[derive(Debug, Clone)]
pub enum Token {
    // Generic
    Ident(String),
    String(String),
    Integer(i64),
    Float(f64),
    // Keywords
    And,
    Break,
    Do,
    Else,
    Elseif,
    End,
    False,
    For,
    Function,
    Goto,
    If,
    In,
    Local,
    Nil,
    Not,
    Or,
    Repeat,
    Return,
    Then,
    True,
    Until,
    While,
    // Delimiters
    Add,
    Sub,
    Mul,
    Div,
    Mod,
    Pow,
    Hash,
    Amp,
    Tilde,
    Bar,
    Shl,
    Shr,
    FlDiv,
    Eq,
    Neq,
    Leq,
    Geq,
    Lt,
    Gt,
    Is,
    ParO,
    ParC,
    CurlO,
    CurlC,
    SqrO,
    SqrC,
    Ass,
    Semi,
    Colon,
    Comma,
    Point,
    Conc,
    Dots,
}

pub struct Tokenizer<'a> {
    input: Peekable<Enumerate<Chars<'a>>>,
    line: usize,
    line_offset: usize,
}

impl<'a> Tokenizer<'a> {
    pub fn read(input: &'a str) -> Self {
        Self {
            input: input.chars().enumerate().peekable(),
            line: 0,
            line_offset: 0,
        }
    }

    fn line_inc(&mut self, pos: usize) {
        self.line += 1;
        self.line_offset = pos + 1; // Such that the next character gets column 0
    }

    fn read_token(&mut self) -> Option<Spanned<Token>> {
        let (pos, char) = loop {
            self.consume_whitespace();
            let (pos, char) = self.input.next()?;
            if char == '-' && matches!(self.input.peek(), Some((_, '-'))) {
                self.input.next();
                self.consume_comment();
            } else {
                break (pos, char);
            }
        };
        let pos = Pos(self.line, pos.saturating_sub(self.line_offset));

        let token = match char {
            '+' => Token::Add,
            '-' => Token::Sub,
            '*' => Token::Mul,
            '/' => {
                if let Some((_, '/')) = self.input.peek() {
                    self.input.next();
                    Token::FlDiv
                } else {
                    Token::Div
                }
            }
            '%' => Token::Mod,
            '^' => Token::Pow,
            '#' => Token::Hash,
            '&' => Token::Amp,
            '~' => {
                if let Some((_, '=')) = self.input.peek() {
                    self.input.next();
                    Token::Neq
                } else {
                    Token::Tilde
                }
            }
            '|' => Token::Bar,
            '<' => {
                if let Some((_, '<')) = self.input.peek() {
                    self.input.next();
                    Token::Shl
                } else if let Some((_, '=')) = self.input.peek() {
                    self.input.next();
                    Token::Leq
                } else {
                    Token::Lt
                }
            }
            '>' => {
                if let Some((_, '>')) = self.input.peek() {
                    self.input.next();
                    Token::Shr
                } else if let Some((_, '=')) = self.input.peek() {
                    self.input.next();
                    Token::Geq
                } else {
                    Token::Gt
                }
            }
            '=' => {
                if let Some((_, '=')) = self.input.peek() {
                    self.input.next();
                    Token::Eq
                } else {
                    Token::Is
                }
            }
            '(' => Token::ParO,
            ')' => Token::ParC,
            '{' => Token::CurlO,
            '}' => Token::CurlC,
            '[' => match self.input.peek() {
                Some((_, '[')) | Some((_, '=')) => self.consume_long_string(),
                _ => Token::SqrO,
            },
            ']' => Token::SqrC,
            ':' => {
                if let Some((_, ':')) = self.input.peek() {
                    self.input.next();
                    Token::Ass
                } else {
                    Token::Colon
                }
            }
            ';' => Token::Semi,
            ',' => Token::Comma,
            '.' => match self.input.peek().cloned() {
                Some((_, '.')) => {
                    self.input.next();
                    if let Some((_, '.')) = self.input.peek() {
                        self.input.next();
                        Token::Dots
                    } else {
                        Token::Conc
                    }
                }
                Some((_, c)) if is_numeric_char(c) => self.consume_numeric('.'),
                _ => Token::Point,
            },
            c if is_start_name_char(c) => self.consume_name(c),
            c if matches!(c, '"' | '\'') => self.consume_string(c),
            c if is_numeric_char(c) => self.consume_numeric(c),
            c => panic!("{}", c),
        };

        Some(Spanned { inner: token, pos })
    }

    fn consume_whitespace(&mut self) {
        while let Some((_, peek)) = self.input.peek() {
            if is_whitespace_char(*peek) {
                if let Some((pos, '\n')) = self.input.next() {
                    self.line_inc(pos);
                }
            } else {
                break;
            }
        }
    }

    fn consume_name(&mut self, start: char) -> Token {
        assert!(is_start_name_char(start));

        let mut name = vec![start];
        while let Some((_, peek)) = self.input.peek().cloned() {
            if is_name_char(peek) {
                self.input.next();
                name.push(peek);
            } else {
                break;
            }
        }

        let name: String = name.iter().collect();
        match name.as_str() {
            "and" => Token::And,
            "break" => Token::Break,
            "do" => Token::Do,
            "else" => Token::Else,
            "elseif" => Token::Elseif,
            "end" => Token::End,
            "false" => Token::False,
            "for" => Token::For,
            "function" => Token::Function,
            "goto" => Token::Goto,
            "if" => Token::If,
            "in" => Token::In,
            "local" => Token::Local,
            "nil" => Token::Nil,
            "not" => Token::Not,
            "or" => Token::Or,
            "repeat" => Token::Repeat,
            "return" => Token::Return,
            "then" => Token::Then,
            "true" => Token::True,
            "until" => Token::Until,
            "while" => Token::While,
            _ => Token::Ident(name),
        }
    }

    fn consume_string(&mut self, delim: char) -> Token {
        let mut string = vec![];

        while let Some((_, next)) = self.input.next() {
            match next {
                c if c == delim => break,
                '\n' => panic!(),
                '\\' => match self.input.next() {
                    Some((pos, next)) => match next {
                        'a' => string.push('\x07'),
                        'b' => string.push('\x08'),
                        't' => string.push('\t'),
                        'n' => string.push('\n'),
                        'v' => string.push('\x0B'),
                        'f' => string.push('\x0C'),
                        'r' => string.push('\x0D'),
                        '\\' | '"' | '\'' => string.push(next),
                        '\n' => {
                            self.line_inc(pos);
                            string.push('\n')
                        }
                        'z' => self.consume_whitespace(),
                        'x' => {
                            let d1 = match self.input.next() {
                                Some((_, d)) => d,
                                None => panic!(),
                            };
                            let d2 = match self.input.next() {
                                Some((_, d)) => d,
                                None => panic!(),
                            };

                            if !is_hex_digit_char(d1) || !is_hex_digit_char(d2) {
                                panic!()
                            }

                            let digits: String = vec![d1, d2].iter().collect();
                            let value = u8::from_str_radix(&digits, 16).unwrap();
                            string.push(value.into());
                        }
                        c if is_numeric_char(c) => {
                            let mut digits = vec![c];
                            if let Some((_, peek)) = self.input.peek().cloned() {
                                if is_numeric_char(peek) {
                                    self.input.next();
                                    digits.push(peek);
                                }
                            }
                            if let Some((_, peek)) = self.input.peek().cloned() {
                                if is_numeric_char(peek) {
                                    self.input.next();
                                    digits.push(peek);
                                }
                            }

                            let digits: String = digits.iter().collect();
                            let value = digits.parse::<u8>().unwrap();
                            string.push(value.into());
                        }
                        'u' => {
                            match self.input.next() {
                                Some((_, '{')) => {}
                                _ => panic!(),
                            }
                            let mut digits = vec![];
                            loop {
                                match self.input.next() {
                                    Some((_, c)) if is_hex_digit_char(c) => digits.push(c),
                                    Some((_, '}')) => break,
                                    _ => panic!(),
                                }
                            }
                            if digits.is_empty() {
                                panic!()
                            }

                            let digits: String = digits.iter().collect();
                            let value = u32::from_str_radix(&digits, 16).unwrap();
                            string.push(value.try_into().unwrap());
                        }
                        _ => panic!(),
                    },
                    None => panic!(),
                },
                _ => string.push(next),
            }
        }

        Token::String(string.iter().collect())
    }

    fn consume_long_string(&mut self) -> Token {
        let mut len = 0;
        loop {
            match self.input.next() {
                Some((_, '[')) => break,
                Some((_, '=')) => len += 1,
                _ => panic!(),
            }
        }

        let mut string = vec![];

        loop {
            match self.input.next() {
                Some((pos, char)) => match char {
                    ']' => {
                        let mut buffer = vec![char];

                        loop {
                            match self.input.next() {
                                Some((_, next)) => {
                                    buffer.push(next);
                                    if next == ']' || next != '=' {
                                        break;
                                    }
                                }
                                None => panic!(),
                            }
                        }

                        if len + 2 == buffer.len()
                            && buffer.first() == Some(&']')
                            && buffer.last() == Some(&']')
                        {
                            break;
                        } else {
                            string.extend(buffer);
                        }
                    }
                    '\n' => {
                        self.line_inc(pos);
                        if let Some((_, '\r')) = self.input.peek() {
                            self.input.next();
                        }

                        if !string.is_empty() {
                            string.push('\n');
                        }
                    }
                    '\r' => {
                        if let Some((_, '\n')) = self.input.peek() {
                            let pos = self.input.next().unwrap().0;
                            self.line_inc(pos);
                        }

                        if !string.is_empty() {
                            string.push('\n');
                        }
                    }
                    c => string.push(c),
                },
                None => panic!(),
            }
        }

        Token::String(string.iter().collect())
    }

    fn consume_numeric(&mut self, digit: char) -> Token {
        let mut hex = false;
        let mut int = digit != '.';
        let mut digits = vec![digit];

        if let Some((_, char)) = self.input.peek() {
            if matches!(char, 'X' | 'x') {
                self.input.next();
                if digits.first() == Some(&'0') {
                    hex = true;
                    digits.clear();
                } else {
                    panic!();
                }
            }
        }

        while let Some((_, c)) = self.input.peek().cloned() {
            match c {
                c if hex && is_hex_digit_char(c) => {
                    self.input.next();
                    digits.push(c);
                }
                c if !hex && is_numeric_char(c) => {
                    self.input.next();
                    digits.push(c);
                }
                '.' => {
                    if int {
                        self.input.next();
                        digits.push(c);
                        int = false;
                    } else {
                        panic!()
                    }
                }
                'E' | 'e' => {
                    if hex {
                        panic!()
                    } else {
                        self.input.next();
                        int = false;
                        digits.push(c);

                        match self.input.peek().cloned() {
                            Some((_, '-')) => {
                                self.input.next();
                                digits.push('-');
                            }
                            Some((_, '+')) => {
                                self.input.next();
                            }
                            _ => {}
                        }

                        loop {
                            match self.input.peek().cloned() {
                                Some((_, c)) if is_numeric_char(c) => {
                                    self.input.next();
                                    digits.push(c);
                                }
                                _ => break,
                            }
                        }
                        break;
                    }
                }
                'P' | 'p' => {
                    if hex {
                        self.input.next();
                        int = false;
                        digits.push(c);

                        match self.input.peek().cloned() {
                            Some((_, '-')) => {
                                self.input.next();
                                digits.push('-');
                            }
                            Some((_, '+')) => {
                                self.input.next();
                            }
                            _ => {}
                        }

                        loop {
                            match self.input.peek().cloned() {
                                Some((_, c)) if is_numeric_char(c) => {
                                    self.input.next();
                                    digits.push(c);
                                }
                                _ => break,
                            }
                        }
                        break;
                    } else {
                        panic!()
                    }
                }
                _ => break,
            }
        }

        let digits: String = digits.iter().collect();
        if int {
            let value = u64::from_str_radix(&digits, if hex { 16 } else { 10 });
            match value {
                Ok(int) => Token::Integer(int as i64),
                Err(..) => Token::Float(digits.parse().unwrap()),
            }
        } else {
            let value: f64 = if hex {
                parse_f64_hex(&format!("0x{}", digits)).unwrap()
            } else {
                digits.parse().unwrap()
            };
            Token::Float(value)
        }
    }

    fn consume_comment(&mut self) {
        let mut long = None;
        if let Some((_, '[')) = self.input.next() {
            let mut len = 0;

            loop {
                match self.input.next() {
                    Some((_, '[')) => {
                        long = Some(len);
                        break;
                    }
                    Some((_, '=')) => len += 1,
                    _ => break,
                }
            }
        }

        // False positive, iterator is also used on inside
        #[allow(clippy::while_let_on_iterator)]
        while let Some((pos, char)) = self.input.next() {
            match char {
                ']' if long.is_some() => {
                    let mut len = 0;
                    let mut closed = false;

                    loop {
                        match self.input.next() {
                            Some((_, ']')) => {
                                if long == Some(len) {
                                    closed = true;
                                }
                                break;
                            }
                            Some((_, '=')) => len += 1,
                            _ => break,
                        }
                    }

                    if closed {
                        break;
                    }
                }
                '\n' if long.is_none() => {
                    self.line_inc(pos);
                    break;
                }
                _ => {}
            }
        }
    }
}

impl<'a> Iterator for Tokenizer<'a> {
    type Item = Spanned<Token>;

    fn next(&mut self) -> Option<Self::Item> {
        self.read_token()
    }
}

#[inline]
fn is_whitespace_char(chr: char) -> bool {
    matches!(chr, '\t'..='\r' | ' ')
}

#[inline]
fn is_start_name_char(chr: char) -> bool {
    matches!(chr, 'A'..='Z' | 'a'..='z' | '_')
}

#[inline]
fn is_name_char(chr: char) -> bool {
    matches!(chr, 'A'..='Z' | 'a'..='z' | '0'..='9' | '_')
}

#[inline]
fn is_numeric_char(chr: char) -> bool {
    matches!(chr, '0'..='9')
}

#[inline]
fn is_hex_digit_char(chr: char) -> bool {
    matches!(chr, 'A'..='F' | 'a'..='f' | '0'..='9')
}