lex_lua 0.1.8

A lexer for Lua
Documentation
use std::{borrow::Cow, iter::Peekable};

use bstr::{BStr, ByteSlice, Chars};

pub struct Lexer<'a> {
    buffer: Peekable<Chars<'a>>,
    original: &'a BStr,
    pos: usize,
}

pub struct SpannedLexer<'a> {
    inner: Lexer<'a>,
}

impl<'a> SpannedLexer<'a> {
    pub fn new(s: &'a [u8]) -> Self {
        Self {
            inner: Lexer::new(s),
        }
    }
}

impl<'a> Lexer<'a> {
    pub fn new(s: &'a [u8]) -> Self {
        let bs: &BStr = s.into();
        Self {
            buffer: bs.chars().peekable(),
            original: bs,
            pos: 0,
        }
    }

    pub fn current_pos(&self) -> usize {
        self.pos
    }

    fn next_char(&mut self) -> Option<char> {
        let c = self.buffer.next()?;
        if c as u32 == 0xFFFD {
            self.pos += 1;
        } else {
            self.pos += c.len_utf8();
        }
        Some(c)
    }

    pub fn next_token(&mut self) -> Option<Token<'a>> {
        let next = if self.pos == 0 && self.eat('#') {
            return Some(self.comment(1));
        } else {
            self.skip_whitespace();
            let next = self.next_char()?;
            next
        };
        if next.is_digit(10) {
            return Some(self.numeral(next));
        }
        if next == '"' || next == '\'' {
            return Some(self.literal_string(next));
        }

        if next.is_ascii_alphabetic() || next == '_' {
            return Some(self.name(next));
        }

        Some(self.punct(next))
    }

    fn next_spanned(&mut self) -> Option<Item<'a>> {
        let (next, start) = if self.pos == 0 && self.eat('#') {
            let token = self.comment(1);
            return Some(Item::new(token, 0, self.pos));
        } else {
            self.skip_whitespace();
            let start = self.pos;
            let next = self.next_char()?;
            (next, start)
        };
        if next.is_digit(10) {
            let token = self.numeral(next);
            return Some(Item::new(token, start, self.pos));
        }
        if next == '"' || next == '\'' {
            let token = self.literal_string(next);
            return Some(Item::new(token, start, self.pos));
        }

        if next.is_ascii_alphabetic() || next == '_' {
            let token = self.name(next);
            return Some(Item::new(token, start, self.pos));
        }
        let token = self.punct(next);
        Some(Item::new(token, start, self.pos))
    }

    fn numeral(&mut self, c: char) -> Token<'a> {
        let start = self.pos - 1;
        if c == '0' && self.eat_ascii_ignore_case('x') {
            while self.at_digit(16) {
                let _ = self.next_char();
            }
            if self.eat('.') {
                while self.at_digit(16) {
                    let _ = self.next_char();
                }
            }
            if self.eat_ascii_ignore_case('e') {
                let _ = self.eat('-') || self.eat('+');
                while self.at_digit(16) {
                    let _ = self.next_char();
                }
            }
            if self.eat_ascii_ignore_case('p') {
                let _ = self.eat('-') || self.eat('+');
                while self.at_digit(16) {
                    let _ = self.next_char();
                }
            }
        } else {
            while self.at_digit(10) {
                let _ = self.next_char();
            }
            if c != '.' && self.eat('.') {
                while self.at_digit(10) {
                    let _ = self.next_char();
                }
            }
            if self.eat_ascii_ignore_case('e') {
                let _ = self.eat('-') || self.eat('+');
                while self.at_digit(10) {
                    let _ = self.next_char();
                }
            }
        }
        Token::numeral(&self.original[start..self.pos])
    }

    fn literal_string(&mut self, c: char) -> Token<'a> {
        let start = self.pos - 1;

        if c == '=' || c == '[' {
            let mut eq_ct = 0;
            while self.eat('=') {
                eq_ct += 1;
            }
            self.seek_long_string_end(eq_ct);
        } else if c == '"' || c == '\'' {
            let mut escaped = false;
            while !self.at(c) || escaped {
                if let Some(ch) = self.next_char() {
                    escaped = ch == '\\' && !escaped;
                } else {
                    return Token::Unknown(Cow::Borrowed(&self.original[start..self.pos]));
                }
            }
            self.eat(c);
        }
        Token::LiteralString(Cow::Borrowed(&self.original[start..self.pos]))
    }

    fn seek_long_string_end(&mut self, eq_ct: usize) {
        'retry: loop {
            while !self.eat(']') && !self.at_end() {
                let _ = self.next_char();
            }
            let mut found_eq = 0;
            while found_eq < eq_ct {
                if self.eat('=') {
                    found_eq += 1;
                } else {
                    continue 'retry;
                }
            }
            if self.eat(']') {
                return;
            }
        }
    }

    fn name(&mut self, c: char) -> Token<'a> {
        let start = self.pos - 1;
        match c {
            'a' => {
                if self.eat('n') {
                    if self.eat('d') {
                        if !self.at_name_cont() {
                            return Token::Keyword(Keyword::And);
                        }
                    }
                }
            }
            'b' => {
                if self.eat('r') {
                    if self.eat('e') {
                        if self.eat('a') {
                            if self.eat('k') {
                                if !self.at_name_cont() {
                                    return Token::Keyword(Keyword::Break);
                                }
                            }
                        }
                    }
                }
            }
            'd' => {
                if self.eat('o') {
                    if !self.at_name_cont() {
                        return Token::Keyword(Keyword::Do);
                    }
                }
            }
            'e' => {
                if self.eat('l') {
                    if self.eat('s') {
                        if self.eat('e') {
                            if !self.at_name_cont() {
                                return Token::Keyword(Keyword::Else);
                            } else if self.eat('i') {
                                if self.eat('f') {
                                    if !self.at_name_cont() {
                                        return Token::Keyword(Keyword::ElseIf);
                                    }
                                }
                            }
                        }
                    }
                } else if self.eat('n') {
                    if self.eat('d') {
                        if !self.at_name_cont() {
                            return Token::Keyword(Keyword::End);
                        }
                    }
                }
            }
            'f' => {
                if self.eat('a') {
                    if self.eat('l') {
                        if self.eat('s') {
                            if self.eat('e') {
                                if !self.at_name_cont() {
                                    return Token::Keyword(Keyword::False);
                                }
                            }
                        }
                    }
                } else if self.eat('o') {
                    if self.eat('r') {
                        if !self.at_name_cont() {
                            return Token::Keyword(Keyword::For);
                        }
                    }
                } else if self.eat('u') {
                    if self.eat('n') {
                        if self.eat('c') {
                            if self.eat('t') {
                                if self.eat('i') {
                                    if self.eat('o') {
                                        if self.eat('n') {
                                            if !self.at_name_cont() {
                                                return Token::Keyword(Keyword::Function);
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            'g' => {
                if self.eat('o') {
                    if self.eat('t') {
                        if self.eat('o') {
                            if !self.at_name_cont() {
                                return Token::Keyword(Keyword::GoTo);
                            }
                        }
                    }
                }
            }
            'i' => {
                if self.eat('n') {
                    if !self.at_name_cont() {
                        return Token::Keyword(Keyword::In);
                    }
                } else if self.eat('f') {
                    if !self.at_name_cont() {
                        return Token::Keyword(Keyword::If);
                    }
                }
            }
            'l' => {
                if self.eat('o') {
                    if self.eat('c') {
                        if self.eat('a') {
                            if self.eat('l') {
                                if !self.at_name_cont() {
                                    return Token::Keyword(Keyword::Local);
                                }
                            }
                        }
                    }
                }
            }
            'n' => {
                if self.eat('i') {
                    if self.eat('l') {
                        if !self.at_name_cont() {
                            return Token::Keyword(Keyword::Nil);
                        }
                    }
                } else if self.eat('o') {
                    if self.eat('t') {
                        if !self.at_name_cont() {
                            return Token::Keyword(Keyword::Not);
                        }
                    }
                }
            }
            'o' => {
                if self.eat('r') {
                    if !self.at_name_cont() {
                        return Token::Keyword(Keyword::Or);
                    }
                }
            }
            'r' => {
                if self.eat('e') {
                    if self.eat('p') {
                        if self.eat('e') {
                            if self.eat('a') {
                                if self.eat('t') {
                                    if !self.at_name_cont() {
                                        return Token::Keyword(Keyword::Repeat);
                                    }
                                }
                            }
                        }
                    } else if self.eat('t') {
                        if self.eat('u') {
                            if self.eat('r') {
                                if self.eat('n') {
                                    if !self.at_name_cont() {
                                        return Token::Keyword(Keyword::Return);
                                    }
                                }
                            }
                        }
                    }
                }
            }
            't' => {
                if self.eat('h') {
                    if self.eat('e') {
                        if self.eat('n') {
                            if !self.at_name_cont() {
                                return Token::Keyword(Keyword::Then);
                            }
                        }
                    }
                } else if self.eat('r') {
                    if self.eat('u') {
                        if self.eat('e') {
                            if !self.at_name_cont() {
                                return Token::Keyword(Keyword::True);
                            }
                        }
                    }
                }
            }
            'u' => {
                if self.eat('n') {
                    if self.eat('t') {
                        if self.eat('i') {
                            if self.eat('l') {
                                if !self.at_name_cont() {
                                    return Token::Keyword(Keyword::Until);
                                }
                            }
                        }
                    }
                }
            }
            'w' => {
                if self.eat('h') {
                    if self.eat('i') {
                        if self.eat('l') {
                            if self.eat('e') {
                                if !self.at_name_cont() {
                                    return Token::Keyword(Keyword::While);
                                }
                            }
                        }
                    }
                }
            }
            _ => {
                if !c.is_ascii_alphanumeric() && c != '_' {
                    return Token::Unknown(Cow::Borrowed(&self.original[start..self.pos]));
                }
            }
        }
        while self.at_name_cont() {
            let _ = self.next_char();
        }
        Token::name(&self.original[start..self.pos])
    }

    fn punct(&mut self, c: char) -> Token<'a> {
        let p = match c {
            '+' => Punct::Plus,
            '-' => {
                if self.eat('-') {
                    return self.comment(2);
                } else {
                    Punct::Minus
                }
            }
            '*' => Punct::Asterisk,
            '%' => Punct::Percent,
            '^' => Punct::Caret,
            '#' => Punct::Hash,
            '&' => Punct::Ampersand,
            '~' => {
                if self.eat('=') {
                    Punct::TildeEqual
                } else {
                    Punct::Tilde
                }
            }
            '|' => Punct::Pipe,
            '(' => Punct::OpenParen,
            ')' => Punct::CloseParen,
            '{' => Punct::OpenBrace,
            '}' => Punct::CloseBrace,
            ';' => Punct::SemiColon,
            ',' => Punct::Comma,
            ']' => Punct::CloseBracket,
            '.' => {
                if self.at_digit(10) {
                    return self.numeral('.');
                } else if self.eat('.') {
                    if self.eat('.') {
                        Punct::Ellipsis
                    } else {
                        Punct::DoubleDot
                    }
                } else {
                    Punct::Dot
                }
            }
            ':' => {
                if self.eat(':') {
                    Punct::DoubleColon
                } else {
                    Punct::Colon
                }
            }
            '/' => {
                if self.eat('/') {
                    Punct::DoubleForwardSlash
                } else {
                    Punct::ForwardSlash
                }
            }
            '=' => {
                if self.eat('=') {
                    Punct::DoubleEqual
                } else {
                    Punct::Equal
                }
            }
            '<' => {
                if self.eat('<') {
                    Punct::DoubleLessThan
                } else if self.eat('=') {
                    Punct::LessThanEqual
                } else {
                    Punct::LessThan
                }
            }
            '>' => {
                if self.eat('>') {
                    Punct::DoubleGreaterThan
                } else if self.eat('=') {
                    Punct::GreaterThanEqual
                } else {
                    Punct::GreaterThan
                }
            }
            '[' => {
                if self.at('[') {
                    return self.literal_string('[');
                } else if self.at('=') {
                    return self.literal_string('=');
                } else {
                    Punct::OpenBracket
                }
            }
            _ => return Token::Unknown(Cow::Borrowed(&self.original[self.pos - 1..self.pos])),
        };
        Token::Punct(p)
    }

    fn comment(&mut self, start_len: usize) -> Token<'a> {
        let start = self.pos - start_len;
        if self.eat('[') {
            let eq_ct = self.consume_long_comment_sep();
            if self.eat('[') {
                self.seek_long_string_end(eq_ct);
                return Token::comment(&self.original[start..self.pos]);
            }
        }
        while !self.at('\n') && !self.at_end() {
            let _ = self.next_char();
        }
        Token::comment(&self.original[start..self.pos])
    }

    fn consume_long_comment_sep(&mut self) -> usize {
        let mut ret = 0;
        while self.eat('=') {
            ret += 1;
        }
        ret
    }

    fn at(&mut self, c: char) -> bool {
        if let Some(ch) = self.buffer.peek() {
            *ch == c
        } else {
            false
        }
    }

    fn eat(&mut self, c: char) -> bool {
        if let Some(ch) = self.buffer.peek() {
            if *ch == c {
                let _ = self.next_char();
                return true;
            }
        }
        false
    }

    fn eat_ascii_ignore_case(&mut self, c: char) -> bool {
        if let Some(ch) = self.buffer.peek() {
            if ch.eq_ignore_ascii_case(&c) {
                let _ = self.next_char();
                return true;
            }
        }
        false
    }

    fn skip_whitespace(&mut self) {
        while let Some(ch) = self.buffer.peek() {
            if ch.is_ascii_whitespace() {
                let _ = self.next_char();
            } else {
                break;
            }
        }
    }

    fn at_name_cont(&mut self) -> bool {
        if let Some(ch) = self.buffer.peek() {
            ch.is_ascii_alphanumeric() || *ch == '_'
        } else {
            false
        }
    }

    fn at_digit(&mut self, radix: u32) -> bool {
        if let Some(ch) = self.buffer.peek() {
            ch.is_digit(radix)
        } else {
            false
        }
    }

    fn at_end(&mut self) -> bool {
        self.buffer.peek().is_none()
    }
}

impl<'a> std::iter::Iterator for Lexer<'a> {
    type Item = Token<'a>;
    fn next(&mut self) -> Option<Self::Item> {
        self.next_token()
    }
}

impl<'a> std::iter::Iterator for SpannedLexer<'a> {
    type Item = Item<'a>;
    fn next(&mut self) -> Option<Self::Item> {
        self.inner.next_spanned()
    }
}

#[derive(Debug, Clone)]
pub struct Item<'a> {
    pub token: Token<'a>,
    pub span: Span,
}

#[derive(Debug, Clone, Copy)]
pub struct Span {
    pub start: usize,
    pub end: usize,
}

impl<'a> Item<'a> {
    pub fn new(token: Token<'a>, start: usize, end: usize) -> Self {
        Self {
            token,
            span: Span { start, end },
        }
    }
}

#[derive(Debug, Eq, PartialEq, Clone)]
pub enum Token<'a> {
    Name(Cow<'a, str>),
    Numeral(Cow<'a, str>),
    LiteralString(Cow<'a, BStr>),
    Punct(Punct),
    Keyword(Keyword),
    Comment(Cow<'a, BStr>),
    Unknown(Cow<'a, BStr>),
}

impl<'a> Token<'a> {
    pub fn name(s: &'a BStr) -> Self {
        Self::Name(s.to_str_lossy())
    }
    pub fn numeral(s: &'a BStr) -> Self {
        Self::Numeral(s.to_str_lossy())
    }
    pub fn literal_string(s: &'a BStr) -> Self {
        Self::LiteralString(Cow::Borrowed(s))
    }
    pub fn comment(s: &'a BStr) -> Self {
        Self::Comment(Cow::Borrowed(s))
    }
}
#[derive(Debug, Eq, PartialEq, Clone, Copy)]
pub enum Punct {
    Ampersand,
    Asterisk,
    Caret,
    CloseBrace,
    CloseBracket,
    CloseParen,
    Colon,
    Comma,
    Dot,
    DoubleColon,
    DoubleDot,
    DoubleEqual,
    DoubleForwardSlash,
    DoubleGreaterThan,
    DoubleLessThan,
    Ellipsis,
    Equal,
    ForwardSlash,
    GreaterThan,
    GreaterThanEqual,
    Hash,
    LessThan,
    LessThanEqual,
    Minus,
    OpenBrace,
    OpenBracket,
    OpenParen,
    Percent,
    Pipe,
    Plus,
    SemiColon,
    Tilde,
    TildeEqual,
}

impl std::str::FromStr for Punct {
    type Err = String;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        let p = match s {
            "&" => Punct::Ampersand,
            "*" => Punct::Asterisk,
            "^" => Punct::Caret,
            "}" => Punct::CloseBrace,
            "]" => Punct::CloseBracket,
            ")" => Punct::CloseParen,
            ":" => Punct::Colon,
            "," => Punct::Comma,
            "." => Punct::Dot,
            "::" => Punct::DoubleColon,
            ".." => Punct::DoubleDot,
            "==" => Punct::DoubleEqual,
            "//" => Punct::DoubleForwardSlash,
            ">>" => Punct::DoubleGreaterThan,
            "<<" => Punct::DoubleLessThan,
            "..." => Punct::Ellipsis,
            "=" => Punct::Equal,
            "/" => Punct::ForwardSlash,
            ">" => Punct::GreaterThan,
            ">=" => Punct::GreaterThanEqual,
            "#" => Punct::Hash,
            "<" => Punct::LessThan,
            "<=" => Punct::LessThanEqual,
            "-" => Punct::Minus,
            "{" => Punct::OpenBrace,
            "[" => Punct::OpenBracket,
            "(" => Punct::OpenParen,
            "%" => Punct::Percent,
            "|" => Punct::Pipe,
            "+" => Punct::Plus,
            ";" => Punct::SemiColon,
            "~" => Punct::Tilde,
            "~=" => Punct::TildeEqual,
            _ => return Err(format!("Invalid punct: {:?}", s)),
        };
        Ok(p)
    }
}

#[derive(Debug, Eq, PartialEq, Clone, Copy)]
pub enum Keyword {
    And,
    Or,
    Function,
    Local,
    For,
    Do,
    End,
    Nil,
    True,
    False,
    Not,
    Return,
    In,
    While,
    GoTo,
    Break,
    Repeat,
    Then,
    ElseIf,
    Else,
    Until,
    If,
}

impl Keyword {
    pub fn as_str(&self) -> &str {
        match self {
            Self::And => "and",
            Self::Or => "or",
            Self::Function => "function",
            Self::Local => "local",
            Self::For => "for",
            Self::Do => "do",
            Self::End => "end",
            Self::Nil => "nil",
            Self::True => "true",
            Self::False => "false",
            Self::Not => "not",
            Self::Return => "return",
            Self::In => "in",
            Self::While => "while",
            Self::GoTo => "goto",
            Self::Break => "break",
            Self::Repeat => "repeat",
            Self::Then => "then",
            Self::ElseIf => "elseif",
            Self::Else => "else",
            Self::Until => "until",
            Self::If => "if",
        }
    }
}

#[cfg(test)]
mod test {
    use super::*;

    #[test]
    fn numerals() {
        let strings = &[
            "3",
            "345",
            "0xff",
            "0xBEBADA",
            "3.0",
            "3.1416",
            "314.16e-2",
            "0.31416E1",
            "34e1",
            "0x0.1E",
            "0xA23p-4",
            "0X1.921FB54442D18P+1",
            ".01",
        ];
        for &string in strings {
            let mut t = Lexer::new(string.as_bytes());
            assert_eq!(
                t.next_token().unwrap(),
                Token::Numeral(Cow::Borrowed(string.into()))
            )
        }
    }

    #[test]
    fn puncts() {
        let strings = &[
            "+", "-", "*", "/", "//", "^", "%", "&", "~", "|", ">>", "<<", ".", "..", "...", "=",
            "<", "<=", ">", ">=", "==", "~=", "#", "~", "(", ")", "[", "]", "{", "}", ",", ":",
            "::", ";",
        ];
        for &string in strings {
            println!("testing {:?}", string);
            // Prefix here to avoid the len operator from
            // being interpreted as a comment
            let prefixed = format!("\n{}", string);
            let mut t = Lexer::new(prefixed.as_bytes());
            assert_eq!(
                t.next_token().unwrap(),
                Token::Punct(string.parse().unwrap())
            )
        }
        let mut t = Lexer::new(b"$");
        assert_eq!(
            t.next_token().unwrap(),
            Token::Unknown(Cow::Borrowed("$".into())),
        )
    }

    #[test]
    fn keywords() {
        let keywords = &[
            Keyword::And,
            Keyword::Or,
            Keyword::Function,
            Keyword::Local,
            Keyword::For,
            Keyword::Do,
            Keyword::End,
            Keyword::Nil,
            Keyword::True,
            Keyword::False,
            Keyword::Not,
            Keyword::Return,
            Keyword::In,
            Keyword::While,
            Keyword::GoTo,
            Keyword::Break,
            Keyword::Repeat,
            Keyword::Then,
            Keyword::ElseIf,
            Keyword::Else,
            Keyword::Until,
            Keyword::If,
        ];
        for &keyword in keywords {
            println!("testing {:?}", keyword.as_str());
            let mut t = Lexer::new(keyword.as_str().as_bytes());
            assert_eq!(
                t.next_token().unwrap(),
                Token::Keyword(keyword)
            )
        }
        let mut t = Lexer::new(b"$");
        assert_eq!(
            t.next_token().unwrap(),
            Token::Unknown(Cow::Borrowed("$".into())),
        )
    }

    #[test]
    fn literal_string() {
        let strings = &[
            r#"'alo\n123"'"#,
            r#""alo\n123\"""#,
            r#"[[alo
123"]]"#,
            r#"[==[
alo
123"]==]"#,
            r#"[==[
alo
123"]=]==]"#,
        ];
        for (i, &s) in strings.iter().enumerate() {
            println!("{} testing {:?}", i, s);
            let mut t = Lexer::new(s.as_bytes());
            assert_eq!(
                t.next_token().unwrap(),
                Token::LiteralString(Cow::Borrowed(s.into()))
            );
        }
    }

    #[test]
    fn comments() {
        let strings = &[
            "#!hashbang",
            "--single line comment",
            "--[[multi
            line comment]]",
            "--[==[
                [[multi with seps]]]]]]]
            ]==]",
        ];
        for &s in strings {
            let mut t = Lexer::new(s.as_bytes());
            assert_eq!(
                t.next_token().unwrap(),
                Token::Comment(Cow::Borrowed(s.into()))
            )
        }
    }
    #[test]
    fn strange_comments_interspersed() {
        let lua = "print(--[[ hello ]]'hello world'--[[world]])";
        let expected = &[
            Token::Name(Cow::Borrowed("print")),
            Token::Punct(Punct::OpenParen),
            Token::Comment(Cow::Borrowed("--[[ hello ]]".into())),
            Token::literal_string("'hello world'".into()),
            Token::Comment(Cow::Borrowed("--[[world]]".into())),
            Token::Punct(Punct::CloseParen),
        ];
        let mut t = Lexer::new(lua.as_bytes());
        for tok in expected {
            assert_eq!(
                t.next_token().unwrap(),
                *tok
            )
        }
    }
}