zoisite 0.1.0

Zoisite is a programming language designed specifically for competitive programming.
Documentation
use rowan::{TextRange, TextSize};
use unscanny::Scanner;

use crate::diagnostic::{Diagnostic, DiagnosticKind};
use crate::syntax_kind::SyntaxKind;
use crate::token::Token;

pub struct Lexer<'a> {
    s: Scanner<'a>,
    tokens: Vec<Token<'a>>,
    current_diagnostic_kind: Option<DiagnosticKind>,
}

impl<'a> Lexer<'a> {
    pub fn new(text: &'a str) -> Self {
        Self {
            s: Scanner::new(text),
            tokens: Vec::new(),
            current_diagnostic_kind: None,
        }
    }

    pub fn tokenize(mut self) -> (Vec<Token<'a>>, Vec<Diagnostic>) {
        let mut diagnostics = Vec::new();
        loop {
            let start = self.s.cursor();
            let kind = self.identify_token(start);
            let end = self.s.cursor();
            let range = TextRange::new(TextSize::new(start as u32), TextSize::new(end as u32));
            let text = self.s.get(range.into());
            self.tokens.push(Token {
                kind,
                text,
                range,
            });
            if let Some(diagnostic_kind) = self.current_diagnostic_kind.take() {
                diagnostics.push(Diagnostic::new(diagnostic_kind, range));
            }
            if kind.is_eof() {
                return (self.tokens, diagnostics);
            }
        }
    }

    fn error(&mut self, diagnostic_kind: DiagnosticKind) -> SyntaxKind {
        self.current_diagnostic_kind = Some(diagnostic_kind);
        SyntaxKind::Error
    }

    fn identify_token(&mut self, start: usize) -> SyntaxKind {
        match self.s.eat() {
            Some(c) if c.is_ascii_whitespace() => self.whitespace(),
            Some(c) if c.is_ascii_digit() => self.number(),
            Some(c) if Self::is_ident_start(c) => self.ident(start),
            Some('=') if self.s.peek() == Some('=') => { self.s.eat(); SyntaxKind::EqEq },
            Some('!') if self.s.peek() == Some('=') => { self.s.eat(); SyntaxKind::Neq },
            Some('>') if self.s.peek() == Some('=') => { self.s.eat(); SyntaxKind::Ge },
            Some('<') if self.s.peek() == Some('=') => { self.s.eat(); SyntaxKind::Le },
            Some('&') if self.s.peek() == Some('&') => { self.s.eat(); SyntaxKind::And },
            Some('|') if self.s.peek() == Some('|') => { self.s.eat(); SyntaxKind::Or },
            Some('>') => SyntaxKind::Gt,
            Some('<') => SyntaxKind::Lt,
            Some('/') if self.s.peek() == Some('/') => self.line_comment(),
            Some('"') => self.string_literal(),
            Some('\'') => self.char_literal(),
            Some(',') => SyntaxKind::Comma,
            Some('.') => SyntaxKind::Dot,
            Some(':') => SyntaxKind::Colon,
            Some(';') => SyntaxKind::Semicolon,
            Some('?') => SyntaxKind::Question,
            Some('=') => SyntaxKind::Equals,
            Some('+') => SyntaxKind::Plus,
            Some('-') => SyntaxKind::Minus,
            Some('*') => SyntaxKind::Star,
            Some('/') => SyntaxKind::Slash,
            Some('%') => SyntaxKind::Percent,
            Some('(') => SyntaxKind::OpenParen,
            Some(')') => SyntaxKind::CloseParen,
            Some('[') => SyntaxKind::OpenBracket,
            Some(']') => SyntaxKind::CloseBracket,
            Some('{') => SyntaxKind::OpenBrace,
            Some('}') => SyntaxKind::CloseBrace,
            None => SyntaxKind::Eof,
            _ => self.error(DiagnosticKind::UnexpectedCharacter),
        }
    }

    fn is_ident_start(c: char) -> bool {
        c.is_ascii_alphabetic() || c == '_'
    }

    fn is_ident_continue(c: char) -> bool {
        c.is_ascii_alphanumeric() || c == '_'
    }

    fn whitespace(&mut self) -> SyntaxKind {
        self.s.eat_while(char::is_ascii_whitespace);
        SyntaxKind::Whitespace
    }

    fn line_comment(&mut self) -> SyntaxKind {
        self.s.eat_while(|c| c != '\n');
        SyntaxKind::LineComment
    }

    fn string_literal(&mut self) -> SyntaxKind {
        self.s.eat_while(|c| c != '"' && c != '\n');
        if self.s.eat_if('"') {
            SyntaxKind::String
        } else {
            self.error(DiagnosticKind::UnterminatedStringLiteral)
        }
    }

    fn char_literal(&mut self) -> SyntaxKind {
        let inner_literal = self.s.eat();
        if let Some('\'') = inner_literal {
            self.error(DiagnosticKind::EmptyCharLiteral)
        } else if inner_literal.is_none() || !self.s.eat_if('\'') {
            self.error(DiagnosticKind::UnterminatedCharLiteral)
        } else {
            SyntaxKind::Char
        }
    }

    fn ident(&mut self, start: usize) -> SyntaxKind {
        self.s.eat_while(Self::is_ident_continue);
        let ident = self.s.from(start);
        match ident {
            "let" => SyntaxKind::LetKw,
            "if" => SyntaxKind::IfKw,
            "else" => SyntaxKind::ElseKw,
            "while" => SyntaxKind::WhileKw,
            "break" => SyntaxKind::BreakKw,
            "continue" => SyntaxKind::ContinueKw,
            "fun" => SyntaxKind::FunKw,
            "none" => SyntaxKind::NoneKw,
            "true" => SyntaxKind::TrueKw,
            "false" => SyntaxKind::FalseKw,
            _ => SyntaxKind::Ident,
        }
    }

    fn number(&mut self) -> SyntaxKind {
        self.s.eat_while(char::is_ascii_digit);
        if self.s.eat_if('.') {
            self.s.eat_while(char::is_ascii_digit);
            SyntaxKind::Float
        } else {
            SyntaxKind::Integer
        }
    }
}

#[cfg(test)]
mod tests {
    use crate::diagnostic::Diagnostic;
    use crate::lexer::Lexer;
    use crate::token::Token;

    fn tokenize(text: &str) -> (Vec<Token>, Vec<Diagnostic>) {
        let lexer = Lexer::new(text);
        lexer.tokenize()
    }

    #[test]
    fn number() {
        insta::assert_debug_snapshot!(tokenize("0 23 1.1"));
    }

    #[test]
    fn string() {
        insta::assert_debug_snapshot!(tokenize("\"Zoisite\""));
    }

    #[test]
    fn char() {
        insta::assert_debug_snapshot!(tokenize("\'Z\'"));
    }

    #[test]
    fn operator() {
        insta::assert_debug_snapshot!(tokenize("+ - * / % == != >= <= > < && ||"));
    }

    #[test]
    fn paren() {
        insta::assert_debug_snapshot!(tokenize("( ) { } [ ]"));
    }

    #[test]
    fn ketword() {
        insta::assert_debug_snapshot!(tokenize("let if else while break continue fun true false"));
    }

    #[test]
    fn ident() {
        insta::assert_debug_snapshot!(tokenize("a bc"));
    }
}