reic 0.1.0 - Docs.rs

use derive_new::new;
use logos::{Logos, Skip};
use std::{ops::Range, slice::Iter};

// ---------------
// LEXING
// ---------------

/// A token in rei
#[derive(Logos, Debug, PartialEq, Clone, Copy)]
pub enum Token {
    #[regex("//.*")]
    SinglelineComment,
    // mainly for doc comments, e.g. at the top of the file
    #[regex("#.*")]
    HashComment,
    #[regex("/[\\*]([^\\*]|([\\*][^/]))*[\\*]+/")]
    MultilineComment,
    #[regex("#[\\*]([^\\*]|([\\*][^/]))*[\\*]+#")]
    MultilineHashComment,
    // for block scopes for notebooks and scripts
    #[token("%%")]
    ScriptBlock,

    #[token("mod")]
    Module,
    // Not defined in the standard, but pasm
    // #[token("namespace")]
    // Namespace,
    #[token("internal")]
    Internal,
    #[token("pub")]
    Pub,
    #[token("export")]
    Export,
    #[token("use")]
    Use,
    #[token("data")]
    Data,
    #[token("class")]
    Class,
    #[token("fn")]
    Function,
    #[token("enum")]
    Enum,
    #[token("self")]
    SelfKeyword,
    #[token("super")]
    Super,
    #[token("outer")]
    Outer,
    // I was going to make this a core::function
    // #[token("macro")]
    // Macro,
    #[token("let")]
    Let,
    #[token("mut")]
    Mut,
    #[token("const")]
    Const,
    #[token("static")]
    Static,
    #[token("new")]
    New,
    #[token("unsafe")]
    Unsafe,

    #[token("if")]
    If,
    #[token("else")]
    Else,
    #[token("return")]
    Return,
    #[token("break")]
    Break,
    #[token("while")]
    While,
    #[token("for")]
    For,
    // no such thing as 'default'. Just match each possible case either T or None for enums
    #[token("match")]
    Match,
    #[token("continue")]
    Continue,
    #[token("loop")]
    Loop,
    #[token("yield")]
    Yield,

    // Conditions
    #[token("true")]
    True,
    #[token("false")]
    False,
    #[token("and")]
    And,
    #[token("or")]
    Or,
    #[token("not")]
    Not,
    #[token("is")]
    Is,
    #[token("as")]
    As,
    #[token("in")]
    In,

    // @
    #[token("@")]
    At,
    // Usually in annotations and lists/tuples
    #[token(",")]
    Comma,

    // Parentheses can be overloaded like C++ classes
    #[token("(")]
    ParenLeft,
    #[token(")")]
    ParenRight,
    // Context sensitive, signifies scoping
    #[token("{")]
    CurlyBraceLeft,
    #[token("}")]
    CurlyBraceRight,

    // Compiler directive on f-strings
    #[token("$")]
    DollarSign,

    // SPECIAL
    #[token("->")]
    FnReturn,
    #[token("=>")]
    ArrowFn,
    /// Variadic args, generics or macros
    #[token("...")]
    TripleDot,

    // OVERLOADABLE or SPECIFIC
    #[token("+")]
    Plus,
    #[token("-")]
    Minus,
    #[token("/")]
    LeftSlash,
    #[token("*")]
    Star,
    // LArrow Ident RArrow = <id> which means Option<id>
    #[token("<")]
    LeftArrow,
    // RArrow in an annotation means "apply this next" std::/core:: does not have an order
    #[token(">")]
    RightArrow,
    #[token("=")]
    Equals,
    #[token("==")]
    Identity,
    #[token("!")]
    Exclamation,
    #[token("?")]
    Question,
    #[token(".")]
    Dot,
    // mainly for range based loops
    // Parser: range_expr or range_op
    #[token("..")]
    DoubleDot,
    #[token(":")]
    Colon,
    // usually module qualification
    // Parser: scan the inner namespace for that specific mod Identifier's existence
    // ? Hmm, could maybe use a callback to form an identifier instead
    #[token("::")]
    DoubleColon,
    #[token("'")]
    SingleQuote,
    #[token("\"")]
    DoubleQuote,
    #[token("^")]
    UpArrow,

    // EOF
    #[regex("\x26")]
    EOF,

    #[regex(r"[_a-zA-Z]\w*")]
    Identifier,
    // #[regex(r"[_a-zA-Z]((\w+)|(::))*\w+", priority = 1)]
    // ModIdentifier,
    #[regex("[-][0-9]+", |lex| lex.slice().parse())]
    Int(i64),
    #[regex("-?[0-9]+\\.[0-9]+", |lex| lex.slice().parse())]
    Float(f64),
    // For ascii printable "strings" (without backslash)
    // I dont think it works for \escaped " quotes. \\ doesnt work
    #[regex("\"(?:[^\"]|\\.)*\"")]
    DoubleQuotedString,
    // For 'strings'
    #[regex("'(?:[^\"]|\\.)*'")]
    SingleQuotedString,
    // For `strings`
    #[regex("`(?:[^\"]|\\.)*`")]
    DashQuotedString,

    // Logos requires one token variant to handle errors, we can also use this variant to define whitespace, or any other matches we wish to skip.
    #[error]
    #[regex(r"[ \t\n\f]+", |_| Skip)]
    Whitespace,
}

/*
Order of precedence (overloadable operators)
Order of precedence matters quite a lot and should be predictable

COMMON
()
::
.
*
/
+
-

*=
/=
+=
-=

**
++
--

COMPARISON
||
&&
<
>
..

LIST
,
|

EQUIVALENCE
==
*/

// NOTE: | means bitwise OR when using numeric. On other types, its free to overload

pub fn tokenise(file: &str) -> Vec<(Token, Range<usize>)> {
    // Build symbol table (DONE WITH LAZY STATIC)
    // let symtab = SymbolTable::default();

    let mut tokens = Token::lexer(file);

    let mut str_ = tokens.source();
    log::info!("\n====SOURCE====\n{}\n==============", str_);

    // Generating labels for scoped blocks. In the end, you cant use namespaces because ELF doesnt have an idea of what that is. Later just generate random names for all the labels
    // 1. collect all the explicitly labelled blocks and add them to the symtab

    // Range tells you where it starts and ends. The start is the good thing??
    let res = tokens.spanned().collect();

    res
}

pub fn tokenise_into_parser(file: &str) -> Parser {
    let res = tokenise(file);

    Parser::new(res, 0, file.to_string())
}

#[derive(Debug, new)]
pub struct Parser {
    tokens: Vec<(Token, Range<usize>)>,
    curr_index: usize,
    input_string: String,
}

impl Parser {
    pub fn next_sym(&mut self) {
        self.curr_index += 1;
    }

    pub fn accept(&mut self, token: Token) -> Result<String, Token> {
        let t = &self.tokens[self.curr_index];

        if token == t.0 {
            let res = Ok(self.input_string[t.1.clone()].to_owned());
            self.next_sym();

            return res;
        }

        Err(t.0)
    }

    /// Simply returns true or false
    pub fn accept_ok(&mut self, token: Token) -> bool {
        self.accept(token).is_ok()
    }

    pub fn expect(&mut self, token: Token) -> String {
        match self.accept(token) {
            Ok(t) => t,
            Err(e) => panic!("Token \"{token:?}\" was not expected... \"{e:?}\" was the actual token"),
        }
    }

    pub fn log_tokens(&self) {
        for token in &self.tokens {
            log::info!("token = {:?}", token.0);
            log::info!(" range = {:?}", token.1);
        }
    }

    pub fn print_tokens(&self) {
        for token in &self.tokens {
            println!("token = {:?}", token.0);
            println!(" range = {:?}", token.1);
        }
    }
}

#[test]
fn test_parser() {
    let example = "
    mod root {
        fn main() {
            output(\"Hello, world!\")
        }
    }
    ";

    let mut parser = tokenise_into_parser(example);
    parser.print_tokens();

    match parser.accept(Token::Module) {
        Ok(ok) => println!("token = {ok}"),
        Err(err) => println!("Didnt match, got {err:?} instead"),
    }

    match parser.accept(Token::Identifier) {
        Ok(ok) => println!("token = {ok}"),
        Err(err) => println!("Didnt match, got {err:?} instead"),
    }
}