mice 0.11.1 - Docs.rs

//! It's not yet time, but a start nonetheless.

struct File {}

/// Id for a source program fragment.
/// Linked to some data store, so the original source
/// can be fetched from it when necessary.
struct FragmentId(u32);

/// A source program fragment.
/// Valid instances include:
/// - Function definitions
/// - Command definitions
/// - Dice operator declarations
/// - Heterogenous collections of the above
struct Fragment {
    id: FragmentId,
}

// This was completely copied from the lexer for another
// language I was working on.
fn string_literal(lex: &mut ::logos::Lexer<Token>) -> bool {
    let remainder = lex.remainder();
    let mut cursor = remainder;
    while let [b, rest @ ..] = cursor {
        // Note that, due to string escapes, this may become more complex
        // later on. I considered unifying this with `line_comment`, but
        // both of them are liable to become very different.
        if *b == b'"' {
            lex.bump((rest.as_ptr() as usize) - (remainder.as_ptr() as usize));
            return true;
        }
        cursor = rest;
    }
    false
}
// I basically pasted this from the lexer for another
// language I was working on. :D
fn line_comment(lex: &mut ::logos::Lexer<Token>) -> bool {
    let remainder = lex.remainder();
    let mut cursor = remainder;
    // TODO: support other line endings
    // Exclusive range patterns are unstable.
    const TO_NEWLINE: u8 = b'\n' - 1;
    const AFTER_NEWLINE: u8 = b'\n' + 1;
    while let [0..=TO_NEWLINE | AFTER_NEWLINE..=255, rest @ ..] = cursor {
        cursor = rest;
    }
    lex.bump(cursor.as_ptr() as usize - remainder.as_ptr() as usize);
    true
}

/// We reserve words we might want to use as keywords in the future.
fn reserved_word(lex: &mut ::logos::Lexer<Token>) -> bool {
    macro_rules! decl_words {
        ($($word:ident),* $(,)?) => {
            &[$(stringify!($word).as_bytes()),*]
        }
    }
    let words: &[&[u8]] = decl_words![
        __I_AM_A_RESERVED_WORD_THAT_NOBODY_WILL_EVER_USE__,
        type,
        prim,
        intrinsic,
        op,
        while,
        true,
        false
    ];
    words.contains(&lex.slice())
}

#[derive(::logos::Logos)]
enum Token {
    // I sort token definitions in order of most
    // to least specific, shortest to longest, but it bears noting that
    // Logos doesn't actually care about variant order.
    // It will perform token disambiguation according
    // to its own rules, as described at
    // https://docs.rs/logos/0.12.0/logos/index.html#token-disambiguation
    #[token(b".")]
    StatementTerminator,
    // I would name this token after its semantic purpose,
    // but it's likely to have two.
    // - Path separation
    // - Type annotation
    #[token(b":")]
    Colon,
    #[token(b",")]
    Comma,
    #[token(b"(")]
    OpenParen,
    #[token(b")")]
    CloseParen,
    #[token(b"{")]
    OpenCurly,
    #[token(b"}")]
    CloseCurly,
    #[token(b"=")]
    Assign,
    #[token(b"@")]
    At,
    #[token(b"#")]
    Quote,
    // I was going to make this a `&`, but taking a reference
    // is not important enough in this language to allow
    // easy confusion between it and logical `and`.
    // Or, worse, bitwise `and`, which commonly is exactly the same symbol.
    // I use `%` instead because frankly using a percent sign
    // for modular arithmetic would be terrible for users as well.
    #[token(b"%")]
    Ref,
    #[token(b"&&")]
    And,
    #[token(b"==")]
    Equal,
    #[token(b"->")]
    Arrow,
    #[token(b"=>")]
    FatArrow,
    #[token(b"do")]
    Do,
    #[token(b"fn")]
    Fn,
    #[token(b"if")]
    If,
    #[token(b"cmd")]
    Cmd,
    #[token(b"let")]
    Let,
    // TODO: determine if I want constructors like this
    #[token(b"new")]
    New,
    #[token(b"loop")]
    Loop,
    #[token(b"else")]
    Else,
    #[token(b"meta")]
    Meta,
    #[token(b"break")]
    Break,
    #[token(b"const")]
    Const,
    // This keyword is a little strange, in that it acts
    // exactly like an ident, but it cannot be assigned to.
    // It also provides access to special semantic constructs,
    // like reading the context of a containing command.
    #[token(b"builtin")]
    Builtin,
    #[regex("[0-9]+")]
    Integer,
    #[regex("[0-9]*d[0-9]+")]
    DiceTerm,
    #[token(b"\"", string_literal)]
    StringLiteral,
    #[token(b"//", line_comment)]
    LineComment,
    #[regex(br##"[_a-zA-Z]+"##, reserved_word, priority = 2)]
    ReservedWord,
    #[regex(br##"[_a-zA-Z]+"##)]
    Ident,
    #[regex("[\t ]+")]
    Whitespace,
    #[error]
    Error,
}

#[cfg(test)]
mod tests {
    use super::Token;
    use logos::Logos;
    /// This test verifies that [`Token::ReservedWord`] has higher
    /// lexing priority than [`Token::Ident`].
    #[test]
    fn reserved_words() {
        let text = "__I_AM_A_RESERVED_WORD_THAT_NOBODY_WILL_EVER_USE__";
        let mut lexer = Token::lexer(text.as_bytes());
        assert!(matches!(lexer.next(), Some(Token::ReservedWord)));
    }
}

fn parse_fragment(input: &[u8], id: FragmentId) -> Fragment {
    use ::logos::Logos;
    let mut lexer = Token::lexer(input);

    todo!()
}