trixy 0.4.0 - Docs.rs

/*
* Copyright (C) 2023 - 2024:
* The Trinitrix Project <soispha@vhack.eu, antifallobst@systemausfall.org>
* SPDX-License-Identifier: GPL-3.0-or-later
*
* This file is part of the Trixy crate for Trinitrix.
*
* Trixy is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* and the GNU General Public License along with this program.
* If not, see <https://www.gnu.org/licenses/>.
*/

use std::{borrow::Cow, fmt::Display};

use regex::Regex;

use self::{error::SpannedLexingError, tokenizer::Tokenizer};

pub mod error;
mod tokenizer;

#[cfg(test)]
mod test;

#[derive(Debug, PartialEq, PartialOrd, Ord, Eq, Clone)]
pub struct TokenStream {
    pub original_file: String,
    tokens: Vec<Token>,
}

impl TokenStream {
    /// Try to remove syntax sugar by applying regex matching to the input string
    pub fn replace(src: &str) -> Cow<str> {
        // vim regex
        // :%s/\v^(\s*)\/\/\/(|[^/].*)$/\1#[doc = r#"\2"#]
        let re = Regex::new(r"(?m)^(?<space>\s*)///(?<content>|[^/].*)$").unwrap();

        // Replace all doc comments with their attribute
        let src_new = re.replace_all(src, r##"$space#[doc = r#"$content"#]"##);
        src_new
    }

    /// Turn a string of valid Trixy code into a list of tokens, including the
    /// location of that token's start and end point in the original source code.
    ///
    /// Note the token indices represent the half-open interval `[start, end)`,
    /// equivalent to `start .. end` in Rust.
    pub fn lex(src: &str) -> Result<Self, SpannedLexingError> {
        let src = Self::replace(src);

        let mut tokenizer = Tokenizer::new(&src);
        let mut tokens = Vec::new();

        while let Some(tok) = tokenizer.next_token()? {
            tokens.push(tok);
        }

        // filter out comments
        let tokens = tokens
            .into_iter()
            .filter(|token| !matches!(token.kind, TokenKind::Comment(_)))
            .collect();

        Ok(Self {
            tokens,
            original_file: src.to_string(),
        })
    }

    /// Get a token by index
    pub fn get(&self, index: usize) -> Option<&Token> {
        self.tokens.get(index)
    }

    /// Get a reference to the uppermost token, without modifying the token list
    pub fn peek(&self) -> Option<&Token> {
        self.tokens.last()
    }

    /// Remove to the uppermost token
    pub fn pop(&mut self) -> Token {
        self.tokens.pop().expect("This should not be emtpy")
    }

    /// Reverses the underlying tokes vector
    /// This is facilitates using the pop and peek methods to parse the tokens from the beginning,
    /// not the end
    pub fn reverse(&mut self) {
        self.tokens.reverse()
    }

    /// Check if the TokenStream is empty.
    pub fn is_empty(&self) -> bool {
        self.tokens.is_empty()
    }
}

/// A token span is recorded in chars starting from the beginning of the file:
/// A token span like this, for example:
/// ```dont_run
///# use trixy_lang_parser::lexing::TokenSpan;
/// TokenSpan {
///     start: 20,
///     end: 23,
/// }
/// ```
/// signals, that the token starts at the 20th char in the source file and ends on the 23rd.
#[derive(Debug, Default, PartialEq, PartialOrd, Ord, Eq, Clone, Copy)]
pub struct TokenSpan {
    /// The start of the token span
    pub start: usize,
    /// The end of the token span
    pub end: usize,
}

impl TokenSpan {
    pub fn from_range(start: TokenSpan, end: TokenSpan) -> Self {
        Self {
            start: start.start,
            end: end.end,
        }
    }
}

/// A Token
#[derive(Debug, Default, PartialEq, PartialOrd, Ord, Eq, Clone)]
pub struct Token {
    /// The token's original location in the source file
    pub span: TokenSpan,
    pub kind: TokenKind,
}

impl Token {
    /// Return the TokenKind of a token
    pub fn kind(&self) -> &TokenKind {
        &self.kind
    }

    /// Return the TokenSpan of a token
    pub fn span(&self) -> &TokenSpan {
        &self.span
    }
}

/// Possibly kinds of tokens
#[derive(Debug, Default, PartialEq, Eq, PartialOrd, Ord, Clone)]
pub enum TokenKind {
    Keyword(Keyword),
    AttributeKeyword(AttributeKeyword),
    Identifier(String),
    Colon,
    Semicolon,
    Comma,
    Arrow,
    PoundSign,
    EqualsSign,
    StringLiteral(String),

    CurlyBracketOpen,
    CurlyBracketClose,
    CurvedBracketOpen,
    CurvedBracketClose,
    AngledBracketOpen,
    AngledBracketClose,
    SquareBracketOpen,
    SquareBracketClose,

    Comment(String),

    #[default]
    /// This tokenkind is here support usages of [`mem::take`], this should never be exposed in the
    /// final parsed output.
    DefaultTokenKind,
}

impl TokenKind {
    pub fn same_kind(&self, other: &TokenKind) -> bool {
        if let TokenKind::Identifier(_) = self {
            if let TokenKind::Identifier(_) = other {
                return true;
            }
        }
        if let TokenKind::AttributeKeyword(_) = self {
            if let TokenKind::AttributeKeyword(_) = other {
                return true;
            }
        }
        if let TokenKind::StringLiteral(_) = self {
            if let TokenKind::StringLiteral(_) = other {
                return true;
            }
        }
        if let TokenKind::Comment(_) = self {
            if let TokenKind::Comment(_) = other {
                return true;
            }
        }
        self == other
    }
}

impl Display for TokenKind {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            TokenKind::Keyword(word) => write!(f, "KEYWORD({})", word),
            TokenKind::AttributeKeyword(word) => write!(f, "ATTRIBUTE_KEYWORD({})", word),
            TokenKind::Identifier(ident) => {
                if ident.is_empty() {
                    write!(f, "IDENTIFIER")
                } else {
                    write!(f, "IDENTIFIER({})", ident)
                }
            }
            TokenKind::EqualsSign => f.write_str("EQUALS_SIGN"),
            TokenKind::PoundSign => f.write_str("POUND_SIGN"),
            TokenKind::Colon => f.write_str("COLON"),
            TokenKind::Semicolon => f.write_str("SEMICOLON"),
            TokenKind::Comma => f.write_str("COMMA"),
            TokenKind::Arrow => f.write_str("ARROW"),
            TokenKind::StringLiteral(text) => write!(f, r#"STRING_LITERAL("{}")"#, text),

            TokenKind::CurlyBracketOpen => f.write_str("CURLY_BRACKET_OPEN"),
            TokenKind::CurlyBracketClose => f.write_str("CURLY_BRACKET_CLOSE"),
            TokenKind::CurvedBracketOpen => f.write_str("CURVED_BRACKET_OPEN"),
            TokenKind::CurvedBracketClose => f.write_str("CURVED_BRACKET_CLOSE"),
            TokenKind::AngledBracketOpen => f.write_str("ANGLED_BRACKET_OPEN"),
            TokenKind::AngledBracketClose => f.write_str("ANGLED_BRACKET_CLOSE"),
            TokenKind::SquareBracketOpen => f.write_str("SQUARE_BRACKET_OPEN"),
            TokenKind::SquareBracketClose => f.write_str("SQUARE_BRACKET_CLOSE"),

            TokenKind::DefaultTokenKind => f.write_str("DEFAULT_TOKEN_KIND"),
            TokenKind::Comment(text) => write!(f, "COMMENT({})", text),
        }
    }
}

/// Keywords used in the language
#[derive(Debug, PartialEq, PartialOrd, Ord, Eq, Clone, Copy)]
pub enum Keyword {
    /// Start a namespace declaration
    #[allow(non_camel_case_types)]
    r#mod,
    /// Start a function declaration
    #[allow(non_camel_case_types)]
    r#fn,
    /// Start a structure declaration
    #[allow(non_camel_case_types)]
    r#struct,
    /// Start a enum declaration
    #[allow(non_camel_case_types)]
    r#enum,
}

/// Keywords used in attributes: (#[<keyword>(<value>)])
#[derive(Debug, PartialEq, PartialOrd, Ord, Eq, Clone, Copy)]
pub enum AttributeKeyword {
    /// Derive a trait (only `Error` for now)
    #[allow(non_camel_case_types)]
    derive,

    /// Document the attached item
    #[allow(non_camel_case_types)]
    doc,

    /// Add an error message
    #[allow(non_camel_case_types)]
    error,
}

impl Display for Keyword {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Keyword::r#mod => f.write_str("mod"),
            Keyword::r#fn => f.write_str("fn"),
            Keyword::r#struct => f.write_str("struct"),
            Keyword::r#enum => f.write_str("enum"),
        }
    }
}

impl Display for AttributeKeyword {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            AttributeKeyword::derive => f.write_str("derive"),
            AttributeKeyword::doc => f.write_str("doc"),
            AttributeKeyword::error => f.write_str("error"),
        }
    }
}

/// Shorthand macro for generating a token from *anything* which can be
/// converted into a `TokenKind`, or any of the `TokenKind` variants.
///
/// # Examples
///
/// ```
/// use crate::parser::token;
///# fn main() {
/// token![mod];
/// token![;];
/// token![Arrow];
///# }
/// ```
#[macro_export]
macro_rules! token {
    [Semicolon] => { $crate::parser::lexing::TokenKind::Semicolon };
    [;] => { $crate::parser::lexing::TokenKind::Semicolon };
    [Colon]    => { $crate::parser::lexing::TokenKind::Colon };
    [:]    => { $crate::parser::lexing::TokenKind::Colon };
    [Comma]    => { $crate::parser::lexing::TokenKind::Comma };
    [,]    => { $crate::parser::lexing::TokenKind::Comma };
    [Arrow]    => { $crate::parser::lexing::TokenKind::Arrow };
    [->] => { $crate::parser::lexing::TokenKind::Arrow };
    [PoundSign]    => { $crate::parser::lexing::TokenKind::PoundSign };
    [#]    => { $crate::parser::lexing::TokenKind::PoundSign };
    [EqualsSign]    => { $crate::parser::lexing::TokenKind::EqualsSign };
    [=]    => { $crate::parser::lexing::TokenKind::EqualsSign };


    [AngledBracketOpen]    => { $crate::parser::lexing::TokenKind::AngledBracketOpen };
    [<] => { $crate::parser::lexing::TokenKind::AngledBracketOpen };

    [AngledBracketClose]    => { $crate::parser::lexing::TokenKind::AngledBracketClose };
    [>] => { $crate::parser::lexing::TokenKind::AngledBracketClose };

    [CurlyBracketOpen]    => { $crate::parser::lexing::TokenKind::CurlyBracketOpen};
    // [{]    => { $crate::parser::lexing::TokenKind::CurlyBracketOpen };
    [CurlyBracketClose]    => { $crate::parser::lexing::TokenKind::CurlyBracketClose};
    // [}]    => { $crate::parser::lexing::TokenKind::CurlyBracketClose };
    [CurvedBracketOpen]    => { $crate::parser::lexing::TokenKind::CurvedBracketOpen};
    // [(]    => { $crate::parser::lexing::TokenKind::ParenthesisOpen };
    [CurvedBracketClose]    => { $crate::parser::lexing::TokenKind::CurvedBracketClose};
    // [)]    => { $crate::parser::lexing::TokenKind::ParenthesisClose };
    [SquareBracketOpen]    => { $crate::parser::lexing::TokenKind::SquareBracketOpen};
    // [[]    => { $crate::parser::lexing::TokenKind::ParenthesisOpen };
    [SquareBracketClose]    => { $crate::parser::lexing::TokenKind::SquareBracketClose};
    // []]    => { $crate::parser::lexing::TokenKind::ParenthesisClose };


    [mod] => { $crate::parser::lexing::TokenKind::Keyword($crate::parser::lexing::Keyword::r#mod) };
    [fn] => { $crate::parser::lexing::TokenKind::Keyword($crate::parser::lexing::Keyword::r#fn) };
    [struct] => { $crate::parser::lexing::TokenKind::Keyword($crate::parser::lexing::Keyword::r#struct) };
    [enum] => { $crate::parser::lexing::TokenKind::Keyword($crate::parser::lexing::Keyword::r#enum) };

    // The `derive` here is completely arbitrary. It is only for comparisons (see `same_kind`)
    [AttributeKeyword] => { $crate::parser::lexing::TokenKind::AttributeKeyword($crate::parser::lexing::AttributeKeyword::derive) };

    // This is only works for checking for a identifier or comment
    // see the `same_kind` method on TokenKind
    [Ident] => { $crate::parser::lexing::TokenKind::Identifier("".to_owned()) };
    [Identifier] => { $crate::parser::lexing::TokenKind::Identifier("".to_owned()) };

    [StringLiteral]  => { $crate::parser::lexing::TokenKind::StringLiteral("".to_owned()) };

    [Comment] => { $crate::parser::lexing::TokenKind::Comment("".to_owned()) };
}

#[cfg(test)]
mod tests {
    use super::TokenKind;
    use crate::token;

    macro_rules! token_macro_test {
        ($name:ident, $from:tt, => $to:expr) => {
            #[test]
            fn $name() {
                let got: TokenKind = token![$from];
                let should_be = $to;

                assert_eq!(got, should_be);
            }
        };
        ($name:ident, $from:tt, => $to:expr) => {
            #[test]
            fn $name() {
                let got: TokenKind = token![$from];
                let should_be = $to;

                assert_eq!(got, should_be);
            }
        };
    }

    token_macro_test!(tok_expands_to_arrow, ->, => TokenKind::Arrow);
    token_macro_test!(tok_expands_to_semicolon, Semicolon, => TokenKind::Semicolon);
    token_macro_test!(tok_expands_to_mod, mod, => TokenKind::Keyword(crate::parser::lexing::Keyword::r#mod));
    token_macro_test!(tok_expands_to_fn, fn, => TokenKind::Keyword(crate::parser::lexing::Keyword::r#fn));
}