trixy 0.4.0 - Docs.rs

/*
* Copyright (C) 2023 - 2024:
* The Trinitrix Project <soispha@vhack.eu, antifallobst@systemausfall.org>
* SPDX-License-Identifier: GPL-3.0-or-later
*
* This file is part of the Trixy crate for Trinitrix.
*
* Trixy is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* and the GNU General Public License along with this program.
* If not, see <https://www.gnu.org/licenses/>.
*/

// This code is heavily inspired by: https://michael-f-bryan.github.io/static-analyser-in-rust/book/lex.html

use crate::parser::{
    error::ErrorContext,
    lexing::{Keyword, TokenSpan},
};

use super::{
    error::{LexingError, SpannedLexingError},
    AttributeKeyword, Token, TokenKind,
};

pub(super) struct Tokenizer<'a> {
    current_index: usize,
    remaining_text: &'a str,
    original_text: &'a str,
}

impl<'a> Tokenizer<'a> {
    pub(super) fn new(input: &'a str) -> Self {
        Self {
            current_index: 0,
            remaining_text: input,
            original_text: input,
        }
    }
    pub(super) fn next_token(&mut self) -> Result<Option<Token>, SpannedLexingError> {
        self.skip_ignored_tokens();
        if self.remaining_text.is_empty() {
            Ok(None)
        } else {
            let start = self.current_index;

            let (token_kind, index) = self.get_next_tokenkind().map_err(|e| {
                let context = ErrorContext::from_index(start, self.original_text);

                SpannedLexingError { source: e, context }
            })?;
            // if let TokenKind::StringLiteral(string) = &token_kind {
            //     if string == "" {
            //         eprintln!(
            //             "Got an empty StringLiteral '{}', with span: {}..{}",
            //             string,
            //             start,
            //             start + index
            //         );
            //         eprintln!(
            //             "Removing following text: '{}'\n",
            //             &self.remaining_text[..index],
            //         );
            //     }
            // }

            self.chomp(index); // end - start
            let end = self.current_index;
            Ok(Some(Token {
                span: TokenSpan { start, end },
                kind: token_kind,
            }))
        }
    }

    fn get_next_tokenkind(&mut self) -> Result<(TokenKind, usize), LexingError> {
        let next = match self.remaining_text.chars().next() {
            Some(c) => c,
            None => return Err(LexingError::UnexpectedEOF),
        };

        let (tok, length) = match next {
            '(' => (TokenKind::CurvedBracketOpen, 1),
            ')' => (TokenKind::CurvedBracketClose, 1),
            '{' => (TokenKind::CurlyBracketOpen, 1),
            '}' => (TokenKind::CurlyBracketClose, 1),
            '<' => (TokenKind::AngledBracketOpen, 1),
            '>' => (TokenKind::AngledBracketClose, 1),
            '[' => (TokenKind::SquareBracketOpen, 1),
            ']' => (TokenKind::SquareBracketClose, 1),

            ':' => (TokenKind::Colon, 1),
            ';' => (TokenKind::Semicolon, 1),
            ',' => (TokenKind::Comma, 1),
            '#' => (TokenKind::PoundSign, 1),
            '=' => (TokenKind::EqualsSign, 1),
            '"' => tokenize_literal_string(self.remaining_text, "\"")?,
            'r' => try_to_tokenize_raw_literal_string(self.remaining_text)?,

            '-' => tokenize_arrow(self.remaining_text)?,
            '/' => tokenize_comment(self.remaining_text)?,

            // can't use a OR (`|`) here, as the guard takes precedence
            c if c.is_alphabetic() => tokenize_ident(self.remaining_text)?,
            '_' => tokenize_ident(self.remaining_text)?,

            other => return Err(LexingError::UnknownCharacter(other)),
        };

        Ok((tok, length))
    }

    fn skip_ignored_tokens(&mut self) {
        loop {
            let ws = self.skip_whitespace();
            let comments = self.skip_block_comment();
            if ws + comments == 0 {
                return;
            }
        }
    }

    /// Skip past any whitespace characters
    fn skip_whitespace(&mut self) -> usize {
        let mut remaining = self.remaining_text;

        // Filter out whitespace
        let _ws = {
            let ws = match take_while(remaining, |ch| ch.is_whitespace()) {
                Ok((_, bytes_skipped)) => bytes_skipped,
                _ => 0,
            };
            remaining = &remaining[ws..];
            ws
        };

        let skip = self.remaining_text.len() - remaining.len();
        self.chomp(skip);
        skip
    }
    fn skip_block_comment(&mut self) -> usize {
        let pairs = [("/*", "*/")];

        let src = self.remaining_text;

        for &(pattern, matcher) in &pairs {
            if src.starts_with(pattern) {
                let leftovers = skip_until(src, matcher);
                let skip = src.len() - leftovers.len();
                self.chomp(skip);
                return skip;
            }
        }

        0
    }

    fn chomp(&mut self, chars_to_chomp: usize) {
        self.remaining_text = &self.remaining_text[chars_to_chomp..];
        self.current_index += chars_to_chomp;
    }
}

/// checks if the next char in the input str is a newline
fn end_of_line(text: &str) -> bool {
    let next = text.chars().next();
    if let Some('\n') = next {
        true
    } else if let Some('\r') = next {
        true
    } else {
        false
    }
}

fn tokenize_comment(text: &str) -> Result<(TokenKind, usize), LexingError> {
    // every token starts with two slashes
    let slashes: &str = &text[..2];
    if slashes != "//" {
        Err(LexingError::ExpectedComment)
    } else {
        let text: &str = &text[2..];
        if end_of_line(&text) {
            Ok((TokenKind::Comment("".to_owned()), 1 + 2))
        } else {
            let (comment, chars_read) = take_while(text, |ch| ch != '\n' && ch != '\r')?;

            // trim trailing whitespace (only at the end to avoid removing wanted whitespace)
            let comment = comment.trim_end();

            Ok((TokenKind::Comment(comment.to_owned()), chars_read + 2))
        }
    }
}

/// We check if the r is the beginning of a literal string, otherwise, we tokenize a identifier
fn try_to_tokenize_raw_literal_string(text: &str) -> Result<(TokenKind, usize), LexingError> {
    // remove the 'r' at the begining
    let text_without_r = &text[1..];

    let next_char = &text_without_r[..1];
    if next_char == "#" {
        // The string is also escaped, count the hashtags
        let (delimeter, chars_read) = take_while(text_without_r, |ch| ch == '#')?;

        let delimeter = format!("\"{}", delimeter);

        let (token, length) = tokenize_literal_string(&text_without_r[chars_read..], &delimeter)?;

        // The 1 is the size of the extra 'r'
        Ok((token, length + 1 + chars_read))
    } else if next_char == "\"" {
        // regular raw string literal
        let (token, length) = tokenize_literal_string(text_without_r, "\"")?;
        // The 1 is the size of the extra 'r'
        Ok((token, length + 1))
    } else {
        // if the 'r' is not followed by either an '#' or a '"', it must be part of an identifier
        tokenize_ident(text)
    }
}

fn tokenize_literal_string(text: &str, delimeter: &str) -> Result<(TokenKind, usize), LexingError> {
    // The first char is always a quote (")
    assert_eq!(&text[..1], "\"");
    let text_without_quote = &text[1..];

    if &text_without_quote[0..delimeter.len()] == delimeter {
        // eprintln!(
        //     "Got a direct delimeter, removing: '{}'",
        //     &text[..1 + delimeter.len()]
        // );
        // eprintln!("Next up to parse: '{}'\n", &text[1 + delimeter.len()..20]);

        // The literal string does not contain anything
        Ok((TokenKind::StringLiteral("".to_owned()), 1 + delimeter.len()))
    } else {
        let mut predicates: Vec<_> = delimeter
            .chars()
            .map(|ch| {
                // eprintln!("Condition, which needs to match: |ch| ch == '{}'", ch);
                move |ch2| ch2 == ch
            })
            .collect();
        let (literal, chars_read) =
            take_until_succesive_match(text_without_quote, &mut predicates)?;

        // The + 1 is the quote taken at the beginning
        Ok((TokenKind::StringLiteral(literal.to_owned()), chars_read + 1))
    }
}

fn tokenize_ident(text: &str) -> Result<(TokenKind, usize), LexingError> {
    let (got, chars_read) = take_while(text, |ch| ch == '_' || ch.is_alphanumeric())?;

    // Filter out keywords
    let tokenkind = match got {
        "mod" => TokenKind::Keyword(Keyword::r#mod),
        "fn" => TokenKind::Keyword(Keyword::r#fn),
        "struct" => TokenKind::Keyword(Keyword::r#struct),
        "enum" => TokenKind::Keyword(Keyword::r#enum),

        "derive" => TokenKind::AttributeKeyword(AttributeKeyword::derive),
        "doc" => TokenKind::AttributeKeyword(AttributeKeyword::doc),
        "error" => TokenKind::AttributeKeyword(AttributeKeyword::error),

        other => TokenKind::Identifier(other.to_string()),
    };

    Ok((tokenkind, chars_read))
}

fn tokenize_arrow(text: &str) -> Result<(TokenKind, usize), LexingError> {
    let mut chars = text.chars();
    if let Some(char) = chars.next() {
        if char == '-' {
            if let Some(char) = chars.next() {
                if char == '>' {
                    return Ok((TokenKind::Arrow, 2));
                }
            }
        }
    }
    // This is a implicit else as the other if clauses return
    Err(LexingError::ExpectedArrow)
}

/// Consumes bytes while a predicate evaluates to true.
fn take_while<F>(data: &str, mut pred: F) -> Result<(&str, usize), LexingError>
where
    F: FnMut(char) -> bool,
{
    let mut current_index = 0;

    for ch in data.chars() {
        let should_continue = pred(ch);

        if !should_continue {
            break;
        }

        current_index += ch.len_utf8();
    }

    if current_index == 0 {
        Err(LexingError::NoMatchesTaken)
    } else {
        Ok((&data[..current_index], current_index))
    }
}
/// Consume bytes until all the predicates match in successive ways
fn take_until_succesive_match<'a, F>(
    data: &'a str,
    preds: &mut [F],
) -> Result<(&'a str, usize), LexingError>
where
    F: FnMut(char) -> bool,
{
    assert!(!preds.is_empty(), "Predicates need to be provided");

    let mut current_byte_index = 0;
    let mut current_char_index = 0;
    let mut current_predicate_byte_index = 0;
    let mut current_predicate_char_index;

    'outer: for ch in data.chars() {
        // eprintln!("Processing: {:#?}", ch);
        let should_stop = preds[0](ch);

        if should_stop {
            current_predicate_byte_index = current_byte_index;
            current_predicate_char_index = current_char_index;

            // eprintln!("First predicate did match char: {:#?}", ch);
            if preds.len() == 1 {
                current_predicate_byte_index += ch.len_utf8();
                // eprintln!("Only one predicate provided, which matched: {:#?}\n", ch);
                break 'outer;
            }

            'inner: for predicate_index in 1..preds.len() {
                let preds_len = preds.len();
                let pred = &mut preds[predicate_index];
                current_predicate_byte_index += ch.len_utf8();
                current_predicate_char_index += 1;

                let ch: char = data
                    .chars()
                    .nth(current_predicate_char_index)
                    .expect("This should always exist");

                // eprintln!("Checking pred with char: {:#?}", ch);
                if pred(ch) && predicate_index == preds_len - 1 {
                    // eprintln!("Predicate did match char and was last: {:#?}\n", ch);

                    // TODO(@soispha): Why is this needed? <2024-03-26>
                    current_predicate_byte_index += ch.len_utf8();
                    break 'outer;
                } else if pred(ch) {
                    // eprintln!("Predicate did match char, but was not last: {:#?}\n", ch);
                    continue;
                } else {
                    // eprintln!("Predicate did not match char: {:#?}\n", ch);
                    break 'inner;
                }
            }
        }

        current_byte_index += ch.len_utf8();
        current_char_index += 1;
    }

    if current_byte_index == 0 {
        Err(LexingError::NoMatchesTaken)
    } else if current_byte_index > current_predicate_byte_index {
        Err(LexingError::RunawayQuote)
    } else {
        Ok((
            &data[..current_byte_index],
            current_byte_index + (current_predicate_byte_index - current_byte_index),
        ))
    }
}

/// Skips input until the remaining string pattern starts with the pattern
fn skip_until<'a>(mut src: &'a str, pattern: &str) -> &'a str {
    while !src.is_empty() && !src.starts_with(pattern) {
        let next_char_size = src
            .chars()
            .next()
            .expect("The string isn't empty")
            .len_utf8();
        src = &src[next_char_size..];
    }

    &src[pattern.len()..]
}