alkale 2.0.0

A simple LL(1) lexer library for Rust.
Documentation
//! Alkale is a library focused on streamlining the production of hand-written lexers.
//!
//! A lexer, generally speaking, is a function that converts source code into
//! a [`FinalizedLexerResult`].
//!
//! A typical lexer function will look something like this.
//! ```rust
//! use alkale::{SourceCodeScanner, LexerResult, FinalizedLexerResult};
//!
//! enum MyTokenData {
//!     // ...
//! }
//!
//! fn lexer(source: &str) -> FinalizedLexerResult<MyTokenData> {
//!     // This will serve as an interface into the code for processing.
//!     let scanner = SourceCodeScanner::new(source);
//!
//!     // This serves as a collection of our produced tokens and notifications.
//!     let mut result = LexerResult::new();
//!
//!     while scanner.has_next() {
//!         // Main body goes here, processing the scanner into
//!         // tokens and notifications to be passed into the result.
//!     }
//!
//!     // Finalize the result and return it.
//!     result.finalize()
//! }
//! ```
//!
//! Many methods exist on [`SourceCodeScanner`] to consume the source code in various ways,
//! see its documentation for more details. Regardless, valid data from the source code should be
//! converted into [`Token`]s, and invalid data into [`Notification`]s, both to be reported to the
//! [`LexerResult`]. These four datatypes, as well as [`Span`][crate::span::Span] (used to create
//! [`Token`]s) pretty much make up the backbone of every single lexer.
//!
//! # Features
//! Alkale has a single feature, `common`. This feature is enabled by default and
//! introduces a huge amount of helper methods to [`SourceCodeScanner`] for things
//! such as number parsing, strings, identifiers, etc.
//!
//! # Example
//! Here is an example of a simple lexer that tokenizes words in the program,
//! ignoring whitespace and throwing an error for everything else.
//!
//! ```rust
//! use alkale::{
//!     format_notification, notification::NotificationSeverity, token::Token, FinalizedLexerResult,
//!     LexerResult, SourceCodeScanner,
//! };
//!
//! type Word<'a> = &'a str;
//!
//! fn lexer(source: &str) -> FinalizedLexerResult<Word<'_>> {
//!     let scanner = SourceCodeScanner::new(source);
//!     let mut result = LexerResult::new();
//!
//!     while scanner.has_next() {
//!         // Try to parse out a word.
//!         if let Some(identifier) = scanner.try_consume_standard_identifier() {
//!             // We found a word, push it and restart the loop.
//!             result.push_token(Token::from_spanned(identifier));
//!             continue;
//!         }
//!
//!         // No word was found, consume one character.
//!         if let Some(char) = scanner.next_span() {
//!             // If this character wasn't whitespace (i.e. illegal char) then
//!             // report a notification.
//!             if !char.is_whitespace() {
//!                 format_notification!("Unrecognized character '{}'", char.data)
//!                     .span(char.span)
//!                     .severity(NotificationSeverity::Error)
//!                     .report(&mut result);
//!             }
//!         }
//!     }
//!
//!     result.finalize()
//! }
//! ```
//!
//! This example should give a basic overview of what the main loop of a lexer should look like.
//! Check for a pattern, if it was found, parse it into a token and reset the loop. If the pattern wasn't
//! found, continue onto the next pattern until you reach a base case.

#![cfg_attr(docsrs, feature(doc_auto_cfg))]

use core::fmt::Debug;

use notification::{Notification, NotificationAcceptor, NotificationList};
use token::Token;

#[cfg(feature = "common")]
pub mod common;
pub mod notification;
mod scanner;
pub mod span;
pub mod token;

pub use scanner::SourceCodeScanner;

/// Used to accumulate [`Notification`]s and [`Token`]s during lexing.
///
/// Once lexing is complete, [`finalize`][Self::finalize] can convert this
/// into a [`FinalizedLexerResult`] to be handled as appropriate.
#[derive(Debug)]
pub struct LexerResult<TokenData, N = ()> {
    /// The notifications that were reported by the lexer.
    notifications: NotificationList<N>,
    /// The vec of tokens generated by the lexer.
    tokens: Vec<Token<TokenData>>,
}

impl<T, N> NotificationAcceptor<N> for LexerResult<T, N> {
    #[inline]
    fn report(&mut self, notification: Notification<N>) {
        self.notifications.push(notification);
    }
}

impl<T, N> Default for LexerResult<T, N> {
    fn default() -> Self {
        Self {
            notifications: NotificationList::new(),
            tokens: vec![],
        }
    }
}

impl<T, N> LexerResult<T, N> {
    /// Create a new [`LexerResult`].
    #[must_use]
    pub const fn new() -> Self {
        Self {
            notifications: NotificationList::new(),
            tokens: vec![],
        }
    }

    /// Push a token to the internal token list.
    #[inline]
    pub fn push_token(&mut self, token: Token<T>) {
        self.tokens.push(token);
    }

    /// Convert this [`LexerResult`] into a [`FinalizedLexerResult`], containing the
    /// [`Token`] list as well as any generated [`Notification`]s.
    #[inline]
    #[must_use]
    pub fn finalize(self) -> FinalizedLexerResult<T, N> {
        FinalizedLexerResult {
            notifications: self.notifications,
            tokens: self.tokens,
        }
    }
}

/// The final result of a lexer. This is returned by
/// a [`LexerResult`]'s [`finalize`][LexerResult::finalize]
/// method.
#[derive(Debug)]
pub struct FinalizedLexerResult<TokenData, N = ()> {
    /// The notifications that were reported by the lexer.
    notifications: NotificationList<N>,
    /// The vec of tokens generated by the lexer.
    tokens: Vec<Token<TokenData>>,
}

impl<T, N> FinalizedLexerResult<T, N> {
    /// Get this result's [`NotificationList`], containing all reported notifications.
    #[inline]
    #[must_use]
    pub const fn notifications(&self) -> &NotificationList<N> {
        &self.notifications
    }

    /// Get this result's list of tokens. This method will return [None] if at least 1
    /// reported notification is severity [`Error`][notification::NotificationSeverity::Error] or higher
    #[inline]
    #[must_use]
    pub fn tokens(&self) -> Option<&Vec<Token<T>>> {
        self.notifications.is_valid().then_some(&self.tokens)
    }

    /// Returns a 2-tuple containing this result's [`NotificationList`] and tokens.
    ///
    /// If the notification list contains at least 1 notification of severity error or higher, then
    /// this pair will contain no tokens. ([`None`])
    #[inline]
    #[must_use]
    pub fn into_pair(self) -> (NotificationList<N>, Option<Vec<Token<T>>>) {
        let valid = self.notifications.is_valid();

        (self.notifications, valid.then_some(self.tokens))
    }
}

#[cfg(test)]
mod tests {
    use crate::{span::Span, token::Token, LexerResult, SourceCodeScanner};

    pub fn common() -> SourceCodeScanner<'static> {
        SourceCodeScanner::new("Testing.")
    }

    #[test]
    fn next() {
        let code = common();

        assert_eq!(code.next(), Some('T'));
        assert_eq!(code.next(), Some('e'));
        assert_eq!(code.next(), Some('s'));
        assert_eq!(code.next(), Some('t'));
        assert_eq!(code.next(), Some('i'));
        assert_eq!(code.next(), Some('n'));
        assert_eq!(code.next(), Some('g'));
        assert_eq!(code.next(), Some('.'));
        assert_eq!(code.next(), None);
    }

    #[test]
    fn next_span() {
        let code = common();

        // SAFETY: Spans are all valid.
        unsafe {
            assert_eq!(code.next_span(), Some(Span::new(0, 1).wrap('T')));
            assert_eq!(code.next_span(), Some(Span::new(1, 1).wrap('e')));
            assert_eq!(code.next_span(), Some(Span::new(2, 1).wrap('s')));
            assert_eq!(code.next_span(), Some(Span::new(3, 1).wrap('t')));
            assert_eq!(code.next_span(), Some(Span::new(4, 1).wrap('i')));
            assert_eq!(code.next_span(), Some(Span::new(5, 1).wrap('n')));
            assert_eq!(code.next_span(), Some(Span::new(6, 1).wrap('g')));
            assert_eq!(code.next_span(), Some(Span::new(7, 1).wrap('.')));
            assert_eq!(code.next_span(), None);
        }
    }

    #[test]
    fn peek() {
        let code = common();

        assert_eq!(code.peek(), Some('T'));
        assert_eq!(code.next(), Some('T'));
        assert_eq!(code.next(), Some('e'));
        assert_eq!(code.next(), Some('s'));
        assert_eq!(code.next(), Some('t'));
        assert_eq!(code.peek(), Some('i'));
        assert_eq!(code.next(), Some('i'));
        assert_eq!(code.next(), Some('n'));
        assert_eq!(code.next(), Some('g'));
        assert_eq!(code.peek(), Some('.'));
        assert_eq!(code.next(), Some('.'));
        assert_eq!(code.peek(), None);
        assert_eq!(code.next(), None);
        assert_eq!(code.peek(), None);
    }

    #[test]
    fn has_next() {
        let code = common();

        assert_eq!(code.next(), Some('T'));
        assert_eq!(code.next(), Some('e'));
        assert!(code.has_next());
        assert_eq!(code.next(), Some('s'));
        assert_eq!(code.next(), Some('t'));
        assert!(code.has_next());
        assert_eq!(code.next(), Some('i'));
        assert_eq!(code.next(), Some('n'));
        assert_eq!(code.next(), Some('g'));
        assert!(code.has_next());
        assert_eq!(code.next(), Some('.'));
        assert!(!code.has_next());
        assert_eq!(code.next(), None);
        assert!(!code.has_next());
    }

    #[test]
    fn skip() {
        let code = common();

        assert_eq!(code.next(), Some('T'));
        assert_eq!(code.next(), Some('e'));
        code.skip();
        assert_eq!(code.next(), Some('t'));
        assert_eq!(code.next(), Some('i'));
        code.skip();
        assert_eq!(code.next(), Some('g'));
        code.skip();
        assert_eq!(code.next(), None);
    }

    #[test]
    fn peek_is() {
        let code = common();

        assert_eq!(code.next(), Some('T'));
        assert_eq!(code.next(), Some('e'));
        assert!(code.peek_is('s'));
        assert_eq!(code.next(), Some('s'));
        assert_eq!(code.next(), Some('t'));
        assert_eq!(code.next(), Some('i'));
        assert!(code.peek_is('n'));
        assert_eq!(code.next(), Some('n'));
        assert!(code.peek_is_not('P'));
        assert_eq!(code.next(), Some('g'));
        assert!(code.peek_is('.'));
        assert_eq!(code.next(), Some('.'));
        assert!(code.peek_is_not('P'));
        assert_eq!(code.next(), None);
        assert!(code.peek_is_not('P'));
    }

    #[test]
    fn peek_is_map() {
        let code = common();

        assert!(code.peek_is_map(|x| x.is_uppercase()));
        assert_eq!(code.next(), Some('T'));
        assert_eq!(code.next(), Some('e'));
        assert!(!code.peek_is_map(|x| x.is_uppercase()));
        assert_eq!(code.next(), Some('s'));
        assert_eq!(code.next(), Some('t'));
        assert_eq!(code.next(), Some('i'));
        assert!(code.peek_is_map(|x| x.is_lowercase()));
        assert_eq!(code.next(), Some('n'));
        assert_eq!(code.next(), Some('g'));
        assert!(!code.peek_is_map(|x| x.is_lowercase()));
        assert_eq!(code.next(), Some('.'));
        assert!(!code.peek_is_map(|_| true));
        assert_eq!(code.next(), None);
        assert!(!code.peek_is_map(|_| true));
    }

    #[test]
    fn span() {
        let code = common();

        // SAFETY: Spans are all valid.
        unsafe {
            assert_eq!(code.next(), Some('T'));
            assert_eq!(code.next(), Some('e'));
            assert_eq!(code.span(), Span::new_single(2));
            assert_eq!(code.next(), Some('s'));
            assert_eq!(code.next(), Some('t'));
            assert_eq!(code.next(), Some('i'));
            assert_eq!(code.span(), Span::new_single(5));
            assert_eq!(code.next(), Some('n'));
            assert_eq!(code.next(), Some('g'));
            assert_eq!(code.span(), Span::new_single(7));
            assert_eq!(code.span(), Span::new_single(7));
            assert_eq!(code.next(), Some('.'));
            assert_eq!(code.span(), Span::new_empty(8));
            assert_eq!(code.next(), None);
            assert_eq!(code.span(), Span::new_empty(8));
        }
    }

    // push_token() and result()
    #[test]
    fn meta() {
        let mut result = LexerResult::<_, ()>::new();

        // SAFETY: Spans are all valid.
        unsafe {
            let bloo = Token::new("bloo", Span::new(0, 4));
            let ploo = Token::new("ploo", Span::new(4, 4));

            result.push_token(bloo.clone());
            result.push_token(ploo.clone());

            let result = result.finalize();

            assert_eq!(result.notifications.into_sorted_vec(), vec![]);

            assert_eq!(result.tokens, vec![bloo, ploo]);
        }
    }
}