logic-parser 1.3.0

A simple lexer & parser for logical expressions that supports output as AST, JSON and SVG
Documentation
use std::ops::Not;
use crate::errors::LexerError;
use super::token::{TokenKind, Token};

pub type Result<T> = std::result::Result<T, LexerError>;

pub struct Lexer<'a> {
    is_in_alphabet: fn(char) -> bool,
    is_in_start_chars_alphabet: fn(char) -> bool,
    src: &'a str,
    pos: usize
}

pub const DEFAULT_ALPHABET: fn(char) -> bool = |c| { char::is_alphanumeric(c) || c == '_' };
pub const DEFAULT_START_ALPHABET: fn(char) -> bool = |c| { char::is_alphabetic(c) || c == '_' };

impl<'a> Lexer<'a> {
    /// Creates a new Lexer with the default alphabet.
    ///
    /// This is conceptually equivalent of doing
    ///
    /// ```
    /// use logic_parser::lexing::Lexer;
    ///
    /// let mut lexer = Lexer::with_alphabets(
    ///     |c| c.is_alphanumeric() || c == '_',
    ///     |c| c.is_alphabetic() || c == '_'
    /// );
    /// ```
    pub fn new() -> Self {
        Lexer {
            is_in_alphabet: DEFAULT_ALPHABET,
            is_in_start_chars_alphabet: DEFAULT_START_ALPHABET,
            src: "",
            pos: 0
        }
    }

    /// This allows you to define a custom alphabet for the lexer.
    ///
    /// ```
    /// use logic_parser::lexing::Lexer;
    /// use logic_parser::parsing::{Parser, ASTNode};
    ///
    /// let query = "(tag:pink || tag:anime) && (mime:image/* || mime:video/*)";
    /// let mut lexer = Lexer::with_alphabets(
    ///     |c| c.is_alphanumeric() || c == '_' || c == ':' || c == '*' || c == '/',
    ///     |c| c.is_alphabetic(),
    /// );
    ///
    /// let tokens = lexer.tokenize(query).unwrap();
    ///
    /// let mut parser = Parser::new(&tokens);
    /// parser.parse().unwrap();
    /// ```
    ///
    /// # WARNING
    ///
    /// Be aware of the following:
    ///
    /// - Creating an alphabet such that the `start_chars_alphabet` contains `alphabet` plus
    ///   some other characters is **undefined behaviour**. It will probably loop forever.
    ///
    pub fn with_alphabets(alphabet: fn(char) -> bool, start_chars_alphabet: fn(char) -> bool) -> Self {
        Lexer {
            is_in_alphabet: alphabet,
            is_in_start_chars_alphabet: start_chars_alphabet,
            src: "",
            pos: 0
        }
    }

    /// Creates a lexer that uses the same alphabet for the start characters and the rest.
    ///
    /// Equivalent to doing
    ///
    /// ```
    /// use logic_parser::lexing::Lexer;
    ///
    /// let custom_alphabet = |c: char| c.is_alphanumeric() || c == '_';
    ///
    /// let mut lexer = Lexer::with_alphabets(custom_alphabet, custom_alphabet);
    /// lexer.tokenize("_puppies_").unwrap();
    /// ```
    pub fn with_alphabet(alphabet: fn(char) -> bool) -> Self {
        Self::with_alphabets(alphabet, alphabet)
    }

    pub fn tokenize(&mut self, src: &'a str) -> Result<Vec<Token>> {
        let mut tokens = Vec::new();
        self.src = src;
        self.pos = 0;

        while let Some(token) = self.next_token()? {
            tokens.push(token);
        }
        Ok(tokens)
    }

    fn next_token(&mut self) -> Result<Option<Token>> {
        self.skip_whitespaces();
        let start = self.pos;

        let c = match self.peek_char() {
            Some(c) => c,
            None => return Ok(None)
        };

        let kind = match c {
            '~' | '!' => { self.consume(); TokenKind::Not },
            '(' => { self.consume(); TokenKind::OpenParen },
            ')' => { self.consume(); TokenKind::CloseParen },
            '&' => {
                match_any_or_syntax_error!(self, ["&&", "&"], TokenKind::And)
            },
            '|' => {
                match_any_or_syntax_error!(self, ["||", "|"], TokenKind::Or)
            },
            '=' | '-' => {
                match_any_or_syntax_error!(self, ["=>", "->"], TokenKind::Implies)
            },
            '<' => {
                match_any_or_syntax_error!(self, ["<->", "<=>"], TokenKind::IfAndOnlyIf)
            },
            c if (self.is_in_start_chars_alphabet)(c) => {
                self.next_word()
            },
            _ => {
                return Err(
                    LexerError::UnknownToken(c, (start, start+1).into())
                )
            }
        };

        Ok(Some(Token::new(kind, (start, self.pos))))
    }

    fn consume(&mut self) -> Option<char> {
        let next = self.src[self.pos..].chars().next();
        if let Some(c) = next {
            self.pos += c.len_utf8();
        }
        next
    }

    fn skip(&mut self, n: usize) {
        for _ in 0..n {
            self.consume();
        }
    }

    fn next_matches(&self, to_match: &str) -> bool {
        let mut chars = self.src[self.pos..].chars();
        for c in to_match.chars() {
            if chars.next() != Some(c) {
                return false;
            }
        }
        true
    }

    fn peek_char(&self) -> Option<char> {
        self.src[self.pos..].chars().next()
    }

    /// A word can be a literal ([`TokenKind::Literal`]) or an identifier
    /// ([`TokenKind::Identifier`])
    fn next_word(&mut self) -> TokenKind {
        let start = self.pos;
        // We add one because we already consumed the first character
        let token_len = self.take_while(self.is_in_alphabet);
        let p = &self.src[start..start + token_len];
        if p == "true" || p == "false" {
            TokenKind::Literal(p == "true")
        }
        else {
            TokenKind::Identifier(p.into())
        }
    }

    fn skip_whitespaces(&mut self) -> usize {
        self.take_while(|c| c == '\t' || c == ' ' || c == '\r')
    }

    fn take_while<F>(&mut self, pred: F) -> usize
    where F: Fn(char) -> bool {
        let from = self.pos;

        for c in self.src[self.pos..].chars() {
            if pred(c).not() {
                break;
            }
            self.pos += c.len_utf8();
        }

        self.pos - from
    }
}

impl<'a> Default for Lexer<'a> {
    fn default() -> Self {
        Self::new()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn no_whitespaces_should_return_zero() {
        let mut lexer = Lexer::new();
        lexer.src = "testing => this";
        let n = Lexer::skip_whitespaces(&mut lexer);
        assert_eq!(n, 0);
    }

    #[test]
    fn literal_booleans_work() {
        let mut lexer = Lexer::new();
        let tokens = lexer.tokenize("false & true").unwrap();
        assert_eq!(
            tokens.iter().map(|t| &t.kind).collect::<Vec<&TokenKind>>(),
            vec![
                &TokenKind::Literal(false),
                &TokenKind::And,
                &TokenKind::Literal(true)
            ]
        );
    }

    #[test]
    fn take_while_returns_zero_if_no_matches() {
        let mut lexer = Lexer::new();
        lexer.src = "kittens";
        let n = lexer.take_while(|_| false);
        assert_eq!(n, 0);
    }

    #[test]
    fn take_while_returns_correct_amount() {
        let mut lexer = Lexer::new();
        lexer.src = "kittens";
        let n = lexer.take_while(|_| true);
        assert_eq!(n, 7);
    }

    #[test]
    fn skip_whitespaces_works_properly() {
        let mut lexer = Lexer::new();
        let tokens = lexer.tokenize("\t\r puppies").unwrap();
        assert_eq!(tokens.len(), 1);
        assert_eq!(tokens[0].kind, TokenKind::Identifier("puppies".into()));
    }

    #[test]
    fn error_is_returned_when_alphabet_doesnt_match() {
        let mut lexer = Lexer::with_alphabets(
            |c| ['a', 'b', 'c', '1', '2', '3'].contains(&c),
            |c| ['a', 'b', 'c'].contains(&c)
        );
        match lexer.tokenize("abcf").unwrap_err() {
            LexerError::UnknownToken(token, span) => {
                assert_eq!(token, 'f');
                assert_eq!(span, (3, 4).into());
            },
            _ => unreachable!()
        };

        match lexer.tokenize("123abc").unwrap_err() {
            LexerError::UnknownToken(token, span) => {
                assert_eq!(token, '1');
                assert_eq!(span, (0, 1).into());
            },
            _ => unreachable!()
        };
    }

    #[test]
    #[should_panic]
    fn propositions_cant_start_with_numbers() {
        let mut lexer = Lexer::new();
        if !lexer.tokenize("pqrs").is_ok() { return }

        let mut lexer = Lexer::new();
        let _ = lexer.tokenize("69p").unwrap();
    }
}