logic_parser/lexing/
lexer.rs

1use std::ops::Not;
2use crate::errors::LexerError;
3use super::token::{TokenKind, Token};
4
5pub type Result<T> = std::result::Result<T, LexerError>;
6
7pub struct Lexer<'a> {
8    is_in_alphabet: fn(char) -> bool,
9    is_in_start_chars_alphabet: fn(char) -> bool,
10    src: &'a str,
11    pos: usize
12}
13
14pub const DEFAULT_ALPHABET: fn(char) -> bool = |c| { char::is_alphanumeric(c) || c == '_' };
15pub const DEFAULT_START_ALPHABET: fn(char) -> bool = |c| { char::is_alphabetic(c) || c == '_' };
16
17impl<'a> Lexer<'a> {
18    /// Creates a new Lexer with the default alphabet.
19    ///
20    /// This is conceptually equivalent of doing
21    ///
22    /// ```
23    /// use logic_parser::lexing::Lexer;
24    ///
25    /// let mut lexer = Lexer::with_alphabets(
26    ///     |c| c.is_alphanumeric() || c == '_',
27    ///     |c| c.is_alphabetic() || c == '_'
28    /// );
29    /// ```
30    pub fn new() -> Self {
31        Lexer {
32            is_in_alphabet: DEFAULT_ALPHABET,
33            is_in_start_chars_alphabet: DEFAULT_START_ALPHABET,
34            src: "",
35            pos: 0
36        }
37    }
38
39    /// This allows you to define a custom alphabet for the lexer.
40    ///
41    /// ```
42    /// use logic_parser::lexing::Lexer;
43    /// use logic_parser::parsing::{Parser, ASTNode};
44    ///
45    /// let query = "(tag:pink || tag:anime) && (mime:image/* || mime:video/*)";
46    /// let mut lexer = Lexer::with_alphabets(
47    ///     |c| c.is_alphanumeric() || c == '_' || c == ':' || c == '*' || c == '/',
48    ///     |c| c.is_alphabetic(),
49    /// );
50    ///
51    /// let tokens = lexer.tokenize(query).unwrap();
52    ///
53    /// let mut parser = Parser::new(&tokens);
54    /// parser.parse().unwrap();
55    /// ```
56    ///
57    /// # WARNING
58    ///
59    /// Be aware of the following:
60    ///
61    /// - Creating an alphabet such that the `start_chars_alphabet` contains `alphabet` plus
62    ///   some other characters is **undefined behaviour**. It will probably loop forever.
63    ///
64    pub fn with_alphabets(alphabet: fn(char) -> bool, start_chars_alphabet: fn(char) -> bool) -> Self {
65        Lexer {
66            is_in_alphabet: alphabet,
67            is_in_start_chars_alphabet: start_chars_alphabet,
68            src: "",
69            pos: 0
70        }
71    }
72
73    /// Creates a lexer that uses the same alphabet for the start characters and the rest.
74    ///
75    /// Equivalent to doing
76    ///
77    /// ```
78    /// use logic_parser::lexing::Lexer;
79    ///
80    /// let custom_alphabet = |c: char| c.is_alphanumeric() || c == '_';
81    ///
82    /// let mut lexer = Lexer::with_alphabets(custom_alphabet, custom_alphabet);
83    /// lexer.tokenize("_puppies_").unwrap();
84    /// ```
85    pub fn with_alphabet(alphabet: fn(char) -> bool) -> Self {
86        Self::with_alphabets(alphabet, alphabet)
87    }
88
89    pub fn tokenize(&mut self, src: &'a str) -> Result<Vec<Token>> {
90        let mut tokens = Vec::new();
91        self.src = src;
92        self.pos = 0;
93
94        while let Some(token) = self.next_token()? {
95            tokens.push(token);
96        }
97        Ok(tokens)
98    }
99
100    fn next_token(&mut self) -> Result<Option<Token>> {
101        self.skip_whitespaces();
102        let start = self.pos;
103
104        let c = match self.peek_char() {
105            Some(c) => c,
106            None => return Ok(None)
107        };
108
109        let kind = match c {
110            '~' | '!' => { self.consume(); TokenKind::Not },
111            '(' => { self.consume(); TokenKind::OpenParen },
112            ')' => { self.consume(); TokenKind::CloseParen },
113            '&' => {
114                match_any_or_syntax_error!(self, ["&&", "&"], TokenKind::And)
115            },
116            '|' => {
117                match_any_or_syntax_error!(self, ["||", "|"], TokenKind::Or)
118            },
119            '=' | '-' => {
120                match_any_or_syntax_error!(self, ["=>", "->"], TokenKind::Implies)
121            },
122            '<' => {
123                match_any_or_syntax_error!(self, ["<->", "<=>"], TokenKind::IfAndOnlyIf)
124            },
125            c if (self.is_in_start_chars_alphabet)(c) => {
126                self.next_word()
127            },
128            _ => {
129                return Err(
130                    LexerError::UnknownToken(c, (start, start+1).into())
131                )
132            }
133        };
134
135        Ok(Some(Token::new(kind, (start, self.pos))))
136    }
137
138    fn consume(&mut self) -> Option<char> {
139        let next = self.src[self.pos..].chars().next();
140        if let Some(c) = next {
141            self.pos += c.len_utf8();
142        }
143        next
144    }
145
146    fn skip(&mut self, n: usize) {
147        for _ in 0..n {
148            self.consume();
149        }
150    }
151
152    fn next_matches(&self, to_match: &str) -> bool {
153        let mut chars = self.src[self.pos..].chars();
154        for c in to_match.chars() {
155            if chars.next() != Some(c) {
156                return false;
157            }
158        }
159        true
160    }
161
162    fn peek_char(&self) -> Option<char> {
163        self.src[self.pos..].chars().next()
164    }
165
166    /// A word can be a literal ([`TokenKind::Literal`]) or an identifier
167    /// ([`TokenKind::Identifier`])
168    fn next_word(&mut self) -> TokenKind {
169        let start = self.pos;
170        // We add one because we already consumed the first character
171        let token_len = self.take_while(self.is_in_alphabet);
172        let p = &self.src[start..start + token_len];
173        if p == "true" || p == "false" {
174            TokenKind::Literal(p == "true")
175        }
176        else {
177            TokenKind::Identifier(p.into())
178        }
179    }
180
181    fn skip_whitespaces(&mut self) -> usize {
182        self.take_while(|c| c == '\t' || c == ' ' || c == '\r')
183    }
184
185    fn take_while<F>(&mut self, pred: F) -> usize
186    where F: Fn(char) -> bool {
187        let from = self.pos;
188
189        for c in self.src[self.pos..].chars() {
190            if pred(c).not() {
191                break;
192            }
193            self.pos += c.len_utf8();
194        }
195
196        self.pos - from
197    }
198}
199
200impl<'a> Default for Lexer<'a> {
201    fn default() -> Self {
202        Self::new()
203    }
204}
205
206#[cfg(test)]
207mod tests {
208    use super::*;
209
210    #[test]
211    fn no_whitespaces_should_return_zero() {
212        let mut lexer = Lexer::new();
213        lexer.src = "testing => this";
214        let n = Lexer::skip_whitespaces(&mut lexer);
215        assert_eq!(n, 0);
216    }
217
218    #[test]
219    fn literal_booleans_work() {
220        let mut lexer = Lexer::new();
221        let tokens = lexer.tokenize("false & true").unwrap();
222        assert_eq!(
223            tokens.iter().map(|t| &t.kind).collect::<Vec<&TokenKind>>(),
224            vec![
225                &TokenKind::Literal(false),
226                &TokenKind::And,
227                &TokenKind::Literal(true)
228            ]
229        );
230    }
231
232    #[test]
233    fn take_while_returns_zero_if_no_matches() {
234        let mut lexer = Lexer::new();
235        lexer.src = "kittens";
236        let n = lexer.take_while(|_| false);
237        assert_eq!(n, 0);
238    }
239
240    #[test]
241    fn take_while_returns_correct_amount() {
242        let mut lexer = Lexer::new();
243        lexer.src = "kittens";
244        let n = lexer.take_while(|_| true);
245        assert_eq!(n, 7);
246    }
247
248    #[test]
249    fn skip_whitespaces_works_properly() {
250        let mut lexer = Lexer::new();
251        let tokens = lexer.tokenize("\t\r puppies").unwrap();
252        assert_eq!(tokens.len(), 1);
253        assert_eq!(tokens[0].kind, TokenKind::Identifier("puppies".into()));
254    }
255
256    #[test]
257    fn error_is_returned_when_alphabet_doesnt_match() {
258        let mut lexer = Lexer::with_alphabets(
259            |c| ['a', 'b', 'c', '1', '2', '3'].contains(&c),
260            |c| ['a', 'b', 'c'].contains(&c)
261        );
262        match lexer.tokenize("abcf").unwrap_err() {
263            LexerError::UnknownToken(token, span) => {
264                assert_eq!(token, 'f');
265                assert_eq!(span, (3, 4).into());
266            },
267            _ => unreachable!()
268        };
269
270        match lexer.tokenize("123abc").unwrap_err() {
271            LexerError::UnknownToken(token, span) => {
272                assert_eq!(token, '1');
273                assert_eq!(span, (0, 1).into());
274            },
275            _ => unreachable!()
276        };
277    }
278
279    #[test]
280    #[should_panic]
281    fn propositions_cant_start_with_numbers() {
282        let mut lexer = Lexer::new();
283        if !lexer.tokenize("pqrs").is_ok() { return }
284
285        let mut lexer = Lexer::new();
286        let _ = lexer.tokenize("69p").unwrap();
287    }
288}