logic_parser/lexing/
lexer.rs1use std::ops::Not;
2use crate::errors::LexerError;
3use super::token::{TokenKind, Token};
4
5pub type Result<T> = std::result::Result<T, LexerError>;
6
7pub struct Lexer<'a> {
8 is_in_alphabet: fn(char) -> bool,
9 is_in_start_chars_alphabet: fn(char) -> bool,
10 src: &'a str,
11 pos: usize
12}
13
14pub const DEFAULT_ALPHABET: fn(char) -> bool = |c| { char::is_alphanumeric(c) || c == '_' };
15pub const DEFAULT_START_ALPHABET: fn(char) -> bool = |c| { char::is_alphabetic(c) || c == '_' };
16
17impl<'a> Lexer<'a> {
18 pub fn new() -> Self {
31 Lexer {
32 is_in_alphabet: DEFAULT_ALPHABET,
33 is_in_start_chars_alphabet: DEFAULT_START_ALPHABET,
34 src: "",
35 pos: 0
36 }
37 }
38
39 pub fn with_alphabets(alphabet: fn(char) -> bool, start_chars_alphabet: fn(char) -> bool) -> Self {
65 Lexer {
66 is_in_alphabet: alphabet,
67 is_in_start_chars_alphabet: start_chars_alphabet,
68 src: "",
69 pos: 0
70 }
71 }
72
73 pub fn with_alphabet(alphabet: fn(char) -> bool) -> Self {
86 Self::with_alphabets(alphabet, alphabet)
87 }
88
89 pub fn tokenize(&mut self, src: &'a str) -> Result<Vec<Token>> {
90 let mut tokens = Vec::new();
91 self.src = src;
92 self.pos = 0;
93
94 while let Some(token) = self.next_token()? {
95 tokens.push(token);
96 }
97 Ok(tokens)
98 }
99
100 fn next_token(&mut self) -> Result<Option<Token>> {
101 self.skip_whitespaces();
102 let start = self.pos;
103
104 let c = match self.peek_char() {
105 Some(c) => c,
106 None => return Ok(None)
107 };
108
109 let kind = match c {
110 '~' | '!' => { self.consume(); TokenKind::Not },
111 '(' => { self.consume(); TokenKind::OpenParen },
112 ')' => { self.consume(); TokenKind::CloseParen },
113 '&' => {
114 match_any_or_syntax_error!(self, ["&&", "&"], TokenKind::And)
115 },
116 '|' => {
117 match_any_or_syntax_error!(self, ["||", "|"], TokenKind::Or)
118 },
119 '=' | '-' => {
120 match_any_or_syntax_error!(self, ["=>", "->"], TokenKind::Implies)
121 },
122 '<' => {
123 match_any_or_syntax_error!(self, ["<->", "<=>"], TokenKind::IfAndOnlyIf)
124 },
125 c if (self.is_in_start_chars_alphabet)(c) => {
126 self.next_word()
127 },
128 _ => {
129 return Err(
130 LexerError::UnknownToken(c, (start, start+1).into())
131 )
132 }
133 };
134
135 Ok(Some(Token::new(kind, (start, self.pos))))
136 }
137
138 fn consume(&mut self) -> Option<char> {
139 let next = self.src[self.pos..].chars().next();
140 if let Some(c) = next {
141 self.pos += c.len_utf8();
142 }
143 next
144 }
145
146 fn skip(&mut self, n: usize) {
147 for _ in 0..n {
148 self.consume();
149 }
150 }
151
152 fn next_matches(&self, to_match: &str) -> bool {
153 let mut chars = self.src[self.pos..].chars();
154 for c in to_match.chars() {
155 if chars.next() != Some(c) {
156 return false;
157 }
158 }
159 true
160 }
161
162 fn peek_char(&self) -> Option<char> {
163 self.src[self.pos..].chars().next()
164 }
165
166 fn next_word(&mut self) -> TokenKind {
169 let start = self.pos;
170 let token_len = self.take_while(self.is_in_alphabet);
172 let p = &self.src[start..start + token_len];
173 if p == "true" || p == "false" {
174 TokenKind::Literal(p == "true")
175 }
176 else {
177 TokenKind::Identifier(p.into())
178 }
179 }
180
181 fn skip_whitespaces(&mut self) -> usize {
182 self.take_while(|c| c == '\t' || c == ' ' || c == '\r')
183 }
184
185 fn take_while<F>(&mut self, pred: F) -> usize
186 where F: Fn(char) -> bool {
187 let from = self.pos;
188
189 for c in self.src[self.pos..].chars() {
190 if pred(c).not() {
191 break;
192 }
193 self.pos += c.len_utf8();
194 }
195
196 self.pos - from
197 }
198}
199
200impl<'a> Default for Lexer<'a> {
201 fn default() -> Self {
202 Self::new()
203 }
204}
205
206#[cfg(test)]
207mod tests {
208 use super::*;
209
210 #[test]
211 fn no_whitespaces_should_return_zero() {
212 let mut lexer = Lexer::new();
213 lexer.src = "testing => this";
214 let n = Lexer::skip_whitespaces(&mut lexer);
215 assert_eq!(n, 0);
216 }
217
218 #[test]
219 fn literal_booleans_work() {
220 let mut lexer = Lexer::new();
221 let tokens = lexer.tokenize("false & true").unwrap();
222 assert_eq!(
223 tokens.iter().map(|t| &t.kind).collect::<Vec<&TokenKind>>(),
224 vec![
225 &TokenKind::Literal(false),
226 &TokenKind::And,
227 &TokenKind::Literal(true)
228 ]
229 );
230 }
231
232 #[test]
233 fn take_while_returns_zero_if_no_matches() {
234 let mut lexer = Lexer::new();
235 lexer.src = "kittens";
236 let n = lexer.take_while(|_| false);
237 assert_eq!(n, 0);
238 }
239
240 #[test]
241 fn take_while_returns_correct_amount() {
242 let mut lexer = Lexer::new();
243 lexer.src = "kittens";
244 let n = lexer.take_while(|_| true);
245 assert_eq!(n, 7);
246 }
247
248 #[test]
249 fn skip_whitespaces_works_properly() {
250 let mut lexer = Lexer::new();
251 let tokens = lexer.tokenize("\t\r puppies").unwrap();
252 assert_eq!(tokens.len(), 1);
253 assert_eq!(tokens[0].kind, TokenKind::Identifier("puppies".into()));
254 }
255
256 #[test]
257 fn error_is_returned_when_alphabet_doesnt_match() {
258 let mut lexer = Lexer::with_alphabets(
259 |c| ['a', 'b', 'c', '1', '2', '3'].contains(&c),
260 |c| ['a', 'b', 'c'].contains(&c)
261 );
262 match lexer.tokenize("abcf").unwrap_err() {
263 LexerError::UnknownToken(token, span) => {
264 assert_eq!(token, 'f');
265 assert_eq!(span, (3, 4).into());
266 },
267 _ => unreachable!()
268 };
269
270 match lexer.tokenize("123abc").unwrap_err() {
271 LexerError::UnknownToken(token, span) => {
272 assert_eq!(token, '1');
273 assert_eq!(span, (0, 1).into());
274 },
275 _ => unreachable!()
276 };
277 }
278
279 #[test]
280 #[should_panic]
281 fn propositions_cant_start_with_numbers() {
282 let mut lexer = Lexer::new();
283 if !lexer.tokenize("pqrs").is_ok() { return }
284
285 let mut lexer = Lexer::new();
286 let _ = lexer.tokenize("69p").unwrap();
287 }
288}