1use tenda_common::source::IdentifiedSource;
2
3use crate::scanner_error::LexicalError;
4use crate::source_iter::SourceIter;
5use crate::token::{Literal, Token, TokenKind};
6use std::char;
7
8pub struct Scanner<'a> {
9 source: SourceIter<'a>,
10}
11
12impl<'a> Scanner<'a> {
13 pub fn new(source: &'a str, source_id: IdentifiedSource) -> Scanner<'a> {
14 Scanner {
15 source: SourceIter::new(source, source_id),
16 }
17 }
18
19 pub fn scan(&mut self) -> Result<Vec<Token>, Vec<LexicalError>> {
20 let mut tokens: Vec<Token> = Vec::new();
21 let mut errors = Vec::new();
22 let mut had_error = false;
23
24 while let Some(c) = self.source.next() {
25 let token = self.consume_token(c, tokens.last());
26
27 match token {
28 Ok(Some(value)) => {
29 had_error = false;
30 tokens.push(value)
31 }
32 Err(err) if !had_error => {
33 had_error = true;
34 errors.push(err);
35 }
36 _ => (),
37 };
38 }
39
40 tokens.push(self.source.consume_eof());
41
42 if errors.is_empty() {
43 Ok(tokens)
44 } else {
45 Err(errors)
46 }
47 }
48
49 fn consume_token(
50 &mut self,
51 char: char,
52 previous_token: Option<&Token>,
53 ) -> Result<Option<Token>, LexicalError> {
54 match char {
55 '\n' => match previous_token {
56 Some(token) if token.kind != TokenKind::Newline => {
57 self.source.consume_token(TokenKind::Newline, "\n").into()
58 }
59 _ => {
60 self.source.ignore_char();
61 Ok(None)
62 }
63 },
64 c if c.is_whitespace() => {
65 self.source.ignore_char();
66 Ok(None)
67 }
68 '(' => self.source.consume_token(TokenKind::LeftParen, "(").into(),
69 ')' => self.source.consume_token(TokenKind::RightParen, ")").into(),
70 '[' => self
71 .source
72 .consume_token(TokenKind::LeftBracket, "[")
73 .into(),
74 ']' => self
75 .source
76 .consume_token(TokenKind::RightBracket, "]")
77 .into(),
78 '{' => self.source.consume_token(TokenKind::LeftBrace, "{").into(),
79 '}' => self.source.consume_token(TokenKind::RightBrace, "}").into(),
80 ':' => self.source.consume_token(TokenKind::Colon, ":").into(),
81 '+' => self.source.consume_token(TokenKind::Plus, "+").into(),
82 '-' => {
83 if let Some('>') = self.source.peek() {
84 self.source.next();
85 self.source.consume_token(TokenKind::Arrow, "->").into()
86 } else {
87 self.source.consume_token(TokenKind::Minus, "-").into()
88 }
89 }
90 '*' => self.source.consume_token(TokenKind::Star, "*").into(),
91 '^' => self.source.consume_token(TokenKind::Caret, "^").into(),
92 '%' => self.source.consume_token(TokenKind::Percent, "%").into(),
93 '=' => self.source.consume_token(TokenKind::EqualSign, "=").into(),
94 '"' => self.consume_string(char).map(Some),
95 ',' => self.source.consume_token(TokenKind::Comma, ",").into(),
96 '.' => self.source.consume_token(TokenKind::Dot, ".").into(),
97 '>' => match self.source.peek() {
98 Some('=') => {
99 self.source.next();
100 self.source
101 .consume_token(TokenKind::GreaterOrEqual, ">")
102 .into()
103 }
104 _ => self.source.consume_token(TokenKind::Greater, ">").into(),
105 },
106 '<' => match self.source.peek() {
107 Some('=') => {
108 self.source.next();
109 self.source
110 .consume_token(TokenKind::LessOrEqual, "<")
111 .into()
112 }
113 _ => self.source.consume_token(TokenKind::Less, "<").into(),
114 },
115 c if c.is_ascii_digit() => self.consume_number(c).map(Some),
116 c if c.is_alphabetic() || c == '_' => self.consume_identifier(c).map(Some),
117 '/' => match self.source.peek() {
118 Some('/') => {
119 self.consume_comment();
120 Ok(None)
121 }
122 Some('*') => {
123 self.consume_multiline_comment();
124 Ok(None)
125 }
126 _ => self.source.consume_token(TokenKind::Slash, "/").into(),
127 },
128 _ => Err(LexicalError::UnexpectedChar {
129 character: char,
130 span: self.source.consume_span(),
131 }),
132 }
133 }
134
135 fn consume_string(&mut self, first_quote: char) -> Result<Token, LexicalError> {
136 let mut buf = String::new();
137 let mut closed = false;
138
139 buf.push(first_quote);
140
141 while let Some(&ch) = self.source.peek() {
142 match ch {
143 '"' => {
144 self.source.next();
145 closed = true;
146 break;
147 }
148 '\n' => {
149 return Err(LexicalError::UnexpectedStringEol {
150 span: self.source.consume_span(),
151 });
152 }
153 '\\' => {
154 self.source.next();
155
156 let esc = self
157 .source
158 .next()
159 .ok_or(LexicalError::UnexpectedStringEol {
160 span: self.source.consume_span(),
161 })?;
162
163 let resolved = match esc {
164 '0' => Some('\0'),
165 'a' => Some('\x07'),
166 'b' => Some('\x08'),
167 'e' => Some('\x1B'),
168 'f' => Some('\x0C'),
169 'n' => Some('\n'),
170 'r' => Some('\r'),
171 't' => Some('\t'),
172 'v' => Some('\x0B'),
173 '\\' => Some('\\'),
174 '\'' => Some('\''),
175 '"' => Some('"'),
176 'x' => {
177 let hi = self.read_hex_digit()?;
178 let lo = self.read_hex_digit()?;
179 Some(char::from(
180 u8::from_str_radix(&format!("{hi}{lo}"), 16).unwrap(),
181 ))
182 }
183 'u' => {
184 let code = self.read_n_hex(4)?;
185 char::from_u32(code)
186 }
187 'U' => {
188 let code = self.read_n_hex(8)?;
189 char::from_u32(code)
190 }
191 d @ '1'..='7' => {
192 let d2 = self.read_octal_digit()?;
193 let d3 = self.read_octal_digit()?;
194 let val = u8::from_str_radix(&format!("{d}{d2}{d3}"), 8).unwrap();
195 Some(char::from(val))
196 }
197 _ => {
198 return Err(LexicalError::UnknownEscape {
199 span: self.source.consume_span(),
200 found: esc,
201 })
202 }
203 };
204
205 if let Some(c) = resolved {
206 buf.push(c);
207 } else {
208 return Err(LexicalError::InvalidUnicodeEscape {
209 span: self.source.consume_span(),
210 });
211 }
212 }
213 _ => {
214 buf.push(ch);
215 self.source.next();
216 }
217 }
218 }
219
220 if !closed {
221 return Err(LexicalError::UnexpectedStringEol {
222 span: self.source.consume_span(),
223 });
224 }
225
226 let literal = buf[1..].to_owned();
227
228 Ok(self.source.consume_token_with_literal(
229 TokenKind::String,
230 literal.clone(),
231 Literal::String(literal),
232 ))
233 }
234
235 fn consume_number(&mut self, first: char) -> Result<Token, LexicalError> {
236 let mut raw = String::new();
237 raw.push(first);
238
239 if first == '0' {
240 if let Some(&next) = self.source.peek() {
241 match next {
242 'b' | 'B' | 'o' | 'O' | 'x' | 'X' => {
243 self.source.next();
244 raw.push(next);
245
246 let (radix, valid_digit): (u32, fn(char) -> bool) = match next {
247 'b' | 'B' => (2, |c: char| c == '0' || c == '1'),
248 'o' | 'O' => (8, |c: char| ('0'..='7').contains(&c)),
249 'x' | 'X' => (16, |c: char| c.is_ascii_hexdigit()),
250 _ => unreachable!(),
251 };
252
253 let mut digits = String::new();
254
255 while let Some(&ch) = self.source.peek() {
256 if ch == '_' {
257 self.source.next();
258 continue;
259 }
260 if valid_digit(ch) {
261 digits.push(ch);
262 raw.push(ch);
263 self.source.next();
264 } else {
265 break;
266 }
267 }
268
269 if digits.is_empty() {
270 return Err(LexicalError::UnexpectedChar {
271 character: next,
272 span: self.source.consume_span(),
273 });
274 }
275
276 let value = u64::from_str_radix(&digits, radix).unwrap() as f64;
277
278 return Ok(self.source.consume_token_with_literal(
279 TokenKind::Number,
280 raw,
281 Literal::Number(value),
282 ));
283 }
284 _ => (),
285 }
286 }
287 }
288
289 let mut matched_dot = first == '.';
290 let mut matched_exp = false;
291
292 while let Some(&ch) = self.source.peek() {
293 match ch {
294 '_' => {
295 raw.push(ch);
296 self.source.next();
297 }
298 d if d.is_ascii_digit() => {
299 raw.push(d);
300 self.source.next();
301 }
302 '.' if !matched_dot && !matched_exp => {
303 matched_dot = true;
304 raw.push('.');
305 self.source.next();
306 }
307 'e' | 'E' if !matched_exp => {
308 matched_exp = true;
309 raw.push(ch);
310 self.source.next();
311
312 if let Some(&sign @ ('+' | '-')) = self.source.peek() {
313 raw.push(sign);
314 self.source.next();
315 }
316 }
317 c if c.is_alphabetic() => {
318 return Err(LexicalError::UnexpectedChar {
319 character: c,
320 span: self.source.consume_span(),
321 });
322 }
323
324 _ => break,
325 }
326 }
327
328 let cleaned: String = raw.chars().filter(|c| *c != '_').collect();
329 let value: f64 = cleaned.parse().unwrap();
330
331 Ok(self
332 .source
333 .consume_token_with_literal(TokenKind::Number, raw, Literal::Number(value)))
334 }
335
336 fn consume_identifier(&mut self, char: char) -> Result<Token, LexicalError> {
337 let mut identifier = String::new();
338
339 identifier.push(char);
340
341 while let Some(&peeked) = self.source.peek() {
342 if peeked.is_alphanumeric() || peeked == '_' {
343 identifier.push(peeked);
344 self.source.next();
345 } else {
346 break;
347 }
348 }
349
350 let token = match identifier.as_str() {
351 Literal::TRUE_LITERAL => self.source.consume_token_with_literal(
352 TokenKind::True,
353 Literal::TRUE_LITERAL.to_string(),
354 Literal::Boolean(true),
355 ),
356 Literal::FALSE_LITERAL => self.source.consume_token_with_literal(
357 TokenKind::False,
358 Literal::FALSE_LITERAL.to_string(),
359 Literal::Boolean(false),
360 ),
361 Literal::NIL_LITERAL => self.source.consume_token_with_literal(
362 TokenKind::Nil,
363 Literal::NIL_LITERAL.to_string(),
364 Literal::Nil,
365 ),
366 "função" => self.source.consume_token(TokenKind::Function, "função"),
367 "não" => self.source.consume_token(TokenKind::Not, "não"),
368 "é" => self.source.consume_token(TokenKind::Equals, "é"),
369 "seja" => self.source.consume_token(TokenKind::Let, "seja"),
370 "se" => self.source.consume_token(TokenKind::If, "se"),
371 "então" => self.source.consume_token(TokenKind::Then, "então"),
372 "retorna" => self.source.consume_token(TokenKind::Return, "retorna"),
373 "senão" => self.source.consume_token(TokenKind::Else, "senão"),
374 "fim" => self.source.consume_token(TokenKind::BlockEnd, "fim"),
375 "ou" => self.source.consume_token(TokenKind::Or, "ou"),
376 "e" => self.source.consume_token(TokenKind::And, "e"),
377 "até" => self.source.consume_token(TokenKind::Until, "até"),
378 "para" => self.source.consume_token(TokenKind::ForOrBreak, "para"),
379 "cada" => self.source.consume_token(TokenKind::Each, "cada"),
380 "em" => self.source.consume_token(TokenKind::In, "em"),
381 "tem" => self.source.consume_token(TokenKind::Has, "tem"),
382 "enquanto" => self.source.consume_token(TokenKind::While, "enquanto"),
383 "faça" => self.source.consume_token(TokenKind::Do, "faça"),
384 "continua" => self.source.consume_token(TokenKind::Continue, "continua"),
385 identifier => self.source.consume_token_with_literal(
386 TokenKind::Identifier,
387 identifier.to_string(),
388 Literal::String(identifier.to_string()),
389 ),
390 };
391
392 Ok(token)
393 }
394
395 fn consume_comment(&mut self) {
396 while let Some(&peeked) = self.source.peek() {
397 if peeked == '\n' {
398 break;
399 }
400
401 self.source.next();
402 }
403
404 self.source.ignore_char();
405 }
406
407 fn consume_multiline_comment(&mut self) {
408 while let Some(_) = self.source.next() {
409 if self.peek_match("*/") {
410 break;
411 }
412 }
413
414 self.source.ignore_char();
415 }
416}
417
418impl Scanner<'_> {
419 fn peek_match(&mut self, expected: &str) -> bool {
420 for c in expected.chars() {
421 if let Some(&peeked) = self.source.peek() {
422 if peeked != c {
423 return false;
424 }
425
426 self.source.next();
427 }
428 }
429
430 true
431 }
432
433 fn read_hex_digit(&mut self) -> Result<char, LexicalError> {
434 self.source
435 .next()
436 .filter(|c| c.is_ascii_hexdigit())
437 .ok_or(LexicalError::InvalidHexEscape {
438 span: self.source.consume_span(),
439 })
440 }
441
442 fn read_n_hex(&mut self, n: usize) -> Result<u32, LexicalError> {
443 let mut s = String::new();
444 for _ in 0..n {
445 s.push(self.read_hex_digit()?);
446 }
447 u32::from_str_radix(&s, 16).map_err(|_| LexicalError::InvalidHexEscape {
448 span: self.source.consume_span(),
449 })
450 }
451
452 fn read_octal_digit(&mut self) -> Result<char, LexicalError> {
453 self.source
454 .next()
455 .filter(|c| ('0'..='7').contains(c))
456 .ok_or(LexicalError::InvalidOctalEscape {
457 span: self.source.consume_span(),
458 })
459 }
460}