1use std::{
4 fmt,
5 num::{ParseFloatError, ParseIntError},
6 str::Chars,
7};
8
9use thiserror::Error;
10use unicode_xid;
11
12use crate::utils::Location;
13
14use super::token::{
15 LiteralKind::*,
16 Token,
17 TokenKind::{self, *},
18};
19
20struct Cursor<'a> {
25 initial_len: usize,
26 chars: Chars<'a>,
28 lineno: u32,
29 column: u32,
30 #[cfg(debug_assertions)]
31 prev: char,
32}
33
34const EOF_CHAR: char = '\0';
35
36impl<'a> Cursor<'a> {
37 fn new(input: &'a str) -> Cursor<'a> {
38 Cursor {
39 initial_len: input.len(),
40 chars: input.chars(),
41 lineno: 1,
42 column: 1,
43 #[cfg(debug_assertions)]
44 prev: EOF_CHAR,
45 }
46 }
47
48 fn prev(&self) -> char {
51 #[cfg(debug_assertions)]
52 {
53 self.prev
54 }
55
56 #[cfg(not(debug_assertions))]
57 {
58 EOF_CHAR
59 }
60 }
61
62 fn first(&self) -> char {
67 self.chars.clone().next().unwrap_or(EOF_CHAR)
69 }
70
71 fn is_eof(&self) -> bool {
73 self.chars.as_str().is_empty()
74 }
75
76 fn bump(&mut self) -> Option<char> {
78 let c = self.chars.next()?;
79
80 if c == '\n' {
81 self.lineno += 1;
82 self.column = 0;
83 }
84 self.column += 1;
85
86 #[cfg(debug_assertions)]
87 {
88 self.prev = c;
89 }
90
91 Some(c)
92 }
93
94 fn location(&self) -> Location {
96 Location {
97 lineno: self.lineno,
98 column: self.column,
99 offset: (self.initial_len - self.chars.as_str().len()) as u32,
100 }
101 }
102
103 fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
105 while predicate(self.first()) && !self.is_eof() {
106 self.bump();
107 }
108 }
109}
110
111#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
113enum Base {
114 Binary,
116 Octal,
118 Hexadecimal,
120 Decimal,
122}
123
124pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
126 let mut cursor = Cursor::new(input);
127 std::iter::from_fn(move || loop {
128 if cursor.is_eof() {
129 break None;
130 } else {
131 let t = cursor.advance_token();
132 match t.kind {
133 LineComment | BlockComment | Whitespace => (),
134 _ => break Some(t),
135 }
136 }
137 })
138}
139
140pub fn is_whitespace(c: char) -> bool {
142 matches!(
148 c,
149 '\u{0009}' | '\u{000B}' | '\u{000C}' | '\u{000D}' | '\u{0020}' | '\u{0085}'
158
159 | '\u{200E}' | '\u{200F}' | '\u{2028}' | '\u{2029}' )
167}
168
169pub fn is_id_start(c: char) -> bool {
171 c == '_' || unicode_xid::UnicodeXID::is_xid_start(c)
173}
174
175pub fn is_id_continue(c: char) -> bool {
177 unicode_xid::UnicodeXID::is_xid_continue(c)
178}
179
180pub fn is_ident(string: &str) -> bool {
182 let mut chars = string.chars();
183 if let Some(start) = chars.next() {
184 is_id_start(start) && chars.all(is_id_continue)
185 } else {
186 false
187 }
188}
189
190impl Cursor<'_> {
191 pub fn advance_token(&mut self) -> Token {
192 let start = self.location();
193 let first_char = self.bump().unwrap_or(EOF_CHAR);
194 let token_kind = match first_char {
195 '/' => match self.first() {
197 '/' => self.line_comment(),
198 '*' => self.block_comment(),
199 '=' => {
200 self.bump();
201 DivAssign
202 }
203 _ => Div,
204 },
205
206 c if is_whitespace(c) => {
208 self.eat_while(is_whitespace);
209 Whitespace
210 }
211
212 'r' => match self.first() {
214 c @ ('"' | '\'') => self.string(c, true),
215 _ => self.ident_or_reserved_word('r'),
216 },
217
218 c if is_id_start(c) => self.ident_or_reserved_word(c),
221
222 c @ '0'..='9' => self.number(c),
224
225 c @ ('"' | '\'') => self.string(c, false),
227
228 ':' if self.first() == ':' => {
230 self.bump();
231 DoubleColon
232 }
233 '=' if self.first() == '=' => {
234 self.bump();
235 Eq
236 }
237 '!' if self.first() == '=' => {
238 self.bump();
239 NotEq
240 }
241 '<' if self.first() == '=' => {
242 self.bump();
243 LtEq
244 }
245 '>' if self.first() == '=' => {
246 self.bump();
247 GtEq
248 }
249 '+' if self.first() == '=' => {
250 self.bump();
251 AddAssign
252 }
253 '-' if self.first() == '=' => {
254 self.bump();
255 SubAssign
256 }
257 '*' if self.first() == '=' => {
258 self.bump();
259 MulAssign
260 }
261 '%' if self.first() == '=' => {
262 self.bump();
263 ModAssign
264 }
265
266 '\n' => self.eol(),
268 '\\' if self.first() == '\n' => {
269 self.bump();
270 Whitespace
271 }
272 ',' => Comma,
273 '.' => Dot,
274 '(' => OpenParen,
275 ')' => CloseParen,
276 '{' => OpenBrace,
277 '}' => CloseBrace,
278 '[' => OpenBracket,
279 ']' => CloseBracket,
280 '#' => Pound,
281 '?' => Question,
282 ':' => Colon,
283 '=' => Assign,
284 '<' => Lt,
285 '>' => Gt,
286 '|' => VBar,
287 '+' => Add,
288 '-' => Sub,
289 '*' => Mul,
290 '%' => Mod,
291 c => Unknown(c),
292 };
293 Token::new(token_kind, start, self.location())
294 }
295
296 fn eol(&mut self) -> TokenKind {
297 debug_assert!(self.prev() == '\n');
298 self.eat_while(|c| c == '\n');
299 EOL
300 }
301
302 fn line_comment(&mut self) -> TokenKind {
303 debug_assert!(self.prev() == '/' && self.first() == '/');
304 self.bump();
305 self.eat_while(|c| c != '\n');
306 LineComment
307 }
308
309 fn block_comment(&mut self) -> TokenKind {
310 debug_assert!(self.prev() == '/' && self.first() == '*');
311 self.bump();
312 let mut depth = 1usize;
313 while let Some(c) = self.bump() {
314 match c {
315 '/' if self.first() == '*' => {
316 self.bump();
317 depth += 1;
318 }
319 '*' if self.first() == '/' => {
320 self.bump();
321 depth -= 1;
322 if depth == 0 {
323 break;
327 }
328 }
329 _ => (),
330 }
331 }
332 BlockComment
333 }
334
335 fn ident_or_reserved_word(&mut self, first_char: char) -> TokenKind {
336 debug_assert!(is_id_start(self.prev()));
337 let mut value = String::from(first_char);
338 loop {
339 let c = self.first();
340 if is_id_continue(c) {
341 value.push(c);
342 } else {
343 break;
344 }
345 self.bump();
346 }
347
348 match value.as_str() {
349 "if" => If,
350 "else" => Else,
351 "loop" => Loop,
352 "while" => While,
353 "for" => For,
354 "in" => In,
355 "break" => Break,
356 "continue" => Continue,
357 "throw" => Throw,
358 "return" => Return,
359 "global" => Global,
360 "import" => Import,
361 "as" => As,
362 "is" => Is,
363 "not" => Not,
364 "and" => And,
365 "or" => Or,
366 "try" => Try,
367 "fn" => Fn,
368 "do" => Do,
369 "null" => Null,
370 "true" => True,
371 "false" => False,
372 _ => Ident(value),
373 }
374 }
375
376 fn number(&mut self, first_digit: char) -> TokenKind {
377 debug_assert!('0' <= self.prev() && self.prev() <= '9');
378 let mut base = Base::Decimal;
379 let mut value = String::new();
380 let mut has_point = false;
381 let mut has_exponent = false;
382 if first_digit == '0' {
383 match self.first() {
385 'b' => {
386 base = Base::Binary;
387 self.bump();
388 }
389 'o' => {
390 base = Base::Octal;
391 self.bump();
392 }
393 'x' => {
394 base = Base::Hexadecimal;
395 self.bump();
396 }
397 '0'..='9' | '_' | '.' | 'e' | 'E' => {
399 base = Base::Decimal;
400 value.push('0');
401 }
402 _ => return Literal(Int(Ok(0))),
404 };
405 } else {
406 value.push(first_digit);
407 }
408 loop {
409 let t = self.first();
410 match t {
411 '_' => {
412 self.bump();
413 continue;
414 }
415 '.' if base == Base::Decimal => {
416 if has_point {
417 return Literal(Float(Err(LexerError::NumberFormatError)));
418 }
419 has_point = true;
420 }
421 'e' | 'E' if base == Base::Decimal => {
422 if has_exponent {
423 return Literal(Float(Err(LexerError::NumberFormatError)));
424 }
425 has_exponent = true;
426 }
427 '0'..='1' if base == Base::Binary => {}
428 '0'..='7' if base == Base::Octal => {}
429 '0'..='9' if base == Base::Decimal => {}
430 '0'..='9' | 'a'..='f' | 'A'..='F' if base == Base::Hexadecimal => {}
431 _ => break,
432 }
433 value.push(t);
434 self.bump();
435 }
436
437 if has_point || has_exponent {
438 if base != Base::Decimal {
440 Literal(Float(Err(LexerError::NumberFormatError)))
441 } else {
442 match value.parse::<f64>() {
443 Ok(v) => Literal(Float(Ok(v))),
444 Err(e) => Literal(Float(Err(LexerError::ParseFloatError(e)))),
445 }
446 }
447 } else {
448 match i64::from_str_radix(
449 &value,
450 match base {
451 Base::Binary => 2,
452 Base::Octal => 8,
453 Base::Hexadecimal => 16,
454 Base::Decimal => 10,
455 },
456 ) {
457 Ok(v) => Literal(Int(Ok(v))),
458 Err(e) => Literal(Int(Err(LexerError::ParseIntError(e)))),
459 }
460 }
461 }
462
463 fn string(&mut self, quoted: char, is_raw: bool) -> TokenKind {
464 if is_raw {
465 debug_assert!(self.prev() == 'r');
466 self.bump();
467 }
468 debug_assert!(self.prev() == '"' || self.prev() == '\'');
469 let mut value = String::new();
470 loop {
471 if let Some(c) = self.bump() {
472 let t = match c {
473 _ if c == quoted => break,
474 '\\' if !is_raw => match self.first() {
475 '\n' => {
476 self.bump();
477 continue;
478 }
479 _ => self.scan_escape(),
480 },
481 '\r' => Err(EscapeError::BareCarriageReturn),
482 _ => Ok(c),
483 };
484 match t {
485 Ok(c) => value.push(c),
486 Err(e) => return Literal(Str(Err(LexerError::EscapeError(e)))),
487 }
488 } else {
489 return Literal(Str(Err(LexerError::UnterminatedStringError)));
490 }
491 }
492 Literal(Str(Ok(value)))
493 }
494
495 fn scan_escape(&mut self) -> std::result::Result<char, EscapeError> {
496 debug_assert!(self.prev() == '\\');
497 let res = match self.bump().unwrap_or(EOF_CHAR) {
499 '"' => '"',
500 'n' => '\n',
501 'r' => '\r',
502 't' => '\t',
503 '\\' => '\\',
504 '\'' => '\'',
505 '0' => '\0',
506
507 'x' => {
508 let hi = self.bump().ok_or(EscapeError::TooShortHexEscape)?;
511 let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
512
513 let lo = self.bump().ok_or(EscapeError::TooShortHexEscape)?;
514 let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
515
516 let value = hi * 16 + lo;
517
518 if value > 0x7F {
520 return Err(EscapeError::OutOfRangeHexEscape);
521 }
522 let value = value as u8;
523
524 value as char
525 }
526
527 'u' => {
528 if self.bump() != Some('{') {
531 return Err(EscapeError::NoBraceInUnicodeEscape);
532 }
533
534 let mut n_digits = 1;
536 let mut value: u32 = match self.bump().ok_or(EscapeError::UnclosedUnicodeEscape)? {
537 '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
538 '}' => return Err(EscapeError::EmptyUnicodeEscape),
539 c => c
540 .to_digit(16)
541 .ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
542 };
543
544 loop {
547 match self.bump() {
548 None => return Err(EscapeError::UnclosedUnicodeEscape),
549 Some('_') => continue,
550 Some('}') => {
551 if n_digits > 6 {
552 return Err(EscapeError::OverlongUnicodeEscape);
553 }
554
555 break std::char::from_u32(value).ok_or({
556 if value > 0x10FFFF {
557 EscapeError::OutOfRangeUnicodeEscape
558 } else {
559 EscapeError::LoneSurrogateUnicodeEscape
560 }
561 })?;
562 }
563 Some(c) => {
564 let digit: u32 = c
565 .to_digit(16)
566 .ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
567 n_digits += 1;
568 if n_digits > 6 {
569 continue;
571 }
572 value = value * 16 + digit;
573 }
574 };
575 }
576 }
577 _ => return Err(EscapeError::InvalidEscape),
578 };
579 Ok(res)
580 }
581}
582
583#[derive(Error, Debug, Clone, PartialEq)]
585pub enum LexerError {
586 #[error("parse int error ({0})")]
587 ParseIntError(#[from] ParseIntError),
588 #[error("parse float error ({0})")]
589 ParseFloatError(#[from] ParseFloatError),
590 #[error("number format error")]
591 NumberFormatError,
592 #[error("unterminated string error")]
593 UnterminatedStringError,
594 #[error("escape error ({0})")]
595 EscapeError(#[from] EscapeError),
596}
597
598#[derive(Error, Debug, Clone, PartialEq, Eq)]
600pub enum EscapeError {
601 InvalidEscape,
603 BareCarriageReturn,
605
606 TooShortHexEscape,
608 InvalidCharInHexEscape,
610 OutOfRangeHexEscape,
612
613 NoBraceInUnicodeEscape,
615 InvalidCharInUnicodeEscape,
617 EmptyUnicodeEscape,
619 UnclosedUnicodeEscape,
621 LeadingUnderscoreUnicodeEscape,
623 OverlongUnicodeEscape,
625 LoneSurrogateUnicodeEscape,
627 OutOfRangeUnicodeEscape,
629}
630
631impl fmt::Display for EscapeError {
632 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
633 fmt::Debug::fmt(self, f)
634 }
635}