1use std::{
2 char, fmt, i32, i64,
3 io::{self, Read},
4 str,
5};
6
7use gc_arena::Collect;
8use thiserror::Error;
9
10use super::StringInterner;
11
12#[derive(Clone)]
13pub enum Token<S> {
14 Break,
15 Do,
16 Else,
17 ElseIf,
18 End,
19 Function,
20 Goto,
21 If,
22 In,
23 Local,
24 Nil,
25 For,
26 While,
27 Repeat,
28 Until,
29 Return,
30 Then,
31 True,
32 False,
33 Not,
34 And,
35 Or,
36 Minus,
37 Add,
38 Mul,
39 Div,
40 IDiv,
41 Pow,
42 Mod,
43 Len,
44 BitNotXor,
45 BitAnd,
46 BitOr,
47 ShiftRight,
48 ShiftLeft,
49 Concat,
50 Dots,
51 Assign,
52 LessThan,
53 LessEqual,
54 GreaterThan,
55 GreaterEqual,
56 Equal,
57 NotEqual,
58 Dot,
59 SemiColon,
60 Colon,
61 DoubleColon,
62 Comma,
63 LeftParen,
64 RightParen,
65 LeftBracket,
66 RightBracket,
67 LeftBrace,
68 RightBrace,
69 Integer(i64),
72 Float(f64),
73 Name(S),
74 String(S),
75}
76
77impl<S: AsRef<[u8]>> PartialEq for Token<S> {
78 fn eq(&self, other: &Self) -> bool {
79 match (self, other) {
80 (Token::Break, Token::Break) => true,
81 (Token::Do, Token::Do) => true,
82 (Token::Else, Token::Else) => true,
83 (Token::ElseIf, Token::ElseIf) => true,
84 (Token::End, Token::End) => true,
85 (Token::Function, Token::Function) => true,
86 (Token::Goto, Token::Goto) => true,
87 (Token::If, Token::If) => true,
88 (Token::In, Token::In) => true,
89 (Token::Local, Token::Local) => true,
90 (Token::Nil, Token::Nil) => true,
91 (Token::For, Token::For) => true,
92 (Token::While, Token::While) => true,
93 (Token::Repeat, Token::Repeat) => true,
94 (Token::Until, Token::Until) => true,
95 (Token::Return, Token::Return) => true,
96 (Token::Then, Token::Then) => true,
97 (Token::True, Token::True) => true,
98 (Token::False, Token::False) => true,
99 (Token::Not, Token::Not) => true,
100 (Token::And, Token::And) => true,
101 (Token::Or, Token::Or) => true,
102 (Token::Minus, Token::Minus) => true,
103 (Token::Add, Token::Add) => true,
104 (Token::Mul, Token::Mul) => true,
105 (Token::Div, Token::Div) => true,
106 (Token::IDiv, Token::IDiv) => true,
107 (Token::Pow, Token::Pow) => true,
108 (Token::Mod, Token::Mod) => true,
109 (Token::Len, Token::Len) => true,
110 (Token::BitNotXor, Token::BitNotXor) => true,
111 (Token::BitAnd, Token::BitAnd) => true,
112 (Token::BitOr, Token::BitOr) => true,
113 (Token::ShiftRight, Token::ShiftRight) => true,
114 (Token::ShiftLeft, Token::ShiftLeft) => true,
115 (Token::Concat, Token::Concat) => true,
116 (Token::Dots, Token::Dots) => true,
117 (Token::Assign, Token::Assign) => true,
118 (Token::LessThan, Token::LessThan) => true,
119 (Token::LessEqual, Token::LessEqual) => true,
120 (Token::GreaterThan, Token::GreaterThan) => true,
121 (Token::GreaterEqual, Token::GreaterEqual) => true,
122 (Token::Equal, Token::Equal) => true,
123 (Token::NotEqual, Token::NotEqual) => true,
124 (Token::Dot, Token::Dot) => true,
125 (Token::SemiColon, Token::SemiColon) => true,
126 (Token::Colon, Token::Colon) => true,
127 (Token::DoubleColon, Token::DoubleColon) => true,
128 (Token::Comma, Token::Comma) => true,
129 (Token::LeftParen, Token::LeftParen) => true,
130 (Token::RightParen, Token::RightParen) => true,
131 (Token::LeftBracket, Token::LeftBracket) => true,
132 (Token::RightBracket, Token::RightBracket) => true,
133 (Token::LeftBrace, Token::LeftBrace) => true,
134 (Token::RightBrace, Token::RightBrace) => true,
135 (Token::Integer(a), Token::Integer(b)) => a == b,
136 (Token::Float(a), Token::Float(b)) => a.total_cmp(b).is_eq(),
137 (Token::Name(a), Token::Name(b)) => a.as_ref() == b.as_ref(),
138 (Token::String(a), Token::String(b)) => a.as_ref() == b.as_ref(),
139 _ => false,
140 }
141 }
142}
143
144impl<S: AsRef<[u8]>> fmt::Debug for Token<S> {
145 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
146 match self {
147 Token::Break => write!(f, "Break"),
148 Token::Do => write!(f, "Do"),
149 Token::Else => write!(f, "Else"),
150 Token::ElseIf => write!(f, "ElseIf"),
151 Token::End => write!(f, "End"),
152 Token::Function => write!(f, "Function"),
153 Token::Goto => write!(f, "Goto"),
154 Token::If => write!(f, "If"),
155 Token::In => write!(f, "In"),
156 Token::Local => write!(f, "Local"),
157 Token::Nil => write!(f, "Nil"),
158 Token::For => write!(f, "For"),
159 Token::While => write!(f, "While"),
160 Token::Repeat => write!(f, "Repeat"),
161 Token::Until => write!(f, "Until"),
162 Token::Return => write!(f, "Return"),
163 Token::Then => write!(f, "Then"),
164 Token::True => write!(f, "True"),
165 Token::False => write!(f, "False"),
166 Token::Not => write!(f, "Not"),
167 Token::And => write!(f, "And"),
168 Token::Or => write!(f, "Or"),
169 Token::Minus => write!(f, "Minus"),
170 Token::Add => write!(f, "Add"),
171 Token::Mul => write!(f, "Mul"),
172 Token::Div => write!(f, "Div"),
173 Token::IDiv => write!(f, "IDiv"),
174 Token::Pow => write!(f, "Pow"),
175 Token::Mod => write!(f, "Mod"),
176 Token::Len => write!(f, "Len"),
177 Token::BitNotXor => write!(f, "BitNotXor"),
178 Token::BitAnd => write!(f, "BitAnd"),
179 Token::BitOr => write!(f, "BitOr"),
180 Token::ShiftRight => write!(f, "ShiftRight"),
181 Token::ShiftLeft => write!(f, "ShiftLeft"),
182 Token::Concat => write!(f, "Concat"),
183 Token::Dots => write!(f, "Dots"),
184 Token::Assign => write!(f, "Assign"),
185 Token::LessThan => write!(f, "LessThan"),
186 Token::LessEqual => write!(f, "LessEqual"),
187 Token::GreaterThan => write!(f, "GreaterThan"),
188 Token::GreaterEqual => write!(f, "GreaterEqual"),
189 Token::Equal => write!(f, "Equal"),
190 Token::NotEqual => write!(f, "NotEqual"),
191 Token::Dot => write!(f, "Dot"),
192 Token::SemiColon => write!(f, "SemiColon"),
193 Token::Colon => write!(f, "Colon"),
194 Token::DoubleColon => write!(f, "DoubleColon"),
195 Token::Comma => write!(f, "Comma"),
196 Token::LeftParen => write!(f, "LeftParen"),
197 Token::RightParen => write!(f, "RightParen"),
198 Token::LeftBracket => write!(f, "LeftBracket"),
199 Token::RightBracket => write!(f, "RightBracket"),
200 Token::LeftBrace => write!(f, "LeftBrace"),
201 Token::RightBrace => write!(f, "RightBrace"),
202 Token::Integer(i) => write!(f, "Integer({})", *i),
203 Token::Float(d) => write!(f, "Float({})", *d),
204 Token::Name(n) => write!(f, "Name({:?})", String::from_utf8_lossy(n.as_ref())),
205 Token::String(s) => write!(f, "String({:?})", String::from_utf8_lossy(s.as_ref())),
206 }
207 }
208}
209
210fn print_char(c: u8) -> char {
211 char::from_u32(c as u32).unwrap_or(char::REPLACEMENT_CHARACTER)
212}
213
214#[derive(Debug, Error)]
215pub enum LexError {
216 #[error("short string not finished, expected matching {}", print_char(*.0))]
217 UnfinishedShortString(u8),
218 #[error("unexpected character: {}", print_char(*.0))]
219 UnexpectedCharacter(u8),
220 #[error("hexadecimal digit expected")]
221 HexDigitExpected,
222 #[error("missing '{{' in \\u{{xxxx}} escape")]
223 EscapeUnicodeStart,
224 #[error("missing '}}' in \\u{{xxxx}} escape")]
225 EscapeUnicodeEnd,
226 #[error("invalid unicode value in \\u{{xxxx}} escape")]
227 EscapeUnicodeInvalid,
228 #[error("\\ddd escape out of 0-255 range")]
229 EscapeDecimalTooLarge,
230 #[error("invalid escape sequence")]
231 InvalidEscape,
232 #[error("invalid long string delimiter")]
233 InvalidLongStringDelimiter,
234 #[error("unfinished long string")]
235 UnfinishedLongString,
236 #[error("malformed number")]
237 BadNumber,
238 #[error("IO Error: {0}")]
239 IOError(#[from] io::Error),
240}
241
242#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Collect)]
244#[collect(require_static)]
245pub struct LineNumber(pub u64);
246
247impl fmt::Display for LineNumber {
248 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
249 write!(f, "{}", u128::from(self.0) + 1)
250 }
251}
252
253pub struct Lexer<R, S> {
254 source: Option<R>,
255 interner: S,
256 peek_buffer: Vec<u8>,
257 string_buffer: Vec<u8>,
258 line_number: u64,
259}
260
261impl<R, S> Lexer<R, S>
262where
263 R: Read,
264 S: StringInterner,
265{
266 pub fn new(source: R, interner: S) -> Lexer<R, S> {
267 Lexer {
268 source: Some(source),
269 interner,
270 peek_buffer: Vec::new(),
271 string_buffer: Vec::new(),
272 line_number: 0,
273 }
274 }
275
276 pub fn line_number(&self) -> LineNumber {
278 LineNumber(self.line_number)
279 }
280
281 pub fn skip_whitespace(&mut self) -> Result<(), LexError> {
282 let mut do_skip_whitespace = || {
283 while let Some(c) = self.peek(0)? {
284 match c {
285 b' ' | b'\t' | VERTICAL_TAB | FORM_FEED => {
286 self.advance(1);
287 }
288
289 b'\n' | b'\r' => {
290 self.read_line_end(false)?;
291 }
292
293 b'-' => {
294 if self.peek(1)? != Some(b'-') {
295 break;
296 } else {
297 self.advance(2);
298
299 match (self.peek(0)?, self.peek(1)?) {
300 (Some(b'['), Some(b'=')) | (Some(b'['), Some(b'[')) => {
301 self.read_long_string(false)?;
303 }
304 _ => {
305 while let Some(c) = self.peek(0)? {
307 if is_newline(c) {
308 break;
309 } else {
310 self.advance(1);
311 }
312 }
313 }
314 }
315 }
316 }
317
318 _ => break,
319 }
320 }
321
322 Ok(())
323 };
324
325 match do_skip_whitespace() {
326 Ok(()) => Ok(()),
327 Err(err) => {
328 self.reset();
329 Err(err)
330 }
331 }
332 }
333
334 pub fn read_token(&mut self) -> Result<Option<Token<S::String>>, LexError> {
336 self.skip_whitespace()?;
337
338 let mut do_read_token = || {
339 if let Some(c) = self.peek(0)? {
340 Ok(Some(match c {
341 b' ' | b'\t' | VERTICAL_TAB | FORM_FEED | b'\n' | b'\r' => {
342 unreachable!("whitespace should have been skipped");
343 }
344
345 b'-' => {
346 if self.peek(1)? != Some(b'-') {
347 self.advance(1);
348 Token::Minus
349 } else {
350 unreachable!("whitespace should have been skipped");
351 }
352 }
353
354 b'[' => {
355 let next = self.peek(1)?;
356 if next == Some(b'=') || next == Some(b'[') {
357 self.read_long_string(true)?;
358 Token::String(self.take_string())
359 } else {
360 self.advance(1);
361 Token::LeftBracket
362 }
363 }
364
365 b'=' => {
366 self.advance(1);
367 if self.peek(0)? == Some(b'=') {
368 self.advance(1);
369 Token::Equal
370 } else {
371 Token::Assign
372 }
373 }
374
375 b'<' => {
376 self.advance(1);
377 let next = self.peek(0)?;
378 if next == Some(b'=') {
379 self.advance(1);
380 Token::LessEqual
381 } else if next == Some(b'<') {
382 self.advance(1);
383 Token::ShiftLeft
384 } else {
385 Token::LessThan
386 }
387 }
388
389 b'>' => {
390 self.advance(1);
391 let next = self.peek(0)?;
392 if next == Some(b'=') {
393 self.advance(1);
394 Token::GreaterEqual
395 } else if next == Some(b'>') {
396 self.advance(1);
397 Token::ShiftRight
398 } else {
399 Token::GreaterThan
400 }
401 }
402
403 b'/' => {
404 self.advance(1);
405 if self.peek(0)? == Some(b'/') {
406 self.advance(1);
407 Token::IDiv
408 } else {
409 Token::Div
410 }
411 }
412
413 b'~' => {
414 self.advance(1);
415 if self.peek(0)? == Some(b'=') {
416 self.advance(1);
417 Token::NotEqual
418 } else {
419 Token::BitNotXor
420 }
421 }
422
423 b':' => {
424 self.advance(1);
425 if self.peek(0)? == Some(b':') {
426 self.advance(1);
427 Token::DoubleColon
428 } else {
429 Token::Colon
430 }
431 }
432
433 b'"' | b'\'' => {
434 self.read_short_string()?;
435 Token::String(self.take_string())
436 }
437
438 b'.' => {
439 if self.peek(1)? == Some(b'.') {
440 if self.peek(2)? == Some(b'.') {
441 self.advance(3);
442 Token::Dots
443 } else {
444 self.advance(2);
445 Token::Concat
446 }
447 } else if self.peek(1)?.map(is_digit).unwrap_or(false) {
448 self.read_numeral()?
449 } else {
450 self.advance(1);
451 Token::Dot
452 }
453 }
454
455 c => {
456 if is_digit(c) {
457 self.read_numeral()?
458 } else if let Some(t) = get_char_token(c) {
459 self.advance(1);
460 t
461 } else if is_alpha(c) {
462 self.string_buffer.clear();
463 self.string_buffer.push(c);
464 self.advance(1);
465
466 while let Some(c) = self.peek(0)? {
467 if is_alpha(c) || is_digit(c) {
468 self.string_buffer.push(c);
469 self.advance(1);
470 } else {
471 break;
472 }
473 }
474
475 if let Some(t) = get_reserved_word_token(self.string_buffer.as_slice())
476 {
477 t
478 } else {
479 Token::Name(self.take_string())
480 }
481 } else {
482 return Err(LexError::UnexpectedCharacter(c));
483 }
484 }
485 }))
486 } else {
487 Ok(None)
488 }
489 };
490
491 match do_read_token() {
492 Ok(Some(token)) => Ok(Some(token)),
493 res => {
494 self.reset();
495 res
496 }
497 }
498 }
499
500 fn reset(&mut self) {
502 self.source = None;
503 self.peek_buffer.clear();
504 self.string_buffer.clear();
505 }
506
507 fn read_line_end(&mut self, append_string: bool) -> Result<(), LexError> {
510 let newline = self.peek(0).unwrap().unwrap();
511 assert!(is_newline(newline));
512 self.advance(1);
513 if append_string {
516 self.string_buffer.push(b'\n');
517 }
518
519 if let Some(next_newline) = self.peek(0)? {
520 if is_newline(next_newline) && next_newline != newline {
521 self.advance(1);
522 }
523 }
524
525 self.line_number += 1;
526 Ok(())
527 }
528
529 fn read_short_string(&mut self) -> Result<(), LexError> {
532 let start_quote = self.peek(0).unwrap().unwrap();
533 assert!(start_quote == b'\'' || start_quote == b'"');
534 self.advance(1);
535
536 self.string_buffer.clear();
537
538 loop {
539 let c = if let Some(c) = self.peek(0)? {
540 c
541 } else {
542 return Err(LexError::UnfinishedShortString(start_quote));
543 };
544
545 if is_newline(c) {
546 return Err(LexError::UnfinishedShortString(start_quote));
547 }
548
549 self.advance(1);
550 if c == b'\\' {
551 match self
552 .peek(0)?
553 .ok_or_else(|| LexError::UnfinishedShortString(start_quote))?
554 {
555 b'a' => {
556 self.advance(1);
557 self.string_buffer.push(ALERT_BEEP);
558 }
559
560 b'b' => {
561 self.advance(1);
562 self.string_buffer.push(BACKSPACE);
563 }
564
565 b'f' => {
566 self.advance(1);
567 self.string_buffer.push(FORM_FEED);
568 }
569
570 b'n' => {
571 self.advance(1);
572 self.string_buffer.push(b'\n');
573 }
574
575 b'r' => {
576 self.advance(1);
577 self.string_buffer.push(b'\r');
578 }
579
580 b't' => {
581 self.advance(1);
582 self.string_buffer.push(b'\t');
583 }
584
585 b'v' => {
586 self.advance(1);
587 self.string_buffer.push(VERTICAL_TAB);
588 }
589
590 b'\\' => {
591 self.advance(1);
592 self.string_buffer.push(b'\\');
593 }
594
595 b'\'' => {
596 self.advance(1);
597 self.string_buffer.push(b'\'');
598 }
599
600 b'"' => {
601 self.advance(1);
602 self.string_buffer.push(b'"');
603 }
604
605 b'\n' | b'\r' => {
606 self.read_line_end(true)?;
607 }
608
609 b'x' => {
610 self.advance(1);
611 let first = self
612 .peek(0)?
613 .and_then(from_hex_digit)
614 .ok_or(LexError::HexDigitExpected)?;
615 let second = self
616 .peek(1)?
617 .and_then(from_hex_digit)
618 .ok_or(LexError::HexDigitExpected)?;
619 self.string_buffer.push(first << 4 | second);
620 self.advance(2);
621 }
622
623 b'u' => {
624 if self.peek(1)? != Some(b'{') {
625 return Err(LexError::EscapeUnicodeStart);
626 }
627 self.advance(2);
628
629 let mut u: u32 = 0;
630 loop {
631 if let Some(c) = self.peek(0)? {
632 if c == b'}' {
633 self.advance(1);
634 break;
635 } else if let Some(h) = from_hex_digit(c) {
636 u = (u << 4) | h as u32;
637 self.advance(1);
638 } else {
639 return Err(LexError::EscapeUnicodeEnd);
640 }
641 } else {
642 return Err(LexError::EscapeUnicodeEnd);
643 }
644 }
645
646 let c = char::from_u32(u).ok_or(LexError::EscapeUnicodeInvalid)?;
647 let mut buf = [0; 4];
648 for &b in c.encode_utf8(&mut buf).as_bytes() {
649 self.string_buffer.push(b);
650 }
651 }
652
653 b'z' => {
654 self.advance(1);
655 while let Some(c) = self.peek(0)? {
656 if is_newline(c) {
657 self.read_line_end(false)?;
658 } else if is_space(c) {
659 self.advance(1);
660 } else {
661 break;
662 }
663 }
664 }
665
666 c => {
667 if is_digit(c) {
668 let mut u: u16 = 0;
669 for _ in 0..3 {
670 if let Some(d) = self.peek(0)?.and_then(from_digit) {
671 u = 10 * u + d as u16;
672 self.advance(1);
673 } else {
674 break;
675 }
676 }
677 if u > 255 {
678 return Err(LexError::EscapeDecimalTooLarge);
679 }
680
681 self.string_buffer.push(u as u8);
682 } else {
683 return Err(LexError::InvalidEscape);
684 }
685 }
686 }
687 } else if c == start_quote {
688 break;
689 } else {
690 self.string_buffer.push(c);
691 }
692 }
693
694 Ok(())
695 }
696
697 fn read_long_string(&mut self, into_string: bool) -> Result<(), LexError> {
700 assert_eq!(self.peek(0).unwrap().unwrap(), b'[');
701 self.advance(1);
702
703 if into_string {
704 self.string_buffer.clear();
705 }
706
707 let mut open_sep_length = 0;
708 while self.peek(0)? == Some(b'=') {
709 self.advance(1);
710 open_sep_length += 1;
711 }
712
713 if self.peek(0)? != Some(b'[') {
714 return Err(LexError::InvalidLongStringDelimiter);
715 }
716 self.advance(1);
717
718 if matches!(self.peek(0)?, Some(b'\n' | b'\r')) {
719 self.read_line_end(false)?;
722 }
723
724 loop {
725 let c = if let Some(c) = self.peek(0)? {
726 c
727 } else {
728 return Err(LexError::UnfinishedLongString);
729 };
730
731 match c {
732 b'\n' | b'\r' => {
733 self.read_line_end(into_string)?;
734 }
735
736 b']' => {
737 let mut close_sep_length = 0;
738 self.advance(1);
739 while self.peek(0)? == Some(b'=') {
740 self.advance(1);
741 close_sep_length += 1;
742 }
743
744 if open_sep_length == close_sep_length && self.peek(0)? == Some(b']') {
745 self.advance(1);
746 break;
747 } else {
748 if into_string {
751 self.string_buffer.push(b']');
752 for _ in 0..close_sep_length {
753 self.string_buffer.push(b'=');
754 }
755 }
756 }
757 }
758
759 c => {
760 if into_string {
761 self.string_buffer.push(c);
762 }
763 self.advance(1);
764 }
765 }
766 }
767
768 Ok(())
769 }
770
771 fn read_numeral(&mut self) -> Result<Token<S::String>, LexError> {
775 let p1 = self.peek(0).unwrap().unwrap();
776 assert!(p1 == b'.' || is_digit(p1));
777
778 self.string_buffer.clear();
779
780 let p2 = self.peek(1)?;
781 let is_hex = p1 == b'0' && (p2 == Some(b'x') || p2 == Some(b'X'));
782 if is_hex {
783 self.string_buffer.push(p1);
784 self.string_buffer.push(p2.unwrap());
785 self.advance(2);
786 }
787
788 let mut has_radix = false;
789 while let Some(c) = self.peek(0)? {
790 if c == b'.' && !has_radix {
791 self.string_buffer.push(b'.');
792 has_radix = true;
793 self.advance(1);
794 } else if (!is_hex && is_digit(c)) || (is_hex && is_hex_digit(c)) {
795 self.string_buffer.push(c);
796 self.advance(1);
797 } else {
798 break;
799 }
800 }
801
802 let mut has_exp = false;
803 if let Some(exp_begin) = self.peek(0)? {
804 if (is_hex && (exp_begin == b'p' || exp_begin == b'P'))
805 || (!is_hex && (exp_begin == b'e' || exp_begin == b'E'))
806 {
807 self.string_buffer.push(exp_begin);
808 has_exp = true;
809 self.advance(1);
810
811 if let Some(sign) = self.peek(0)? {
812 if sign == b'+' || sign == b'-' {
813 self.string_buffer.push(sign);
814 self.advance(1);
815 }
816 }
817
818 while let Some(c) = self.peek(0)? {
819 if is_digit(c) {
820 self.string_buffer.push(c);
821 self.advance(1);
822 } else {
823 break;
824 }
825 }
826 }
827 }
828
829 if !has_exp && !has_radix {
830 if is_hex {
831 if let Some(i) = read_hex_integer(&self.string_buffer) {
832 return Ok(Token::Integer(i));
833 }
834 }
835 if let Some(i) = read_dec_integer(&self.string_buffer) {
836 return Ok(Token::Integer(i));
837 }
838 }
839
840 Ok(Token::Float(
841 if is_hex {
842 read_hex_float(&self.string_buffer)
843 } else {
844 read_dec_float(&self.string_buffer)
845 }
846 .ok_or(LexError::BadNumber)?,
847 ))
848 }
849
850 fn peek(&mut self, n: usize) -> Result<Option<u8>, LexError> {
851 if let Some(source) = self.source.as_mut() {
852 while self.peek_buffer.len() <= n {
853 let mut c = [0];
854 match source.read(&mut c) {
855 Ok(0) => {
856 self.source = None;
857 break;
858 }
859 Ok(_) => {
860 self.peek_buffer.push(c[0]);
861 }
862 Err(e) => {
863 if e.kind() != io::ErrorKind::Interrupted {
864 self.source = None;
865 return Err(LexError::IOError(e));
866 }
867 }
868 }
869 }
870 }
871
872 Ok(self.peek_buffer.get(n).copied())
873 }
874
875 fn advance(&mut self, n: usize) {
876 assert!(
877 n <= self.peek_buffer.len(),
878 "cannot advance over un-peeked characters"
879 );
880 self.peek_buffer.drain(0..n);
881 }
882
883 fn take_string(&mut self) -> S::String {
884 let s = self.interner.intern(&self.string_buffer);
885 self.string_buffer.clear();
886 s
887 }
888}
889
890pub fn read_integer(s: &[u8]) -> Option<i64> {
891 read_hex_integer(s).or_else(|| read_dec_integer(s))
892}
893
894pub fn read_dec_integer(s: &[u8]) -> Option<i64> {
895 let (is_neg, s) = read_neg(s);
896
897 let mut i: i64 = 0;
898 for &c in s {
899 let d = from_digit(c)? as i64;
900 i = i.checked_mul(10)?.checked_add(d)?;
901 }
902
903 if is_neg {
904 i = i.checked_neg()?;
905 }
906
907 Some(i)
908}
909
910pub fn read_hex_integer(s: &[u8]) -> Option<i64> {
911 let (is_neg, s) = read_neg(s);
912
913 if s[0] != b'0' || (s[1] != b'x' && s[1] != b'X') {
914 return None;
915 }
916
917 let mut i: i64 = 0;
918 for &c in &s[2..] {
919 let d = from_hex_digit(c)? as i64;
920 i = i.checked_mul(16)?.checked_add(d)?;
921 }
922
923 if is_neg {
924 i = i.checked_neg()?;
925 }
926
927 Some(i)
928}
929
930pub fn read_float(s: &[u8]) -> Option<f64> {
931 read_hex_float(s).or_else(|| read_dec_float(s))
932}
933
934pub fn read_dec_float(s: &[u8]) -> Option<f64> {
935 let s = str::from_utf8(s).ok()?;
936 str::parse(s).ok()
937}
938
939pub fn read_hex_float(s: &[u8]) -> Option<f64> {
940 const MAX_SIGNIFICANT_DIGITS: u32 = 30;
941
942 let (is_neg, s) = read_neg(s);
943
944 if s.len() < 2 {
945 return None;
946 }
947
948 if s[0] != b'0' || (s[1] != b'x' && s[1] != b'X') {
949 return None;
950 }
951
952 let mut significant_digits: u32 = 0;
953 let mut non_significant_digits: u32 = 0;
954 let mut found_dot = false;
955 let mut base: f64 = 0.0;
956 let mut exp: i32 = 0;
957 let mut i = 2;
958
959 while i < s.len() {
960 let c = s[i];
961 if c == b'.' {
962 if found_dot {
963 return None;
964 }
965 found_dot = true;
966 } else if let Some(d) = from_hex_digit(c) {
967 if significant_digits == 0 && d == 0 {
968 non_significant_digits += 1;
969 } else if significant_digits < MAX_SIGNIFICANT_DIGITS {
970 significant_digits += 1;
971 base = (base * 16.0) + d as f64;
972 } else {
973 exp = exp.checked_add(4)?;
975 }
976 if found_dot {
977 exp = exp.checked_sub(4)?;
979 }
980 } else {
981 break;
982 }
983 i += 1;
984 }
985
986 if non_significant_digits + significant_digits == 0 {
987 return None;
988 }
989
990 if i + 1 < s.len() && (s[i] == b'p' || s[i] == b'P') {
991 let (exp_neg, exp_s) = read_neg(&s[i + 1..]);
992 let mut exp1: i32 = 0;
993 for &c in exp_s {
994 let d = from_digit(c)?;
995 exp1 = exp1.saturating_mul(10).saturating_add(d as i32);
996 }
997 if exp_neg {
998 exp1 = -exp1;
999 }
1000 exp = exp.saturating_add(exp1);
1001 } else if i != s.len() {
1002 return None;
1003 }
1004
1005 if is_neg {
1006 base = -base;
1007 }
1008
1009 Some(base * (exp as f64).exp2())
1010}
1011
1012fn read_neg(s: &[u8]) -> (bool, &[u8]) {
1013 if s.len() > 0 {
1014 if s[0] == b'-' {
1015 (true, &s[1..])
1016 } else if s[0] == b'+' {
1017 (false, &s[1..])
1018 } else {
1019 (false, s)
1020 }
1021 } else {
1022 (false, s)
1023 }
1024}
1025
1026const ALERT_BEEP: u8 = 0x07;
1027const BACKSPACE: u8 = 0x08;
1028const VERTICAL_TAB: u8 = 0x0b;
1029const FORM_FEED: u8 = 0x0c;
1030
1031fn get_char_token<S>(c: u8) -> Option<Token<S>> {
1032 match c {
1033 b'-' => Some(Token::Minus),
1034 b'+' => Some(Token::Add),
1035 b'*' => Some(Token::Mul),
1036 b'^' => Some(Token::Pow),
1037 b'%' => Some(Token::Mod),
1038 b'&' => Some(Token::BitAnd),
1039 b'|' => Some(Token::BitOr),
1040 b',' => Some(Token::Comma),
1041 b';' => Some(Token::SemiColon),
1042 b'#' => Some(Token::Len),
1043 b'(' => Some(Token::LeftParen),
1044 b')' => Some(Token::RightParen),
1045 b']' => Some(Token::RightBracket),
1046 b'{' => Some(Token::LeftBrace),
1047 b'}' => Some(Token::RightBrace),
1048 _ => None,
1049 }
1050}
1051
1052fn get_reserved_word_token<S>(word: &[u8]) -> Option<Token<S>> {
1053 match word {
1054 b"break" => Some(Token::Break),
1055 b"do" => Some(Token::Do),
1056 b"else" => Some(Token::Else),
1057 b"elseif" => Some(Token::ElseIf),
1058 b"end" => Some(Token::End),
1059 b"function" => Some(Token::Function),
1060 b"goto" => Some(Token::Goto),
1061 b"if" => Some(Token::If),
1062 b"in" => Some(Token::In),
1063 b"local" => Some(Token::Local),
1064 b"nil" => Some(Token::Nil),
1065 b"for" => Some(Token::For),
1066 b"while" => Some(Token::While),
1067 b"repeat" => Some(Token::Repeat),
1068 b"until" => Some(Token::Until),
1069 b"return" => Some(Token::Return),
1070 b"then" => Some(Token::Then),
1071 b"true" => Some(Token::True),
1072 b"false" => Some(Token::False),
1073 b"not" => Some(Token::Not),
1074 b"and" => Some(Token::And),
1075 b"or" => Some(Token::Or),
1076 _ => None,
1077 }
1078}
1079
1080fn is_newline(c: u8) -> bool {
1081 c == b'\n' || c == b'\r'
1082}
1083
1084fn is_space(c: u8) -> bool {
1085 c == b' ' || c == b'\t' || c == VERTICAL_TAB || c == FORM_FEED || is_newline(c)
1086}
1087
1088fn is_alpha(c: u8) -> bool {
1090 (c >= b'a' && c <= b'z') || (c >= b'A' && c <= b'Z') || c == b'_'
1091}
1092
1093fn from_digit(c: u8) -> Option<u8> {
1094 if c >= b'0' && c <= b'9' {
1095 Some(c - b'0')
1096 } else {
1097 None
1098 }
1099}
1100
1101fn is_digit(c: u8) -> bool {
1102 from_digit(c).is_some()
1103}
1104
1105fn from_hex_digit(c: u8) -> Option<u8> {
1106 if c >= b'0' && c <= b'9' {
1107 Some(c - b'0')
1108 } else if c >= b'a' && c <= b'f' {
1109 Some(10 + c - b'a')
1110 } else if c >= b'A' && c <= b'F' {
1111 Some(10 + c - b'A')
1112 } else {
1113 None
1114 }
1115}
1116
1117fn is_hex_digit(c: u8) -> bool {
1118 from_hex_digit(c).is_some()
1119}
1120
1121#[cfg(test)]
1122mod tests {
1123 use std::rc::Rc;
1124
1125 use crate::compiler::interning::BasicInterner;
1126
1127 use super::*;
1128
1129 fn test_tokens(source: &str, tokens: &[Token<Rc<[u8]>>]) {
1130 let mut lexer = Lexer::new(source.as_bytes(), BasicInterner::default());
1131 let mut i = 0;
1132 while let Some(token) = lexer.read_token().unwrap() {
1133 assert!(i < tokens.len(), "too many tokens");
1134 assert_eq!(token, tokens[i], "tokens not equal");
1135 i += 1;
1136 }
1137 assert!(i == tokens.len(), "not enough tokens");
1138 }
1139
1140 fn test_tokens_lines(source: &str, tokens: &[(Token<Rc<[u8]>>, u64)]) {
1141 let mut lexer = Lexer::new(source.as_bytes(), BasicInterner::default());
1142 let mut i = 0;
1143 loop {
1144 lexer.skip_whitespace().unwrap();
1145 let line_number = lexer.line_number().0;
1146 if let Some(token) = lexer.read_token().unwrap() {
1147 assert!(i < tokens.len(), "too many tokens");
1148 assert_eq!(token, tokens[i].0, "tokens not equal");
1149 assert_eq!(line_number, tokens[i].1, "line numbers do not match");
1150 i += 1;
1151 } else {
1152 break;
1153 }
1154 }
1155 assert!(i == tokens.len(), "not enough tokens");
1156 }
1157
1158 fn str_token(s: &str) -> Token<Rc<[u8]>> {
1159 Token::String(s.as_bytes().to_vec().into_boxed_slice().into())
1160 }
1161
1162 fn name_token(s: &str) -> Token<Rc<[u8]>> {
1163 Token::Name(s.as_bytes().to_vec().into_boxed_slice().into())
1164 }
1165
1166 #[test]
1167 fn comments() {
1168 test_tokens_lines(
1169 r#"
1170 -- this is a comment
1171 -- this is also -- a comment
1172 --[[ long comment ]]
1173 --[==[ longer comment ]==]
1174
1175 -- Real token
1176 -
1177
1178 --[====[ longest comment
1179 these shouldn't trigger the end of comments
1180 ]=] ]==] ]===]
1181 ]====]
1182
1183 -- Real token
1184 =
1185 "#,
1186 &[(Token::Minus, 7), (Token::Assign, 15)],
1187 );
1188 }
1189
1190 #[test]
1191 fn long_string() {
1192 test_tokens(
1193 r#"
1194 [====[ [==[ this is a [[]] long string ]== ]==] ]====]
1195 [[ [=] [==] another long string [==] [=] ]]
1196 "#,
1197 &[
1198 str_token(" [==[ this is a [[]] long string ]== ]==] "),
1199 str_token(" [=] [==] another long string [==] [=] "),
1200 ],
1201 );
1202
1203 test_tokens(
1204 "[==[\nfoo\nbar\rbaz\r\nbaf\rquux]==]",
1205 &[str_token("foo\nbar\nbaz\nbaf\nquux")],
1206 );
1207 }
1208
1209 #[test]
1210 fn short_string() {
1211 test_tokens_lines(
1212 r#"
1213 "\\ \" '"
1214 '\n \t "'
1215 "begin \z
1216 end"
1217 'state\u{2e}'
1218 "question\x3f"
1219 "exclaim\33"
1220 "#,
1221 &[
1222 (str_token("\\ \" '"), 1),
1223 (str_token("\n \t \""), 2),
1224 (str_token("begin end"), 3),
1225 (str_token("state."), 5),
1226 (str_token("question?"), 6),
1227 (str_token("exclaim!"), 7),
1228 ],
1229 );
1230 }
1231
1232 #[test]
1233 fn numerals() {
1234 test_tokens(
1235 r#"
1236 0xdeadbeef
1237 12345
1238 12345.
1239 3.1415e-2
1240 0x22.4p+1
1241 0Xaa.8P-2
1242 0x8.4P0
1243 .123E-10
1244 0x99999999999999999999999999999999p999999999999999999999999999999
1245 9223372036854775807
1246 9223372036854775808
1247 "#,
1248 &[
1249 Token::Integer(0xdeadbeef),
1250 Token::Integer(12345),
1251 Token::Float(12345.0),
1252 Token::Float(3.1415e-2),
1253 Token::Float(68.5),
1254 Token::Float(42.625),
1255 Token::Float(8.25),
1256 Token::Float(0.123e-10),
1257 Token::Float(f64::INFINITY),
1258 Token::Integer(9223372036854775807),
1259 Token::Float(9223372036854775808.0),
1260 ],
1261 );
1262 }
1263
1264 #[test]
1265 fn words() {
1266 test_tokens(
1267 r#"
1268 break do else elseif end function goto if in local nil for while repeat until return
1269 then true false not and or
1270 custom names
1271 "#,
1272 &[
1273 Token::Break,
1274 Token::Do,
1275 Token::Else,
1276 Token::ElseIf,
1277 Token::End,
1278 Token::Function,
1279 Token::Goto,
1280 Token::If,
1281 Token::In,
1282 Token::Local,
1283 Token::Nil,
1284 Token::For,
1285 Token::While,
1286 Token::Repeat,
1287 Token::Until,
1288 Token::Return,
1289 Token::Then,
1290 Token::True,
1291 Token::False,
1292 Token::Not,
1293 Token::And,
1294 Token::Or,
1295 name_token("custom"),
1296 name_token("names"),
1297 ],
1298 );
1299 }
1300
1301 #[test]
1302 fn ops() {
1303 test_tokens(
1304 r#"- + * / // ^ % & ~ | , ; >> << . .. ... = < <= > >= == ~= : :: # ( ) [ ] { }"#,
1305 &[
1306 Token::Minus,
1307 Token::Add,
1308 Token::Mul,
1309 Token::Div,
1310 Token::IDiv,
1311 Token::Pow,
1312 Token::Mod,
1313 Token::BitAnd,
1314 Token::BitNotXor,
1315 Token::BitOr,
1316 Token::Comma,
1317 Token::SemiColon,
1318 Token::ShiftRight,
1319 Token::ShiftLeft,
1320 Token::Dot,
1321 Token::Concat,
1322 Token::Dots,
1323 Token::Assign,
1324 Token::LessThan,
1325 Token::LessEqual,
1326 Token::GreaterThan,
1327 Token::GreaterEqual,
1328 Token::Equal,
1329 Token::NotEqual,
1330 Token::Colon,
1331 Token::DoubleColon,
1332 Token::Len,
1333 Token::LeftParen,
1334 Token::RightParen,
1335 Token::LeftBracket,
1336 Token::RightBracket,
1337 Token::LeftBrace,
1338 Token::RightBrace,
1339 ],
1340 );
1341 }
1342}