lua_tokenizer/lib.rs
1//!
2//! ```rust
3//! let source = " <source code here> ";
4//!
5//! let tokenizer = lua_tokenizer::Tokenizer::new(source);
6//! // tokenizer itself is a lazy iterator.
7//! for token in tokenizer {
8//! match token {
9//! Ok(token) => {
10//! // do something with token
11//! }
12//! Err(e) => {
13//! print!("Tokenize Error: {}", e);
14//! }
15//! }
16//! }
17//! ```
18//!
19
20mod error;
21mod iorf;
22mod span;
23mod token;
24mod tokentype;
25
26#[cfg(test)]
27mod test;
28
29pub use error::TokenizeError;
30pub use iorf::IntOrFloat;
31pub use span::Span;
32pub use token::Token;
33pub use tokentype::TokenType;
34
35/// type alias for lua integer type.
36#[cfg(not(feature = "32bit"))]
37pub type IntType = i64;
38/// type alias for lua float type.
39#[cfg(not(feature = "32bit"))]
40pub type FloatType = f64;
41
42/// type alias for lua integer type.
43#[cfg(feature = "32bit")]
44pub type IntType = i32;
45/// type alias for lua float type.
46#[cfg(feature = "32bit")]
47pub type FloatType = f32;
48
49use core::str;
50use std::collections::HashMap;
51
52/// lazy tokenize iterator.
53#[derive(Clone)]
54pub struct Tokenizer<'a> {
55 /// source code to tokenize
56 pub(crate) source: &'a [u8],
57 /// current byte offset in source
58 pub(crate) byte_offset: usize,
59
60 pub(crate) keyword_map: HashMap<&'static str, TokenType>,
61}
62
63impl<'a> Tokenizer<'a> {
64 /// create new tokenizer iterator from source code.
65 pub fn new(source: &'a str) -> Self {
66 Self::from_bytes(source.as_bytes())
67 }
68 pub fn from_bytes(source: &'a [u8]) -> Self {
69 let mut keyword_map = HashMap::with_capacity(25);
70 keyword_map.insert("and", TokenType::And);
71 keyword_map.insert("break", TokenType::Break);
72 keyword_map.insert("do", TokenType::Do);
73 keyword_map.insert("else", TokenType::Else);
74 keyword_map.insert("elseif", TokenType::Elseif);
75 keyword_map.insert("end", TokenType::End);
76 keyword_map.insert("false", TokenType::Bool(false));
77 keyword_map.insert("for", TokenType::For);
78 keyword_map.insert("function", TokenType::Function);
79 keyword_map.insert("goto", TokenType::Goto);
80 keyword_map.insert("if", TokenType::If);
81 keyword_map.insert("in", TokenType::In);
82 keyword_map.insert("local", TokenType::Local);
83 keyword_map.insert("nil", TokenType::Nil);
84 keyword_map.insert("not", TokenType::Not);
85 keyword_map.insert("or", TokenType::Or);
86 keyword_map.insert("repeat", TokenType::Repeat);
87 keyword_map.insert("return", TokenType::Return);
88 keyword_map.insert("then", TokenType::Then);
89 keyword_map.insert("true", TokenType::Bool(true));
90 keyword_map.insert("until", TokenType::Until);
91 keyword_map.insert("while", TokenType::While);
92
93 Self {
94 source,
95 byte_offset: 0,
96 keyword_map,
97 }
98 }
99 fn get_cursor(&self) -> usize {
100 self.byte_offset
101 }
102 fn set_cursor(&mut self, cursor: usize) {
103 self.byte_offset = cursor;
104 }
105
106 fn advance(&mut self) {
107 self.byte_offset += 1;
108 }
109 fn advance_n(&mut self, bytes: usize) {
110 self.byte_offset += bytes;
111 }
112
113 pub fn peek(&self) -> Option<u8> {
114 self.source.get(self.byte_offset).copied()
115 }
116 pub fn is_end(&self) -> bool {
117 self.byte_offset >= self.source.len()
118 }
119
120 pub fn ignore_whitespace(&mut self) {
121 while let Some(ch) = self.peek() {
122 match ch {
123 b' ' | b'\t' | b'\r' | b'\n' => {
124 self.advance();
125 }
126 _ => break,
127 }
128 }
129 }
130
131 /// parse identifier.
132 /// returns `Some` if identifier is successfully parsed.
133 pub fn tokenize_ident(&mut self) -> Option<Token> {
134 let i0 = self.byte_offset;
135 if let Some(ch) = self.peek() {
136 match ch {
137 b'_' | b'a'..=b'z' | b'A'..=b'Z' => {
138 self.advance();
139 while let Some(ch) = self.peek() {
140 match ch {
141 b'_' | b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' => {
142 self.advance();
143 }
144 _ => break,
145 }
146 }
147
148 // checks for keyword
149 let i1 = self.byte_offset;
150 let slice = &self.source[i0..i1];
151 // source is from `&str`, so it is guaranteed to be valid utf-8.
152 let s = unsafe { str::from_utf8_unchecked(slice) };
153 if let Some(keyword) = self.keyword_map.get(s) {
154 let token = Token {
155 token_type: keyword.clone(),
156 span: Span::new(i0, i1),
157 };
158 Some(token)
159 } else {
160 let token = Token {
161 token_type: TokenType::Ident(s.to_string()),
162 span: Span::new(i0, i1),
163 };
164 Some(token)
165 }
166 }
167 _ => None,
168 }
169 } else {
170 None
171 }
172 }
173 /// parse literal.
174 /// returns error if it is definitely literal but it contains invalid characters.
175 /// otherwise, Ok(true) if it is literal, Ok(false) if it is not literal.
176 pub fn tokenize_literal(&mut self) -> Result<Option<Token>, TokenizeError> {
177 if let Some(token) = self.tokenize_numeric()? {
178 Ok(Some(token))
179 } else if let Some(token) = self.tokenize_string()? {
180 Ok(Some(token))
181 } else {
182 Ok(None)
183 }
184 }
185
186 /// parse single hex
187 pub(crate) fn hex(ch: u8) -> Option<u32> {
188 match ch {
189 b'0'..=b'9' => Some((ch - b'0') as u32),
190 b'a'..=b'f' => Some((ch - b'a') as u32 + 10),
191 b'A'..=b'F' => Some((ch - b'A') as u32 + 10),
192 _ => None,
193 }
194 }
195
196 pub fn tokenize_numeric(&mut self) -> Result<Option<Token>, TokenizeError> {
197 let i0 = self.byte_offset;
198 // check if it is hex
199 if self.starts_with_and_advance(b"0x") || self.starts_with_and_advance(b"0X") {
200 // hex
201
202 let mut value = IntOrFloat::from(0);
203
204 // hexs
205 let mut hexs_exist = false;
206 while let Some(ch) = self.peek() {
207 if let Some(hex) = Self::hex(ch) {
208 self.advance();
209 hexs_exist = true;
210 value *= 16 as IntType;
211 value += hex as IntType;
212 } else {
213 break;
214 }
215 }
216
217 if hexs_exist {
218 // check fraction
219 // dot
220 if self.peek() == Some(b'.') {
221 self.advance();
222
223 // one or more hexs for fraction
224 let base = (1.0 / 16.0) as FloatType;
225 let mut exp = base;
226 while let Some(ch) = self.peek() {
227 if let Some(hex) = Self::hex(ch) {
228 self.advance();
229
230 let f = hex as FloatType * exp;
231 value += f;
232 exp *= base;
233 } else {
234 break;
235 }
236 }
237 }
238 } else {
239 // hex part does not exist.
240
241 // dot must exist.
242 if self.peek() != Some(b'.') {
243 self.set_cursor(i0);
244 return Ok(None);
245 }
246 self.advance();
247
248 // one or more hexs for fraction must exist
249 let mut fraction_exist = false;
250 let base = (1.0 / 16.0) as FloatType;
251 let mut exp = base;
252 while let Some(ch) = self.peek() {
253 if let Some(hex) = Self::hex(ch) {
254 fraction_exist = true;
255 self.advance();
256
257 let f = hex as FloatType * exp;
258 value += f;
259 exp *= base;
260 } else {
261 break;
262 }
263 }
264 if fraction_exist == false {
265 self.set_cursor(i0);
266 return Ok(None);
267 }
268 }
269
270 // check exponent
271 // p or P
272 if self.peek() == Some(b'p') || self.peek() == Some(b'P') {
273 self.advance();
274
275 // '+' or '-'
276 let is_neg = match self.peek() {
277 Some(b'+') => {
278 self.advance();
279 false
280 }
281 Some(b'-') => {
282 self.advance();
283 true
284 }
285 _ => false,
286 };
287
288 // one or more digits for exponent
289 let mut exp_digit_exist = false;
290 let mut binary_exp: u32 = 0;
291 while let Some(ch) = self.peek() {
292 if ch >= b'0' && ch <= b'9' {
293 self.advance();
294 exp_digit_exist = true;
295 let d = (ch - b'0') as u32;
296 binary_exp = binary_exp.wrapping_mul(10).wrapping_add(d);
297 } else {
298 break;
299 }
300 }
301 if exp_digit_exist == false {
302 return Err(TokenizeError::NumericEmpty {
303 start: i0,
304 pos: self.byte_offset,
305 });
306 }
307
308 if is_neg {
309 for _ in 0..binary_exp {
310 value *= 0.5 as FloatType;
311 }
312 } else {
313 for _ in 0..binary_exp {
314 value *= 2 as IntType;
315 }
316 }
317 }
318
319 let token = Token {
320 token_type: TokenType::Numeric(value),
321 span: Span::new(i0, self.byte_offset),
322 };
323 Ok(Some(token))
324 } else {
325 let mut value = IntOrFloat::from(0);
326
327 // decimals
328 let mut decimal_exist = false;
329 while let Some(ch) = self.peek() {
330 if ch >= b'0' && ch <= b'9' {
331 decimal_exist = true;
332 self.advance();
333 value *= 10 as IntType;
334 value += (ch - b'0') as IntType;
335 } else {
336 break;
337 }
338 }
339
340 if decimal_exist {
341 // check fraction
342 // dot
343 if self.peek() == Some(b'.') {
344 self.advance();
345
346 value = value.to_float().into();
347
348 // one or more hexs for fraction
349 let base = (1.0 / 10.0) as FloatType;
350 let mut exp = base;
351 while let Some(ch) = self.peek() {
352 if ch >= b'0' && ch <= b'9' {
353 self.advance();
354
355 let f = (ch - b'0') as FloatType * exp;
356 value += f;
357 exp *= base;
358 } else {
359 break;
360 }
361 }
362 }
363 } else {
364 // decimal part does not exist.
365
366 // dot must exist.
367 if self.peek() != Some(b'.') {
368 self.set_cursor(i0);
369 return Ok(None);
370 }
371 self.advance();
372
373 // one or more digits must exist
374 let mut digit_exist = false;
375 let base = (1.0 / 10.0) as FloatType;
376 let mut exp = base;
377 while let Some(ch) = self.peek() {
378 if ch >= b'0' && ch <= b'9' {
379 digit_exist = true;
380 self.advance();
381
382 let f = (ch - b'0') as FloatType * exp;
383 value += f;
384 exp *= base;
385 } else {
386 break;
387 }
388 }
389 if digit_exist == false {
390 self.set_cursor(i0);
391 return Ok(None);
392 }
393 }
394
395 // check exponent
396 // e or E
397 if self.peek() == Some(b'e') || self.peek() == Some(b'E') {
398 self.advance();
399
400 // '+' or '-'
401 let is_neg = match self.peek() {
402 Some(b'+') => {
403 self.advance();
404 false
405 }
406 Some(b'-') => {
407 self.advance();
408 true
409 }
410 _ => false,
411 };
412
413 // one or more digits for exponent
414 let mut exp_digit_exist = false;
415 let mut base10_exp: u32 = 0;
416 while let Some(ch) = self.peek() {
417 if ch >= b'0' && ch <= b'9' {
418 self.advance();
419 exp_digit_exist = true;
420 let d = (ch - b'0') as u32;
421 base10_exp = base10_exp.wrapping_mul(10).wrapping_add(d);
422 } else {
423 break;
424 }
425 }
426 if exp_digit_exist == false {
427 return Err(TokenizeError::NumericEmpty {
428 start: i0,
429 pos: self.byte_offset,
430 });
431 }
432
433 if is_neg {
434 for _ in 0..base10_exp {
435 value *= 0.1 as FloatType;
436 }
437 } else {
438 for _ in 0..base10_exp {
439 value *= 10 as IntType;
440 }
441 }
442 }
443
444 let token = Token {
445 token_type: TokenType::Numeric(value),
446 span: Span::new(i0, self.byte_offset),
447 };
448 Ok(Some(token))
449 }
450 }
451 pub fn short_string_literal(
452 &mut self,
453 delim: u8,
454 start: usize,
455 ) -> Result<Vec<u8>, TokenizeError> {
456 let mut s = Vec::<u8>::new();
457 while let Some(ch) = self.peek() {
458 if ch == delim {
459 self.advance();
460 return Ok(s);
461 }
462 match ch {
463 b'\\' => {
464 let escape_start = self.byte_offset;
465 // escape
466 // consume '\\'
467 self.advance();
468 match self.peek() {
469 Some(b'z') => {
470 self.advance();
471 self.ignore_whitespace();
472 }
473 Some(b'a') => {
474 s.push(b'\x07');
475 self.advance();
476 }
477 Some(b'b') => {
478 s.push(b'\x08');
479 self.advance();
480 }
481 Some(b'f') => {
482 s.push(b'\x0c');
483 self.advance();
484 }
485 Some(b'n') | Some(b'\n') => {
486 s.push(b'\n');
487 self.advance();
488 }
489 Some(b'r') => {
490 s.push(b'\r');
491 self.advance();
492 }
493 Some(b't') => {
494 s.push(b'\t');
495 self.advance();
496 }
497 Some(b'v') => {
498 s.push(b'\x0b');
499 self.advance();
500 }
501 Some(b'\\') => {
502 s.push(b'\\');
503 self.advance();
504 }
505 Some(b'\"') => {
506 s.push(b'\"');
507 self.advance();
508 }
509 Some(b'\'') => {
510 s.push(b'\'');
511 self.advance();
512 }
513 Some(b'x') => {
514 // two hex digits
515 self.advance();
516
517 if let Some(first) = self.peek() {
518 if let Some(first) = Self::hex(first) {
519 self.advance();
520 if let Some(second) = self.peek() {
521 if let Some(second) = Self::hex(second) {
522 s.push((first * 16u32 + second) as u8);
523 self.advance();
524 } else {
525 // not hex
526 return Err(TokenizeError::ShortStringNotHex {
527 start,
528 pos: self.byte_offset,
529 });
530 }
531 } else {
532 // not closed
533 return Err(TokenizeError::ShortStringNotClosed {
534 delim: delim as char,
535 start,
536 end: self.byte_offset,
537 });
538 }
539 } else {
540 // not hex
541 return Err(TokenizeError::ShortStringNotHex {
542 start,
543 pos: self.byte_offset,
544 });
545 }
546 } else {
547 // not closed
548 return Err(TokenizeError::ShortStringNotClosed {
549 delim: delim as char,
550 start,
551 end: self.byte_offset,
552 });
553 }
554 }
555 Some(b'0'..=b'9') => {
556 // up to three decimal digits
557 let first: u32 = (self.peek().unwrap() - b'0') as u32;
558 self.advance();
559
560 if let Some(second) = self.peek() {
561 if second >= b'0' && second <= b'9' {
562 let second: u32 = (second - b'0') as u32;
563 self.advance();
564 if let Some(third) = self.peek() {
565 if third >= b'0' && third <= b'9' {
566 let third: u32 = (third - b'0') as u32;
567 self.advance();
568 s.push((first * 100 + second * 10 + third) as u8);
569 } else {
570 s.push((first * 10 + second) as u8);
571 }
572 } else {
573 // not closed
574 return Err(TokenizeError::ShortStringNotClosed {
575 delim: delim as char,
576 start,
577 end: self.byte_offset,
578 });
579 }
580 } else {
581 s.push(first as u8);
582 }
583 } else {
584 // not closed
585 return Err(TokenizeError::ShortStringNotClosed {
586 delim: delim as char,
587 start,
588 end: self.byte_offset,
589 });
590 }
591 }
592 Some(b'u') => {
593 self.advance();
594 // \u{X+}
595
596 if let Some(open) = self.peek() {
597 if open == b'{' {
598 self.advance();
599
600 let mut codepoint = 0i32;
601 let mut closed = false;
602 let mut count = 0;
603 while let Some(ch) = self.peek() {
604 if ch == b'}' {
605 closed = true;
606 self.advance();
607 break;
608 }
609 if let Some(digit) = Self::hex(ch) {
610 count += 1;
611 if let Some(mul) = codepoint.checked_mul(16i32) {
612 codepoint = mul;
613 } else {
614 return Err(TokenizeError::ShortStringOverflow {
615 start,
616 pos: self.byte_offset,
617 });
618 }
619 if let Some(add) = codepoint.checked_add(digit as i32) {
620 codepoint = add;
621 } else {
622 return Err(TokenizeError::ShortStringOverflow {
623 start,
624 pos: self.byte_offset,
625 });
626 }
627 self.advance();
628 } else {
629 // not hex
630 return Err(TokenizeError::ShortStringNotHex {
631 start,
632 pos: self.byte_offset,
633 });
634 }
635 }
636
637 if !closed {
638 // not closed
639 return Err(TokenizeError::ShortStringNotClosed {
640 delim: delim as char,
641 start,
642 end: self.byte_offset,
643 });
644 }
645 if count == 0 {
646 // empty codepoint
647 return Err(TokenizeError::ShortStringEmptyCodepoint {
648 start,
649 escape_start,
650 escape_end: self.byte_offset,
651 });
652 }
653
654 fn encode_u32_to_extended_utf8(u: i32) -> Vec<u8> {
655 if u < 0 {
656 unreachable!("encode_u32_to_extended_utf8: u < 0");
657 }
658 let u = u as u32;
659 // Determine how many bytes are needed based on the value
660 let bytes_needed = match u {
661 0x0000_0000..=0x0000_007F => 1,
662 0x0000_0080..=0x0000_07FF => 2,
663 0x0000_0800..=0x0000_FFFF => 3,
664 0x0001_0000..=0x001F_FFFF => 4,
665 0x0020_0000..=0x03FF_FFFF => 5,
666 0x0400_0000..=0x7FFF_FFFF => 6,
667 _ => unreachable!(),
668 };
669
670 let mut bytes = Vec::with_capacity(bytes_needed);
671
672 match bytes_needed {
673 1 => {
674 // 0xxxxxxx
675 bytes.push(u as u8);
676 }
677 2 => {
678 // 110xxxxx 10xxxxxx
679 bytes.push(
680 0b1100_0000 | ((u >> 6) as u8 & 0b0001_1111),
681 );
682 bytes.push(0b1000_0000 | (u as u8 & 0b0011_1111));
683 }
684 3 => {
685 // 1110xxxx 10xxxxxx 10xxxxxx
686 bytes.push(
687 0b1110_0000 | ((u >> 12) as u8 & 0b0000_1111),
688 );
689 bytes.push(
690 0b1000_0000 | ((u >> 6) as u8 & 0b0011_1111),
691 );
692 bytes.push(0b1000_0000 | (u as u8 & 0b0011_1111));
693 }
694 4 => {
695 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
696 bytes.push(
697 0b1111_0000 | ((u >> 18) as u8 & 0b0000_0111),
698 );
699 bytes.push(
700 0b1000_0000 | ((u >> 12) as u8 & 0b0011_1111),
701 );
702 bytes.push(
703 0b1000_0000 | ((u >> 6) as u8 & 0b0011_1111),
704 );
705 bytes.push(0b1000_0000 | (u as u8 & 0b0011_1111));
706 }
707 5 => {
708 // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
709 bytes.push(
710 0b1111_1000 | ((u >> 24) as u8 & 0b0000_0011),
711 );
712 bytes.push(
713 0b1000_0000 | ((u >> 18) as u8 & 0b0011_1111),
714 );
715 bytes.push(
716 0b1000_0000 | ((u >> 12) as u8 & 0b0011_1111),
717 );
718 bytes.push(
719 0b1000_0000 | ((u >> 6) as u8 & 0b0011_1111),
720 );
721 bytes.push(0b1000_0000 | (u as u8 & 0b0011_1111));
722 }
723 6 => {
724 // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
725 bytes.push(
726 0b1111_1100 | ((u >> 30) as u8 & 0b0000_0001),
727 );
728 bytes.push(
729 0b1000_0000 | ((u >> 24) as u8 & 0b0011_1111),
730 );
731 bytes.push(
732 0b1000_0000 | ((u >> 18) as u8 & 0b0011_1111),
733 );
734 bytes.push(
735 0b1000_0000 | ((u >> 12) as u8 & 0b0011_1111),
736 );
737 bytes.push(
738 0b1000_0000 | ((u >> 6) as u8 & 0b0011_1111),
739 );
740 bytes.push(0b1000_0000 | (u as u8 & 0b0011_1111));
741 }
742 _ => unreachable!(),
743 }
744
745 bytes
746 }
747
748 s.append(&mut encode_u32_to_extended_utf8(codepoint));
749 } else {
750 // '{' not present
751 return Err(TokenizeError::ShortStringNoOpenBrace {
752 start,
753 pos: self.byte_offset,
754 });
755 }
756 } else {
757 // not closed
758 return Err(TokenizeError::ShortStringNotClosed {
759 delim: delim as char,
760 start,
761 end: self.byte_offset,
762 });
763 }
764 }
765
766 Some(other) => {
767 return Err(TokenizeError::ShortStringInvalidEscape {
768 start,
769 pos: self.byte_offset,
770 escape: other as char,
771 });
772 }
773 None => {
774 return Err(TokenizeError::ShortStringNotClosed {
775 delim: delim as char,
776 start,
777 end: self.byte_offset,
778 });
779 }
780 }
781 }
782 b'\n' => {
783 return Err(TokenizeError::ShortStringNewline {
784 start,
785 pos: self.byte_offset,
786 });
787 }
788 _ => {
789 s.push(ch);
790 self.advance();
791 }
792 }
793 }
794 // not closed
795 Err(TokenizeError::ShortStringNotClosed {
796 delim: delim as char,
797 start,
798 end: self.byte_offset,
799 })
800 }
801 pub fn long_string_literal(
802 &mut self,
803 equal_count: usize,
804 start: usize,
805 ) -> Result<Vec<u8>, TokenizeError> {
806 let mut s = Vec::<u8>::new();
807 while let Some(ch) = self.peek() {
808 match ch {
809 b']' => {
810 // check end of long string literal
811 let cursor0 = self.get_cursor();
812 if let Some(count) = self.long_bracket(b']') {
813 if count == equal_count {
814 return Ok(s);
815 } else {
816 self.set_cursor(cursor0);
817 self.advance();
818 s.push(ch);
819 }
820 } else {
821 self.advance();
822 s.push(ch);
823 }
824 }
825
826 _ => {
827 s.push(ch);
828 self.advance();
829 }
830 }
831 }
832 // not closed
833 Err(TokenizeError::LongStringNotClosed {
834 start,
835 end: self.byte_offset,
836 equal_count,
837 })
838 }
839 pub fn tokenize_string(&mut self) -> Result<Option<Token>, TokenizeError> {
840 match self.peek() {
841 Some(b'\'') | Some(b'"') => {
842 // since ' or " is consumed, it is definitely short string literal.
843 let i0 = self.get_cursor();
844 let quote = self.peek().unwrap();
845 self.advance();
846
847 let s = self.short_string_literal(quote, i0)?;
848
849 let token = Token {
850 token_type: TokenType::String(s),
851 span: Span::new(i0, self.byte_offset),
852 };
853 Ok(Some(token))
854 }
855 Some(b'[') => {
856 // long string literal
857 let i0 = self.get_cursor();
858 if let Some(open_count) = self.long_bracket(b'[') {
859 // since long bracket '[[' is consumed, it is definitely long string literal.
860 let s = self.long_string_literal(open_count, i0)?;
861
862 let token = Token {
863 token_type: TokenType::String(s),
864 span: Span::new(i0, self.byte_offset),
865 };
866 Ok(Some(token))
867 } else {
868 self.set_cursor(i0);
869 Ok(None)
870 }
871 }
872 _ => Ok(None),
873 }
874 }
875
876 /// consume long bracket and return the number of '='.
877 /// `bracket` must be either b'[' or b']'.
878 pub(crate) fn long_bracket(&mut self, bracket: u8) -> Option<usize> {
879 debug_assert!(bracket == b'[' || bracket == b']');
880 let cursor0 = self.get_cursor();
881 if self.peek() == Some(bracket) {
882 // consume '['
883 self.advance();
884
885 // the number of '='
886 let mut count = 0;
887 while let Some(ch) = self.peek() {
888 if ch == bracket {
889 // consume '['
890 self.advance();
891 return Some(count);
892 } else if ch == b'=' {
893 // consume '='
894 self.advance();
895 count += 1;
896 } else {
897 self.set_cursor(cursor0);
898 return None;
899 }
900 }
901 None
902 } else {
903 return None;
904 }
905 }
906 pub(crate) fn starts_with_and_advance(&mut self, prefix: &[u8]) -> bool {
907 let slice = &self.source[self.byte_offset..];
908 if slice.starts_with(prefix) {
909 self.advance_n(prefix.len());
910 true
911 } else {
912 false
913 }
914 }
915
916 /// try tokenize single token.
917 pub fn try_tokenize(&mut self) -> Result<Option<Token>, TokenizeError> {
918 self.ignore_whitespace();
919 // check eof
920 if self.byte_offset >= self.source.len() {
921 return Ok(None);
922 }
923
924 if let Some(token) = self.tokenize_ident() {
925 // try ident
926 Ok(Some(token))
927 } else if let Some(token) = self.tokenize_literal()? {
928 // try literal
929 Ok(Some(token))
930 } else {
931 // try punctuator
932
933 macro_rules! advance_and_return {
934 ($token_type:ident) => {{
935 self.advance();
936 Ok(Some(Token {
937 token_type: TokenType::$token_type,
938 span: Span::new(self.byte_offset - 1, self.byte_offset),
939 }))
940 }};
941 }
942
943 let ch = self.peek().unwrap();
944 match ch {
945 b'+' => {
946 advance_and_return!(Plus)
947 }
948 b'*' => {
949 advance_and_return!(Asterisk)
950 }
951 b'/' => {
952 // check for SlashSlash
953 let i0 = self.byte_offset;
954 self.advance();
955
956 if self.peek() == Some(b'/') {
957 self.advance();
958 Ok(Some(Token {
959 token_type: TokenType::SlashSlash,
960 span: Span::new(i0, self.byte_offset),
961 }))
962 } else {
963 Ok(Some(Token {
964 token_type: TokenType::Slash,
965 span: Span::new(i0, i0 + 1),
966 }))
967 }
968 }
969 b'%' => {
970 advance_and_return!(Percent)
971 }
972 b'^' => {
973 advance_and_return!(Caret)
974 }
975 b'#' => {
976 advance_and_return!(Hash)
977 }
978 b'&' => {
979 advance_and_return!(Ampersand)
980 }
981 b'~' => {
982 // check for TildeEqual
983 let i0 = self.byte_offset;
984 self.advance();
985
986 if self.peek() == Some(b'=') {
987 self.advance();
988 Ok(Some(Token {
989 token_type: TokenType::TildeEqual,
990 span: Span::new(i0, self.byte_offset),
991 }))
992 } else {
993 Ok(Some(Token {
994 token_type: TokenType::Tilde,
995 span: Span::new(i0, i0 + 1),
996 }))
997 }
998 }
999 b'|' => {
1000 advance_and_return!(Pipe)
1001 }
1002 b'<' => {
1003 // check for LessLess
1004 let i0 = self.byte_offset;
1005 self.advance();
1006
1007 match self.peek() {
1008 Some(b'<') => {
1009 self.advance();
1010 Ok(Some(Token {
1011 token_type: TokenType::LessLess,
1012 span: Span::new(i0, self.byte_offset),
1013 }))
1014 }
1015 Some(b'=') => {
1016 self.advance();
1017 Ok(Some(Token {
1018 token_type: TokenType::LessEqual,
1019 span: Span::new(i0, self.byte_offset),
1020 }))
1021 }
1022
1023 _ => Ok(Some(Token {
1024 token_type: TokenType::Less,
1025 span: Span::new(i0, i0 + 1),
1026 })),
1027 }
1028 }
1029 b'>' => {
1030 // check for LessLess
1031 let i0 = self.byte_offset;
1032 self.advance();
1033
1034 match self.peek() {
1035 Some(b'>') => {
1036 self.advance();
1037 Ok(Some(Token {
1038 token_type: TokenType::GreaterGreater,
1039 span: Span::new(i0, self.byte_offset),
1040 }))
1041 }
1042 Some(b'=') => {
1043 self.advance();
1044 Ok(Some(Token {
1045 token_type: TokenType::GreaterEqual,
1046 span: Span::new(i0, self.byte_offset),
1047 }))
1048 }
1049
1050 _ => Ok(Some(Token {
1051 token_type: TokenType::Greater,
1052 span: Span::new(i0, i0 + 1),
1053 })),
1054 }
1055 }
1056 b'=' => {
1057 // check for EqualEqual
1058 let i0 = self.byte_offset;
1059 self.advance();
1060
1061 if self.peek() == Some(b'=') {
1062 self.advance();
1063 Ok(Some(Token {
1064 token_type: TokenType::EqualEqual,
1065 span: Span::new(i0, self.byte_offset),
1066 }))
1067 } else {
1068 Ok(Some(Token {
1069 token_type: TokenType::Equal,
1070 span: Span::new(i0, i0 + 1),
1071 }))
1072 }
1073 }
1074
1075 b'(' => {
1076 advance_and_return!(LParen)
1077 }
1078 b')' => {
1079 advance_and_return!(RParen)
1080 }
1081 b'{' => {
1082 advance_and_return!(LBrace)
1083 }
1084 b'}' => {
1085 advance_and_return!(RBrace)
1086 }
1087 b'[' => {
1088 advance_and_return!(LBracket)
1089 }
1090 b']' => {
1091 advance_and_return!(RBracket)
1092 }
1093 b':' => {
1094 // check for ColonColon
1095 let i0 = self.byte_offset;
1096 self.advance();
1097
1098 if self.peek() == Some(b':') {
1099 self.advance();
1100 Ok(Some(Token {
1101 token_type: TokenType::ColonColon,
1102 span: Span::new(i0, self.byte_offset),
1103 }))
1104 } else {
1105 Ok(Some(Token {
1106 token_type: TokenType::Colon,
1107 span: Span::new(i0, i0 + 1),
1108 }))
1109 }
1110 }
1111 b';' => {
1112 advance_and_return!(Semicolon)
1113 }
1114 b',' => {
1115 advance_and_return!(Comma)
1116 }
1117 b'.' => {
1118 let i0 = self.byte_offset;
1119 self.advance();
1120
1121 if self.peek() == Some(b'.') {
1122 let i1 = self.byte_offset;
1123 self.advance();
1124
1125 if self.peek() == Some(b'.') {
1126 self.advance();
1127 Ok(Some(Token {
1128 token_type: TokenType::DotDotDot,
1129 span: Span::new(i0, self.byte_offset),
1130 }))
1131 } else {
1132 Ok(Some(Token {
1133 token_type: TokenType::DotDot,
1134 span: Span::new(i0, i1),
1135 }))
1136 }
1137 } else {
1138 Ok(Some(Token {
1139 token_type: TokenType::Dot,
1140 span: Span::new(i0, i0 + 1),
1141 }))
1142 }
1143 }
1144 b'-' => {
1145 let i0 = self.byte_offset;
1146 // check start of comment
1147 if self.starts_with_and_advance(b"--") {
1148 // check start of multi-line comment
1149 if let Some(open_equal_count) = self.long_bracket(b'[') {
1150 let multiline_comment_begin = (i0, self.byte_offset);
1151
1152 while self.byte_offset < self.source.len() {
1153 if let Some(close_equal_count) = self.long_bracket(b']') {
1154 if close_equal_count == open_equal_count {
1155 return self.try_tokenize();
1156 }
1157 // since `long_bracket` is parsed, the cursor is currently at the next position of ']'.
1158 // ]====]
1159 // ^ here
1160 // move back cursor so that it points to the last ']'.
1161 // so we can test other long-closing-brackets.
1162 self.byte_offset -= 1;
1163 } else {
1164 self.advance()
1165 }
1166 }
1167 // eof reached
1168 // multi-line comment not closed
1169 Err(TokenizeError::MultilineCommentNotClosed {
1170 start: multiline_comment_begin.0,
1171 end: multiline_comment_begin.1,
1172 })
1173 } else {
1174 // it is line comment
1175 while let Some(ch) = self.peek() {
1176 self.advance();
1177 if ch == b'\n' {
1178 break;
1179 }
1180 }
1181 self.try_tokenize()
1182 }
1183 } else {
1184 // it is not comment, put '-'
1185 advance_and_return!(Minus)
1186 }
1187 }
1188
1189 _ => {
1190 // invalid punctuator
1191 Err(TokenizeError::InvalidPunct {
1192 pos: self.byte_offset,
1193 punct: ch as char,
1194 })
1195 }
1196 }
1197 }
1198 }
1199}
1200
1201impl<'a> Iterator for Tokenizer<'a> {
1202 type Item = Result<Token, TokenizeError>;
1203
1204 fn next(&mut self) -> Option<Self::Item> {
1205 match self.try_tokenize() {
1206 Ok(Some(token)) => Some(Ok(token)),
1207 Ok(None) => None,
1208 Err(e) => Some(Err(e)),
1209 }
1210 }
1211}