lua_tokenizer/
lib.rs

1//!
2//! ```rust
3//! let source = " <source code here> ";
4//!
5//! let tokenizer = lua_tokenizer::Tokenizer::new(source);
6//! // tokenizer itself is a lazy iterator.
7//! for token in tokenizer {
8//!     match token {
9//!         Ok(token) => {
10//!             // do something with token
11//!         }
12//!         Err(e) => {
13//!             print!("Tokenize Error: {}", e);
14//!         }
15//!     }
16//! }
17//! ```
18//!
19
20mod error;
21mod iorf;
22mod span;
23mod token;
24mod tokentype;
25
26#[cfg(test)]
27mod test;
28
29pub use error::TokenizeError;
30pub use iorf::IntOrFloat;
31pub use span::Span;
32pub use token::Token;
33pub use tokentype::TokenType;
34
35/// type alias for lua integer type.
36#[cfg(not(feature = "32bit"))]
37pub type IntType = i64;
38/// type alias for lua float type.
39#[cfg(not(feature = "32bit"))]
40pub type FloatType = f64;
41
42/// type alias for lua integer type.
43#[cfg(feature = "32bit")]
44pub type IntType = i32;
45/// type alias for lua float type.
46#[cfg(feature = "32bit")]
47pub type FloatType = f32;
48
49use core::str;
50use std::collections::HashMap;
51
52/// lazy tokenize iterator.
53#[derive(Clone)]
54pub struct Tokenizer<'a> {
55    /// source code to tokenize
56    pub(crate) source: &'a [u8],
57    /// current byte offset in source
58    pub(crate) byte_offset: usize,
59
60    pub(crate) keyword_map: HashMap<&'static str, TokenType>,
61}
62
63impl<'a> Tokenizer<'a> {
64    /// create new tokenizer iterator from source code.
65    pub fn new(source: &'a str) -> Self {
66        Self::from_bytes(source.as_bytes())
67    }
68    pub fn from_bytes(source: &'a [u8]) -> Self {
69        let mut keyword_map = HashMap::with_capacity(25);
70        keyword_map.insert("and", TokenType::And);
71        keyword_map.insert("break", TokenType::Break);
72        keyword_map.insert("do", TokenType::Do);
73        keyword_map.insert("else", TokenType::Else);
74        keyword_map.insert("elseif", TokenType::Elseif);
75        keyword_map.insert("end", TokenType::End);
76        keyword_map.insert("false", TokenType::Bool(false));
77        keyword_map.insert("for", TokenType::For);
78        keyword_map.insert("function", TokenType::Function);
79        keyword_map.insert("goto", TokenType::Goto);
80        keyword_map.insert("if", TokenType::If);
81        keyword_map.insert("in", TokenType::In);
82        keyword_map.insert("local", TokenType::Local);
83        keyword_map.insert("nil", TokenType::Nil);
84        keyword_map.insert("not", TokenType::Not);
85        keyword_map.insert("or", TokenType::Or);
86        keyword_map.insert("repeat", TokenType::Repeat);
87        keyword_map.insert("return", TokenType::Return);
88        keyword_map.insert("then", TokenType::Then);
89        keyword_map.insert("true", TokenType::Bool(true));
90        keyword_map.insert("until", TokenType::Until);
91        keyword_map.insert("while", TokenType::While);
92
93        Self {
94            source,
95            byte_offset: 0,
96            keyword_map,
97        }
98    }
99    fn get_cursor(&self) -> usize {
100        self.byte_offset
101    }
102    fn set_cursor(&mut self, cursor: usize) {
103        self.byte_offset = cursor;
104    }
105
106    fn advance(&mut self) {
107        self.byte_offset += 1;
108    }
109    fn advance_n(&mut self, bytes: usize) {
110        self.byte_offset += bytes;
111    }
112
113    pub fn peek(&self) -> Option<u8> {
114        self.source.get(self.byte_offset).copied()
115    }
116    pub fn is_end(&self) -> bool {
117        self.byte_offset >= self.source.len()
118    }
119
120    pub fn ignore_whitespace(&mut self) {
121        while let Some(ch) = self.peek() {
122            match ch {
123                b' ' | b'\t' | b'\r' | b'\n' => {
124                    self.advance();
125                }
126                _ => break,
127            }
128        }
129    }
130
131    /// parse identifier.
132    /// returns `Some` if identifier is successfully parsed.
133    pub fn tokenize_ident(&mut self) -> Option<Token> {
134        let i0 = self.byte_offset;
135        if let Some(ch) = self.peek() {
136            match ch {
137                b'_' | b'a'..=b'z' | b'A'..=b'Z' => {
138                    self.advance();
139                    while let Some(ch) = self.peek() {
140                        match ch {
141                            b'_' | b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' => {
142                                self.advance();
143                            }
144                            _ => break,
145                        }
146                    }
147
148                    // checks for keyword
149                    let i1 = self.byte_offset;
150                    let slice = &self.source[i0..i1];
151                    // source is from `&str`, so it is guaranteed to be valid utf-8.
152                    let s = unsafe { str::from_utf8_unchecked(slice) };
153                    if let Some(keyword) = self.keyword_map.get(s) {
154                        let token = Token {
155                            token_type: keyword.clone(),
156                            span: Span::new(i0, i1),
157                        };
158                        Some(token)
159                    } else {
160                        let token = Token {
161                            token_type: TokenType::Ident(s.to_string()),
162                            span: Span::new(i0, i1),
163                        };
164                        Some(token)
165                    }
166                }
167                _ => None,
168            }
169        } else {
170            None
171        }
172    }
173    /// parse literal.
174    /// returns error if it is definitely literal but it contains invalid characters.
175    /// otherwise, Ok(true) if it is literal, Ok(false) if it is not literal.
176    pub fn tokenize_literal(&mut self) -> Result<Option<Token>, TokenizeError> {
177        if let Some(token) = self.tokenize_numeric()? {
178            Ok(Some(token))
179        } else if let Some(token) = self.tokenize_string()? {
180            Ok(Some(token))
181        } else {
182            Ok(None)
183        }
184    }
185
186    /// parse single hex
187    pub(crate) fn hex(ch: u8) -> Option<u32> {
188        match ch {
189            b'0'..=b'9' => Some((ch - b'0') as u32),
190            b'a'..=b'f' => Some((ch - b'a') as u32 + 10),
191            b'A'..=b'F' => Some((ch - b'A') as u32 + 10),
192            _ => None,
193        }
194    }
195
196    pub fn tokenize_numeric(&mut self) -> Result<Option<Token>, TokenizeError> {
197        let i0 = self.byte_offset;
198        // check if it is hex
199        if self.starts_with_and_advance(b"0x") || self.starts_with_and_advance(b"0X") {
200            // hex
201
202            let mut value = IntOrFloat::from(0);
203
204            // hexs
205            let mut hexs_exist = false;
206            while let Some(ch) = self.peek() {
207                if let Some(hex) = Self::hex(ch) {
208                    self.advance();
209                    hexs_exist = true;
210                    value *= 16 as IntType;
211                    value += hex as IntType;
212                } else {
213                    break;
214                }
215            }
216
217            if hexs_exist {
218                // check fraction
219                // dot
220                if self.peek() == Some(b'.') {
221                    self.advance();
222
223                    // one or more hexs for fraction
224                    let base = (1.0 / 16.0) as FloatType;
225                    let mut exp = base;
226                    while let Some(ch) = self.peek() {
227                        if let Some(hex) = Self::hex(ch) {
228                            self.advance();
229
230                            let f = hex as FloatType * exp;
231                            value += f;
232                            exp *= base;
233                        } else {
234                            break;
235                        }
236                    }
237                }
238            } else {
239                // hex part does not exist.
240
241                // dot must exist.
242                if self.peek() != Some(b'.') {
243                    self.set_cursor(i0);
244                    return Ok(None);
245                }
246                self.advance();
247
248                // one or more hexs for fraction must exist
249                let mut fraction_exist = false;
250                let base = (1.0 / 16.0) as FloatType;
251                let mut exp = base;
252                while let Some(ch) = self.peek() {
253                    if let Some(hex) = Self::hex(ch) {
254                        fraction_exist = true;
255                        self.advance();
256
257                        let f = hex as FloatType * exp;
258                        value += f;
259                        exp *= base;
260                    } else {
261                        break;
262                    }
263                }
264                if fraction_exist == false {
265                    self.set_cursor(i0);
266                    return Ok(None);
267                }
268            }
269
270            // check exponent
271            // p or P
272            if self.peek() == Some(b'p') || self.peek() == Some(b'P') {
273                self.advance();
274
275                // '+' or '-'
276                let is_neg = match self.peek() {
277                    Some(b'+') => {
278                        self.advance();
279                        false
280                    }
281                    Some(b'-') => {
282                        self.advance();
283                        true
284                    }
285                    _ => false,
286                };
287
288                // one or more digits for exponent
289                let mut exp_digit_exist = false;
290                let mut binary_exp: u32 = 0;
291                while let Some(ch) = self.peek() {
292                    if ch >= b'0' && ch <= b'9' {
293                        self.advance();
294                        exp_digit_exist = true;
295                        let d = (ch - b'0') as u32;
296                        binary_exp = binary_exp.wrapping_mul(10).wrapping_add(d);
297                    } else {
298                        break;
299                    }
300                }
301                if exp_digit_exist == false {
302                    return Err(TokenizeError::NumericEmpty {
303                        start: i0,
304                        pos: self.byte_offset,
305                    });
306                }
307
308                if is_neg {
309                    for _ in 0..binary_exp {
310                        value *= 0.5 as FloatType;
311                    }
312                } else {
313                    for _ in 0..binary_exp {
314                        value *= 2 as IntType;
315                    }
316                }
317            }
318
319            let token = Token {
320                token_type: TokenType::Numeric(value),
321                span: Span::new(i0, self.byte_offset),
322            };
323            Ok(Some(token))
324        } else {
325            let mut value = IntOrFloat::from(0);
326
327            // decimals
328            let mut decimal_exist = false;
329            while let Some(ch) = self.peek() {
330                if ch >= b'0' && ch <= b'9' {
331                    decimal_exist = true;
332                    self.advance();
333                    value *= 10 as IntType;
334                    value += (ch - b'0') as IntType;
335                } else {
336                    break;
337                }
338            }
339
340            if decimal_exist {
341                // check fraction
342                // dot
343                if self.peek() == Some(b'.') {
344                    self.advance();
345
346                    value = value.to_float().into();
347
348                    // one or more hexs for fraction
349                    let base = (1.0 / 10.0) as FloatType;
350                    let mut exp = base;
351                    while let Some(ch) = self.peek() {
352                        if ch >= b'0' && ch <= b'9' {
353                            self.advance();
354
355                            let f = (ch - b'0') as FloatType * exp;
356                            value += f;
357                            exp *= base;
358                        } else {
359                            break;
360                        }
361                    }
362                }
363            } else {
364                // decimal part does not exist.
365
366                // dot must exist.
367                if self.peek() != Some(b'.') {
368                    self.set_cursor(i0);
369                    return Ok(None);
370                }
371                self.advance();
372
373                // one or more digits must exist
374                let mut digit_exist = false;
375                let base = (1.0 / 10.0) as FloatType;
376                let mut exp = base;
377                while let Some(ch) = self.peek() {
378                    if ch >= b'0' && ch <= b'9' {
379                        digit_exist = true;
380                        self.advance();
381
382                        let f = (ch - b'0') as FloatType * exp;
383                        value += f;
384                        exp *= base;
385                    } else {
386                        break;
387                    }
388                }
389                if digit_exist == false {
390                    self.set_cursor(i0);
391                    return Ok(None);
392                }
393            }
394
395            // check exponent
396            // e or E
397            if self.peek() == Some(b'e') || self.peek() == Some(b'E') {
398                self.advance();
399
400                // '+' or '-'
401                let is_neg = match self.peek() {
402                    Some(b'+') => {
403                        self.advance();
404                        false
405                    }
406                    Some(b'-') => {
407                        self.advance();
408                        true
409                    }
410                    _ => false,
411                };
412
413                // one or more digits for exponent
414                let mut exp_digit_exist = false;
415                let mut base10_exp: u32 = 0;
416                while let Some(ch) = self.peek() {
417                    if ch >= b'0' && ch <= b'9' {
418                        self.advance();
419                        exp_digit_exist = true;
420                        let d = (ch - b'0') as u32;
421                        base10_exp = base10_exp.wrapping_mul(10).wrapping_add(d);
422                    } else {
423                        break;
424                    }
425                }
426                if exp_digit_exist == false {
427                    return Err(TokenizeError::NumericEmpty {
428                        start: i0,
429                        pos: self.byte_offset,
430                    });
431                }
432
433                if is_neg {
434                    for _ in 0..base10_exp {
435                        value *= 0.1 as FloatType;
436                    }
437                } else {
438                    for _ in 0..base10_exp {
439                        value *= 10 as IntType;
440                    }
441                }
442            }
443
444            let token = Token {
445                token_type: TokenType::Numeric(value),
446                span: Span::new(i0, self.byte_offset),
447            };
448            Ok(Some(token))
449        }
450    }
451    pub fn short_string_literal(
452        &mut self,
453        delim: u8,
454        start: usize,
455    ) -> Result<Vec<u8>, TokenizeError> {
456        let mut s = Vec::<u8>::new();
457        while let Some(ch) = self.peek() {
458            if ch == delim {
459                self.advance();
460                return Ok(s);
461            }
462            match ch {
463                b'\\' => {
464                    let escape_start = self.byte_offset;
465                    // escape
466                    // consume '\\'
467                    self.advance();
468                    match self.peek() {
469                        Some(b'z') => {
470                            self.advance();
471                            self.ignore_whitespace();
472                        }
473                        Some(b'a') => {
474                            s.push(b'\x07');
475                            self.advance();
476                        }
477                        Some(b'b') => {
478                            s.push(b'\x08');
479                            self.advance();
480                        }
481                        Some(b'f') => {
482                            s.push(b'\x0c');
483                            self.advance();
484                        }
485                        Some(b'n') | Some(b'\n') => {
486                            s.push(b'\n');
487                            self.advance();
488                        }
489                        Some(b'r') => {
490                            s.push(b'\r');
491                            self.advance();
492                        }
493                        Some(b't') => {
494                            s.push(b'\t');
495                            self.advance();
496                        }
497                        Some(b'v') => {
498                            s.push(b'\x0b');
499                            self.advance();
500                        }
501                        Some(b'\\') => {
502                            s.push(b'\\');
503                            self.advance();
504                        }
505                        Some(b'\"') => {
506                            s.push(b'\"');
507                            self.advance();
508                        }
509                        Some(b'\'') => {
510                            s.push(b'\'');
511                            self.advance();
512                        }
513                        Some(b'x') => {
514                            // two hex digits
515                            self.advance();
516
517                            if let Some(first) = self.peek() {
518                                if let Some(first) = Self::hex(first) {
519                                    self.advance();
520                                    if let Some(second) = self.peek() {
521                                        if let Some(second) = Self::hex(second) {
522                                            s.push((first * 16u32 + second) as u8);
523                                            self.advance();
524                                        } else {
525                                            // not hex
526                                            return Err(TokenizeError::ShortStringNotHex {
527                                                start,
528                                                pos: self.byte_offset,
529                                            });
530                                        }
531                                    } else {
532                                        // not closed
533                                        return Err(TokenizeError::ShortStringNotClosed {
534                                            delim: delim as char,
535                                            start,
536                                            end: self.byte_offset,
537                                        });
538                                    }
539                                } else {
540                                    // not hex
541                                    return Err(TokenizeError::ShortStringNotHex {
542                                        start,
543                                        pos: self.byte_offset,
544                                    });
545                                }
546                            } else {
547                                // not closed
548                                return Err(TokenizeError::ShortStringNotClosed {
549                                    delim: delim as char,
550                                    start,
551                                    end: self.byte_offset,
552                                });
553                            }
554                        }
555                        Some(b'0'..=b'9') => {
556                            // up to three decimal digits
557                            let first: u32 = (self.peek().unwrap() - b'0') as u32;
558                            self.advance();
559
560                            if let Some(second) = self.peek() {
561                                if second >= b'0' && second <= b'9' {
562                                    let second: u32 = (second - b'0') as u32;
563                                    self.advance();
564                                    if let Some(third) = self.peek() {
565                                        if third >= b'0' && third <= b'9' {
566                                            let third: u32 = (third - b'0') as u32;
567                                            self.advance();
568                                            s.push((first * 100 + second * 10 + third) as u8);
569                                        } else {
570                                            s.push((first * 10 + second) as u8);
571                                        }
572                                    } else {
573                                        // not closed
574                                        return Err(TokenizeError::ShortStringNotClosed {
575                                            delim: delim as char,
576                                            start,
577                                            end: self.byte_offset,
578                                        });
579                                    }
580                                } else {
581                                    s.push(first as u8);
582                                }
583                            } else {
584                                // not closed
585                                return Err(TokenizeError::ShortStringNotClosed {
586                                    delim: delim as char,
587                                    start,
588                                    end: self.byte_offset,
589                                });
590                            }
591                        }
592                        Some(b'u') => {
593                            self.advance();
594                            // \u{X+}
595
596                            if let Some(open) = self.peek() {
597                                if open == b'{' {
598                                    self.advance();
599
600                                    let mut codepoint = 0i32;
601                                    let mut closed = false;
602                                    let mut count = 0;
603                                    while let Some(ch) = self.peek() {
604                                        if ch == b'}' {
605                                            closed = true;
606                                            self.advance();
607                                            break;
608                                        }
609                                        if let Some(digit) = Self::hex(ch) {
610                                            count += 1;
611                                            if let Some(mul) = codepoint.checked_mul(16i32) {
612                                                codepoint = mul;
613                                            } else {
614                                                return Err(TokenizeError::ShortStringOverflow {
615                                                    start,
616                                                    pos: self.byte_offset,
617                                                });
618                                            }
619                                            if let Some(add) = codepoint.checked_add(digit as i32) {
620                                                codepoint = add;
621                                            } else {
622                                                return Err(TokenizeError::ShortStringOverflow {
623                                                    start,
624                                                    pos: self.byte_offset,
625                                                });
626                                            }
627                                            self.advance();
628                                        } else {
629                                            // not hex
630                                            return Err(TokenizeError::ShortStringNotHex {
631                                                start,
632                                                pos: self.byte_offset,
633                                            });
634                                        }
635                                    }
636
637                                    if !closed {
638                                        // not closed
639                                        return Err(TokenizeError::ShortStringNotClosed {
640                                            delim: delim as char,
641                                            start,
642                                            end: self.byte_offset,
643                                        });
644                                    }
645                                    if count == 0 {
646                                        // empty codepoint
647                                        return Err(TokenizeError::ShortStringEmptyCodepoint {
648                                            start,
649                                            escape_start,
650                                            escape_end: self.byte_offset,
651                                        });
652                                    }
653
654                                    fn encode_u32_to_extended_utf8(u: i32) -> Vec<u8> {
655                                        if u < 0 {
656                                            unreachable!("encode_u32_to_extended_utf8: u < 0");
657                                        }
658                                        let u = u as u32;
659                                        // Determine how many bytes are needed based on the value
660                                        let bytes_needed = match u {
661                                            0x0000_0000..=0x0000_007F => 1,
662                                            0x0000_0080..=0x0000_07FF => 2,
663                                            0x0000_0800..=0x0000_FFFF => 3,
664                                            0x0001_0000..=0x001F_FFFF => 4,
665                                            0x0020_0000..=0x03FF_FFFF => 5,
666                                            0x0400_0000..=0x7FFF_FFFF => 6,
667                                            _ => unreachable!(),
668                                        };
669
670                                        let mut bytes = Vec::with_capacity(bytes_needed);
671
672                                        match bytes_needed {
673                                            1 => {
674                                                // 0xxxxxxx
675                                                bytes.push(u as u8);
676                                            }
677                                            2 => {
678                                                // 110xxxxx 10xxxxxx
679                                                bytes.push(
680                                                    0b1100_0000 | ((u >> 6) as u8 & 0b0001_1111),
681                                                );
682                                                bytes.push(0b1000_0000 | (u as u8 & 0b0011_1111));
683                                            }
684                                            3 => {
685                                                // 1110xxxx 10xxxxxx 10xxxxxx
686                                                bytes.push(
687                                                    0b1110_0000 | ((u >> 12) as u8 & 0b0000_1111),
688                                                );
689                                                bytes.push(
690                                                    0b1000_0000 | ((u >> 6) as u8 & 0b0011_1111),
691                                                );
692                                                bytes.push(0b1000_0000 | (u as u8 & 0b0011_1111));
693                                            }
694                                            4 => {
695                                                // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
696                                                bytes.push(
697                                                    0b1111_0000 | ((u >> 18) as u8 & 0b0000_0111),
698                                                );
699                                                bytes.push(
700                                                    0b1000_0000 | ((u >> 12) as u8 & 0b0011_1111),
701                                                );
702                                                bytes.push(
703                                                    0b1000_0000 | ((u >> 6) as u8 & 0b0011_1111),
704                                                );
705                                                bytes.push(0b1000_0000 | (u as u8 & 0b0011_1111));
706                                            }
707                                            5 => {
708                                                // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
709                                                bytes.push(
710                                                    0b1111_1000 | ((u >> 24) as u8 & 0b0000_0011),
711                                                );
712                                                bytes.push(
713                                                    0b1000_0000 | ((u >> 18) as u8 & 0b0011_1111),
714                                                );
715                                                bytes.push(
716                                                    0b1000_0000 | ((u >> 12) as u8 & 0b0011_1111),
717                                                );
718                                                bytes.push(
719                                                    0b1000_0000 | ((u >> 6) as u8 & 0b0011_1111),
720                                                );
721                                                bytes.push(0b1000_0000 | (u as u8 & 0b0011_1111));
722                                            }
723                                            6 => {
724                                                // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
725                                                bytes.push(
726                                                    0b1111_1100 | ((u >> 30) as u8 & 0b0000_0001),
727                                                );
728                                                bytes.push(
729                                                    0b1000_0000 | ((u >> 24) as u8 & 0b0011_1111),
730                                                );
731                                                bytes.push(
732                                                    0b1000_0000 | ((u >> 18) as u8 & 0b0011_1111),
733                                                );
734                                                bytes.push(
735                                                    0b1000_0000 | ((u >> 12) as u8 & 0b0011_1111),
736                                                );
737                                                bytes.push(
738                                                    0b1000_0000 | ((u >> 6) as u8 & 0b0011_1111),
739                                                );
740                                                bytes.push(0b1000_0000 | (u as u8 & 0b0011_1111));
741                                            }
742                                            _ => unreachable!(),
743                                        }
744
745                                        bytes
746                                    }
747
748                                    s.append(&mut encode_u32_to_extended_utf8(codepoint));
749                                } else {
750                                    // '{' not present
751                                    return Err(TokenizeError::ShortStringNoOpenBrace {
752                                        start,
753                                        pos: self.byte_offset,
754                                    });
755                                }
756                            } else {
757                                // not closed
758                                return Err(TokenizeError::ShortStringNotClosed {
759                                    delim: delim as char,
760                                    start,
761                                    end: self.byte_offset,
762                                });
763                            }
764                        }
765
766                        Some(other) => {
767                            return Err(TokenizeError::ShortStringInvalidEscape {
768                                start,
769                                pos: self.byte_offset,
770                                escape: other as char,
771                            });
772                        }
773                        None => {
774                            return Err(TokenizeError::ShortStringNotClosed {
775                                delim: delim as char,
776                                start,
777                                end: self.byte_offset,
778                            });
779                        }
780                    }
781                }
782                b'\n' => {
783                    return Err(TokenizeError::ShortStringNewline {
784                        start,
785                        pos: self.byte_offset,
786                    });
787                }
788                _ => {
789                    s.push(ch);
790                    self.advance();
791                }
792            }
793        }
794        // not closed
795        Err(TokenizeError::ShortStringNotClosed {
796            delim: delim as char,
797            start,
798            end: self.byte_offset,
799        })
800    }
801    pub fn long_string_literal(
802        &mut self,
803        equal_count: usize,
804        start: usize,
805    ) -> Result<Vec<u8>, TokenizeError> {
806        let mut s = Vec::<u8>::new();
807        while let Some(ch) = self.peek() {
808            match ch {
809                b']' => {
810                    // check end of long string literal
811                    let cursor0 = self.get_cursor();
812                    if let Some(count) = self.long_bracket(b']') {
813                        if count == equal_count {
814                            return Ok(s);
815                        } else {
816                            self.set_cursor(cursor0);
817                            self.advance();
818                            s.push(ch);
819                        }
820                    } else {
821                        self.advance();
822                        s.push(ch);
823                    }
824                }
825
826                _ => {
827                    s.push(ch);
828                    self.advance();
829                }
830            }
831        }
832        // not closed
833        Err(TokenizeError::LongStringNotClosed {
834            start,
835            end: self.byte_offset,
836            equal_count,
837        })
838    }
839    pub fn tokenize_string(&mut self) -> Result<Option<Token>, TokenizeError> {
840        match self.peek() {
841            Some(b'\'') | Some(b'"') => {
842                // since ' or " is consumed, it is definitely short string literal.
843                let i0 = self.get_cursor();
844                let quote = self.peek().unwrap();
845                self.advance();
846
847                let s = self.short_string_literal(quote, i0)?;
848
849                let token = Token {
850                    token_type: TokenType::String(s),
851                    span: Span::new(i0, self.byte_offset),
852                };
853                Ok(Some(token))
854            }
855            Some(b'[') => {
856                // long string literal
857                let i0 = self.get_cursor();
858                if let Some(open_count) = self.long_bracket(b'[') {
859                    // since long bracket '[[' is consumed, it is definitely long string literal.
860                    let s = self.long_string_literal(open_count, i0)?;
861
862                    let token = Token {
863                        token_type: TokenType::String(s),
864                        span: Span::new(i0, self.byte_offset),
865                    };
866                    Ok(Some(token))
867                } else {
868                    self.set_cursor(i0);
869                    Ok(None)
870                }
871            }
872            _ => Ok(None),
873        }
874    }
875
876    /// consume long bracket and return the number of '='.
877    /// `bracket` must be either b'[' or b']'.
878    pub(crate) fn long_bracket(&mut self, bracket: u8) -> Option<usize> {
879        debug_assert!(bracket == b'[' || bracket == b']');
880        let cursor0 = self.get_cursor();
881        if self.peek() == Some(bracket) {
882            // consume '['
883            self.advance();
884
885            // the number of '='
886            let mut count = 0;
887            while let Some(ch) = self.peek() {
888                if ch == bracket {
889                    // consume '['
890                    self.advance();
891                    return Some(count);
892                } else if ch == b'=' {
893                    // consume '='
894                    self.advance();
895                    count += 1;
896                } else {
897                    self.set_cursor(cursor0);
898                    return None;
899                }
900            }
901            None
902        } else {
903            return None;
904        }
905    }
906    pub(crate) fn starts_with_and_advance(&mut self, prefix: &[u8]) -> bool {
907        let slice = &self.source[self.byte_offset..];
908        if slice.starts_with(prefix) {
909            self.advance_n(prefix.len());
910            true
911        } else {
912            false
913        }
914    }
915
916    /// try tokenize single token.
917    pub fn try_tokenize(&mut self) -> Result<Option<Token>, TokenizeError> {
918        self.ignore_whitespace();
919        // check eof
920        if self.byte_offset >= self.source.len() {
921            return Ok(None);
922        }
923
924        if let Some(token) = self.tokenize_ident() {
925            // try ident
926            Ok(Some(token))
927        } else if let Some(token) = self.tokenize_literal()? {
928            // try literal
929            Ok(Some(token))
930        } else {
931            // try punctuator
932
933            macro_rules! advance_and_return {
934                ($token_type:ident) => {{
935                    self.advance();
936                    Ok(Some(Token {
937                        token_type: TokenType::$token_type,
938                        span: Span::new(self.byte_offset - 1, self.byte_offset),
939                    }))
940                }};
941            }
942
943            let ch = self.peek().unwrap();
944            match ch {
945                b'+' => {
946                    advance_and_return!(Plus)
947                }
948                b'*' => {
949                    advance_and_return!(Asterisk)
950                }
951                b'/' => {
952                    // check for SlashSlash
953                    let i0 = self.byte_offset;
954                    self.advance();
955
956                    if self.peek() == Some(b'/') {
957                        self.advance();
958                        Ok(Some(Token {
959                            token_type: TokenType::SlashSlash,
960                            span: Span::new(i0, self.byte_offset),
961                        }))
962                    } else {
963                        Ok(Some(Token {
964                            token_type: TokenType::Slash,
965                            span: Span::new(i0, i0 + 1),
966                        }))
967                    }
968                }
969                b'%' => {
970                    advance_and_return!(Percent)
971                }
972                b'^' => {
973                    advance_and_return!(Caret)
974                }
975                b'#' => {
976                    advance_and_return!(Hash)
977                }
978                b'&' => {
979                    advance_and_return!(Ampersand)
980                }
981                b'~' => {
982                    // check for TildeEqual
983                    let i0 = self.byte_offset;
984                    self.advance();
985
986                    if self.peek() == Some(b'=') {
987                        self.advance();
988                        Ok(Some(Token {
989                            token_type: TokenType::TildeEqual,
990                            span: Span::new(i0, self.byte_offset),
991                        }))
992                    } else {
993                        Ok(Some(Token {
994                            token_type: TokenType::Tilde,
995                            span: Span::new(i0, i0 + 1),
996                        }))
997                    }
998                }
999                b'|' => {
1000                    advance_and_return!(Pipe)
1001                }
1002                b'<' => {
1003                    // check for LessLess
1004                    let i0 = self.byte_offset;
1005                    self.advance();
1006
1007                    match self.peek() {
1008                        Some(b'<') => {
1009                            self.advance();
1010                            Ok(Some(Token {
1011                                token_type: TokenType::LessLess,
1012                                span: Span::new(i0, self.byte_offset),
1013                            }))
1014                        }
1015                        Some(b'=') => {
1016                            self.advance();
1017                            Ok(Some(Token {
1018                                token_type: TokenType::LessEqual,
1019                                span: Span::new(i0, self.byte_offset),
1020                            }))
1021                        }
1022
1023                        _ => Ok(Some(Token {
1024                            token_type: TokenType::Less,
1025                            span: Span::new(i0, i0 + 1),
1026                        })),
1027                    }
1028                }
1029                b'>' => {
1030                    // check for LessLess
1031                    let i0 = self.byte_offset;
1032                    self.advance();
1033
1034                    match self.peek() {
1035                        Some(b'>') => {
1036                            self.advance();
1037                            Ok(Some(Token {
1038                                token_type: TokenType::GreaterGreater,
1039                                span: Span::new(i0, self.byte_offset),
1040                            }))
1041                        }
1042                        Some(b'=') => {
1043                            self.advance();
1044                            Ok(Some(Token {
1045                                token_type: TokenType::GreaterEqual,
1046                                span: Span::new(i0, self.byte_offset),
1047                            }))
1048                        }
1049
1050                        _ => Ok(Some(Token {
1051                            token_type: TokenType::Greater,
1052                            span: Span::new(i0, i0 + 1),
1053                        })),
1054                    }
1055                }
1056                b'=' => {
1057                    // check for EqualEqual
1058                    let i0 = self.byte_offset;
1059                    self.advance();
1060
1061                    if self.peek() == Some(b'=') {
1062                        self.advance();
1063                        Ok(Some(Token {
1064                            token_type: TokenType::EqualEqual,
1065                            span: Span::new(i0, self.byte_offset),
1066                        }))
1067                    } else {
1068                        Ok(Some(Token {
1069                            token_type: TokenType::Equal,
1070                            span: Span::new(i0, i0 + 1),
1071                        }))
1072                    }
1073                }
1074
1075                b'(' => {
1076                    advance_and_return!(LParen)
1077                }
1078                b')' => {
1079                    advance_and_return!(RParen)
1080                }
1081                b'{' => {
1082                    advance_and_return!(LBrace)
1083                }
1084                b'}' => {
1085                    advance_and_return!(RBrace)
1086                }
1087                b'[' => {
1088                    advance_and_return!(LBracket)
1089                }
1090                b']' => {
1091                    advance_and_return!(RBracket)
1092                }
1093                b':' => {
1094                    // check for ColonColon
1095                    let i0 = self.byte_offset;
1096                    self.advance();
1097
1098                    if self.peek() == Some(b':') {
1099                        self.advance();
1100                        Ok(Some(Token {
1101                            token_type: TokenType::ColonColon,
1102                            span: Span::new(i0, self.byte_offset),
1103                        }))
1104                    } else {
1105                        Ok(Some(Token {
1106                            token_type: TokenType::Colon,
1107                            span: Span::new(i0, i0 + 1),
1108                        }))
1109                    }
1110                }
1111                b';' => {
1112                    advance_and_return!(Semicolon)
1113                }
1114                b',' => {
1115                    advance_and_return!(Comma)
1116                }
1117                b'.' => {
1118                    let i0 = self.byte_offset;
1119                    self.advance();
1120
1121                    if self.peek() == Some(b'.') {
1122                        let i1 = self.byte_offset;
1123                        self.advance();
1124
1125                        if self.peek() == Some(b'.') {
1126                            self.advance();
1127                            Ok(Some(Token {
1128                                token_type: TokenType::DotDotDot,
1129                                span: Span::new(i0, self.byte_offset),
1130                            }))
1131                        } else {
1132                            Ok(Some(Token {
1133                                token_type: TokenType::DotDot,
1134                                span: Span::new(i0, i1),
1135                            }))
1136                        }
1137                    } else {
1138                        Ok(Some(Token {
1139                            token_type: TokenType::Dot,
1140                            span: Span::new(i0, i0 + 1),
1141                        }))
1142                    }
1143                }
1144                b'-' => {
1145                    let i0 = self.byte_offset;
1146                    // check start of comment
1147                    if self.starts_with_and_advance(b"--") {
1148                        // check start of multi-line comment
1149                        if let Some(open_equal_count) = self.long_bracket(b'[') {
1150                            let multiline_comment_begin = (i0, self.byte_offset);
1151
1152                            while self.byte_offset < self.source.len() {
1153                                if let Some(close_equal_count) = self.long_bracket(b']') {
1154                                    if close_equal_count == open_equal_count {
1155                                        return self.try_tokenize();
1156                                    }
1157                                    // since `long_bracket` is parsed, the cursor is currently at the next position of ']'.
1158                                    // ]====]
1159                                    //       ^ here
1160                                    // move back cursor so that it points to the last ']'.
1161                                    // so we can test other long-closing-brackets.
1162                                    self.byte_offset -= 1;
1163                                } else {
1164                                    self.advance()
1165                                }
1166                            }
1167                            // eof reached
1168                            // multi-line comment not closed
1169                            Err(TokenizeError::MultilineCommentNotClosed {
1170                                start: multiline_comment_begin.0,
1171                                end: multiline_comment_begin.1,
1172                            })
1173                        } else {
1174                            // it is line comment
1175                            while let Some(ch) = self.peek() {
1176                                self.advance();
1177                                if ch == b'\n' {
1178                                    break;
1179                                }
1180                            }
1181                            self.try_tokenize()
1182                        }
1183                    } else {
1184                        // it is not comment, put '-'
1185                        advance_and_return!(Minus)
1186                    }
1187                }
1188
1189                _ => {
1190                    // invalid punctuator
1191                    Err(TokenizeError::InvalidPunct {
1192                        pos: self.byte_offset,
1193                        punct: ch as char,
1194                    })
1195                }
1196            }
1197        }
1198    }
1199}
1200
1201impl<'a> Iterator for Tokenizer<'a> {
1202    type Item = Result<Token, TokenizeError>;
1203
1204    fn next(&mut self) -> Option<Self::Item> {
1205        match self.try_tokenize() {
1206            Ok(Some(token)) => Some(Ok(token)),
1207            Ok(None) => None,
1208            Err(e) => Some(Err(e)),
1209        }
1210    }
1211}