rye_grain/
tokenize.rs

1use crate::tokens::{Token, TokenType};
2use std::cmp;
3use std::collections::VecDeque;
4
5use unicode_categories::UnicodeCategories;
6
7struct RawSource {
8    input: Vec<char>,
9    cursor: usize,
10    lookahead: usize,
11}
12
13impl RawSource {
14    pub fn new(input: &str) -> RawSource {
15        RawSource {
16            input: input.chars().collect(),
17            cursor: 0,
18            lookahead: 0,
19        }
20    }
21
22    fn at_end(&self) -> bool {
23        self.cursor >= self.input.len()
24    }
25
26    fn committed_index(&self) -> usize {
27        self.cursor
28    }
29
30    fn peeked_index(&self) -> usize {
31        self.lookahead
32    }
33
34    /// return a slice of input stream of size window or the rest of the input
35    /// starting from the last peek
36    fn peek(&mut self, window: usize) -> &[char] {
37        let window_open = cmp::min(self.lookahead, self.input.len());
38        let window_close = cmp::min(self.lookahead + window, self.input.len());
39        self.lookahead += window;
40        &self.input[window_open..window_close]
41    }
42
43    fn hide(&mut self, window: usize) {
44        self.lookahead -= window;
45        assert!(
46            self.lookahead >= self.cursor,
47            "Cannot hide what was already consumed"
48        );
49    }
50
51    fn commit(&mut self) {
52        self.cursor = cmp::min(self.lookahead, self.input.len());
53    }
54
55    fn revert(&mut self) {
56        self.lookahead = self.cursor;
57    }
58
59    fn peeked_string(&self) -> String {
60        return self.input[self.cursor..self.lookahead].iter().collect();
61    }
62}
63
64pub struct TokenStream {
65    source: RawSource,
66    within_statement: bool,
67    parenthesis_level: usize,
68    bracket_level: usize,
69    brace_level: usize,
70    indents_seen: Vec<usize>,
71    tokens: VecDeque<Token>,
72    ended: bool,
73}
74
75impl TokenStream {
76    pub fn new(input: &str) -> TokenStream {
77        TokenStream {
78            source: RawSource::new(input),
79            within_statement: false,
80            parenthesis_level: 0,
81            bracket_level: 0,
82            brace_level: 0,
83            indents_seen: vec![0],
84            tokens: VecDeque::new(),
85            ended: false,
86        }
87    }
88
89    fn add_token(
90        &mut self,
91        token_type: TokenType,
92        exact_token_type: TokenType,
93        token_contents: String,
94        col_start: usize,
95        col_end: usize,
96    ) {
97        self.tokens.push_back(Token {
98            token_type,
99            exact_token_type,
100            token_contents,
101            col_start,
102            col_end,
103        });
104    }
105
106    fn commit_to_token(&mut self, token_type: TokenType) {
107        self.commit_to_exact_token(token_type, token_type)
108    }
109
110    fn commit_to_exact_token(&mut self, token_type: TokenType, exact_token_type: TokenType) {
111        self.add_token(
112            token_type,
113            exact_token_type,
114            self.source.peeked_string(),
115            self.source.committed_index(),
116            self.source.peeked_index(),
117        );
118        self.source.commit();
119        if token_type == TokenType::NEWLINE {
120            self.within_statement = false;
121        } else if !(token_type == TokenType::NL || token_type == TokenType::COMMENT) {
122            self.within_statement = true;
123        };
124        // else NL and COMMENT don't either start or end a statement
125    }
126
127    /// Attempt to consume the longest valid op token from the source
128    /// advance the cursor if any op is found
129    fn consume_next_op_token(&mut self) -> bool {
130        let exact_token_type: TokenType;
131        match self.source.peek(3) {
132            ['*', '*', '='] => exact_token_type = TokenType::DOUBLESTAREQUAL,
133            ['.', '.', '.'] => exact_token_type = TokenType::ELLIPSIS,
134            ['/', '/', '='] => exact_token_type = TokenType::DOUBLESLASHEQUAL,
135            ['<', '<', '='] => exact_token_type = TokenType::LEFTSHIFTEQUAL,
136            ['>', '>', '='] => exact_token_type = TokenType::RIGHTSHIFTEQUAL,
137            ['!', '=', ..] => {
138                exact_token_type = TokenType::NOTEQUAL;
139                self.source.hide(1);
140            }
141            ['%', '=', ..] => {
142                exact_token_type = TokenType::PERCENTEQUAL;
143                self.source.hide(1);
144            }
145            ['&', '=', ..] => {
146                exact_token_type = TokenType::AMPEREQUAL;
147                self.source.hide(1);
148            }
149            ['*', '*', ..] => {
150                exact_token_type = TokenType::DOUBLESTAR;
151                self.source.hide(1);
152            }
153            ['*', '=', ..] => {
154                exact_token_type = TokenType::STAREQUAL;
155                self.source.hide(1);
156            }
157            ['+', '=', ..] => {
158                exact_token_type = TokenType::PLUSEQUAL;
159                self.source.hide(1);
160            }
161            ['-', '=', ..] => {
162                exact_token_type = TokenType::MINEQUAL;
163                self.source.hide(1);
164            }
165            ['-', '>', ..] => {
166                exact_token_type = TokenType::RARROW;
167                self.source.hide(1);
168            }
169            ['/', '/', ..] => {
170                exact_token_type = TokenType::DOUBLESLASH;
171                self.source.hide(1);
172            }
173            ['/', '=', ..] => {
174                exact_token_type = TokenType::SLASHEQUAL;
175                self.source.hide(1);
176            }
177            [':', '=', ..] => {
178                exact_token_type = TokenType::COLONEQUAL;
179                self.source.hide(1);
180            }
181            ['<', '<', ..] => {
182                exact_token_type = TokenType::LEFTSHIFT;
183                self.source.hide(1);
184            }
185            ['<', '=', ..] => {
186                exact_token_type = TokenType::LESSEQUAL;
187                self.source.hide(1);
188            }
189            ['<', '>', ..] => {
190                exact_token_type = TokenType::NOTEQUAL;
191                self.source.hide(1);
192            }
193            ['=', '=', ..] => {
194                exact_token_type = TokenType::EQEQUAL;
195                self.source.hide(1);
196            }
197            ['>', '=', ..] => {
198                exact_token_type = TokenType::GREATEREQUAL;
199                self.source.hide(1);
200            }
201            ['>', '>', ..] => {
202                exact_token_type = TokenType::RIGHTSHIFT;
203                self.source.hide(1);
204            }
205            ['@', '=', ..] => {
206                exact_token_type = TokenType::ATEQUAL;
207                self.source.hide(1);
208            }
209            ['^', '=', ..] => {
210                exact_token_type = TokenType::CIRCUMFLEXEQUAL;
211                self.source.hide(1);
212            }
213            ['|', '=', ..] => {
214                exact_token_type = TokenType::VBAREQUAL;
215                self.source.hide(1);
216            }
217            ['%', ..] => {
218                exact_token_type = TokenType::PERCENT;
219                self.source.hide(2);
220            }
221            ['&', ..] => {
222                exact_token_type = TokenType::AMPER;
223                self.source.hide(2);
224            }
225            ['(', ..] => {
226                exact_token_type = TokenType::LPAR;
227                self.source.hide(2);
228                self.parenthesis_level += 1;
229            }
230            [')', ..] => {
231                exact_token_type = TokenType::RPAR;
232                self.source.hide(2);
233                self.parenthesis_level = self.parenthesis_level.saturating_sub(1);
234            }
235            ['*', ..] => {
236                exact_token_type = TokenType::STAR;
237                self.source.hide(2);
238            }
239            ['+', ..] => {
240                exact_token_type = TokenType::PLUS;
241                self.source.hide(2);
242            }
243            [',', ..] => {
244                exact_token_type = TokenType::COMMA;
245                self.source.hide(2);
246            }
247            ['-', ..] => {
248                exact_token_type = TokenType::MINUS;
249                self.source.hide(2);
250            }
251            ['.', ..] => {
252                exact_token_type = TokenType::DOT;
253                self.source.hide(2);
254            }
255            ['/', ..] => {
256                exact_token_type = TokenType::SLASH;
257                self.source.hide(2);
258            }
259            [':', ..] => {
260                exact_token_type = TokenType::COLON;
261                self.source.hide(2);
262            }
263            [';', ..] => {
264                exact_token_type = TokenType::SEMI;
265                self.source.hide(2);
266            }
267            ['<', ..] => {
268                exact_token_type = TokenType::LESS;
269                self.source.hide(2);
270            }
271            ['=', ..] => {
272                exact_token_type = TokenType::EQUAL;
273                self.source.hide(2);
274            }
275            ['>', ..] => {
276                exact_token_type = TokenType::GREATER;
277                self.source.hide(2);
278            }
279            ['@', ..] => {
280                exact_token_type = TokenType::AT;
281                self.source.hide(2);
282            }
283            ['[', ..] => {
284                exact_token_type = TokenType::LSQB;
285                self.source.hide(2);
286                self.bracket_level += 1;
287            }
288            [']', ..] => {
289                exact_token_type = TokenType::RSQB;
290                self.source.hide(2);
291                self.bracket_level = self.bracket_level.saturating_sub(1);
292            }
293            ['^', ..] => {
294                exact_token_type = TokenType::CIRCUMFLEX;
295                self.source.hide(2);
296            }
297            ['{', ..] => {
298                exact_token_type = TokenType::LBRACE;
299                self.source.hide(2);
300                self.brace_level += 1;
301            }
302            ['|', ..] => {
303                exact_token_type = TokenType::VBAR;
304                self.source.hide(2);
305            }
306            ['}', ..] => {
307                exact_token_type = TokenType::RBRACE;
308                self.source.hide(2);
309                self.brace_level = self.brace_level.saturating_sub(1);
310            }
311            ['~', ..] => {
312                exact_token_type = TokenType::TILDE;
313                self.source.hide(2);
314            }
315            _ => {
316                self.source.revert();
317                return false;
318            }
319        }
320        self.commit_to_exact_token(TokenType::OP, exact_token_type);
321        true
322    }
323
324    fn is_start_of_name(c: &char) -> bool {
325        // XXX: also chars with Other_ID_Start property
326        c.is_letter_uppercase()
327            || c.is_letter_lowercase()
328            || c.is_letter_titlecase()
329            || c.is_letter_modifier()
330            || c.is_letter_other()
331            || c.is_number_letter()
332            || *c == '_'
333    }
334
335    fn is_part_of_name(c: &char) -> bool {
336        // XXX: also chars with Other_ID_Continue
337        Self::is_start_of_name(c)
338            || c.is_mark_nonspacing()
339            || c.is_mark_spacing_combining()
340            || c.is_number_decimal_digit()
341            || c.is_punctuation_connector()
342    }
343
344    /// Attempt to consume the longest valid name token from the source
345    /// advance the cursor if any name is found
346    fn consume_next_name_token(&mut self) -> bool {
347        if let [next] = self.source.peek(1) {
348            if !Self::is_start_of_name(next) {
349                self.source.hide(1);
350                return false;
351            }
352        } else {
353            self.source.hide(1);
354            return false;
355        };
356
357        loop {
358            if let [next] = self.source.peek(1) {
359                if !Self::is_part_of_name(next) {
360                    self.source.hide(1);
361                    break;
362                }
363            } else {
364                self.source.hide(1);
365                break;
366            };
367        }
368
369        self.commit_to_token(TokenType::NAME);
370        true
371    }
372
373    fn is_bin_digit(c: &char) -> bool {
374        *c == '0' || *c == '1'
375    }
376
377    fn is_oct_digit(c: &char) -> bool {
378        Self::is_bin_digit(c)
379            || *c == '2'
380            || *c == '3'
381            || *c == '4'
382            || *c == '5'
383            || *c == '6'
384            || *c == '7'
385    }
386
387    fn is_dec_digit(c: &char) -> bool {
388        Self::is_oct_digit(c) || *c == '8' || *c == '9'
389    }
390
391    fn is_hex_digit(c: &char) -> bool {
392        Self::is_dec_digit(c)
393            || *c == 'a'
394            || *c == 'b'
395            || *c == 'c'
396            || *c == 'd'
397            || *c == 'e'
398            || *c == 'f'
399            || *c == 'A'
400            || *c == 'B'
401            || *c == 'C'
402            || *c == 'D'
403            || *c == 'E'
404            || *c == 'F'
405    }
406
407    /// only to be called after the smallest matching sequence of characters has been found
408    /// for non-decimal integers this means "0[bBoOxX][0-9a-fA-F]" has already been matched
409    /// for decimal integers just the first digit has already been matched
410    fn find_end_of_integer(&mut self, valid_digit: fn(&char) -> bool) {
411        let mut last_under = false;
412        loop {
413            match self.source.peek(1) {
414                ['_'] => {
415                    if last_under {
416                        // neither of these "_" are part of a number
417                        self.source.hide(2);
418                        return;
419                    } else {
420                        last_under = true;
421                    }
422                }
423                [next] if valid_digit(next) => {
424                    last_under = false;
425                }
426                _ => {
427                    if last_under {
428                        self.source.hide(2);
429                    } else {
430                        self.source.hide(1);
431                    };
432                    return;
433                }
434            }
435        }
436    }
437
438    /// only to be called after a potential exponent was found after an already valid number token
439    /// this means that "[0-9][.]?[eE]" has already been matched
440    fn find_end_of_exponent(&mut self) -> bool {
441        match self.source.peek(2) {
442            [next, ..] if Self::is_dec_digit(next) => {
443                self.source.hide(1); // last peek was unchecked, leave it to next call
444                self.find_end_of_integer(Self::is_dec_digit);
445                true
446            }
447            ['-' | '+', next] if Self::is_dec_digit(next) => {
448                self.find_end_of_integer(Self::is_dec_digit);
449                true
450            }
451            _ => {
452                // also hide the "e" that was already matched
453                // this returns the caller's view of the token to a valid end of number
454                self.source.hide(3);
455                false
456            }
457        }
458    }
459
460    /// Attempt to consume the longest valid number token from the source
461    /// advance the cursor if any name is found
462    fn consume_next_number_token(&mut self) -> bool {
463        let number_type: TokenType;
464
465        match self.source.peek(1) {
466            ['0'] => {
467                match self.source.peek(2) {
468                    ['b' | 'B', next] if Self::is_bin_digit(next) => {
469                        number_type = TokenType::BININT;
470                        self.find_end_of_integer(Self::is_bin_digit);
471                    }
472                    ['o' | 'O', next] if Self::is_oct_digit(next) => {
473                        number_type = TokenType::OCTINT;
474                        self.find_end_of_integer(Self::is_oct_digit);
475                    }
476                    ['x' | 'X', next] if Self::is_hex_digit(next) => {
477                        number_type = TokenType::HEXINT;
478                        self.find_end_of_integer(Self::is_hex_digit);
479                    }
480                    [next, ..] if Self::is_dec_digit(next) || *next == '_' => {
481                        // at this point the longest possible integer token is a zero as only zero
482                        // can have leading 0s
483                        let last_zero: usize;
484                        if *next == '_' {
485                            // put non-digit char back
486                            self.source.hide(2);
487                            self.find_end_of_integer(|c| *c == '0');
488                            last_zero = self.source.peeked_index();
489                        } else if *next == '0' {
490                            // put unchecked char back
491                            self.source.hide(1);
492                            self.find_end_of_integer(|c| *c == '0');
493                            last_zero = self.source.peeked_index();
494                        } else {
495                            // put unchecked char back
496                            self.source.hide(1);
497                            last_zero = self.source.peeked_index() - 1;
498                        };
499                        match self.source.peek(1) {
500                            ['.'] => {
501                                number_type = TokenType::FLOAT;
502                                self.find_end_of_integer(Self::is_dec_digit);
503                                if !matches!(self.source.peek(1), ['e' | 'E'])
504                                    || !self.find_end_of_exponent()
505                                {
506                                    self.source.hide(1);
507                                };
508                            }
509                            ['e' | 'E'] => {
510                                if self.find_end_of_exponent() {
511                                    // found exponent with base part zero
512                                    number_type = TokenType::FLOAT;
513                                } else {
514                                    // found decimal number zero spelled with multiple 0s
515                                    self.source.hide(self.source.peeked_index() - last_zero);
516                                    number_type = TokenType::INTEGER;
517                                };
518                            }
519                            [next] if Self::is_dec_digit(next) || *next == '_' => {
520                                // 0 digits are certain to be part of one token but non-0 digits
521                                // are only part of the same token if it ends up being a float or
522                                // imaginary
523                                self.find_end_of_integer(Self::is_dec_digit);
524                                match self.source.peek(1) {
525                                    ['.'] => {
526                                        // found fraction with integer part non-zero but leading 0s
527                                        number_type = TokenType::FLOAT;
528                                        self.find_end_of_integer(Self::is_dec_digit);
529                                        if let ['e' | 'E'] = self.source.peek(1) {
530                                            self.find_end_of_exponent();
531                                        } else {
532                                            self.source.hide(1);
533                                        };
534                                    }
535                                    ['e' | 'E'] => {
536                                        if self.find_end_of_exponent() {
537                                            // found exponent with base part non-zero but with
538                                            // leading 0s
539                                            number_type = TokenType::FLOAT;
540                                        } else {
541                                            // found decimal number zero spelled with multiple 0s
542                                            number_type = TokenType::INTEGER;
543                                            self.source
544                                                .hide(self.source.peeked_index() - last_zero);
545                                        };
546                                    }
547                                    ['j' | 'J'] => {
548                                        // found imaginary whole number with leading zeros
549                                        // identification of imaginary tokens is normally done at
550                                        // the end of this function, but intigers and imaginary
551                                        // whole number tokens have different lexing rules around 0
552                                        number_type = TokenType::IMAGINARY;
553                                    }
554                                    _ => {
555                                        // found decimal number zero spelled with multiple 0s
556                                        number_type = TokenType::INTEGER;
557                                        self.source.hide(self.source.peeked_index() - last_zero);
558                                    }
559                                };
560                            }
561                            _ => {
562                                // found decimal number zero spelled with multiple 0s
563                                number_type = TokenType::INTEGER;
564                                self.source.hide(1);
565                            }
566                        };
567                    }
568                    ['.', ..] => {
569                        // put unchecked char back
570                        self.source.hide(1);
571                        number_type = TokenType::FLOAT;
572                        self.find_end_of_integer(Self::is_dec_digit);
573                        if let ['e' | 'E'] = self.source.peek(1) {
574                            self.find_end_of_exponent();
575                        } else {
576                            self.source.hide(1);
577                        };
578                    }
579                    ['e' | 'E', ..] => {
580                        // put unchecked char back
581                        self.source.hide(1);
582                        if self.find_end_of_exponent() {
583                            // found exponent with base part zero
584                            number_type = TokenType::FLOAT;
585                        } else {
586                            // found decimal number zero spelled with one 0
587                            number_type = TokenType::INTEGER;
588                        };
589                    }
590                    _ => {
591                        // found decimal number zero spelled with one 0
592                        number_type = TokenType::INTEGER;
593                        self.source.hide(2);
594                    }
595                };
596            }
597            ['1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'] => {
598                self.find_end_of_integer(Self::is_dec_digit);
599                match self.source.peek(1) {
600                    ['.'] => {
601                        // found fraction with integer part
602                        number_type = TokenType::FLOAT;
603                        self.find_end_of_integer(Self::is_dec_digit);
604                        if let ['e' | 'E'] = self.source.peek(1) {
605                            self.find_end_of_exponent();
606                        } else {
607                            self.source.hide(1);
608                        };
609                    }
610                    ['e' | 'E'] => {
611                        if self.find_end_of_exponent() {
612                            // found exponent with base part zero
613                            number_type = TokenType::FLOAT;
614                        } else {
615                            // found decimal number zero spelled with multiple 0s
616                            number_type = TokenType::INTEGER;
617                        };
618                    }
619                    _ => {
620                        // found decimal number non-zero
621                        number_type = TokenType::INTEGER;
622                        self.source.hide(1);
623                    }
624                };
625            }
626            ['.'] => {
627                match self.source.peek(1) {
628                    [next] if Self::is_dec_digit(next) => {
629                        // found float with no integer part
630                        number_type = TokenType::FLOAT;
631                        self.find_end_of_integer(Self::is_dec_digit);
632                        if let ['e' | 'E'] = self.source.peek(1) {
633                            self.find_end_of_exponent();
634                        } else {
635                            self.source.hide(1);
636                        };
637                    }
638                    _ => {
639                        // just a dot
640                        self.source.revert();
641                        return false;
642                    }
643                }
644            }
645            _ => {
646                // no number here
647                self.source.revert();
648                return false;
649            }
650        };
651
652        let exact_token_type: TokenType;
653        if let TokenType::INTEGER | TokenType::FLOAT = number_type {
654            if let ['j' | 'J'] = self.source.peek(1) {
655                exact_token_type = TokenType::IMAGINARY;
656            } else {
657                self.source.hide(1);
658                exact_token_type = number_type;
659            };
660        } else {
661            exact_token_type = number_type;
662        };
663
664        self.commit_to_exact_token(TokenType::NUMBER, exact_token_type);
665        true
666    }
667
668    /// Attempt to consume a newline
669    /// advance the cursor if a newline is detected
670    fn consume_next_newline(&mut self) -> Option<bool> {
671        return match self.source.peek(2) {
672            ['\\', '\n'] => {
673                // no tokens produced when newline escaped
674                self.source.commit();
675                Some(false)
676            }
677            ['\n', ..] => {
678                self.source.hide(1);
679                if self.within_statement
680                    && self.parenthesis_level == 0
681                    && self.bracket_level == 0
682                    && self.brace_level == 0
683                {
684                    self.commit_to_token(TokenType::NEWLINE);
685                    Some(true)
686                } else {
687                    self.commit_to_token(TokenType::NL);
688                    Some(true)
689                }
690            }
691            _ => {
692                self.source.revert();
693                None
694            }
695        };
696    }
697
698    /// Attempt to consume leading whitespace from a logical line in the source
699    /// advance the cursor if any name is found
700    /// This must be called first after every NEWLINE but not after other tokens
701    fn consume_next_dent(&mut self) -> Result<bool, String> {
702        let mut spaces: usize = 0;
703        let mut no_more_source = true;
704        while let [next] = self.source.peek(1) {
705            if *next == ' ' {
706                spaces += 1;
707            } else if *next == '\t' {
708                // round up to the next multiple of 8 spaces
709                spaces += 8 - (spaces % 8);
710            } else if *next == '\u{000C}' {
711                // formfeeds don't count toward indentation but may be interspersed
712                continue;
713            } else if *next == '\n' || *next == '\\' || *next == '#' {
714                // there is no code on this line and no tokens are produced from any indent
715                // any indent does not have to line up with any other line and has no significance
716                self.source.hide(1);
717                self.source.commit();
718                return Ok(false);
719            } else {
720                no_more_source = false;
721                break;
722            };
723        }
724        self.source.hide(1);
725        if no_more_source {
726            // there is no code on this line
727            self.source.commit();
728            return Ok(false);
729        };
730
731        match *self.indents_seen.last().unwrap() {
732            s if s == spaces => {
733                // this line is the same indentation level as the current block
734                self.source.commit();
735                Ok(false)
736            }
737            s if s < spaces => {
738                self.indents_seen.push(spaces);
739                self.commit_to_token(TokenType::INDENT);
740                Ok(true)
741            }
742            _ => {
743                // DEDENT size must match a previously seen INDENT size
744                // one or more DEDENTs may be produced until such a match is found
745                self.source.commit();
746                loop {
747                    self.indents_seen.pop();
748                    match *self.indents_seen.last().unwrap() {
749                        s if s == spaces => {
750                            self.add_token(
751                                TokenType::DEDENT,
752                                TokenType::DEDENT,
753                                String::from(""),
754                                self.source.committed_index(),
755                                self.source.committed_index(),
756                            );
757                            return Ok(true);
758                        }
759                        s if s < spaces || self.indents_seen.len() == 1 => {
760                            return Err(String::from(
761                                "dedent does not match any outer indentation level",
762                            ));
763                        }
764                        _ => {
765                            self.add_token(
766                                TokenType::DEDENT,
767                                TokenType::DEDENT,
768                                String::from(""),
769                                self.source.committed_index(),
770                                self.source.committed_index(),
771                            );
772                        }
773                    };
774                }
775            }
776        }
777    }
778
779    /// Attempt to consume any whitespace from the source
780    /// advance the cursor if any whitespace if found
781    /// This is NOT for finding INDENT/DEDENT or NL/NEWLINE tokens
782    fn consume_next_whitespace(&mut self) {
783        while let [next] = self.source.peek(1) {
784            // space, tab, and formfeed are valid inter-token whitespace
785            if *next != ' ' && *next != '\t' && *next != '\u{000C}' {
786                break;
787            };
788        }
789        // everything but the last peek was whitespace
790        self.source.hide(1);
791        self.source.commit();
792    }
793
794    fn consume_next_comment(&mut self) -> bool {
795        if let [next] = self.source.peek(1) {
796            if *next == '#' {
797                while let [next] = self.source.peek(1) {
798                    if *next == '\n' {
799                        break;
800                    };
801                }
802                self.source.hide(1);
803                self.commit_to_token(TokenType::COMMENT);
804                return true;
805            };
806        };
807        self.source.revert();
808        false
809    }
810
811    fn find_end_tripple_quote(&mut self, end_match: [char; 3]) -> Result<(), String> {
812        let mut last_escape = false;
813        while let [a, b, c] = self.source.peek(3) {
814            if [*a, *b, *c] == end_match && !last_escape {
815                return Ok(());
816            } else if [*a, *b, *c] == ['\\', '\\', '\\'] && !last_escape
817                || *c == '\\' && *b != '\\'
818                || *a == '\\' && !last_escape
819            {
820                last_escape = true;
821            } else if last_escape {
822                last_escape = false;
823            };
824            self.source.hide(2);
825        }
826        Err(String::from("EOF in multi-line string"))
827    }
828
829    fn find_end_quote(&mut self, end_match: [char; 1]) -> bool {
830        let mut last_escape = false;
831        while let [a] = self.source.peek(1) {
832            if [*a] == end_match && !last_escape {
833                return true;
834            } else if [*a] == ['\n'] {
835                return false;
836            } else if [*a] == ['\\'] && !last_escape {
837                last_escape = true;
838            } else if last_escape {
839                last_escape = false;
840            };
841        }
842        false
843    }
844
845    fn consume_next_string_token(&mut self) -> Result<bool, String> {
846        let qt: char;
847        match self.source.peek(3) {
848            [q, ..] if q == &'\'' || q == &'"' => {
849                qt = *q;
850                self.source.hide(2);
851            }
852            ['b' | 'B', q, ..] | ['f' | 'F', q, ..] | ['r' | 'R', q, ..] | ['u' | 'U', q, ..]
853                if q == &'\'' || q == &'"' =>
854            {
855                qt = *q;
856                self.source.hide(1);
857            }
858            ['r' | 'R', 'b' | 'B' | 'f' | 'F', q] | ['b' | 'B' | 'f' | 'F', 'r' | 'R', q]
859                if q == &'\'' || q == &'"' =>
860            {
861                qt = *q;
862            }
863            _ => {
864                self.source.revert();
865                return Ok(false);
866            }
867        };
868        match self.source.peek(2) {
869            [a, b] if [*a, *b] == [qt, qt] => {
870                self.find_end_tripple_quote([qt, qt, qt])?;
871            }
872            _ => {
873                self.source.hide(2);
874                if !self.find_end_quote([qt]) {
875                    self.source.revert();
876                    self.source.peek(1);
877                    self.commit_to_token(TokenType::ERRORTOKEN);
878                    return Ok(true);
879                };
880            }
881        };
882        self.commit_to_token(TokenType::STRING);
883        Ok(true)
884    }
885
886    fn finalize_stream(&mut self) -> Result<(), String> {
887        if self.parenthesis_level != 0 || self.brace_level != 0 || self.bracket_level != 0 {
888            return Err(String::from("EOF in multi-line statement"));
889        }
890        if self.within_statement {
891            // all statements must end in a newline, even if not present in the source
892            self.add_token(
893                TokenType::NEWLINE,
894                TokenType::NEWLINE,
895                String::from(""),
896                self.source.committed_index(),
897                self.source.committed_index() + 1,
898            );
899        };
900        while self.indents_seen.len() > 1 {
901            // bottom of the stack is indent of size 0 and does not need a DEDENT
902            self.indents_seen.pop();
903            self.add_token(
904                TokenType::DEDENT,
905                TokenType::DEDENT,
906                String::from(""),
907                self.source.committed_index() + 1,
908                self.source.committed_index() + 1,
909            );
910        }
911        self.add_token(
912            TokenType::ENDMARKER,
913            TokenType::ENDMARKER,
914            String::from(""),
915            self.source.committed_index() + 1,
916            self.source.committed_index() + 1,
917        );
918        self.ended = true;
919        Ok(())
920    }
921
922    fn consume_next_token(&mut self) -> Result<(), String> {
923        if self.ended {
924            return Ok(());
925        };
926        if self.source.at_end() {
927            return self.finalize_stream();
928        };
929
930        // consume any significant whitespace
931        // may not produce a token, even if the cursor is advanced
932        if !self.within_statement {
933            match self.consume_next_dent() {
934                Ok(true) => {
935                    return Ok(());
936                }
937                Ok(false) => (),
938                Err(e) => return Err(e),
939            };
940        };
941        // non-dent whitespace does not produce tokens
942        self.consume_next_whitespace();
943        if let Some(produced_token) = self.consume_next_newline() {
944            if produced_token {
945                return Ok(());
946            } else {
947                // re-enter; escaped newline is insignificant whitespace
948                return self.consume_next_token();
949            };
950        };
951        // number must come before op to correctly capture a leading decimal point
952        if self.consume_next_number_token() {
953            return Ok(());
954        };
955        if self.consume_next_op_token() {
956            return Ok(());
957        };
958        // string must come before name to correctly capture prefix directives
959        match self.consume_next_string_token() {
960            Ok(true) => {
961                return Ok(());
962            }
963            Ok(false) => (),
964            Err(e) => return Err(e),
965        };
966        if self.consume_next_name_token() {
967            return Ok(());
968        };
969        if self.consume_next_comment() {
970            return Ok(());
971        };
972
973        // no tokens found
974        if self.source.at_end() {
975            return self.consume_next_token();
976        } else {
977            self.source.peek(1);
978            self.commit_to_token(TokenType::ERRORTOKEN);
979            return Ok(());
980        };
981    }
982}
983
984impl Iterator for TokenStream {
985    type Item = Result<Token, String>;
986
987    fn next(&mut self) -> Option<Self::Item> {
988        if self.tokens.is_empty() {
989            match self.consume_next_token() {
990                Ok(_) => (),
991                Err(e) => return Some(Err(e)),
992            }
993        };
994        if self.tokens.is_empty() {
995            None
996        } else {
997            Ok(self.tokens.pop_front()).transpose()
998        }
999    }
1000}
rye_grain/tokenize.rs

rye_grain/
tokenize.rs