blueprint_starlark_syntax/
lexer.rs

1/*
2 * Copyright 2018 The Starlark in Rust Authors.
3 * Copyright (c) Facebook, Inc. and its affiliates.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 *     https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18use std::char;
19use std::collections::VecDeque;
20use std::fmt;
21use std::fmt::Display;
22
23use logos::Logos;
24use num_bigint::BigInt;
25use num_traits::Num;
26use thiserror::Error;
27
28use crate::codemap::CodeMap;
29use crate::codemap::Pos;
30use crate::codemap::Span;
31use crate::cursors::CursorBytes;
32use crate::cursors::CursorChars;
33use crate::dialect::Dialect;
34use crate::eval_exception::EvalException;
35
36#[derive(Error, Debug)]
37pub enum LexemeError {
38    #[error("Parse error: incorrect indentation")]
39    Indentation,
40    #[error("Parse error: invalid input `{0}`")]
41    InvalidInput(String),
42    #[error("Parse error: tabs are not allowed")]
43    InvalidTab,
44    #[error("Parse error: unfinished string literal")]
45    UnfinishedStringLiteral,
46    #[error("Parse error: invalid string escape sequence `{0}`")]
47    InvalidEscapeSequence(String),
48    #[error("Parse error: missing string escape sequence, only saw `\\`")]
49    EmptyEscapeSequence,
50    #[error("Parse error: cannot use reserved keyword `{0}`")]
51    ReservedKeyword(String),
52    #[error("Parse error: integer cannot have leading 0, got `{0}`")]
53    StartsZero(String),
54    #[error("Parse error: failed to parse integer: `{0}`")]
55    IntParse(String),
56    #[error("Comment span is computed incorrectly (internal error)")]
57    CommentSpanComputedIncorrectly,
58    #[error("Cannot parse `{0}` as an integer in base {1}")]
59    CannotParse(String, u32),
60}
61
62impl From<LexemeError> for crate::error::Error {
63    fn from(e: LexemeError) -> Self {
64        crate::error::Error::new_kind(crate::error::ErrorKind::Parser(anyhow::Error::new(e)))
65    }
66}
67
68type LexemeT<T> = Result<(usize, T, usize), EvalException>;
69type Lexeme = LexemeT<Token>;
70
71fn map_lexeme_t<T1, T2>(lexeme: LexemeT<T1>, f: impl FnOnce(T1) -> T2) -> LexemeT<T2> {
72    lexeme.map(|(l, t, r)| (l, f(t), r))
73}
74
75pub struct Lexer<'a> {
76    // Information for spans
77    codemap: CodeMap,
78    // Other info
79    indent_levels: Vec<usize>,
80    /// Lexemes that have been generated but not yet returned
81    buffer: VecDeque<Lexeme>,
82    parens: isize, // Number of parens we have seen
83    lexer: logos::Lexer<'a, Token>,
84    done: bool,
85}
86
87impl<'a> Lexer<'a> {
88    pub fn new(input: &'a str, _dialect: &Dialect, codemap: CodeMap) -> Self {
89        let lexer = Token::lexer(input);
90        let mut lexer2 = Self {
91            codemap,
92            // Aim to size all the buffers such that they never resize
93            indent_levels: Vec::with_capacity(20),
94            buffer: VecDeque::with_capacity(10),
95            lexer,
96            parens: 0,
97            done: false,
98        };
99        if let Err(e) = lexer2.calculate_indent() {
100            lexer2.buffer.push_back(Err(e));
101        }
102        lexer2
103    }
104
105    fn err_pos<T>(&self, msg: LexemeError, pos: usize) -> Result<T, EvalException> {
106        self.err_span(msg, pos, pos)
107    }
108
109    fn err_span<T>(&self, msg: LexemeError, start: usize, end: usize) -> Result<T, EvalException> {
110        Err(EvalException::new(
111            msg.into(),
112            Span::new(Pos::new(start as u32), Pos::new(end as u32)),
113            &self.codemap,
114        ))
115    }
116
117    fn err_now<T>(&self, msg: fn(String) -> LexemeError) -> Result<T, EvalException> {
118        self.err_span(
119            msg(self.lexer.slice().to_owned()),
120            self.lexer.span().start,
121            self.lexer.span().end,
122        )
123    }
124
125    /// Comment tokens are produced by either logos for comments after code,
126    /// or explicitly on lines which are only comments. This functions is used in the latter case.
127    #[allow(clippy::manual_strip)]
128    fn make_comment(&self, start: usize, end: usize) -> Lexeme {
129        let comment = &self.codemap.source()[start..end];
130        if !comment.starts_with('#') {
131            return self.err_pos(LexemeError::CommentSpanComputedIncorrectly, start);
132        }
133        // Remove the `#`.
134        let comment = &comment[1..];
135        // Remove the trailing `\r` if it exists.
136        // Note comments do not contain `\n`.
137        if comment.ends_with('\r') {
138            let end = end - 1;
139            let comment = &comment[..comment.len() - 1];
140            Ok((start, Token::Comment(comment.to_owned()), end))
141        } else {
142            Ok((start, Token::Comment(comment.to_owned()), end))
143        }
144    }
145
146    /// We have just seen a newline, read how many indents we have
147    /// and then set self.indent properly
148    fn calculate_indent(&mut self) -> Result<(), EvalException> {
149        // consume tabs and spaces, output the indentation levels
150        let mut it = CursorBytes::new(self.lexer.remainder());
151        let mut spaces = 0;
152        let mut tabs = 0;
153        let mut indent_start = self.lexer.span().end;
154        loop {
155            match it.next_char() {
156                None => {
157                    self.lexer.bump(it.pos());
158                    return Ok(());
159                }
160                Some(' ') => {
161                    spaces += 1;
162                }
163                Some('\t') => {
164                    tabs += 1;
165                }
166                Some('\n') => {
167                    // A line that is entirely blank gets emitted as a newline, and then
168                    // we don't consume the subsequent newline character.
169                    self.lexer.bump(it.pos() - 1);
170                    return Ok(());
171                }
172                Some('\r') => {
173                    // We just ignore these entirely
174                }
175                Some('#') => {
176                    // A line that is all comments, only emits comment tokens.
177                    // Skip until the next newline
178                    // Remove skip now, so we can freely add it on later
179                    spaces = 0;
180                    tabs = 0;
181                    let start = self.lexer.span().end + it.pos() - 1;
182                    loop {
183                        match it.next_char() {
184                            None => {
185                                let end = self.lexer.span().end + it.pos();
186                                self.buffer.push_back(self.make_comment(start, end));
187                                self.lexer.bump(it.pos());
188                                return Ok(());
189                            }
190                            Some('\n') => break, // only the inner loop
191                            Some(_) => {}
192                        }
193                    }
194                    let end = self.lexer.span().end + it.pos() - 1;
195                    self.buffer.push_back(self.make_comment(start, end));
196                    indent_start = self.lexer.span().end + it.pos();
197                }
198                _ => break,
199            }
200        }
201        self.lexer.bump(it.pos() - 1); // last character broke us out the loop
202        let indent = spaces + tabs * 8;
203        if tabs > 0 {
204            return self.err_pos(LexemeError::InvalidTab, self.lexer.span().start);
205        }
206        let now = self.indent_levels.last().copied().unwrap_or(0);
207
208        if indent > now {
209            self.indent_levels.push(indent);
210            let span = self.lexer.span();
211            self.buffer
212                .push_back(Ok((indent_start, Token::Indent, span.end)));
213        } else if indent < now {
214            let mut dedents = 1;
215            self.indent_levels.pop().unwrap();
216            loop {
217                let now = self.indent_levels.last().copied().unwrap_or(0);
218                if now == indent {
219                    break;
220                } else if now > indent {
221                    dedents += 1;
222                    self.indent_levels.pop().unwrap();
223                } else {
224                    let pos = self.lexer.span();
225                    return self.err_span(LexemeError::Indentation, pos.start, pos.end);
226                }
227            }
228            for _ in 0..dedents {
229                // We must declare each dedent is only a position, so multiple adjacent dedents don't overlap
230                self.buffer
231                    .push_back(Ok((indent_start, Token::Dedent, indent_start)))
232            }
233        }
234        Ok(())
235    }
236
237    fn wrap(&mut self, token: Token) -> Option<Lexeme> {
238        let span = self.lexer.span();
239        Some(Ok((span.start, token, span.end)))
240    }
241
242    // We've potentially seen one character, now consume between min and max elements of iterator
243    // and treat it as an int in base radix
244    fn escape_char(it: &mut CursorChars, min: usize, max: usize, radix: u32) -> Result<char, ()> {
245        let mut value = 0u32;
246        let mut count = 0;
247        while count < max {
248            match it.next() {
249                None => {
250                    if count >= min {
251                        break;
252                    } else {
253                        return Err(());
254                    }
255                }
256                Some(c) => match c.to_digit(radix) {
257                    None => {
258                        if count >= min {
259                            it.unnext(c);
260                            break;
261                        } else {
262                            return Err(());
263                        }
264                    }
265                    Some(v) => {
266                        count += 1;
267                        value = (value * radix) + v;
268                    }
269                },
270            }
271        }
272        char::from_u32(value).ok_or(())
273    }
274
275    // We have seen a '\' character, now parse what comes next
276    fn escape(it: &mut CursorChars, res: &mut String) -> Result<(), ()> {
277        match it.next() {
278            Some('n') => res.push('\n'),
279            Some('r') => res.push('\r'),
280            Some('t') => res.push('\t'),
281            Some('a') => res.push('\x07'),
282            Some('b') => res.push('\x08'),
283            Some('f') => res.push('\x0C'),
284            Some('v') => res.push('\x0B'),
285            Some('\n') => {}
286            Some('\r') => {
287                // Windows newline incoming, we expect a \n next, which we can ignore
288                if it.next() != Some('\n') {
289                    // A random \r character happened, let's declare an error, but we're just confused here
290                    return Err(());
291                }
292            }
293            Some('x') => res.push(Self::escape_char(it, 2, 2, 16)?),
294            Some('u') => res.push(Self::escape_char(it, 4, 4, 16)?),
295            Some('U') => res.push(Self::escape_char(it, 8, 8, 16)?),
296            Some(c) => match c {
297                '0'..='7' => {
298                    it.unnext(c);
299                    res.push(Self::escape_char(it, 1, 3, 8)?)
300                }
301                '"' | '\'' | '\\' => res.push(c),
302                _ => {
303                    res.push('\\');
304                    res.push(c);
305                }
306            },
307            None => {
308                return Err(());
309            }
310        };
311        Ok(())
312    }
313
314    /// Parse a String. Return the String, and the offset where it starts.
315    // String parsing is a hot-spot, so parameterise by a `stop` function which gets
316    // specialised for each variant
317    fn string(
318        &mut self,
319        triple: bool,
320        raw: bool,
321        mut stop: impl FnMut(char) -> bool,
322    ) -> LexemeT<(String, usize)> {
323        // We have seen an opening quote, which is either ' or "
324        // If triple is true, it was a triple quote
325        // stop lets us know when a string ends.
326
327        // Before the first quote character
328        let string_start = self.lexer.span().start;
329        // After the first quote character, but before any contents or it tracked stuff
330        let mut string_end = self.lexer.span().end;
331
332        let mut it = CursorBytes::new(self.lexer.remainder());
333        let it2;
334
335        if triple {
336            it.next();
337            it.next();
338        }
339        let contents_start = it.pos();
340
341        // Take the fast path as long as the result is a slice of the original, with no changes.
342        let mut res;
343        loop {
344            match it.next_char() {
345                None => {
346                    return self.err_span(
347                        LexemeError::UnfinishedStringLiteral,
348                        string_start,
349                        string_end + it.pos(),
350                    );
351                }
352                Some(c) => {
353                    if stop(c) {
354                        let contents_end = it.pos() - if triple { 3 } else { 1 };
355                        let contents = &self.lexer.remainder()[contents_start..contents_end];
356                        self.lexer.bump(it.pos());
357                        return Ok((
358                            string_start,
359                            (contents.to_owned(), contents_start),
360                            string_end + it.pos(),
361                        ));
362                    } else if c == '\\' || c == '\r' || (c == '\n' && !triple) {
363                        res = String::with_capacity(it.pos() + 10);
364                        res.push_str(&self.lexer.remainder()[contents_start..it.pos() - 1]);
365                        it2 = CursorChars::new_offset(self.lexer.remainder(), it.pos() - 1);
366                        break;
367                    }
368                }
369            }
370        }
371
372        // We bailed out of the fast path, that means we now accumulate character by character,
373        // might have an error or be dealing with escape characters.
374        let mut it = it2;
375        while let Some(c) = it.next() {
376            if stop(c) {
377                self.lexer.bump(it.pos());
378                if triple {
379                    res.truncate(res.len() - 2);
380                }
381                return Ok((string_start, (res, contents_start), string_end + it.pos()));
382            }
383            match c {
384                '\n' if !triple => {
385                    // Will raise an error about out of chars.
386                    // But don't include the final \n in the count.
387                    string_end -= 1;
388                    break;
389                }
390                '\r' => {
391                    // We just ignore these in all modes
392                }
393                '\\' => {
394                    if raw {
395                        match it.next() {
396                            Some(c) => {
397                                if c != '\'' && c != '"' {
398                                    res.push('\\');
399                                }
400                                res.push(c);
401                            }
402                            _ => break, // Out of chars
403                        }
404                    } else {
405                        let pos = it.pos();
406                        if Self::escape(&mut it, &mut res).is_err() {
407                            let bad = self.lexer.remainder()[pos..it.pos()].to_owned();
408                            return self.err_span(
409                                if bad.is_empty() {
410                                    LexemeError::EmptyEscapeSequence
411                                } else {
412                                    LexemeError::InvalidEscapeSequence(bad)
413                                },
414                                string_end + pos - 1,
415                                string_end + it.pos(),
416                            );
417                        }
418                    }
419                }
420                c => res.push(c),
421            }
422        }
423
424        // We ran out of characters
425        self.err_span(
426            LexemeError::UnfinishedStringLiteral,
427            string_start,
428            string_end + it.pos(),
429        )
430    }
431
432    fn int(&self, s: &str, radix: u32) -> Lexeme {
433        let span = self.lexer.span();
434        match TokenInt::from_str_radix(s, radix) {
435            Ok(i) => Ok((span.start, Token::Int(i), span.end)),
436            Err(_) => self.err_now(LexemeError::IntParse),
437        }
438    }
439
440    pub fn next(&mut self) -> Option<Lexeme> {
441        loop {
442            // Note that this function doesn't always return - a few branches use `continue`
443            // to always go round the loop again.
444            return match self.buffer.pop_front() {
445                Some(x) => Some(x),
446                _ => {
447                    if self.done {
448                        None
449                    } else {
450                        match self.lexer.next() {
451                            None => {
452                                self.done = true;
453                                let pos = self.lexer.span().end;
454                                for _ in 0..self.indent_levels.len() {
455                                    self.buffer.push_back(Ok((pos, Token::Dedent, pos)))
456                                }
457                                self.indent_levels.clear();
458                                self.wrap(Token::Newline)
459                            }
460                            Some(Ok(token)) => match token {
461                                Token::Tabs => {
462                                    self.buffer.push_back(
463                                        self.err_pos(
464                                            LexemeError::InvalidTab,
465                                            self.lexer.span().start,
466                                        ),
467                                    );
468                                    continue;
469                                }
470                                Token::Newline => {
471                                    if self.parens == 0 {
472                                        let span = self.lexer.span();
473                                        if let Err(e) = self.calculate_indent() {
474                                            return Some(Err(e));
475                                        }
476                                        Some(Ok((span.start, Token::Newline, span.end)))
477                                    } else {
478                                        continue;
479                                    }
480                                }
481                                Token::Reserved | Token::Match | Token::Case => {
482                                    // Allow reserved keywords as identifiers (e.g., obj.del(), obj.match())
483                                    self.wrap(Token::Identifier(self.lexer.slice().to_owned()))
484                                }
485                                Token::RawDecInt => {
486                                    let s = self.lexer.slice();
487                                    if s.len() > 1 && &s[0..1] == "0" {
488                                        return Some(self.err_now(LexemeError::StartsZero));
489                                    }
490                                    Some(self.int(s, 10))
491                                }
492                                Token::RawOctInt => {
493                                    let s = self.lexer.slice();
494                                    assert!(s.starts_with("0o") || s.starts_with("0O"));
495                                    Some(self.int(&s[2..], 8))
496                                }
497                                Token::RawHexInt => {
498                                    let s = self.lexer.slice();
499                                    assert!(s.starts_with("0x") || s.starts_with("0X"));
500                                    Some(self.int(&s[2..], 16))
501                                }
502                                Token::RawBinInt => {
503                                    let s = self.lexer.slice();
504                                    assert!(s.starts_with("0b") || s.starts_with("0B"));
505                                    Some(self.int(&s[2..], 2))
506                                }
507                                Token::Int(..) => unreachable!("Lexer does not produce Int tokens"),
508                                Token::RawDoubleQuote => {
509                                    let raw = self.lexer.span().len() == 2;
510                                    self.parse_double_quoted_string(raw).map(|lex| {
511                                        map_lexeme_t(lex, |(s, _offset)| Token::String(s))
512                                    })
513                                }
514                                Token::RawSingleQuote => {
515                                    let raw = self.lexer.span().len() == 2;
516                                    self.parse_single_quoted_string(raw).map(|lex| {
517                                        map_lexeme_t(lex, |(s, _offset)| Token::String(s))
518                                    })
519                                }
520                                Token::String(_) => {
521                                    unreachable!("The lexer does not produce String")
522                                }
523                                Token::RawFStringDoubleQuote => {
524                                    let span_len = self.lexer.span().len();
525                                    let raw = span_len == 3;
526                                    self.parse_double_quoted_string(raw).map(|lex| {
527                                        map_lexeme_t(lex, |(content, content_start_offset)| {
528                                            Token::FString(TokenFString {
529                                                content,
530                                                content_start_offset: content_start_offset
531                                                    + span_len,
532                                            })
533                                        })
534                                    })
535                                }
536                                Token::RawFStringSingleQuote => {
537                                    let span_len = self.lexer.span().len();
538                                    let raw = span_len == 3;
539                                    self.parse_single_quoted_string(raw).map(|lex| {
540                                        map_lexeme_t(lex, |(content, content_start_offset)| {
541                                            Token::FString(TokenFString {
542                                                content,
543                                                content_start_offset: content_start_offset
544                                                    + span_len,
545                                            })
546                                        })
547                                    })
548                                }
549                                Token::FString(_) => {
550                                    unreachable!("The lexer does not produce FString")
551                                }
552                                Token::RawByteDoubleQuote => {
553                                    let raw = self.lexer.span().len() == 3;
554                                    self.parse_double_quoted_string(raw).map(|lex| {
555                                        map_lexeme_t(lex, |(s, _offset)| {
556                                            Token::ByteString(s.into_bytes())
557                                        })
558                                    })
559                                }
560                                Token::RawByteSingleQuote => {
561                                    let raw = self.lexer.span().len() == 3;
562                                    self.parse_single_quoted_string(raw).map(|lex| {
563                                        map_lexeme_t(lex, |(s, _offset)| {
564                                            Token::ByteString(s.into_bytes())
565                                        })
566                                    })
567                                }
568                                Token::ByteString(_) => {
569                                    unreachable!("The lexer does not produce ByteString")
570                                }
571                                Token::OpeningCurly
572                                | Token::OpeningRound
573                                | Token::OpeningSquare => {
574                                    self.parens += 1;
575                                    self.wrap(token)
576                                }
577                                Token::ClosingCurly
578                                | Token::ClosingRound
579                                | Token::ClosingSquare => {
580                                    self.parens -= 1;
581                                    self.wrap(token)
582                                }
583                                _ => self.wrap(token),
584                            },
585                            Some(Err(_)) => Some(self.err_now(LexemeError::InvalidInput)),
586                        }
587                    }
588                }
589            };
590        }
591    }
592
593    fn parse_double_quoted_string(&mut self, raw: bool) -> Option<LexemeT<(String, usize)>> {
594        if self.lexer.remainder().starts_with("\"\"") {
595            let mut qs = 0;
596            Some(self.string(true, raw, |c| {
597                if c == '\"' {
598                    qs += 1;
599                    qs == 3
600                } else {
601                    qs = 0;
602                    false
603                }
604            }))
605        } else {
606            Some(self.string(false, raw, |c| c == '\"'))
607        }
608    }
609
610    fn parse_single_quoted_string(&mut self, raw: bool) -> Option<LexemeT<(String, usize)>> {
611        if self.lexer.remainder().starts_with("''") {
612            let mut qs = 0;
613            Some(self.string(true, raw, |c| {
614                if c == '\'' {
615                    qs += 1;
616                    qs == 3
617                } else {
618                    qs = 0;
619                    false
620                }
621            }))
622        } else {
623            Some(self.string(false, raw, |c| c == '\''))
624        }
625    }
626}
627
628#[derive(Debug, Clone, Eq, PartialEq, derive_more::Display)]
629pub enum TokenInt {
630    I32(i32),
631    /// Only if larger than `i32`.
632    BigInt(BigInt),
633}
634
635impl TokenInt {
636    pub fn from_str_radix(s: &str, base: u32) -> crate::Result<TokenInt> {
637        if let Ok(i) = i32::from_str_radix(s, base) {
638            Ok(TokenInt::I32(i))
639        } else {
640            match BigInt::from_str_radix(s, base) {
641                Ok(i) => Ok(TokenInt::BigInt(i)),
642                Err(_) => Err(LexemeError::CannotParse(s.to_owned(), base).into()),
643            }
644        }
645    }
646}
647
648#[derive(Debug, Clone, PartialEq)]
649pub struct TokenFString {
650    /// The content of this TokenFString
651    pub content: String,
652    /// Relative to the token, where does the actual string content start?
653    pub content_start_offset: usize,
654}
655
656/// All token that can be generated by the lexer
657#[derive(Logos, Debug, Clone, PartialEq)]
658#[logos(skip r" +")] // whitespace
659#[logos(skip r"\\\n")] // Escaped newline
660#[logos(skip r"\\\r\n")] // Escaped newline (Windows line ending)
661pub enum Token {
662    /// Comment as token.
663    /// Span includes the leading `#`, but the content does not.
664    #[regex(r#"#[^\r\n]*"#, |lex| lex.slice()[1..].to_owned())]
665    Comment(String),
666
667    #[regex("\t+")] // Tabs (might be an error)
668    Tabs,
669
670    // Indentation block & meaningful spaces
671    Indent, // New indentation block
672    Dedent, // Leaving an indentation block
673    #[regex(r"(\r)?\n")]
674    Newline, // Newline outside a string
675
676    // Some things the lexer can't deal with well, so we step in and generate
677    // things ourselves
678    #[token("'")]
679    #[token("r'")]
680    RawSingleQuote,
681    #[token("\"")]
682    #[token("r\"")]
683    RawDoubleQuote,
684
685    /// The start of a single-quoted f-string.
686    #[token("f'")]
687    #[token("fr'")]
688    RawFStringSingleQuote,
689    /// The start of a double-quoted f-string.
690    #[token("f\"")]
691    #[token("fr\"")]
692    RawFStringDoubleQuote,
693
694    /// The start of a single-quoted byte string.
695    #[token("b'")]
696    #[token("br'")]
697    RawByteSingleQuote,
698    /// The start of a double-quoted byte string.
699    #[token("b\"")]
700    #[token("br\"")]
701    RawByteDoubleQuote,
702
703    #[regex(
704        "as|\
705        async|\
706        await|\
707        class|\
708        del|\
709        except|\
710        finally|\
711        from|\
712        global|\
713        import|\
714        is|\
715        nonlocal|\
716        raise|\
717        try|\
718        while|\
719        with"
720    )]
721    Reserved, // One of the reserved keywords
722
723    #[regex(
724        "[a-zA-Z_][a-zA-Z0-9_]*"
725    , |lex| lex.slice().to_owned())]
726    Identifier(String), // An identifier
727
728    #[regex("[0-9]+")]
729    RawDecInt,
730    #[regex("0[xX][A-Fa-f0-9]+")]
731    RawHexInt,
732    #[regex("0[bB][01]+")]
733    RawBinInt,
734    #[regex("0[oO][0-7]+")]
735    RawOctInt,
736
737    Int(TokenInt), // An integer literal (123, 0x1, 0b1011, 0o755, ...)
738
739    // Returns closest f64. https://doc.rust-lang.org/std/primitive.f64.html#method.from_str
740    #[regex("[0-9]+\\.[0-9]*([eE][-+]?[0-9]+)?", |lex| lex.slice().parse::<f64>().ok())]
741    #[regex("[0-9]+[eE][-+]?[0-9]+", |lex| lex.slice().parse::<f64>().ok())]
742    #[regex("\\.[0-9]+([eE][-+]?[0-9]+)?", |lex| lex.slice().parse::<f64>().ok())]
743    Float(f64), // A float literal (3.14, .3, 1e6, 0.)
744
745    String(String), // A string literal
746    /// The raw text of a f-string
747    FString(TokenFString),
748    /// A byte string literal (b"..." or b'...')
749    ByteString(Vec<u8>),
750
751    // Keywords
752    #[token("and")]
753    And,
754    #[token("break")]
755    Break,
756    #[token("continue")]
757    Continue,
758    #[token("def")]
759    Def,
760    #[token("elif")]
761    Elif,
762    #[token("else")]
763    Else,
764    #[token("for")]
765    For,
766    #[token("if")]
767    If,
768    #[token("in")]
769    In,
770    #[token("lambda")]
771    Lambda,
772    #[token("load")]
773    Load,
774    #[token("not")]
775    Not,
776    #[token("or")]
777    Or,
778    #[token("pass")]
779    Pass,
780    #[token("return")]
781    Return,
782    #[token("struct")]
783    Struct,
784    #[token("yield")]
785    Yield,
786    #[token("match")]
787    Match,
788    #[token("case")]
789    Case,
790    // Symbols
791    #[token(",")]
792    Comma,
793    #[token(";")]
794    Semicolon,
795    #[token(":")]
796    Colon,
797    #[token("+=")]
798    PlusEqual,
799    #[token("-=")]
800    MinusEqual,
801    #[token("*=")]
802    StarEqual,
803    #[token("/=")]
804    SlashEqual,
805    #[token("//=")]
806    SlashSlashEqual,
807    #[token("%=")]
808    PercentEqual,
809    #[token("==")]
810    EqualEqual,
811    #[token("!=")]
812    BangEqual,
813    #[token("<=")]
814    LessEqual,
815    #[token(">=")]
816    GreaterEqual,
817    #[token("**")]
818    StarStar,
819    #[token("->")]
820    MinusGreater,
821    #[token("=")]
822    Equal,
823    #[token("<")]
824    LessThan,
825    #[token(">")]
826    GreaterThan,
827    #[token("-")]
828    Minus,
829    #[token("+")]
830    Plus,
831    #[token("*")]
832    Star,
833    #[token("%")]
834    Percent,
835    #[token("/")]
836    Slash,
837    #[token("//")]
838    SlashSlash,
839    #[token(".")]
840    Dot,
841    #[token("&")]
842    Ampersand,
843    #[token("|")]
844    Pipe,
845    #[token("^")]
846    Caret,
847    #[token("<<")]
848    LessLess,
849    #[token(">>")]
850    GreaterGreater,
851    #[token("~")]
852    Tilde,
853    #[token("&=")]
854    AmpersandEqual,
855    #[token("|=")]
856    PipeEqual,
857    #[token("^=")]
858    CaretEqual,
859    #[token("<<=")]
860    LessLessEqual,
861    #[token(">>=")]
862    GreaterGreaterEqual,
863    #[token("...")]
864    Ellipsis,
865
866    // Brackets
867    #[token("[")]
868    OpeningSquare,
869    #[token("{")]
870    OpeningCurly,
871    #[token("(")]
872    OpeningRound,
873    #[token("]")]
874    ClosingSquare,
875    #[token("}")]
876    ClosingCurly,
877    #[token(")")]
878    ClosingRound,
879}
880
881impl Token {
882    /// Used for testing
883    #[cfg(test)]
884    pub fn unlex(&self) -> String {
885        use std::io::Write;
886        match self {
887            Token::Indent => "\t".to_owned(),
888            Token::Newline => "\n".to_owned(),
889            Token::Dedent => "#dedent".to_owned(),
890            Token::String(x) => {
891                // The Rust {:?} is unstable, so changes between versions,
892                // instead use the JSON standard for string escapes.
893                // Reuse the StarlarkValue implementation since it's close to hand.
894                serde_json::to_string(x).unwrap()
895            }
896            Token::FString(x) => {
897                let mut buff = Vec::new();
898                write!(&mut buff, "f").unwrap();
899                serde_json::to_writer(&mut buff, &x.content).unwrap();
900                String::from_utf8(buff).unwrap()
901            }
902            Token::ByteString(b) => {
903                let mut buff = Vec::new();
904                write!(&mut buff, "b").unwrap();
905                serde_json::to_writer(&mut buff, &String::from_utf8_lossy(b).to_string()).unwrap();
906                String::from_utf8(buff).unwrap()
907            }
908            _ => {
909                let s = self.to_string();
910                // Out display is often: keyword 'lambda'
911                // so strip out the bit in single quotes
912                let first = s.find('\'');
913                match first {
914                    Some(first) if s.ends_with('\'') && first != s.len() - 1 => {
915                        s[first + 1..s.len() - 1].to_owned()
916                    }
917                    _ => s,
918                }
919            }
920        }
921    }
922}
923
924impl Display for Token {
925    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
926        match self {
927            Token::Indent => write!(f, "new indentation block"),
928            Token::Dedent => write!(f, "end of indentation block"),
929            Token::Newline => write!(f, "new line"),
930            Token::And => write!(f, "keyword 'and'"),
931            Token::Else => write!(f, "keyword 'else'"),
932            Token::Load => write!(f, "keyword 'load'"),
933            Token::Break => write!(f, "keyword 'break'"),
934            Token::For => write!(f, "keyword 'for'"),
935            Token::Not => write!(f, "keyword 'not'"),
936            Token::Continue => write!(f, "keyword 'continue'"),
937            Token::If => write!(f, "keyword 'if'"),
938            Token::Or => write!(f, "keyword 'or'"),
939            Token::Def => write!(f, "keyword 'def'"),
940            Token::In => write!(f, "keyword 'in'"),
941            Token::Pass => write!(f, "keyword 'pass'"),
942            Token::Elif => write!(f, "keyword 'elif'"),
943            Token::Return => write!(f, "keyword 'return'"),
944            Token::Struct => write!(f, "keyword 'struct'"),
945            Token::Yield => write!(f, "keyword 'yield'"),
946            Token::Match => write!(f, "keyword 'match'"),
947            Token::Case => write!(f, "keyword 'case'"),
948            Token::Lambda => write!(f, "keyword 'lambda'"),
949            Token::Comma => write!(f, "symbol ','"),
950            Token::Semicolon => write!(f, "symbol ';'"),
951            Token::Colon => write!(f, "symbol ':'"),
952            Token::PlusEqual => write!(f, "symbol '+='"),
953            Token::MinusEqual => write!(f, "symbol '-='"),
954            Token::StarEqual => write!(f, "symbol '*='"),
955            Token::SlashEqual => write!(f, "symbol '/='"),
956            Token::SlashSlashEqual => write!(f, "symbol '//='"),
957            Token::PercentEqual => write!(f, "symbol '%='"),
958            Token::EqualEqual => write!(f, "symbol '=='"),
959            Token::BangEqual => write!(f, "symbol '!='"),
960            Token::LessEqual => write!(f, "symbol '<='"),
961            Token::GreaterEqual => write!(f, "symbol '>='"),
962            Token::StarStar => write!(f, "symbol '**'"),
963            Token::MinusGreater => write!(f, "symbol '->'"),
964            Token::Equal => write!(f, "symbol '='"),
965            Token::LessThan => write!(f, "symbol '<'"),
966            Token::GreaterThan => write!(f, "symbol '>'"),
967            Token::Minus => write!(f, "symbol '-'"),
968            Token::Plus => write!(f, "symbol '+'"),
969            Token::Star => write!(f, "symbol '*'"),
970            Token::Percent => write!(f, "symbol '%'"),
971            Token::Slash => write!(f, "symbol '/'"),
972            Token::SlashSlash => write!(f, "symbol '//'"),
973            Token::Dot => write!(f, "symbol '.'"),
974            Token::Ampersand => write!(f, "symbol '&'"),
975            Token::Pipe => write!(f, "symbol '|'"),
976            Token::Caret => write!(f, "symbol '^'"),
977            Token::LessLess => write!(f, "symbol '<<'"),
978            Token::GreaterGreater => write!(f, "symbol '>>'"),
979            Token::Tilde => write!(f, "symbol '~'"),
980            Token::AmpersandEqual => write!(f, "symbol '&='"),
981            Token::PipeEqual => write!(f, "symbol '|='"),
982            Token::CaretEqual => write!(f, "symbol '^='"),
983            Token::LessLessEqual => write!(f, "symbol '<<='"),
984            Token::GreaterGreaterEqual => write!(f, "symbol '>>='"),
985            Token::Ellipsis => write!(f, "symbol '...'"),
986            Token::OpeningSquare => write!(f, "symbol '['"),
987            Token::OpeningCurly => write!(f, "symbol '{{'"),
988            Token::OpeningRound => write!(f, "symbol '('"),
989            Token::ClosingSquare => write!(f, "symbol ']'"),
990            Token::ClosingCurly => write!(f, "symbol '}}'"),
991            Token::ClosingRound => write!(f, "symbol ')'"),
992            Token::Reserved => write!(f, "reserved keyword"),
993            Token::Identifier(s) => write!(f, "identifier '{s}'"),
994            Token::Int(i) => write!(f, "integer literal '{i}'"),
995            Token::RawDecInt => write!(f, "decimal integer literal"),
996            Token::RawHexInt => write!(f, "hexadecimal integer literal"),
997            Token::RawOctInt => write!(f, "octal integer literal"),
998            Token::RawBinInt => write!(f, "binary integer literal"),
999            Token::Float(n) => write!(f, "float literal '{n}'"),
1000            Token::String(s) => write!(f, "string literal {s:?}"),
1001            Token::RawSingleQuote => write!(f, "starting '"),
1002            Token::RawDoubleQuote => write!(f, "starting \""),
1003            Token::RawFStringDoubleQuote => write!(f, "starting f'"),
1004            Token::RawFStringSingleQuote => write!(f, "starting f\""),
1005            Token::FString(s) => write!(f, "f-string {:?}", &s.content),
1006            Token::RawByteSingleQuote => write!(f, "starting b'"),
1007            Token::RawByteDoubleQuote => write!(f, "starting b\""),
1008            Token::ByteString(b) => write!(f, "byte string literal ({} bytes)", b.len()),
1009            Token::Comment(c) => write!(f, "comment '{c}'"),
1010            Token::Tabs => Ok(()),
1011        }
1012    }
1013}
1014
1015impl<'a> Iterator for Lexer<'a> {
1016    type Item = Lexeme;
1017
1018    fn next(&mut self) -> Option<Self::Item> {
1019        self.next()
1020    }
1021}
1022
1023pub fn lex_exactly_one_identifier(s: &str) -> Option<String> {
1024    let mut lexer = Token::lexer(s);
1025    match (lexer.next(), lexer.next()) {
1026        (Some(Ok(Token::Identifier(ident))), None) => Some(ident),
1027        _ => None,
1028    }
1029}
1030
1031#[cfg(test)]
1032mod tests {
1033    use crate::lexer::lex_exactly_one_identifier;
1034
1035    #[test]
1036    fn test_is_valid_identifier() {
1037        assert_eq!(lex_exactly_one_identifier("foo").as_deref(), Some("foo"));
1038        assert_eq!(lex_exactly_one_identifier(" foo ").as_deref(), Some("foo"));
1039        assert_eq!(lex_exactly_one_identifier("foo bar"), None);
1040        assert_eq!(lex_exactly_one_identifier("not"), None);
1041        assert_eq!(lex_exactly_one_identifier("123"), None);
1042    }
1043}
blueprint_starlark_syntax/lexer.rs

blueprint_starlark_syntax/
lexer.rs