starlark_syntax/
lexer.rs

1/*
2 * Copyright 2018 The Starlark in Rust Authors.
3 * Copyright (c) Facebook, Inc. and its affiliates.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 *     https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18use std::char;
19use std::collections::VecDeque;
20use std::fmt;
21use std::fmt::Display;
22
23use logos::Logos;
24use num_bigint::BigInt;
25use num_traits::Num;
26use thiserror::Error;
27
28use crate::codemap::CodeMap;
29use crate::codemap::Pos;
30use crate::codemap::Span;
31use crate::cursors::CursorBytes;
32use crate::cursors::CursorChars;
33use crate::dialect::Dialect;
34use crate::eval_exception::EvalException;
35
36#[derive(Error, Debug)]
37pub enum LexemeError {
38    #[error("Parse error: incorrect indentation")]
39    Indentation,
40    #[error("Parse error: invalid input `{0}`")]
41    InvalidInput(String),
42    #[error("Parse error: tabs are not allowed")]
43    InvalidTab,
44    #[error("Parse error: unfinished string literal")]
45    UnfinishedStringLiteral,
46    #[error("Parse error: invalid string escape sequence `{0}`")]
47    InvalidEscapeSequence(String),
48    #[error("Parse error: missing string escape sequence, only saw `\\`")]
49    EmptyEscapeSequence,
50    #[error("Parse error: cannot use reserved keyword `{0}`")]
51    ReservedKeyword(String),
52    #[error("Parse error: integer cannot have leading 0, got `{0}`")]
53    StartsZero(String),
54    #[error("Parse error: failed to parse integer: `{0}`")]
55    IntParse(String),
56    #[error("Comment span is computed incorrectly (internal error)")]
57    CommentSpanComputedIncorrectly,
58    #[error("Cannot parse `{0}` as an integer in base {1}")]
59    CannotParse(String, u32),
60}
61
62impl From<LexemeError> for crate::error::Error {
63    fn from(e: LexemeError) -> Self {
64        crate::error::Error::new_kind(crate::error::ErrorKind::Parser(anyhow::Error::new(e)))
65    }
66}
67
68type LexemeT<T> = Result<(usize, T, usize), EvalException>;
69type Lexeme = LexemeT<Token>;
70
71fn map_lexeme_t<T1, T2>(lexeme: LexemeT<T1>, f: impl FnOnce(T1) -> T2) -> LexemeT<T2> {
72    lexeme.map(|(l, t, r)| (l, f(t), r))
73}
74
75pub struct Lexer<'a> {
76    // Information for spans
77    codemap: CodeMap,
78    // Other info
79    indent_levels: Vec<usize>,
80    /// Lexemes that have been generated but not yet returned
81    buffer: VecDeque<Lexeme>,
82    parens: isize, // Number of parens we have seen
83    lexer: logos::Lexer<'a, Token>,
84    done: bool,
85}
86
87impl<'a> Lexer<'a> {
88    pub fn new(input: &'a str, _dialect: &Dialect, codemap: CodeMap) -> Self {
89        let lexer = Token::lexer(input);
90        let mut lexer2 = Self {
91            codemap,
92            // Aim to size all the buffers such that they never resize
93            indent_levels: Vec::with_capacity(20),
94            buffer: VecDeque::with_capacity(10),
95            lexer,
96            parens: 0,
97            done: false,
98        };
99        if let Err(e) = lexer2.calculate_indent() {
100            lexer2.buffer.push_back(Err(e));
101        }
102        lexer2
103    }
104
105    fn err_pos<T>(&self, msg: LexemeError, pos: usize) -> Result<T, EvalException> {
106        self.err_span(msg, pos, pos)
107    }
108
109    fn err_span<T>(&self, msg: LexemeError, start: usize, end: usize) -> Result<T, EvalException> {
110        Err(EvalException::new(
111            msg.into(),
112            Span::new(Pos::new(start as u32), Pos::new(end as u32)),
113            &self.codemap,
114        ))
115    }
116
117    fn err_now<T>(&self, msg: fn(String) -> LexemeError) -> Result<T, EvalException> {
118        self.err_span(
119            msg(self.lexer.slice().to_owned()),
120            self.lexer.span().start,
121            self.lexer.span().end,
122        )
123    }
124
125    /// Comment tokens are produced by either logos for comments after code,
126    /// or explicitly on lines which are only comments. This functions is used in the latter case.
127    #[allow(clippy::manual_strip)]
128    fn make_comment(&self, start: usize, end: usize) -> Lexeme {
129        let comment = &self.codemap.source()[start..end];
130        if !comment.starts_with('#') {
131            return self.err_pos(LexemeError::CommentSpanComputedIncorrectly, start);
132        }
133        // Remove the `#`.
134        let comment = &comment[1..];
135        // Remove the trailing `\r` if it exists.
136        // Note comments do not contain `\n`.
137        if comment.ends_with('\r') {
138            let end = end - 1;
139            let comment = &comment[..comment.len() - 1];
140            Ok((start, Token::Comment(comment.to_owned()), end))
141        } else {
142            Ok((start, Token::Comment(comment.to_owned()), end))
143        }
144    }
145
146    /// We have just seen a newline, read how many indents we have
147    /// and then set self.indent properly
148    fn calculate_indent(&mut self) -> Result<(), EvalException> {
149        // consume tabs and spaces, output the indentation levels
150        let mut it = CursorBytes::new(self.lexer.remainder());
151        let mut spaces = 0;
152        let mut tabs = 0;
153        let mut indent_start = self.lexer.span().end;
154        loop {
155            match it.next_char() {
156                None => {
157                    self.lexer.bump(it.pos());
158                    return Ok(());
159                }
160                Some(' ') => {
161                    spaces += 1;
162                }
163                Some('\t') => {
164                    tabs += 1;
165                }
166                Some('\n') => {
167                    // A line that is entirely blank gets emitted as a newline, and then
168                    // we don't consume the subsequent newline character.
169                    self.lexer.bump(it.pos() - 1);
170                    return Ok(());
171                }
172                Some('\r') => {
173                    // We just ignore these entirely
174                }
175                Some('#') => {
176                    // A line that is all comments, only emits comment tokens.
177                    // Skip until the next newline
178                    // Remove skip now, so we can freely add it on later
179                    spaces = 0;
180                    tabs = 0;
181                    let start = self.lexer.span().end + it.pos() - 1;
182                    loop {
183                        match it.next_char() {
184                            None => {
185                                let end = self.lexer.span().end + it.pos();
186                                self.buffer.push_back(self.make_comment(start, end));
187                                self.lexer.bump(it.pos());
188                                return Ok(());
189                            }
190                            Some('\n') => break, // only the inner loop
191                            Some(_) => {}
192                        }
193                    }
194                    let end = self.lexer.span().end + it.pos() - 1;
195                    self.buffer.push_back(self.make_comment(start, end));
196                    indent_start = self.lexer.span().end + it.pos();
197                }
198                _ => break,
199            }
200        }
201        self.lexer.bump(it.pos() - 1); // last character broke us out the loop
202        let indent = spaces + tabs * 8;
203        if tabs > 0 {
204            return self.err_pos(LexemeError::InvalidTab, self.lexer.span().start);
205        }
206        let now = self.indent_levels.last().copied().unwrap_or(0);
207
208        if indent > now {
209            self.indent_levels.push(indent);
210            let span = self.lexer.span();
211            self.buffer
212                .push_back(Ok((indent_start, Token::Indent, span.end)));
213        } else if indent < now {
214            let mut dedents = 1;
215            self.indent_levels.pop().unwrap();
216            loop {
217                let now = self.indent_levels.last().copied().unwrap_or(0);
218                if now == indent {
219                    break;
220                } else if now > indent {
221                    dedents += 1;
222                    self.indent_levels.pop().unwrap();
223                } else {
224                    let pos = self.lexer.span();
225                    return self.err_span(LexemeError::Indentation, pos.start, pos.end);
226                }
227            }
228            for _ in 0..dedents {
229                // We must declare each dedent is only a position, so multiple adjacent dedents don't overlap
230                self.buffer
231                    .push_back(Ok((indent_start, Token::Dedent, indent_start)))
232            }
233        }
234        Ok(())
235    }
236
237    fn wrap(&mut self, token: Token) -> Option<Lexeme> {
238        let span = self.lexer.span();
239        Some(Ok((span.start, token, span.end)))
240    }
241
242    // We've potentially seen one character, now consume between min and max elements of iterator
243    // and treat it as an int in base radix
244    fn escape_char(it: &mut CursorChars, min: usize, max: usize, radix: u32) -> Result<char, ()> {
245        let mut value = 0u32;
246        let mut count = 0;
247        while count < max {
248            match it.next() {
249                None => {
250                    if count >= min {
251                        break;
252                    } else {
253                        return Err(());
254                    }
255                }
256                Some(c) => match c.to_digit(radix) {
257                    None => {
258                        if count >= min {
259                            it.unnext(c);
260                            break;
261                        } else {
262                            return Err(());
263                        }
264                    }
265                    Some(v) => {
266                        count += 1;
267                        value = (value * radix) + v;
268                    }
269                },
270            }
271        }
272        char::from_u32(value).ok_or(())
273    }
274
275    // We have seen a '\' character, now parse what comes next
276    fn escape(it: &mut CursorChars, res: &mut String) -> Result<(), ()> {
277        match it.next() {
278            Some('n') => res.push('\n'),
279            Some('r') => res.push('\r'),
280            Some('t') => res.push('\t'),
281            Some('a') => res.push('\x07'),
282            Some('b') => res.push('\x08'),
283            Some('f') => res.push('\x0C'),
284            Some('v') => res.push('\x0B'),
285            Some('\n') => {}
286            Some('\r') => {
287                // Windows newline incoming, we expect a \n next, which we can ignore
288                if it.next() != Some('\n') {
289                    // A random \r character happened, let's declare an error, but we're just confused here
290                    return Err(());
291                }
292            }
293            Some('x') => res.push(Self::escape_char(it, 2, 2, 16)?),
294            Some('u') => res.push(Self::escape_char(it, 4, 4, 16)?),
295            Some('U') => res.push(Self::escape_char(it, 8, 8, 16)?),
296            Some(c) => match c {
297                '0'..='7' => {
298                    it.unnext(c);
299                    res.push(Self::escape_char(it, 1, 3, 8)?)
300                }
301                '"' | '\'' | '\\' => res.push(c),
302                _ => {
303                    res.push('\\');
304                    res.push(c);
305                }
306            },
307            None => {
308                return Err(());
309            }
310        };
311        Ok(())
312    }
313
314    /// Parse a String. Return the String, and the offset where it starts.
315    // String parsing is a hot-spot, so parameterise by a `stop` function which gets
316    // specialised for each variant
317    fn string(
318        &mut self,
319        triple: bool,
320        raw: bool,
321        mut stop: impl FnMut(char) -> bool,
322    ) -> LexemeT<(String, usize)> {
323        // We have seen an opening quote, which is either ' or "
324        // If triple is true, it was a triple quote
325        // stop lets us know when a string ends.
326
327        // Before the first quote character
328        let string_start = self.lexer.span().start;
329        // After the first quote character, but before any contents or it tracked stuff
330        let mut string_end = self.lexer.span().end;
331
332        let mut it = CursorBytes::new(self.lexer.remainder());
333        let it2;
334
335        if triple {
336            it.next();
337            it.next();
338        }
339        let contents_start = it.pos();
340
341        // Take the fast path as long as the result is a slice of the original, with no changes.
342        let mut res;
343        loop {
344            match it.next_char() {
345                None => {
346                    return self.err_span(
347                        LexemeError::UnfinishedStringLiteral,
348                        string_start,
349                        string_end + it.pos(),
350                    );
351                }
352                Some(c) => {
353                    if stop(c) {
354                        let contents_end = it.pos() - if triple { 3 } else { 1 };
355                        let contents = &self.lexer.remainder()[contents_start..contents_end];
356                        self.lexer.bump(it.pos());
357                        return Ok((
358                            string_start,
359                            (contents.to_owned(), contents_start),
360                            string_end + it.pos(),
361                        ));
362                    } else if c == '\\' || c == '\r' || (c == '\n' && !triple) {
363                        res = String::with_capacity(it.pos() + 10);
364                        res.push_str(&self.lexer.remainder()[contents_start..it.pos() - 1]);
365                        it2 = CursorChars::new_offset(self.lexer.remainder(), it.pos() - 1);
366                        break;
367                    }
368                }
369            }
370        }
371
372        // We bailed out of the fast path, that means we now accumulate character by character,
373        // might have an error or be dealing with escape characters.
374        let mut it = it2;
375        while let Some(c) = it.next() {
376            if stop(c) {
377                self.lexer.bump(it.pos());
378                if triple {
379                    res.truncate(res.len() - 2);
380                }
381                return Ok((string_start, (res, contents_start), string_end + it.pos()));
382            }
383            match c {
384                '\n' if !triple => {
385                    // Will raise an error about out of chars.
386                    // But don't include the final \n in the count.
387                    string_end -= 1;
388                    break;
389                }
390                '\r' => {
391                    // We just ignore these in all modes
392                }
393                '\\' => {
394                    if raw {
395                        match it.next() {
396                            Some(c) => {
397                                if c != '\'' && c != '"' {
398                                    res.push('\\');
399                                }
400                                res.push(c);
401                            }
402                            _ => break, // Out of chars
403                        }
404                    } else {
405                        let pos = it.pos();
406                        if Self::escape(&mut it, &mut res).is_err() {
407                            let bad = self.lexer.remainder()[pos..it.pos()].to_owned();
408                            return self.err_span(
409                                if bad.is_empty() {
410                                    LexemeError::EmptyEscapeSequence
411                                } else {
412                                    LexemeError::InvalidEscapeSequence(bad)
413                                },
414                                string_end + pos - 1,
415                                string_end + it.pos(),
416                            );
417                        }
418                    }
419                }
420                c => res.push(c),
421            }
422        }
423
424        // We ran out of characters
425        self.err_span(
426            LexemeError::UnfinishedStringLiteral,
427            string_start,
428            string_end + it.pos(),
429        )
430    }
431
432    fn int(&self, s: &str, radix: u32) -> Lexeme {
433        let span = self.lexer.span();
434        match TokenInt::from_str_radix(s, radix) {
435            Ok(i) => Ok((span.start, Token::Int(i), span.end)),
436            Err(_) => self.err_now(LexemeError::IntParse),
437        }
438    }
439
440    pub fn next(&mut self) -> Option<Lexeme> {
441        loop {
442            // Note that this function doesn't always return - a few branches use `continue`
443            // to always go round the loop again.
444            return if let Some(x) = self.buffer.pop_front() {
445                Some(x)
446            } else if self.done {
447                None
448            } else {
449                match self.lexer.next() {
450                    None => {
451                        self.done = true;
452                        let pos = self.lexer.span().end;
453                        for _ in 0..self.indent_levels.len() {
454                            self.buffer.push_back(Ok((pos, Token::Dedent, pos)))
455                        }
456                        self.indent_levels.clear();
457                        self.wrap(Token::Newline)
458                    }
459                    Some(token) => match token {
460                        Token::Tabs => {
461                            self.buffer.push_back(
462                                self.err_pos(LexemeError::InvalidTab, self.lexer.span().start),
463                            );
464                            continue;
465                        }
466                        Token::Newline => {
467                            if self.parens == 0 {
468                                let span = self.lexer.span();
469                                if let Err(e) = self.calculate_indent() {
470                                    return Some(Err(e));
471                                }
472                                Some(Ok((span.start, Token::Newline, span.end)))
473                            } else {
474                                continue;
475                            }
476                        }
477                        Token::Reserved => Some(self.err_now(LexemeError::ReservedKeyword)),
478                        Token::Error => Some(self.err_now(LexemeError::InvalidInput)),
479                        Token::RawDecInt => {
480                            let s = self.lexer.slice();
481                            if s.len() > 1 && &s[0..1] == "0" {
482                                return Some(self.err_now(LexemeError::StartsZero));
483                            }
484                            Some(self.int(s, 10))
485                        }
486                        Token::RawOctInt => {
487                            let s = self.lexer.slice();
488                            assert!(s.starts_with("0o") || s.starts_with("0O"));
489                            Some(self.int(&s[2..], 8))
490                        }
491                        Token::RawHexInt => {
492                            let s = self.lexer.slice();
493                            assert!(s.starts_with("0x") || s.starts_with("0X"));
494                            Some(self.int(&s[2..], 16))
495                        }
496                        Token::RawBinInt => {
497                            let s = self.lexer.slice();
498                            assert!(s.starts_with("0b") || s.starts_with("0B"));
499                            Some(self.int(&s[2..], 2))
500                        }
501                        Token::Int(..) => unreachable!("Lexer does not produce Int tokens"),
502                        Token::RawDoubleQuote => {
503                            let raw = self.lexer.span().len() == 2;
504                            self.parse_double_quoted_string(raw)
505                                .map(|lex| map_lexeme_t(lex, |(s, _offset)| Token::String(s)))
506                        }
507                        Token::RawSingleQuote => {
508                            let raw = self.lexer.span().len() == 2;
509                            self.parse_single_quoted_string(raw)
510                                .map(|lex| map_lexeme_t(lex, |(s, _offset)| Token::String(s)))
511                        }
512                        Token::String(_) => {
513                            unreachable!("The lexer does not produce String")
514                        }
515                        Token::RawFStringDoubleQuote => {
516                            let span_len = self.lexer.span().len();
517                            let raw = span_len == 3;
518                            self.parse_double_quoted_string(raw).map(|lex| {
519                                map_lexeme_t(lex, |(content, content_start_offset)| {
520                                    Token::FString(TokenFString {
521                                        content,
522                                        content_start_offset: content_start_offset + span_len,
523                                    })
524                                })
525                            })
526                        }
527                        Token::RawFStringSingleQuote => {
528                            let span_len = self.lexer.span().len();
529                            let raw = span_len == 3;
530                            self.parse_single_quoted_string(raw).map(|lex| {
531                                map_lexeme_t(lex, |(content, content_start_offset)| {
532                                    Token::FString(TokenFString {
533                                        content,
534                                        content_start_offset: content_start_offset + span_len,
535                                    })
536                                })
537                            })
538                        }
539                        Token::FString(_) => {
540                            unreachable!("The lexer does not produce FString")
541                        }
542                        Token::OpeningCurly | Token::OpeningRound | Token::OpeningSquare => {
543                            self.parens += 1;
544                            self.wrap(token)
545                        }
546                        Token::ClosingCurly | Token::ClosingRound | Token::ClosingSquare => {
547                            self.parens -= 1;
548                            self.wrap(token)
549                        }
550                        _ => self.wrap(token),
551                    },
552                }
553            };
554        }
555    }
556
557    fn parse_double_quoted_string(&mut self, raw: bool) -> Option<LexemeT<(String, usize)>> {
558        if self.lexer.remainder().starts_with("\"\"") {
559            let mut qs = 0;
560            Some(self.string(true, raw, |c| {
561                if c == '\"' {
562                    qs += 1;
563                    qs == 3
564                } else {
565                    qs = 0;
566                    false
567                }
568            }))
569        } else {
570            Some(self.string(false, raw, |c| c == '\"'))
571        }
572    }
573
574    fn parse_single_quoted_string(&mut self, raw: bool) -> Option<LexemeT<(String, usize)>> {
575        if self.lexer.remainder().starts_with("''") {
576            let mut qs = 0;
577            Some(self.string(true, raw, |c| {
578                if c == '\'' {
579                    qs += 1;
580                    qs == 3
581                } else {
582                    qs = 0;
583                    false
584                }
585            }))
586        } else {
587            Some(self.string(false, raw, |c| c == '\''))
588        }
589    }
590}
591
592#[derive(Debug, Clone, Eq, PartialEq, derive_more::Display)]
593pub enum TokenInt {
594    I32(i32),
595    /// Only if larger than `i32`.
596    BigInt(BigInt),
597}
598
599impl TokenInt {
600    pub fn from_str_radix(s: &str, base: u32) -> crate::Result<TokenInt> {
601        if let Ok(i) = i32::from_str_radix(s, base) {
602            Ok(TokenInt::I32(i))
603        } else {
604            match BigInt::from_str_radix(s, base) {
605                Ok(i) => Ok(TokenInt::BigInt(i)),
606                Err(_) => Err(LexemeError::CannotParse(s.to_owned(), base).into()),
607            }
608        }
609    }
610}
611
612#[derive(Debug, Clone, PartialEq)]
613pub struct TokenFString {
614    /// The content of this TokenFString
615    pub content: String,
616    /// Relative to the token, where does the actual string content start?
617    pub content_start_offset: usize,
618}
619
620/// All token that can be generated by the lexer
621#[derive(Logos, Debug, Clone, PartialEq)]
622pub enum Token {
623    #[regex(" +", logos::skip)] // Whitespace
624    #[token("\\\n", logos::skip)] // Escaped newline
625    #[token("\\\r\n", logos::skip)] // Escaped newline (Windows line ending)
626    #[error]
627    Error,
628
629    /// Comment as token.
630    /// Span includes the leading `#`, but the content does not.
631    #[regex(r#"#[^\r\n]*"#, |lex| lex.slice()[1..].to_owned())]
632    Comment(String),
633
634    #[regex("\t+")] // Tabs (might be an error)
635    Tabs,
636
637    // Indentation block & meaningful spaces
638    Indent, // New indentation block
639    Dedent, // Leaving an indentation block
640    #[regex(r"(\r)?\n")]
641    Newline, // Newline outside a string
642
643    // Some things the lexer can't deal with well, so we step in and generate
644    // things ourselves
645    #[token("'")]
646    #[token("r'")]
647    RawSingleQuote,
648    #[token("\"")]
649    #[token("r\"")]
650    RawDoubleQuote,
651
652    /// The start of a single-quoted f-string.
653    #[token("f'")]
654    #[token("fr'")]
655    RawFStringSingleQuote,
656    /// The start of a double-quoted f-string.
657    #[token("f\"")]
658    #[token("fr\"")]
659    RawFStringDoubleQuote,
660
661    #[regex(
662        "as|\
663        assert|\
664        async|\
665        await|\
666        class|\
667        del|\
668        except|\
669        finally|\
670        from|\
671        global|\
672        import|\
673        is|\
674        nonlocal|\
675        raise|\
676        try|\
677        while|\
678        with|\
679        yield"
680    )]
681    Reserved, // One of the reserved keywords
682
683    #[regex(
684        "[a-zA-Z_][a-zA-Z0-9_]*"
685    , |lex| lex.slice().to_owned())]
686    Identifier(String), // An identifier
687
688    #[regex("[0-9]+")]
689    RawDecInt,
690    #[regex("0[xX][A-Fa-f0-9]+")]
691    RawHexInt,
692    #[regex("0[bB][01]+")]
693    RawBinInt,
694    #[regex("0[oO][0-7]+")]
695    RawOctInt,
696
697    Int(TokenInt), // An integer literal (123, 0x1, 0b1011, 0o755, ...)
698
699    #[regex("[0-9]+\\.[0-9]*([eE][-+]?[0-9]+)?", |lex| lex.slice().parse::<f64>())]
700    #[regex("[0-9]+[eE][-+]?[0-9]+", |lex| lex.slice().parse::<f64>())]
701    #[regex("\\.[0-9]+([eE][-+]?[0-9]+)?", |lex| lex.slice().parse::<f64>())]
702    Float(f64), // A float literal (3.14, .3, 1e6, 0.)
703
704    String(String), // A string literal
705    /// The raw text of a f-string
706    FString(TokenFString),
707
708    // Keywords
709    #[token("and")]
710    And,
711    #[token("break")]
712    Break,
713    #[token("continue")]
714    Continue,
715    #[token("def")]
716    Def,
717    #[token("elif")]
718    Elif,
719    #[token("else")]
720    Else,
721    #[token("for")]
722    For,
723    #[token("if")]
724    If,
725    #[token("in")]
726    In,
727    #[token("lambda")]
728    Lambda,
729    #[token("load")]
730    Load,
731    #[token("not")]
732    Not,
733    #[token("or")]
734    Or,
735    #[token("pass")]
736    Pass,
737    #[token("return")]
738    Return,
739    // Symbols
740    #[token(",")]
741    Comma,
742    #[token(";")]
743    Semicolon,
744    #[token(":")]
745    Colon,
746    #[token("+=")]
747    PlusEqual,
748    #[token("-=")]
749    MinusEqual,
750    #[token("*=")]
751    StarEqual,
752    #[token("/=")]
753    SlashEqual,
754    #[token("//=")]
755    SlashSlashEqual,
756    #[token("%=")]
757    PercentEqual,
758    #[token("==")]
759    EqualEqual,
760    #[token("!=")]
761    BangEqual,
762    #[token("<=")]
763    LessEqual,
764    #[token(">=")]
765    GreaterEqual,
766    #[token("**")]
767    StarStar,
768    #[token("->")]
769    MinusGreater,
770    #[token("=")]
771    Equal,
772    #[token("<")]
773    LessThan,
774    #[token(">")]
775    GreaterThan,
776    #[token("-")]
777    Minus,
778    #[token("+")]
779    Plus,
780    #[token("*")]
781    Star,
782    #[token("%")]
783    Percent,
784    #[token("/")]
785    Slash,
786    #[token("//")]
787    SlashSlash,
788    #[token(".")]
789    Dot,
790    #[token("&")]
791    Ampersand,
792    #[token("|")]
793    Pipe,
794    #[token("^")]
795    Caret,
796    #[token("<<")]
797    LessLess,
798    #[token(">>")]
799    GreaterGreater,
800    #[token("~")]
801    Tilde,
802    #[token("&=")]
803    AmpersandEqual,
804    #[token("|=")]
805    PipeEqual,
806    #[token("^=")]
807    CaretEqual,
808    #[token("<<=")]
809    LessLessEqual,
810    #[token(">>=")]
811    GreaterGreaterEqual,
812    #[token("...")]
813    Ellipsis,
814
815    // Brackets
816    #[token("[")]
817    OpeningSquare,
818    #[token("{")]
819    OpeningCurly,
820    #[token("(")]
821    OpeningRound,
822    #[token("]")]
823    ClosingSquare,
824    #[token("}")]
825    ClosingCurly,
826    #[token(")")]
827    ClosingRound,
828}
829
830impl Token {
831    /// Used for testing
832    #[cfg(test)]
833    pub fn unlex(&self) -> String {
834        use std::io::Write;
835        match self {
836            Token::Indent => "\t".to_owned(),
837            Token::Newline => "\n".to_owned(),
838            Token::Dedent => "#dedent".to_owned(),
839            Token::String(x) => {
840                // The Rust {:?} is unstable, so changes between versions,
841                // instead use the JSON standard for string escapes.
842                // Reuse the StarlarkValue implementation since it's close to hand.
843                serde_json::to_string(x).unwrap()
844            }
845            Token::FString(x) => {
846                let mut buff = Vec::new();
847                write!(&mut buff, "f").unwrap();
848                serde_json::to_writer(&mut buff, &x.content).unwrap();
849                String::from_utf8(buff).unwrap()
850            }
851            _ => {
852                let s = self.to_string();
853                // Out display is often: keyword 'lambda'
854                // so strip out the bit in single quotes
855                let first = s.find('\'');
856                match first {
857                    Some(first) if s.ends_with('\'') && first != s.len() - 1 => {
858                        s[first + 1..s.len() - 1].to_owned()
859                    }
860                    _ => s,
861                }
862            }
863        }
864    }
865}
866
867impl Display for Token {
868    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
869        match self {
870            Token::Error => write!(f, "lexical error"),
871            Token::Indent => write!(f, "new indentation block"),
872            Token::Dedent => write!(f, "end of indentation block"),
873            Token::Newline => write!(f, "new line"),
874            Token::And => write!(f, "keyword 'and'"),
875            Token::Else => write!(f, "keyword 'else'"),
876            Token::Load => write!(f, "keyword 'load'"),
877            Token::Break => write!(f, "keyword 'break'"),
878            Token::For => write!(f, "keyword 'for'"),
879            Token::Not => write!(f, "keyword 'not'"),
880            Token::Continue => write!(f, "keyword 'continue'"),
881            Token::If => write!(f, "keyword 'if'"),
882            Token::Or => write!(f, "keyword 'or'"),
883            Token::Def => write!(f, "keyword 'def'"),
884            Token::In => write!(f, "keyword 'in'"),
885            Token::Pass => write!(f, "keyword 'pass'"),
886            Token::Elif => write!(f, "keyword 'elif'"),
887            Token::Return => write!(f, "keyword 'return'"),
888            Token::Lambda => write!(f, "keyword 'lambda'"),
889            Token::Comma => write!(f, "symbol ','"),
890            Token::Semicolon => write!(f, "symbol ';'"),
891            Token::Colon => write!(f, "symbol ':'"),
892            Token::PlusEqual => write!(f, "symbol '+='"),
893            Token::MinusEqual => write!(f, "symbol '-='"),
894            Token::StarEqual => write!(f, "symbol '*='"),
895            Token::SlashEqual => write!(f, "symbol '/='"),
896            Token::SlashSlashEqual => write!(f, "symbol '//='"),
897            Token::PercentEqual => write!(f, "symbol '%='"),
898            Token::EqualEqual => write!(f, "symbol '=='"),
899            Token::BangEqual => write!(f, "symbol '!='"),
900            Token::LessEqual => write!(f, "symbol '<='"),
901            Token::GreaterEqual => write!(f, "symbol '>='"),
902            Token::StarStar => write!(f, "symbol '**'"),
903            Token::MinusGreater => write!(f, "symbol '->'"),
904            Token::Equal => write!(f, "symbol '='"),
905            Token::LessThan => write!(f, "symbol '<'"),
906            Token::GreaterThan => write!(f, "symbol '>'"),
907            Token::Minus => write!(f, "symbol '-'"),
908            Token::Plus => write!(f, "symbol '+'"),
909            Token::Star => write!(f, "symbol '*'"),
910            Token::Percent => write!(f, "symbol '%'"),
911            Token::Slash => write!(f, "symbol '/'"),
912            Token::SlashSlash => write!(f, "symbol '//'"),
913            Token::Dot => write!(f, "symbol '.'"),
914            Token::Ampersand => write!(f, "symbol '&'"),
915            Token::Pipe => write!(f, "symbol '|'"),
916            Token::Caret => write!(f, "symbol '^'"),
917            Token::LessLess => write!(f, "symbol '<<'"),
918            Token::GreaterGreater => write!(f, "symbol '>>'"),
919            Token::Tilde => write!(f, "symbol '~'"),
920            Token::AmpersandEqual => write!(f, "symbol '&='"),
921            Token::PipeEqual => write!(f, "symbol '|='"),
922            Token::CaretEqual => write!(f, "symbol '^='"),
923            Token::LessLessEqual => write!(f, "symbol '<<='"),
924            Token::GreaterGreaterEqual => write!(f, "symbol '>>='"),
925            Token::Ellipsis => write!(f, "symbol '...'"),
926            Token::OpeningSquare => write!(f, "symbol '['"),
927            Token::OpeningCurly => write!(f, "symbol '{{'"),
928            Token::OpeningRound => write!(f, "symbol '('"),
929            Token::ClosingSquare => write!(f, "symbol ']'"),
930            Token::ClosingCurly => write!(f, "symbol '}}'"),
931            Token::ClosingRound => write!(f, "symbol ')'"),
932            Token::Reserved => write!(f, "reserved keyword"),
933            Token::Identifier(s) => write!(f, "identifier '{}'", s),
934            Token::Int(i) => write!(f, "integer literal '{}'", i),
935            Token::RawDecInt => write!(f, "decimal integer literal"),
936            Token::RawHexInt => write!(f, "hexadecimal integer literal"),
937            Token::RawOctInt => write!(f, "octal integer literal"),
938            Token::RawBinInt => write!(f, "binary integer literal"),
939            Token::Float(n) => write!(f, "float literal '{}'", n),
940            Token::String(s) => write!(f, "string literal {:?}", s),
941            Token::RawSingleQuote => write!(f, "starting '"),
942            Token::RawDoubleQuote => write!(f, "starting \""),
943            Token::RawFStringDoubleQuote => write!(f, "starting f'"),
944            Token::RawFStringSingleQuote => write!(f, "starting f\""),
945            Token::FString(s) => write!(f, "f-string {:?}", &s.content),
946            Token::Comment(c) => write!(f, "comment '{}'", c),
947            Token::Tabs => Ok(()),
948        }
949    }
950}
951
952impl<'a> Iterator for Lexer<'a> {
953    type Item = Lexeme;
954
955    fn next(&mut self) -> Option<Self::Item> {
956        self.next()
957    }
958}
959
960pub fn lex_exactly_one_identifier(s: &str) -> Option<String> {
961    let mut lexer = Token::lexer(s);
962    match (lexer.next(), lexer.next()) {
963        (Some(Token::Identifier(ident)), None) => Some(ident),
964        _ => None,
965    }
966}
967
968#[cfg(test)]
969mod tests {
970    use crate::lexer::lex_exactly_one_identifier;
971
972    #[test]
973    fn test_is_valid_identifier() {
974        assert_eq!(lex_exactly_one_identifier("foo").as_deref(), Some("foo"));
975        assert_eq!(lex_exactly_one_identifier(" foo ").as_deref(), Some("foo"));
976        assert_eq!(lex_exactly_one_identifier("foo bar"), None);
977        assert_eq!(lex_exactly_one_identifier("not"), None);
978        assert_eq!(lex_exactly_one_identifier("123"), None);
979    }
980}
starlark_syntax/lexer.rs

starlark_syntax/
lexer.rs