sway_parse/
token.rs

1use core::mem;
2use extension_trait::extension_trait;
3use num_bigint::BigUint;
4use sway_ast::literal::{LitChar, LitInt, LitIntType, LitString, Literal};
5use sway_ast::token::{
6    Comment, CommentKind, CommentedGroup, CommentedTokenStream, CommentedTokenTree, DocComment,
7    DocStyle, Punct, Spacing, TokenStream,
8};
9use sway_error::error::CompileError;
10use sway_error::handler::{ErrorEmitted, Handler};
11use sway_error::lex_error::{LexError, LexErrorKind};
12use sway_types::span::Source;
13use sway_types::{
14    ast::{Delimiter, PunctKind},
15    Ident, SourceId, Span, Spanned,
16};
17use unicode_bidi::format_chars::{ALM, FSI, LRE, LRI, LRM, LRO, PDF, PDI, RLE, RLI, RLM, RLO};
18use unicode_xid::UnicodeXID;
19
20#[extension_trait]
21impl CharExt for char {
22    /// Converts the character into an opening delimiter, if any.
23    fn as_open_delimiter(self) -> Option<Delimiter> {
24        match self {
25            '(' => Some(Delimiter::Parenthesis),
26            '{' => Some(Delimiter::Brace),
27            '[' => Some(Delimiter::Bracket),
28            _ => None,
29        }
30    }
31
32    /// Converts the character into a closing delimiter, if any.
33    fn as_close_delimiter(self) -> Option<Delimiter> {
34        match self {
35            ')' => Some(Delimiter::Parenthesis),
36            '}' => Some(Delimiter::Brace),
37            ']' => Some(Delimiter::Bracket),
38            _ => None,
39        }
40    }
41
42    /// Determines what sort of punctuation this character is, if any.
43    fn as_punct_kind(self) -> Option<PunctKind> {
44        match self {
45            ';' => Some(PunctKind::Semicolon),
46            ':' => Some(PunctKind::Colon),
47            '/' => Some(PunctKind::ForwardSlash),
48            ',' => Some(PunctKind::Comma),
49            '*' => Some(PunctKind::Star),
50            '+' => Some(PunctKind::Add),
51            '-' => Some(PunctKind::Sub),
52            '<' => Some(PunctKind::LessThan),
53            '>' => Some(PunctKind::GreaterThan),
54            '=' => Some(PunctKind::Equals),
55            '.' => Some(PunctKind::Dot),
56            '!' => Some(PunctKind::Bang),
57            '%' => Some(PunctKind::Percent),
58            '&' => Some(PunctKind::Ampersand),
59            '^' => Some(PunctKind::Caret),
60            '|' => Some(PunctKind::Pipe),
61            '_' => Some(PunctKind::Underscore),
62            '#' => Some(PunctKind::Sharp),
63            _ => None,
64        }
65    }
66}
67
68struct CharIndicesInner<'a> {
69    src: &'a str,
70    position: usize,
71}
72
73impl Iterator for CharIndicesInner<'_> {
74    type Item = (usize, char);
75
76    fn next(&mut self) -> Option<(usize, char)> {
77        let mut char_indices = self.src[self.position..].char_indices();
78        let (_, c) = char_indices.next()?;
79        let ret = (self.position, c);
80        match char_indices.next() {
81            Some((char_width, _)) => self.position += char_width,
82            None => self.position = self.src.len(),
83        };
84        Some(ret)
85    }
86}
87
88type CharIndices<'a> = std::iter::Peekable<CharIndicesInner<'a>>;
89type Result<T> = core::result::Result<T, ErrorEmitted>;
90
91struct Lexer<'l> {
92    handler: &'l Handler,
93    src: &'l Source,
94    source_id: &'l Option<SourceId>,
95    stream: &'l mut CharIndices<'l>,
96}
97
98pub fn lex(
99    handler: &Handler,
100    src: Source,
101    start: usize,
102    end: usize,
103    source_id: Option<SourceId>,
104) -> Result<TokenStream> {
105    lex_commented(handler, src, start, end, &source_id).map(|stream| stream.strip_comments())
106}
107
108/// Identifier & path validation.
109///
110/// A *path* may optionally start with `::` and is otherwise a `::`‑separated
111/// list of identifiers.  Identifiers follow these rules:
112///
113/// * Must not be empty.
114/// * Must not be just `_`.
115/// * Must not start with two underscores (`__`).
116/// * First char: Unicode XID_Start or `_`.
117/// * Remaining chars: Unicode XID_Continue.
118///
119/// Any colon that is *not* part of a `::` token is rejected, and empty path
120/// segments such as `foo::` or `foo:::bar` are invalid.
121pub fn is_valid_identifier_or_path(s: &str) -> bool {
122    // Reject empty string early.
123    if s.is_empty() {
124        return false;
125    }
126
127    // Handle an optional leading `::`.
128    let mut input = s;
129    if let Some(rest) = input.strip_prefix("::") {
130        input = rest;
131        // Bare `::` is invalid.
132        if input.is_empty() {
133            return false;
134        }
135    }
136
137    // Split on *exactly* two consecutive colons.  Any single `:` or triple
138    // `:::` will leave stray `:` characters inside a segment and fail below.
139    for segment in input.split("::") {
140        if !is_valid_identifier(segment) {
141            return false;
142        }
143    }
144
145    true
146}
147
148/// Check a single identifier segment.
149fn is_valid_identifier(ident: &str) -> bool {
150    // Reject empty, bare underscore, or double‑underscore prefix.
151    if ident.is_empty() || ident == "_" || ident.starts_with("__") {
152        return false;
153    }
154
155    let mut chars = ident.chars();
156    let first = chars.next().unwrap();
157
158    // First char: Unicode XID_Start or underscore.
159    if !(first.is_xid_start() || first == '_') {
160        return false;
161    }
162
163    // Remaining chars: Unicode XID_Continue.
164    chars.all(|c| c.is_xid_continue())
165}
166
167pub fn lex_commented(
168    handler: &Handler,
169    src: Source,
170    start: usize,
171    end: usize,
172    source_id: &Option<SourceId>,
173) -> Result<CommentedTokenStream> {
174    let stream = &mut CharIndicesInner {
175        src: &src.text[..end],
176        position: start,
177    }
178    .peekable();
179    let mut l = Lexer {
180        handler,
181        src: &src,
182        source_id,
183        stream,
184    };
185    let mut file_start_offset: usize = 0;
186
187    let mut parent_token_trees = Vec::new();
188    let mut token_trees = Vec::new();
189    while let Some((mut index, mut character)) = l.stream.next() {
190        if character.is_whitespace() {
191            // if the beginning of a file starts with whitespace
192            // we must keep track to ensure that the module level docs
193            // will get inserted into the tree correctly
194            if index - file_start_offset == 0 {
195                file_start_offset += character.len_utf8();
196            }
197            continue;
198        }
199        if character == '/' {
200            match l.stream.peek() {
201                Some((_, '/')) => {
202                    // search_end is the index at which we stop looking backwards for
203                    // a newline
204                    let search_end = token_trees
205                        .last()
206                        .map(|tt| {
207                            if let CommentedTokenTree::Tree(t) = tt {
208                                t.span().end()
209                            } else {
210                                0
211                            }
212                        })
213                        .unwrap_or_default();
214
215                    let has_newline = src.text[search_end..index]
216                        .chars()
217                        .rev()
218                        .take_while(|c| c.is_whitespace())
219                        .filter(|&c| c == '\n')
220                        .count()
221                        > 0;
222                    // We found a comment at the start of file, which should be accounted for as a Newlined comment.
223                    let start_of_file_found = search_end == 0 && index == 0;
224
225                    let comment_kind = if has_newline || start_of_file_found {
226                        CommentKind::Newlined
227                    } else {
228                        CommentKind::Trailing
229                    };
230
231                    let ctt = lex_line_comment(&mut l, end, index, comment_kind);
232                    token_trees.push(ctt);
233                    continue;
234                }
235                Some((_, '*')) => {
236                    if let Some(token) = lex_block_comment(&mut l, index) {
237                        token_trees.push(token);
238                    }
239                    continue;
240                }
241                Some(_) | None => {}
242            }
243        }
244
245        if character.is_xid_start() || character == '_' {
246            // Raw identifier, e.g., `r#foo`? Then mark as such, stripping the prefix `r#`.
247            let is_raw_ident = character == 'r' && matches!(l.stream.peek(), Some((_, '#')));
248            if is_raw_ident {
249                l.stream.next();
250                if let Some((next_index, next_character)) = l.stream.next() {
251                    character = next_character;
252                    index = next_index;
253                }
254                if !(character.is_xid_start() || character == '_') {
255                    let kind = LexErrorKind::InvalidCharacter {
256                        position: index,
257                        character,
258                    };
259                    let span = span_one(&l, index, character);
260                    error(l.handler, LexError { kind, span });
261                    continue;
262                }
263            }
264
265            // Don't accept just `_` as an identifier.
266            let not_is_single_underscore = character != '_'
267                || l.stream
268                    .peek()
269                    .is_some_and(|(_, next)| next.is_xid_continue());
270            if not_is_single_underscore {
271                // Consume until we hit other than `XID_CONTINUE`.
272                while l.stream.next_if(|(_, c)| c.is_xid_continue()).is_some() {}
273                let ident = Ident::new_with_raw(span_until(&mut l, index), is_raw_ident);
274                token_trees.push(CommentedTokenTree::Tree(ident.into()));
275                continue;
276            }
277        }
278        if let Some(delimiter) = character.as_open_delimiter() {
279            let token_trees = mem::take(&mut token_trees);
280            parent_token_trees.push((token_trees, index, delimiter));
281            continue;
282        }
283        if let Some(close_delimiter) = character.as_close_delimiter() {
284            match parent_token_trees.pop() {
285                None => {
286                    // Recover by ignoring the unexpected closing delim,
287                    // giving the parser opportunities to realize the need for an opening delim
288                    // in e.g., this example:
289                    //
290                    // fn foo() // <-- Parser expects grouped tokens in `{ ... }` here.
291                    //     let x = 0;
292                    // } // <- This recovery.
293                    let kind = LexErrorKind::UnexpectedCloseDelimiter {
294                        position: index,
295                        close_delimiter,
296                    };
297                    let span = span_one(&l, index, character);
298                    error(l.handler, LexError { kind, span });
299                }
300                Some((parent, open_index, open_delimiter)) => {
301                    if open_delimiter != close_delimiter {
302                        // Recover on e.g., a `{ )` mismatch by having `)` interpreted as `}`.
303                        let kind = LexErrorKind::MismatchedDelimiters {
304                            open_position: open_index,
305                            close_position: index,
306                            open_delimiter,
307                            close_delimiter,
308                        };
309                        let span = span_one(&l, index, character);
310                        error(l.handler, LexError { kind, span });
311                    }
312                    token_trees = lex_close_delimiter(
313                        &mut l,
314                        index,
315                        parent,
316                        token_trees,
317                        open_index,
318                        open_delimiter,
319                    );
320                }
321            }
322            continue;
323        }
324        if let Some(token) = lex_string(&mut l, index, character)? {
325            token_trees.push(token);
326            continue;
327        }
328        if let Some(token) = lex_char(&mut l, index, character)? {
329            token_trees.push(token);
330            continue;
331        }
332        if let Some(token) = lex_int_lit(&mut l, index, character)? {
333            token_trees.push(token);
334            continue;
335        }
336        if let Some(token) = lex_punctuation(&mut l, index, character) {
337            token_trees.push(token);
338            continue;
339        }
340
341        // Recover by simply ignoring the character.
342        // NOTE(Centril): I'm not sure how good of an idea this is... time will tell.
343        let kind = LexErrorKind::InvalidCharacter {
344            position: index,
345            character,
346        };
347        let span = span_one(&l, index, character);
348        error(l.handler, LexError { kind, span });
349        continue;
350    }
351
352    // Recover all unclosed delimiters.
353    while let Some((parent, open_index, open_delimiter)) = parent_token_trees.pop() {
354        let kind = LexErrorKind::UnclosedDelimiter {
355            open_position: open_index,
356            open_delimiter,
357        };
358        let span = span_one(&l, open_index, open_delimiter.as_open_char());
359        error(l.handler, LexError { kind, span });
360
361        token_trees = lex_close_delimiter(
362            &mut l,
363            src.text.len(),
364            parent,
365            token_trees,
366            open_index,
367            open_delimiter,
368        );
369    }
370    Ok(CommentedTokenStream {
371        token_trees,
372        full_span: span(&l, start, end),
373    })
374}
375
376fn lex_close_delimiter(
377    l: &mut Lexer<'_>,
378    index: usize,
379    mut parent: Vec<CommentedTokenTree>,
380    token_trees: Vec<CommentedTokenTree>,
381    open_index: usize,
382    delimiter: Delimiter,
383) -> Vec<CommentedTokenTree> {
384    let start_index = open_index + delimiter.as_open_char().len_utf8();
385    let full_span = span(l, start_index, index);
386    let group = CommentedGroup {
387        token_stream: CommentedTokenStream {
388            token_trees,
389            full_span,
390        },
391        delimiter,
392        span: span_until(l, open_index),
393    };
394    parent.push(CommentedTokenTree::Tree(group.into()));
395    parent
396}
397
398fn lex_line_comment(
399    l: &mut Lexer<'_>,
400    end: usize,
401    index: usize,
402    comment_kind: CommentKind,
403) -> CommentedTokenTree {
404    let _ = l.stream.next();
405
406    // Find end; either at EOF or at `\n`.
407    let end = l
408        .stream
409        .find(|(_, character)| *character == '\n')
410        .map_or(end, |(end, _)| end);
411    let sp = span(l, index, end);
412
413    let doc_style = match (sp.as_str().chars().nth(2), sp.as_str().chars().nth(3)) {
414        // `//!` is an inner line doc comment.
415        (Some('!'), _) => Some(DocStyle::Inner),
416        // `////` (more than 3 slashes) is not considered a doc comment.
417        (Some('/'), Some('/')) => None,
418        // `///` is an outer line doc comment.
419        (Some('/'), _) => Some(DocStyle::Outer),
420        _ => None,
421    };
422
423    if let Some(doc_style) = doc_style {
424        let doc_comment = DocComment {
425            span: sp,
426            doc_style,
427            content_span: span(l, index + 3, end),
428        };
429        CommentedTokenTree::Tree(doc_comment.into())
430    } else {
431        Comment {
432            span: sp,
433            comment_kind,
434        }
435        .into()
436    }
437}
438
439fn lex_block_comment(l: &mut Lexer<'_>, index: usize) -> Option<CommentedTokenTree> {
440    // Lexing a multi-line comment.
441    let _ = l.stream.next();
442    let mut unclosed_indices = vec![index];
443
444    let unclosed_multiline_comment = |l: &Lexer<'_>, unclosed_indices: Vec<_>| {
445        let span = span(l, *unclosed_indices.last().unwrap(), l.src.text.len() - 1);
446        let kind = LexErrorKind::UnclosedMultilineComment { unclosed_indices };
447        error(l.handler, LexError { kind, span });
448        None
449    };
450
451    // We first start by assuming that block comments are inlined.
452    let mut comment_kind = CommentKind::Inlined;
453
454    loop {
455        match l.stream.next() {
456            None => return unclosed_multiline_comment(l, unclosed_indices),
457            Some((_, '*')) => match l.stream.next() {
458                None => return unclosed_multiline_comment(l, unclosed_indices),
459                // Matched `*/`, so we're closing some multi-line comment. It could be nested.
460                Some((slash_ix, '/')) => {
461                    let start = unclosed_indices.pop().unwrap();
462                    if unclosed_indices.is_empty() {
463                        // For the purposes of lexing,
464                        // nested multi-line comments constitute a single multi-line comment.
465                        // We could represent them as several ones, but that's unnecessary.
466                        let end = slash_ix + '/'.len_utf8();
467                        let span = span(l, start, end);
468                        return Some(Comment { span, comment_kind }.into());
469                    }
470                }
471                Some(_) => {}
472            },
473            // Found nested multi-line comment.
474            Some((next_index, '/')) => match l.stream.next() {
475                None => return unclosed_multiline_comment(l, unclosed_indices),
476                Some((_, '*')) => unclosed_indices.push(next_index),
477                Some(_) => {}
478            },
479            Some((_, '\n')) => {
480                // If we find a newline character while lexing, this means that the block comment is multiline.
481                // Example:
482                // /* this is a
483                //    multilined block comment */
484                comment_kind = CommentKind::Multilined;
485            }
486            Some(_) => {}
487        }
488    }
489}
490
491fn lex_string(
492    l: &mut Lexer<'_>,
493    index: usize,
494    character: char,
495) -> Result<Option<CommentedTokenTree>> {
496    if character != '"' {
497        return Ok(None);
498    }
499    let mut parsed = String::new();
500    loop {
501        let unclosed_string_lit = |l: &Lexer<'_>, end| {
502            error(
503                l.handler,
504                LexError {
505                    kind: LexErrorKind::UnclosedStringLiteral { position: index },
506                    span: span(l, index, end),
507                },
508            )
509        };
510        let (next_index, next_character) = l.stream.next().ok_or_else(|| {
511            // last character may not be a unicode boundary
512            let mut end = l.src.text.len() - 1;
513            while !l.src.text.is_char_boundary(end) {
514                end -= 1;
515            }
516            unclosed_string_lit(l, end)
517        })?;
518        parsed.push(match next_character {
519            '\\' => parse_escape_code(l)
520                .map_err(|e| e.unwrap_or_else(|| unclosed_string_lit(l, l.src.text.len())))?,
521            '"' => break,
522            // do not allow text direction codepoints
523            ALM | FSI | LRE | LRI | LRM | LRO | PDF | PDI | RLE | RLI | RLM | RLO => {
524                let kind = LexErrorKind::UnicodeTextDirInLiteral {
525                    position: next_index,
526                    character: next_character,
527                };
528                let span = span_one(l, next_index, next_character);
529                error(l.handler, LexError { span, kind });
530                continue;
531            }
532            _ => next_character,
533        });
534    }
535    let span = span_until(l, index);
536    let literal = Literal::String(LitString { span, parsed });
537    Ok(Some(CommentedTokenTree::Tree(literal.into())))
538}
539
540fn lex_char(
541    l: &mut Lexer<'_>,
542    index: usize,
543    character: char,
544) -> Result<Option<CommentedTokenTree>> {
545    let is_quote = |c| c == '\'';
546    if !is_quote(character) {
547        return Ok(None);
548    }
549
550    let unclosed_char_lit = |l: &Lexer<'_>| {
551        let err = LexError {
552            kind: LexErrorKind::UnclosedCharLiteral { position: index },
553            span: span(l, index, l.src.text.len()),
554        };
555        error(l.handler, err)
556    };
557    let next = |l: &mut Lexer<'_>| l.stream.next().ok_or_else(|| unclosed_char_lit(l));
558    let escape = |l: &mut Lexer<'_>, next_char| {
559        if next_char == '\\' {
560            parse_escape_code(l).map_err(|e| e.unwrap_or_else(|| unclosed_char_lit(l)))
561        } else {
562            Ok(next_char)
563        }
564    };
565
566    let (next_index, next_char) = next(l)?;
567    // do not allow text direction codepoints
568    if let ALM | FSI | LRE | LRI | LRM | LRO | PDF | PDI | RLE | RLI | RLM | RLO = next_char {
569        let kind = LexErrorKind::UnicodeTextDirInLiteral {
570            position: next_index,
571            character: next_char,
572        };
573        let span = span_one(l, next_index, next_char);
574        error(l.handler, LexError { span, kind });
575    }
576
577    let parsed = escape(l, next_char)?;
578
579    // Consume the closing `'`.
580    let (next_index, next_char) = next(l)?;
581    let sp = span_until(l, index);
582
583    // Not a closing quote? Then this is e.g., 'ab'.
584    // Most likely the user meant a string literal, so recover as that instead.
585    let literal = if !is_quote(next_char) {
586        let mut string = String::new();
587        string.push(parsed);
588        string.push(escape(l, next_char)?);
589        loop {
590            let (_, next_char) = next(l)?;
591            if is_quote(next_char) {
592                break;
593            }
594            string.push(next_char);
595        }
596
597        // Emit the expected closing quote error.
598        error(
599            l.handler,
600            LexError {
601                kind: LexErrorKind::ExpectedCloseQuote {
602                    position: next_index,
603                },
604                span: span(l, next_index, next_index + string.len()),
605            },
606        );
607
608        Literal::String(LitString {
609            span: sp,
610            parsed: string,
611        })
612    } else {
613        Literal::Char(LitChar { span: sp, parsed })
614    };
615
616    Ok(Some(CommentedTokenTree::Tree(literal.into())))
617}
618
619fn parse_escape_code(l: &mut Lexer<'_>) -> core::result::Result<char, Option<ErrorEmitted>> {
620    let error = |kind, span| Err(Some(error(l.handler, LexError { kind, span })));
621
622    match l.stream.next() {
623        None => Err(None),
624        Some((_, '"')) => Ok('"'),
625        Some((_, '\'')) => Ok('\''),
626        Some((_, 'n')) => Ok('\n'),
627        Some((_, 'r')) => Ok('\r'),
628        Some((_, 't')) => Ok('\t'),
629        Some((_, '\\')) => Ok('\\'),
630        Some((_, '0')) => Ok('\0'),
631        Some((index, 'x')) => {
632            let (high, low) = match (l.stream.next(), l.stream.next()) {
633                (Some((_, high)), Some((_, low))) => (high, low),
634                _ => return Err(None),
635            };
636            let (high, low) = match (high.to_digit(16), low.to_digit(16)) {
637                (Some(high), Some(low)) => (high, low),
638                _ => return error(LexErrorKind::InvalidHexEscape, span_until(l, index)),
639            };
640            let parsed_character = char::from_u32((high << 4) | low).unwrap();
641            Ok(parsed_character)
642        }
643        Some((index, 'u')) => {
644            match l.stream.next() {
645                None => return Err(None),
646                Some((_, '{')) => (),
647                Some((_, unexpected_char)) => {
648                    let span = span_one(l, index, unexpected_char);
649                    let kind = LexErrorKind::UnicodeEscapeMissingBrace { position: index };
650                    return error(kind, span);
651                }
652            }
653            let mut digits_start_position_opt = None;
654            let mut char_value = BigUint::from(0u32);
655            let digits_end_position = loop {
656                let (position, digit) = match l.stream.next() {
657                    None => return Err(None),
658                    Some((position, '}')) => break position,
659                    Some((position, digit)) => (position, digit),
660                };
661                if digits_start_position_opt.is_none() {
662                    digits_start_position_opt = Some(position);
663                };
664                let digit = match digit.to_digit(16) {
665                    None => {
666                        let span = span_one(l, position, digit);
667                        let kind = LexErrorKind::InvalidUnicodeEscapeDigit { position };
668                        return error(kind, span);
669                    }
670                    Some(digit) => digit,
671                };
672                char_value *= 16u32;
673                char_value += digit;
674            };
675            let digits_start_position = digits_start_position_opt.unwrap_or(digits_end_position);
676            let char_value = match u32::try_from(char_value) {
677                Err(..) => {
678                    let span = span(l, digits_start_position, digits_end_position);
679                    let kind = LexErrorKind::UnicodeEscapeOutOfRange { position: index };
680                    return error(kind, span);
681                }
682                Ok(char_value) => char_value,
683            };
684            let parsed_character = match char::from_u32(char_value) {
685                None => {
686                    let span_all = span_until(l, index);
687                    let kind = LexErrorKind::UnicodeEscapeInvalidCharValue { span: span_all };
688                    let span = span(l, digits_start_position, digits_end_position);
689                    return error(kind, span);
690                }
691                Some(parsed_character) => parsed_character,
692            };
693            Ok(parsed_character)
694        }
695        Some((index, unexpected_char)) => error(
696            LexErrorKind::InvalidEscapeCode { position: index },
697            span_one(l, index, unexpected_char),
698        ),
699    }
700}
701
702fn lex_int_lit(
703    l: &mut Lexer<'_>,
704    index: usize,
705    character: char,
706) -> Result<Option<CommentedTokenTree>> {
707    let digit = match character.to_digit(10) {
708        None => return Ok(None),
709        Some(d) => d,
710    };
711
712    let decimal_int_lit = |l, digit: u32| {
713        let mut big_uint = BigUint::from(digit);
714        let end_opt = parse_digits(&mut big_uint, l, 10);
715        (big_uint, end_opt)
716    };
717    let (big_uint, end_opt) = if digit == 0 {
718        let prefixed_int_lit = |l: &mut Lexer<'_>, radix| {
719            let _ = l.stream.next();
720            let d = l.stream.next();
721            let incomplete_int_lit = |end| {
722                let kind = match radix {
723                    16 => LexErrorKind::IncompleteHexIntLiteral { position: index },
724                    8 => LexErrorKind::IncompleteOctalIntLiteral { position: index },
725                    2 => LexErrorKind::IncompleteBinaryIntLiteral { position: index },
726                    _ => unreachable!(),
727                };
728                let span = span(l, index, end);
729                error(l.handler, LexError { kind, span })
730            };
731            let (digit_pos, digit) = d.ok_or_else(|| incomplete_int_lit(l.src.text.len()))?;
732            let radix_digit = digit
733                .to_digit(radix)
734                .ok_or_else(|| incomplete_int_lit(digit_pos))?;
735            let mut big_uint = BigUint::from(radix_digit);
736            let end_opt = parse_digits(&mut big_uint, l, radix);
737            Ok((big_uint, end_opt))
738        };
739
740        match l.stream.peek() {
741            Some((_, 'x')) => prefixed_int_lit(l, 16)?,
742            Some((_, 'o')) => prefixed_int_lit(l, 8)?,
743            Some((_, 'b')) => prefixed_int_lit(l, 2)?,
744            Some((_, '_' | '0'..='9')) => decimal_int_lit(l, 0),
745            Some(&(next_index, _)) => (BigUint::from(0u32), Some(next_index)),
746            None => (BigUint::from(0u32), None),
747        }
748    } else {
749        decimal_int_lit(l, digit)
750    };
751
752    let ty_opt = lex_int_ty_opt(l)?;
753
754    let literal = Literal::Int(LitInt {
755        span: span(l, index, end_opt.unwrap_or(l.src.text.len())),
756        parsed: big_uint,
757        ty_opt,
758        is_generated_b256: false,
759    });
760
761    Ok(Some(CommentedTokenTree::Tree(literal.into())))
762}
763
764fn lex_int_ty_opt(l: &mut Lexer<'_>) -> Result<Option<(LitIntType, Span)>> {
765    let (suffix_start_position, c) = match l.stream.next_if(|(_, c)| c.is_xid_continue()) {
766        None => return Ok(None),
767        Some(x) => x,
768    };
769    let mut suffix = String::from(c);
770    let suffix_end_position = loop {
771        match l.stream.peek() {
772            Some((_, c)) if c.is_xid_continue() => {
773                suffix.push(*c);
774                let _ = l.stream.next();
775            }
776            Some((pos, _)) => break *pos,
777            None => break l.src.text.len(),
778        }
779    };
780    // Parse the suffix to a known one, or if unknown, recover by throwing it away.
781    let ty = match parse_int_suffix(&suffix) {
782        Some(s) => s,
783        None => {
784            let span = span(l, suffix_start_position, suffix_end_position);
785            let kind = LexErrorKind::InvalidIntSuffix {
786                suffix: Ident::new(span.clone()),
787            };
788            error(l.handler, LexError { kind, span });
789            return Ok(None);
790        }
791    };
792    let span = span_until(l, suffix_start_position);
793    Ok(Some((ty, span)))
794}
795
796/// Interpret the given `suffix` string as a `LitIntType`.
797pub fn parse_int_suffix(suffix: &str) -> Option<LitIntType> {
798    Some(match suffix {
799        "u8" => LitIntType::U8,
800        "u16" => LitIntType::U16,
801        "u32" => LitIntType::U32,
802        "u64" => LitIntType::U64,
803        "u256" => LitIntType::U256,
804        "i8" => LitIntType::I8,
805        "i16" => LitIntType::I16,
806        "i32" => LitIntType::I32,
807        "i64" => LitIntType::I64,
808        _ => return None,
809    })
810}
811
812fn parse_digits(big_uint: &mut BigUint, l: &mut Lexer<'_>, radix: u32) -> Option<usize> {
813    loop {
814        match l.stream.peek() {
815            None => break None,
816            Some((_, '_')) => {
817                let _ = l.stream.next();
818            }
819            Some(&(index, character)) => match character.to_digit(radix) {
820                None => break Some(index),
821                Some(digit) => {
822                    let _ = l.stream.next();
823                    *big_uint *= radix;
824                    *big_uint += digit;
825                }
826            },
827        };
828    }
829}
830
831fn lex_punctuation(l: &mut Lexer<'_>, index: usize, character: char) -> Option<CommentedTokenTree> {
832    let punct = Punct {
833        kind: character.as_punct_kind()?,
834        spacing: match l.stream.peek() {
835            Some((_, next_character)) if next_character.as_punct_kind().is_some() => Spacing::Joint,
836            _ => Spacing::Alone,
837        },
838        span: span_until(l, index),
839    };
840    Some(CommentedTokenTree::Tree(punct.into()))
841}
842
843fn span_until(l: &mut Lexer<'_>, start: usize) -> Span {
844    let end = l.stream.peek().map_or(l.src.text.len(), |(end, _)| *end);
845    span(l, start, end)
846}
847
848fn span_one(l: &Lexer<'_>, start: usize, c: char) -> Span {
849    span(l, start, start + c.len_utf8())
850}
851
852fn span(l: &Lexer<'_>, start: usize, end: usize) -> Span {
853    Span::new(l.src.clone(), start, end, *l.source_id).unwrap()
854}
855
856/// Emit a lexer error.
857fn error(handler: &Handler, error: LexError) -> ErrorEmitted {
858    handler.emit_err(CompileError::Lex { error })
859}
860
861#[cfg(test)]
862mod tests {
863    use super::*;
864    use assert_matches::assert_matches;
865    use sway_ast::{
866        literal::{LitChar, Literal},
867        token::{
868            Comment, CommentKind, CommentedTokenTree, CommentedTree, DocComment, DocStyle,
869            TokenTree,
870        },
871    };
872    use sway_error::{
873        error::CompileError,
874        handler::Handler,
875        lex_error::{LexError, LexErrorKind},
876    };
877
878    #[test]
879    fn lex_bidi() {
880        let input = "
881            script;
882            use std::string::String;
883            fn main() {
884                let a = String::from_ascii_str(\"fuel\");
885                let b = String::from_ascii_str(\"fuel\u{202E}\u{2066}// Same string again\u{2069}\u{2066}\");
886                if a.as_bytes() == b.as_bytes() {
887                    log(\"same\");
888                } else {
889                    log(\"different\");
890                }
891                let lrm = '\u{202E}';
892                log(lrm);
893            }
894        ";
895        let start = 0;
896        let end = input.len();
897        let path = None;
898        let handler = Handler::default();
899        let _stream = lex_commented(&handler, input.into(), start, end, &path).unwrap();
900        let (errors, warnings, infos) = handler.consume();
901        assert_eq!(infos.len(), 0);
902        assert_eq!(warnings.len(), 0);
903        assert_eq!(errors.len(), 5);
904        for err in errors {
905            assert_matches!(
906                err,
907                CompileError::Lex {
908                    error: LexError {
909                        span: _,
910                        kind: LexErrorKind::UnicodeTextDirInLiteral {
911                            position: _,
912                            character: _
913                        }
914                    }
915                }
916            );
917        }
918    }
919
920    #[test]
921    fn lex_commented_token_stream() {
922        let input = r#"
923        //
924        // Single-line comment.
925        struct Foo {
926            /* multi-
927             * line-
928             * comment */
929            bar: i32, // trailing comment
930        }
931        "#;
932        let start = 0;
933        let end = input.len();
934        let path = None;
935        let handler = Handler::default();
936        let stream = lex_commented(&handler, input.into(), start, end, &path).unwrap();
937        assert!(handler.consume().0.is_empty());
938        let mut tts = stream.token_trees().iter();
939        assert_eq!(tts.next().unwrap().span().as_str(), "//");
940        assert_eq!(
941            tts.next().unwrap().span().as_str(),
942            "// Single-line comment."
943        );
944        assert_eq!(tts.next().unwrap().span().as_str(), "struct");
945        assert_eq!(tts.next().unwrap().span().as_str(), "Foo");
946        {
947            let group = match tts.next() {
948                Some(CommentedTokenTree::Tree(CommentedTree::Group(group))) => group,
949                _ => panic!("expected group"),
950            };
951            let mut tts = group.token_stream.token_trees().iter();
952            assert_eq!(
953                tts.next().unwrap().span().as_str(),
954                "/* multi-\n             * line-\n             * comment */",
955            );
956            assert_eq!(tts.next().unwrap().span().as_str(), "bar");
957            assert_eq!(tts.next().unwrap().span().as_str(), ":");
958            assert_eq!(tts.next().unwrap().span().as_str(), "i32");
959            assert_eq!(tts.next().unwrap().span().as_str(), ",");
960            assert_matches!(
961                tts.next(),
962                Some(CommentedTokenTree::Comment(Comment {
963                    span,
964                    comment_kind: CommentKind::Trailing,
965                })) if span.as_str() ==  "// trailing comment"
966            );
967            assert!(tts.next().is_none());
968        }
969        assert!(tts.next().is_none());
970    }
971
972    #[test]
973    fn lex_comments_check_comment_kind() {
974        let input = r#"
975        // CommentKind::Newlined
976        abi Foo {
977            // CommentKind::Newlined
978            fn bar(); // CommentKind::Trailing
979            // CommentKind::Newlined
980        }
981        "#;
982        let start = 0;
983        let end = input.len();
984        let path = None;
985        let handler = Handler::default();
986        let stream = lex_commented(&handler, input.into(), start, end, &path).unwrap();
987        assert!(handler.consume().0.is_empty());
988        let mut tts = stream.token_trees().iter();
989
990        assert_matches!(
991            tts.next(),
992            Some(CommentedTokenTree::Comment(Comment {
993                span,
994                comment_kind: CommentKind::Newlined,
995            })) if span.as_str() ==  "// CommentKind::Newlined"
996        );
997        assert_eq!(tts.next().unwrap().span().as_str(), "abi");
998        assert_eq!(tts.next().unwrap().span().as_str(), "Foo");
999
1000        {
1001            let group = match tts.next() {
1002                Some(CommentedTokenTree::Tree(CommentedTree::Group(group))) => group,
1003                _ => panic!("expected group"),
1004            };
1005            let mut tts = group.token_stream.token_trees().iter();
1006
1007            assert_matches!(
1008                tts.next(),
1009                Some(CommentedTokenTree::Comment(Comment {
1010                    span,
1011                    comment_kind: CommentKind::Newlined,
1012                })) if span.as_str() ==  "// CommentKind::Newlined"
1013            );
1014            assert_eq!(tts.next().unwrap().span().as_str(), "fn");
1015            assert_eq!(tts.next().unwrap().span().as_str(), "bar");
1016            assert_eq!(tts.next().unwrap().span().as_str(), "()");
1017            assert_eq!(tts.next().unwrap().span().as_str(), ";");
1018            assert_matches!(
1019                tts.next(),
1020                Some(CommentedTokenTree::Comment(Comment {
1021                    span,
1022                    comment_kind: CommentKind::Trailing,
1023                })) if span.as_str() ==  "// CommentKind::Trailing"
1024            );
1025            assert_matches!(
1026                tts.next(),
1027                Some(CommentedTokenTree::Comment(Comment {
1028                    span,
1029                    comment_kind: CommentKind::Newlined,
1030                })) if span.as_str() ==  "// CommentKind::Newlined"
1031            );
1032            assert!(tts.next().is_none());
1033        }
1034    }
1035
1036    #[test]
1037    fn lex_doc_comments() {
1038        let input = r#"
1039        //none
1040        ////none
1041        //!inner
1042        //! inner
1043        ///outer
1044        /// outer
1045        "#;
1046        let start = 0;
1047        let end = input.len();
1048        let path = None;
1049        let handler = Handler::default();
1050        let stream = lex_commented(&handler, input.into(), start, end, &path).unwrap();
1051        assert!(handler.consume().0.is_empty());
1052        let mut tts = stream.token_trees().iter();
1053        assert_matches!(
1054            tts.next(),
1055            Some(CommentedTokenTree::Comment(Comment {
1056                span,
1057                comment_kind: CommentKind::Newlined,
1058            })) if span.as_str() ==  "//none"
1059        );
1060        assert_matches!(
1061            tts.next(),
1062            Some(CommentedTokenTree::Comment(Comment {
1063                span,
1064                comment_kind: CommentKind::Newlined,
1065            })) if span.as_str() ==  "////none"
1066        );
1067        assert_matches!(
1068            tts.next(),
1069            Some(CommentedTokenTree::Tree(CommentedTree::DocComment(DocComment {
1070                doc_style: DocStyle::Inner,
1071                span,
1072                content_span
1073            }))) if span.as_str() ==  "//!inner" && content_span.as_str() == "inner"
1074        );
1075        assert_matches!(
1076            tts.next(),
1077            Some(CommentedTokenTree::Tree(CommentedTree::DocComment(DocComment {
1078                doc_style: DocStyle::Inner,
1079                span,
1080                content_span
1081            }))) if span.as_str() ==  "//! inner" && content_span.as_str() == " inner"
1082        );
1083        assert_matches!(
1084            tts.next(),
1085            Some(CommentedTokenTree::Tree(CommentedTree::DocComment(DocComment {
1086                doc_style: DocStyle::Outer,
1087                span,
1088                content_span
1089            }))) if span.as_str() ==  "///outer" && content_span.as_str() == "outer"
1090        );
1091        assert_matches!(
1092            tts.next(),
1093            Some(CommentedTokenTree::Tree(CommentedTree::DocComment(DocComment {
1094                doc_style: DocStyle::Outer,
1095                span,
1096                content_span
1097            }))) if span.as_str() ==  "/// outer" && content_span.as_str() == " outer"
1098        );
1099        assert_eq!(tts.next(), None);
1100    }
1101
1102    #[test]
1103    fn lex_char_escaped_quote() {
1104        let input = r"
1105        '\''
1106        ";
1107        let handler = Handler::default();
1108        let stream = lex(&handler, input.into(), 0, input.len(), None).unwrap();
1109        assert!(handler.consume().0.is_empty());
1110        let mut tts = stream.token_trees().iter();
1111        assert_matches!(
1112            tts.next(),
1113            Some(TokenTree::Literal(Literal::Char(LitChar {
1114                parsed: '\'',
1115                ..
1116            })))
1117        );
1118        assert_eq!(tts.next(), None);
1119    }
1120
1121    use super::is_valid_identifier_or_path as valid;
1122
1123    #[test]
1124    fn accepts_simple_identifiers() {
1125        assert!(valid("foo"));
1126        assert!(valid("Foo"));
1127        assert!(valid("_foo"));
1128        assert!(valid("foo123"));
1129        assert!(valid("føø"));
1130    }
1131
1132    #[test]
1133    fn rejects_invalid_identifiers() {
1134        assert!(!valid(""));
1135        assert!(!valid("_"));
1136        assert!(!valid("__"));
1137        assert!(!valid("__invalid"));
1138        assert!(!valid(":foo"));
1139        assert!(!valid("foo:bar"));
1140    }
1141
1142    #[test]
1143    fn accepts_paths() {
1144        assert!(valid("foo::bar"));
1145        assert!(valid("_foo::_bar"));
1146        assert!(valid("foo_bar::baz123"));
1147        assert!(valid("::some_module::in_the_same::package"));
1148    }
1149
1150    #[test]
1151    fn rejects_malformed_paths() {
1152        assert!(!valid("foo:bar:baz"));
1153        assert!(!valid("foo::"));
1154        assert!(!valid("::"));
1155        assert!(!valid("foo:::bar"));
1156        assert!(!valid("foo::__bad"));
1157    }
1158}