rsjsonnet_lang/lexer/
mod.rs

1//! A lexer for the Jsonnet language.
2//!
3//! # Example
4//!
5//! ```
6//! let source = b"local add_one(x) = x + 1; add_one(2)";
7//!
8//! let arena = rsjsonnet_lang::arena::Arena::new();
9//! let ast_arena = rsjsonnet_lang::arena::Arena::new();
10//! let str_interner = rsjsonnet_lang::interner::StrInterner::new();
11//! let mut span_mgr = rsjsonnet_lang::span::SpanManager::new();
12//! let (span_ctx, _) = span_mgr.insert_source_context(source.len());
13//!
14//! // Create the lexer
15//! let lexer = rsjsonnet_lang::lexer::Lexer::new(
16//!     &arena,
17//!     &ast_arena,
18//!     &str_interner,
19//!     &mut span_mgr,
20//!     span_ctx,
21//!     source,
22//! );
23//!
24//! // Lex the whole input.
25//! let tokens = lexer.lex_to_eof(false).unwrap();
26//! ```
27
28use crate::arena::Arena;
29use crate::interner::StrInterner;
30use crate::span::{SpanContextId, SpanId, SpanManager};
31use crate::token::{Number, STokenKind, Token, TokenKind};
32
33mod error;
34
35pub use error::LexError;
36
37/// The lexer. See the [module-level documentation](self) for more details.
38pub struct Lexer<'a, 'p, 'ast> {
39    arena: &'p Arena,
40    ast_arena: &'ast Arena,
41    str_interner: &'a StrInterner<'p>,
42    span_mgr: &'a mut SpanManager,
43    span_ctx: SpanContextId,
44    input: &'a [u8],
45    start_pos: usize,
46    end_pos: usize,
47}
48
49impl<'a, 'p, 'ast> Lexer<'a, 'p, 'ast> {
50    /// Create a new lexer that operates on `input`.
51    pub fn new(
52        arena: &'p Arena,
53        ast_arena: &'ast Arena,
54        str_interner: &'a StrInterner<'p>,
55        span_mgr: &'a mut SpanManager,
56        span_ctx: SpanContextId,
57        input: &'a [u8],
58    ) -> Self {
59        Self {
60            arena,
61            ast_arena,
62            str_interner,
63            span_mgr,
64            span_ctx,
65            input,
66            start_pos: 0,
67            end_pos: 0,
68        }
69    }
70
71    /// Lexes all the remaining input into a token `Vec`.
72    ///
73    /// The last token will be an [end-of-file token](TokenKind::EndOfFile).
74    ///
75    /// If `whitespaces_and_comments` is `false`,
76    /// [whitespace](TokenKind::Whitespace) and [comment](TokenKind::Comment)
77    /// tokens will be omitted from the output.
78    pub fn lex_to_eof(
79        mut self,
80        whitespaces_and_comments: bool,
81    ) -> Result<Vec<Token<'p, 'ast>>, LexError> {
82        let mut tokens = Vec::new();
83        loop {
84            let token = self.next_token()?;
85            let is_eof = token.kind == TokenKind::EndOfFile;
86            if whitespaces_and_comments
87                || !matches!(token.kind, TokenKind::Whitespace | TokenKind::Comment)
88            {
89                tokens.push(token);
90            }
91            if is_eof {
92                break;
93            }
94        }
95        Ok(tokens)
96    }
97
98    /// Lexes the next token.
99    pub fn next_token(&mut self) -> Result<Token<'p, 'ast>, LexError> {
100        match self.eat_any_byte() {
101            None => Ok(self.commit_token(TokenKind::EndOfFile)),
102            Some(b'{') => Ok(self.commit_token(TokenKind::Simple(STokenKind::LeftBrace))),
103            Some(b'}') => Ok(self.commit_token(TokenKind::Simple(STokenKind::RightBrace))),
104            Some(b'[') => Ok(self.commit_token(TokenKind::Simple(STokenKind::LeftBracket))),
105            Some(b']') => Ok(self.commit_token(TokenKind::Simple(STokenKind::RightBracket))),
106            Some(b',') => Ok(self.commit_token(TokenKind::Simple(STokenKind::Comma))),
107            Some(b'.') => Ok(self.commit_token(TokenKind::Simple(STokenKind::Dot))),
108            Some(b'(') => Ok(self.commit_token(TokenKind::Simple(STokenKind::LeftParen))),
109            Some(b')') => Ok(self.commit_token(TokenKind::Simple(STokenKind::RightParen))),
110            Some(b';') => Ok(self.commit_token(TokenKind::Simple(STokenKind::Semicolon))),
111            Some(b'/') => {
112                if self.eat_byte(b'/') {
113                    self.lex_single_line_comment()
114                } else if self.eat_byte(b'*') {
115                    self.lex_multi_line_comment()
116                } else {
117                    Ok(self.lex_operator())
118                }
119            }
120            Some(b'|') => {
121                if self.eat_slice(b"||") {
122                    self.lex_text_block()
123                } else {
124                    Ok(self.lex_operator())
125                }
126            }
127            Some(
128                b'!' | b'$' | b':' | b'~' | b'+' | b'-' | b'&' | b'^' | b'=' | b'<' | b'>' | b'*'
129                | b'%',
130            ) => Ok(self.lex_operator()),
131            Some(b' ' | b'\t' | b'\n' | b'\r') => {
132                while self.eat_byte_if(|byte| matches!(byte, b' ' | b'\t' | b'\n' | b'\r')) {}
133                Ok(self.commit_token(TokenKind::Whitespace))
134            }
135            Some(b'#') => self.lex_single_line_comment(),
136            Some(chr @ b'0'..=b'9') => self.lex_number(chr),
137            Some(b'_' | b'a'..=b'z' | b'A'..=b'Z') => Ok(self.lex_ident()),
138            Some(b'@') => {
139                if self.eat_byte(b'\'') {
140                    self.lex_verbatim_string(b'\'')
141                } else if self.eat_byte(b'"') {
142                    self.lex_verbatim_string(b'"')
143                } else {
144                    let span = self.make_span(self.start_pos, self.end_pos);
145                    return Err(LexError::InvalidChar { span, chr: '@' });
146                }
147            }
148            Some(b'\'') => self.lex_quoted_string(b'\''),
149            Some(b'"') => self.lex_quoted_string(b'"'),
150            Some(byte0) => match self.eat_cont_any_char(byte0) {
151                Ok(chr) => {
152                    let span = self.make_span(self.start_pos, self.end_pos);
153                    Err(LexError::InvalidChar { span, chr })
154                }
155                Err(_) => {
156                    let span = self.make_span(self.start_pos, self.end_pos);
157                    Err(LexError::InvalidUtf8 {
158                        span,
159                        seq: self.input[self.start_pos..self.end_pos].to_vec(),
160                    })
161                }
162            },
163        }
164    }
165
166    fn lex_single_line_comment(&mut self) -> Result<Token<'p, 'ast>, LexError> {
167        while !matches!(self.eat_any_byte(), None | Some(b'\n')) {}
168        Ok(self.commit_token(TokenKind::Comment))
169    }
170
171    fn lex_multi_line_comment(&mut self) -> Result<Token<'p, 'ast>, LexError> {
172        loop {
173            if self.eat_slice(b"*/") {
174                break;
175            } else if self.eat_any_byte().is_none() {
176                let span = self.make_span(self.start_pos, self.end_pos);
177                return Err(LexError::UnfinishedMultilineComment { span });
178            }
179        }
180        Ok(self.commit_token(TokenKind::Comment))
181    }
182
183    #[must_use]
184    fn lex_operator(&mut self) -> Token<'p, 'ast> {
185        let mut sure_end_pos = self.end_pos;
186        loop {
187            if self.eat_slice(b"|||") || self.eat_slice(b"//") || self.eat_slice(b"/*") {
188                // `|||`, `//` and `/*` cannot appear within an operator
189                break;
190            }
191
192            let Some(next_byte) = self.eat_any_byte() else {
193                break;
194            };
195            // An multi-byte operator cannot end with '+', '-', '~', '!' or '$'
196            if matches!(
197                next_byte,
198                b':' | b'&' | b'|' | b'^' | b'=' | b'<' | b'>' | b'*' | b'/' | b'%'
199            ) {
200                sure_end_pos = self.end_pos;
201            } else if !matches!(next_byte, b'+' | b'-' | b'~' | b'!' | b'$') {
202                break;
203            }
204        }
205        self.end_pos = sure_end_pos;
206        let op = &self.input[self.start_pos..self.end_pos];
207        match op {
208            b":" => self.commit_token(TokenKind::Simple(STokenKind::Colon)),
209            b"::" => self.commit_token(TokenKind::Simple(STokenKind::ColonColon)),
210            b":::" => self.commit_token(TokenKind::Simple(STokenKind::ColonColonColon)),
211            b"+:" => self.commit_token(TokenKind::Simple(STokenKind::PlusColon)),
212            b"+::" => self.commit_token(TokenKind::Simple(STokenKind::PlusColonColon)),
213            b"+:::" => self.commit_token(TokenKind::Simple(STokenKind::PlusColonColonColon)),
214            b"=" => self.commit_token(TokenKind::Simple(STokenKind::Eq)),
215            b"$" => self.commit_token(TokenKind::Simple(STokenKind::Dollar)),
216            b"*" => self.commit_token(TokenKind::Simple(STokenKind::Asterisk)),
217            b"/" => self.commit_token(TokenKind::Simple(STokenKind::Slash)),
218            b"%" => self.commit_token(TokenKind::Simple(STokenKind::Percent)),
219            b"+" => self.commit_token(TokenKind::Simple(STokenKind::Plus)),
220            b"-" => self.commit_token(TokenKind::Simple(STokenKind::Minus)),
221            b"<<" => self.commit_token(TokenKind::Simple(STokenKind::LtLt)),
222            b">>" => self.commit_token(TokenKind::Simple(STokenKind::GtGt)),
223            b"<" => self.commit_token(TokenKind::Simple(STokenKind::Lt)),
224            b"<=" => self.commit_token(TokenKind::Simple(STokenKind::LtEq)),
225            b">" => self.commit_token(TokenKind::Simple(STokenKind::Gt)),
226            b">=" => self.commit_token(TokenKind::Simple(STokenKind::GtEq)),
227            b"==" => self.commit_token(TokenKind::Simple(STokenKind::EqEq)),
228            b"!=" => self.commit_token(TokenKind::Simple(STokenKind::ExclamEq)),
229            b"&" => self.commit_token(TokenKind::Simple(STokenKind::Amp)),
230            b"^" => self.commit_token(TokenKind::Simple(STokenKind::Hat)),
231            b"|" => self.commit_token(TokenKind::Simple(STokenKind::Pipe)),
232            b"&&" => self.commit_token(TokenKind::Simple(STokenKind::AmpAmp)),
233            b"||" => self.commit_token(TokenKind::Simple(STokenKind::PipePipe)),
234            b"!" => self.commit_token(TokenKind::Simple(STokenKind::Exclam)),
235            b"~" => self.commit_token(TokenKind::Simple(STokenKind::Tilde)),
236            _ => self.commit_token(TokenKind::OtherOp(
237                self.ast_arena.alloc_str(std::str::from_utf8(op).unwrap()),
238            )),
239        }
240    }
241
242    #[must_use]
243    fn lex_ident(&mut self) -> Token<'p, 'ast> {
244        while self.eat_byte_if(|b| b.is_ascii_alphanumeric() || b == b'_') {}
245        let ident_bytes = &self.input[self.start_pos..self.end_pos];
246        match ident_bytes {
247            b"assert" => self.commit_token(TokenKind::Simple(STokenKind::Assert)),
248            b"else" => self.commit_token(TokenKind::Simple(STokenKind::Else)),
249            b"error" => self.commit_token(TokenKind::Simple(STokenKind::Error)),
250            b"false" => self.commit_token(TokenKind::Simple(STokenKind::False)),
251            b"for" => self.commit_token(TokenKind::Simple(STokenKind::For)),
252            b"function" => self.commit_token(TokenKind::Simple(STokenKind::Function)),
253            b"if" => self.commit_token(TokenKind::Simple(STokenKind::If)),
254            b"import" => self.commit_token(TokenKind::Simple(STokenKind::Import)),
255            b"importstr" => self.commit_token(TokenKind::Simple(STokenKind::Importstr)),
256            b"importbin" => self.commit_token(TokenKind::Simple(STokenKind::Importbin)),
257            b"in" => self.commit_token(TokenKind::Simple(STokenKind::In)),
258            b"local" => self.commit_token(TokenKind::Simple(STokenKind::Local)),
259            b"null" => self.commit_token(TokenKind::Simple(STokenKind::Null)),
260            b"tailstrict" => self.commit_token(TokenKind::Simple(STokenKind::Tailstrict)),
261            b"then" => self.commit_token(TokenKind::Simple(STokenKind::Then)),
262            b"self" => self.commit_token(TokenKind::Simple(STokenKind::Self_)),
263            b"super" => self.commit_token(TokenKind::Simple(STokenKind::Super)),
264            b"true" => self.commit_token(TokenKind::Simple(STokenKind::True)),
265            _ => self.commit_token(TokenKind::Ident(
266                self.str_interner
267                    .intern(self.arena, std::str::from_utf8(ident_bytes).unwrap()),
268            )),
269        }
270    }
271
272    fn lex_number(&mut self, chr0: u8) -> Result<Token<'p, 'ast>, LexError> {
273        let leading_zero = chr0 == b'0';
274        let mut digits = String::new();
275        digits.push(char::from(chr0));
276
277        while let Some(chr) = self.eat_get_byte_if(|b| b.is_ascii_digit()) {
278            if digits.len() == 1 && leading_zero {
279                let span = self.make_span(self.end_pos - 2, self.end_pos - 1);
280                return Err(LexError::LeadingZeroInNumber { span });
281            }
282            digits.push(char::from(chr));
283        }
284
285        let mut implicit_exp = 0i64;
286        if self.eat_byte(b'.') {
287            while let Some(chr) = self.eat_get_byte_if(|b| b.is_ascii_digit()) {
288                digits.push(char::from(chr));
289                implicit_exp -= 1;
290            }
291            if implicit_exp == 0 {
292                let span = self.make_span(self.end_pos - 1, self.end_pos);
293                return Err(LexError::MissingFracDigits { span });
294            }
295        }
296
297        let eff_exp;
298        if self.eat_byte_if(|b| matches!(b, b'e' | b'E')) {
299            let mut explicit_exp_sign = false;
300            let mut explicit_exp = Some(0u64);
301
302            let exp_start = self.end_pos - 1;
303
304            if self.eat_byte(b'+') {
305                // explicit +
306            } else if self.eat_byte(b'-') {
307                explicit_exp_sign = true;
308            }
309
310            let mut num_exp_digits = 0;
311            while let Some(chr) = self.eat_get_byte_if(|b| b.is_ascii_digit()) {
312                explicit_exp = explicit_exp
313                    .and_then(|e| e.checked_mul(10))
314                    .and_then(|e| e.checked_add(u64::from(chr - b'0')));
315                num_exp_digits += 1;
316            }
317
318            if num_exp_digits == 0 {
319                let span = self.make_span(exp_start, self.end_pos);
320                return Err(LexError::MissingExpDigits { span });
321            }
322
323            eff_exp = explicit_exp
324                .and_then(|e| i64::try_from(e).ok())
325                .and_then(|e| {
326                    if explicit_exp_sign {
327                        implicit_exp.checked_sub(e)
328                    } else {
329                        implicit_exp.checked_add(e)
330                    }
331                })
332                .ok_or_else(|| {
333                    let span = self.make_span(exp_start, self.end_pos);
334                    LexError::ExpOverflow { span }
335                })?;
336        } else {
337            eff_exp = implicit_exp;
338        }
339
340        Ok(self.commit_token(TokenKind::Number(Number {
341            digits: self.ast_arena.alloc_str(&digits),
342            exp: eff_exp,
343        })))
344    }
345
346    fn lex_quoted_string(&mut self, delim: u8) -> Result<Token<'p, 'ast>, LexError> {
347        let mut string = String::new();
348        loop {
349            if self.eat_byte(delim) {
350                break;
351            } else if self.eat_byte(b'\\') {
352                let escape_start = self.end_pos - 1;
353                if self.eat_byte(b'"') {
354                    string.push('"');
355                } else if self.eat_byte(b'\'') {
356                    string.push('\'');
357                } else if self.eat_byte(b'\\') {
358                    string.push('\\');
359                } else if self.eat_byte(b'/') {
360                    string.push('/');
361                } else if self.eat_byte(b'b') {
362                    string.push('\u{8}');
363                } else if self.eat_byte(b'f') {
364                    string.push('\u{C}');
365                } else if self.eat_byte(b'n') {
366                    string.push('\n');
367                } else if self.eat_byte(b'r') {
368                    string.push('\r');
369                } else if self.eat_byte(b't') {
370                    string.push('\t');
371                } else if self.eat_byte(b'u') {
372                    let hex_from_digit = |b: u8| match b {
373                        b'0'..=b'9' => Some(b - b'0'),
374                        b'a'..=b'f' => Some(b - b'a' + 10),
375                        b'A'..=b'F' => Some(b - b'A' + 10),
376                        _ => None,
377                    };
378
379                    let eat_codeunit = |this: &mut Self| -> Option<u16> {
380                        let d0 = this.eat_map_byte(hex_from_digit)?;
381                        let d1 = this.eat_map_byte(hex_from_digit)?;
382                        let d2 = this.eat_map_byte(hex_from_digit)?;
383                        let d3 = this.eat_map_byte(hex_from_digit)?;
384                        Some(
385                            (u16::from(d0) << 12)
386                                | (u16::from(d1) << 8)
387                                | (u16::from(d2) << 4)
388                                | u16::from(d3),
389                        )
390                    };
391
392                    let Some(cu1) = eat_codeunit(self) else {
393                        let span = self.make_span(escape_start, self.end_pos);
394                        return Err(LexError::IncompleteUnicodeEscape { span });
395                    };
396
397                    if matches!(cu1, 0xD800..=0xDFFF) && self.eat_slice(b"\\u") {
398                        let Some(cu2) = eat_codeunit(self) else {
399                            let span = self.make_span(escape_start + 6, self.end_pos);
400                            return Err(LexError::IncompleteUnicodeEscape { span });
401                        };
402                        if let Ok(chr) = char::decode_utf16([cu1, cu2]).next().unwrap() {
403                            string.push(chr);
404                        } else {
405                            let span = self.make_span(escape_start, self.end_pos);
406                            return Err(LexError::InvalidUtf16EscapeSequence {
407                                span,
408                                cu1,
409                                cu2: Some(cu2),
410                            });
411                        }
412                    } else if let Some(chr) = char::from_u32(cu1.into()) {
413                        string.push(chr);
414                    } else {
415                        let span = self.make_span(escape_start, self.end_pos);
416                        return Err(LexError::InvalidUtf16EscapeSequence {
417                            span,
418                            cu1,
419                            cu2: None,
420                        });
421                    }
422                } else {
423                    match self.eat_any_char() {
424                        None => {
425                            let span = self.make_span(self.start_pos, self.end_pos);
426                            return Err(LexError::UnfinishedString { span });
427                        }
428                        Some(chr) => {
429                            let span = self.make_span(escape_start, self.end_pos);
430                            return Err(LexError::InvalidEscapeInString {
431                                span,
432                                chr: chr.unwrap_or('\u{FFFD}'),
433                            });
434                        }
435                    }
436                }
437            } else {
438                match self.eat_any_char() {
439                    None => {
440                        let span = self.make_span(self.start_pos, self.end_pos);
441                        return Err(LexError::UnfinishedString { span });
442                    }
443                    Some(Ok(chr)) => string.push(chr),
444                    Some(Err(_)) => string.push('\u{FFFD}'),
445                }
446            }
447        }
448        Ok(self.commit_token(TokenKind::String(self.ast_arena.alloc_str(&string))))
449    }
450
451    fn lex_verbatim_string(&mut self, delim: u8) -> Result<Token<'p, 'ast>, LexError> {
452        let mut string = String::new();
453        loop {
454            if self.eat_byte(delim) {
455                if self.eat_byte(delim) {
456                    string.push(char::from(delim));
457                } else {
458                    break;
459                }
460            } else {
461                match self.eat_any_char() {
462                    None => {
463                        let span = self.make_span(self.start_pos, self.end_pos);
464                        return Err(LexError::UnfinishedString { span });
465                    }
466                    Some(chr) => string.push(chr.unwrap_or('\u{FFFD}')),
467                }
468            }
469        }
470        Ok(self.commit_token(TokenKind::String(self.ast_arena.alloc_str(&string))))
471    }
472
473    #[inline]
474    fn lex_text_block(&mut self) -> Result<Token<'p, 'ast>, LexError> {
475        let mut string = String::new();
476        let mut prefix;
477        while self.eat_byte_if(|b| matches!(b, b' ' | b'\t' | b'\r')) {}
478        if self.eat_byte(b'\n') {
479            loop {
480                let prefix_start = self.end_pos;
481                while self.eat_byte_if(|b| matches!(b, b' ' | b'\t')) {}
482                let prefix_end = self.end_pos;
483                prefix = &self.input[prefix_start..prefix_end];
484                if self.eat_byte(b'\r') {
485                    string.push('\r');
486                }
487                if prefix.is_empty() {
488                    if self.eat_byte(b'\n') {
489                        // handle fully empty lines at the beginning:
490                        // |||
491                        //
492                        // →→...
493                        // |||
494                        string.push('\n');
495                        continue;
496                    } else {
497                        let span = self.make_span(prefix_start, prefix_end);
498                        return Err(LexError::MissingWhitespaceTextBlockStart { span });
499                    }
500                }
501                break;
502            }
503        } else {
504            let span = self.make_span(self.start_pos, self.end_pos);
505            return Err(LexError::MissingLineBreakAfterTextBlockStart { span });
506        }
507
508        'outer: loop {
509            while self.eat_byte(b'\n') {
510                string.push('\n');
511                loop {
512                    // Handle fully empty lines
513                    if self.eat_byte(b'\n') {
514                        string.push('\n');
515                    } else if self.eat_slice(b"\r\n") {
516                        string.push_str("\r\n");
517                    } else {
518                        break;
519                    }
520                }
521                if !self.eat_slice(prefix) {
522                    let line_start = self.end_pos;
523                    while self.eat_byte_if(|b| matches!(b, b' ' | b'\t')) {}
524                    if self.eat_slice(b"|||") {
525                        break 'outer;
526                    } else {
527                        let span = self.make_span(line_start, self.end_pos);
528                        return Err(LexError::InvalidTextBlockTermination { span });
529                    }
530                }
531            }
532
533            match self.eat_any_char() {
534                None => {
535                    let span = self.make_span(self.start_pos, self.end_pos);
536                    return Err(LexError::UnfinishedString { span });
537                }
538                Some(chr) => string.push(chr.unwrap_or('\u{FFFD}')),
539            }
540        }
541
542        Ok(self.commit_token(TokenKind::TextBlock(self.ast_arena.alloc_str(&string))))
543    }
544
545    #[must_use]
546    #[inline]
547    fn eat_byte(&mut self, byte: u8) -> bool {
548        if matches!(self.input.get(self.end_pos), Some(&b) if b == byte) {
549            self.end_pos += 1;
550            true
551        } else {
552            false
553        }
554    }
555
556    #[must_use]
557    #[inline]
558    fn eat_byte_if(&mut self, pred: impl FnOnce(u8) -> bool) -> bool {
559        if matches!(self.input.get(self.end_pos), Some(&b) if pred(b)) {
560            self.end_pos += 1;
561            true
562        } else {
563            false
564        }
565    }
566
567    #[must_use]
568    #[inline]
569    fn eat_get_byte_if(&mut self, pred: impl FnOnce(u8) -> bool) -> Option<u8> {
570        if let Some(&b) = self.input.get(self.end_pos).filter(|&&b| pred(b)) {
571            self.end_pos += 1;
572            Some(b)
573        } else {
574            None
575        }
576    }
577
578    #[must_use]
579    #[inline]
580    fn eat_map_byte<R>(&mut self, f: impl FnOnce(u8) -> Option<R>) -> Option<R> {
581        if let Some(r) = self.input.get(self.end_pos).and_then(|&b| f(b)) {
582            self.end_pos += 1;
583            Some(r)
584        } else {
585            None
586        }
587    }
588
589    #[must_use]
590    #[inline]
591    fn eat_slice(&mut self, s: &[u8]) -> bool {
592        if self
593            .input
594            .get(self.end_pos..)
595            .is_some_and(|rem| rem.starts_with(s))
596        {
597            self.end_pos += s.len();
598            true
599        } else {
600            false
601        }
602    }
603
604    #[must_use]
605    #[inline]
606    fn decode_cont_char(&self, byte0: u8) -> (usize, Option<char>) {
607        // Based on `<Utf8Chunks as Iterator>::next` from Rust libcore
608        const TAG_CONT_U8: u8 = 128;
609        fn safe_get(xs: &[u8], i: usize) -> u8 {
610            *xs.get(i).unwrap_or(&0)
611        }
612
613        let mut i = self.end_pos;
614        match byte0 {
615            0..=0x7F => (i, Some(char::from(byte0))),
616            0b11000000..=0b11011111 => {
617                let byte1 = safe_get(self.input, i);
618                if byte1 & 192 != TAG_CONT_U8 {
619                    return (i, None);
620                }
621                i += 1;
622
623                let cp = (u32::from(byte0 & 0b11111) << 6) | u32::from(byte1 & 0b111111);
624                (i, Some(char::from_u32(cp).unwrap()))
625            }
626            0b11100000..=0b11101111 => {
627                let byte1 = safe_get(self.input, i);
628                match (byte0, byte1) {
629                    (0xE0, 0xA0..=0xBF) => (),
630                    (0xE1..=0xEC, 0x80..=0xBF) => (),
631                    (0xED, 0x80..=0x9F) => (),
632                    (0xEE..=0xEF, 0x80..=0xBF) => (),
633                    _ => return (i, None),
634                }
635                i += 1;
636                let byte2 = safe_get(self.input, i);
637                if byte2 & 192 != TAG_CONT_U8 {
638                    return (i, None);
639                }
640                i += 1;
641
642                let cp = (u32::from(byte0 & 0b1111) << 12)
643                    | (u32::from(byte1 & 0b111111) << 6)
644                    | u32::from(byte2 & 0b111111);
645                (i, Some(char::from_u32(cp).unwrap()))
646            }
647            0b11110000..=0b11110111 => {
648                let byte1 = safe_get(self.input, i);
649                match (byte0, byte1) {
650                    (0xF0, 0x90..=0xBF) => (),
651                    (0xF1..=0xF3, 0x80..=0xBF) => (),
652                    (0xF4, 0x80..=0x8F) => (),
653                    _ => return (i, None),
654                }
655                i += 1;
656                let byte2 = safe_get(self.input, i);
657                if byte2 & 192 != TAG_CONT_U8 {
658                    return (i, None);
659                }
660                i += 1;
661                let byte3 = safe_get(self.input, i);
662                if byte3 & 192 != TAG_CONT_U8 {
663                    return (i, None);
664                }
665                i += 1;
666
667                let cp = (u32::from(byte0 & 0b111) << 18)
668                    | (u32::from(byte1 & 0b111111) << 12)
669                    | (u32::from(byte2 & 0b111111) << 6)
670                    | u32::from(byte3 & 0b111111);
671                (i, Some(char::from_u32(cp).unwrap()))
672            }
673            _ => (i, None),
674        }
675    }
676
677    #[must_use]
678    #[inline]
679    fn eat_any_byte(&mut self) -> Option<u8> {
680        if let Some(&byte) = self.input.get(self.end_pos) {
681            self.end_pos += 1;
682            Some(byte)
683        } else {
684            None
685        }
686    }
687
688    #[inline]
689    fn eat_cont_any_char(&mut self, byte0: u8) -> Result<char, usize> {
690        let (end_pos, chr) = self.decode_cont_char(byte0);
691        if let Some(chr) = chr {
692            self.end_pos = end_pos;
693            Ok(chr)
694        } else {
695            let error_len = end_pos - self.end_pos + 1;
696            self.end_pos = end_pos;
697            Err(error_len)
698        }
699    }
700
701    #[must_use]
702    #[inline]
703    fn eat_any_char(&mut self) -> Option<Result<char, usize>> {
704        self.eat_any_byte()
705            .map(|byte0| self.eat_cont_any_char(byte0))
706    }
707
708    #[must_use]
709    #[inline]
710    fn commit_token(&mut self, kind: TokenKind<'p, 'ast>) -> Token<'p, 'ast> {
711        let start_pos = self.start_pos;
712        self.start_pos = self.end_pos;
713        Token {
714            span: self.make_span(start_pos, self.end_pos),
715            kind,
716        }
717    }
718
719    #[must_use]
720    #[inline]
721    fn make_span(&mut self, start_pos: usize, end_pos: usize) -> SpanId {
722        self.span_mgr.intern_span(self.span_ctx, start_pos, end_pos)
723    }
724}
725
726#[cfg(test)]
727mod tests {
728    use super::Lexer;
729    use crate::arena::Arena;
730    use crate::interner::StrInterner;
731    use crate::span::SpanManager;
732
733    #[test]
734    fn test_decode_valid_utf8() {
735        let arena = Arena::new();
736        let ast_arena = Arena::new();
737        let str_interner = StrInterner::new();
738        let mut span_mgr = SpanManager::new();
739        let (span_ctx, _) = span_mgr.insert_source_context(4);
740
741        for chr in '\u{0}'..=char::MAX {
742            let mut buf = [0; 4];
743            let encoded = chr.encode_utf8(&mut buf);
744
745            let mut lexer = Lexer::new(
746                &arena,
747                &ast_arena,
748                &str_interner,
749                &mut span_mgr,
750                span_ctx,
751                encoded.as_bytes(),
752            );
753            assert_eq!(lexer.eat_any_char(), Some(Ok(chr)));
754        }
755    }
756
757    #[test]
758    fn test_decode_invalid_utf8() {
759        let arena = Arena::new();
760        let ast_area = Arena::new();
761        let str_interner = StrInterner::new();
762        let mut span_mgr = SpanManager::new();
763
764        let tests: &[&[u8]] = &[
765            // Taken from `from_utf8_error` test in Rust's library/alloc/tests/str.rs
766            b"\xFF ",
767            b"\x80 ",
768            b"\xC1 ",
769            b"\xC1",
770            b"\xC2",
771            b"\xC2 ",
772            b"\xC2\xC0",
773            b"\xE0",
774            b"\xE0\x9F",
775            b"\xE0\xA0",
776            b"\xE0\xA0\xC0",
777            b"\xE0\xA0 ",
778            b"\xED\xA0\x80 ",
779            b"\xF1",
780            b"\xF1\x80",
781            b"\xF1\x80\x80",
782            b"\xF1 ",
783            b"\xF1\x80 ",
784            b"\xF1\x80\x80 ",
785        ];
786
787        let (span_ctx, _) =
788            span_mgr.insert_source_context(tests.iter().fold(0, |l, s| l.max(s.len())));
789
790        for &source in tests.iter() {
791            let utf8_error = std::str::from_utf8(source).unwrap_err();
792            assert_eq!(utf8_error.valid_up_to(), 0);
793            let error_len = utf8_error.error_len().unwrap_or(source.len());
794
795            let mut lexer = Lexer::new(
796                &ast_area,
797                &arena,
798                &str_interner,
799                &mut span_mgr,
800                span_ctx,
801                source,
802            );
803            assert_eq!(lexer.eat_any_char(), Some(Err(error_len)));
804        }
805    }
806}