1use crate::arena::Arena;
29use crate::interner::StrInterner;
30use crate::span::{SpanContextId, SpanId, SpanManager};
31use crate::token::{Number, STokenKind, Token, TokenKind};
32
33mod error;
34
35pub use error::LexError;
36
37pub struct Lexer<'a, 'p, 'ast> {
39 arena: &'p Arena,
40 ast_arena: &'ast Arena,
41 str_interner: &'a StrInterner<'p>,
42 span_mgr: &'a mut SpanManager,
43 span_ctx: SpanContextId,
44 input: &'a [u8],
45 start_pos: usize,
46 end_pos: usize,
47}
48
49impl<'a, 'p, 'ast> Lexer<'a, 'p, 'ast> {
50 pub fn new(
52 arena: &'p Arena,
53 ast_arena: &'ast Arena,
54 str_interner: &'a StrInterner<'p>,
55 span_mgr: &'a mut SpanManager,
56 span_ctx: SpanContextId,
57 input: &'a [u8],
58 ) -> Self {
59 Self {
60 arena,
61 ast_arena,
62 str_interner,
63 span_mgr,
64 span_ctx,
65 input,
66 start_pos: 0,
67 end_pos: 0,
68 }
69 }
70
71 pub fn lex_to_eof(
79 mut self,
80 whitespaces_and_comments: bool,
81 ) -> Result<Vec<Token<'p, 'ast>>, LexError> {
82 let mut tokens = Vec::new();
83 loop {
84 let token = self.next_token()?;
85 let is_eof = token.kind == TokenKind::EndOfFile;
86 if whitespaces_and_comments
87 || !matches!(token.kind, TokenKind::Whitespace | TokenKind::Comment)
88 {
89 tokens.push(token);
90 }
91 if is_eof {
92 break;
93 }
94 }
95 Ok(tokens)
96 }
97
98 pub fn next_token(&mut self) -> Result<Token<'p, 'ast>, LexError> {
100 match self.eat_any_byte() {
101 None => Ok(self.commit_token(TokenKind::EndOfFile)),
102 Some(b'{') => Ok(self.commit_token(TokenKind::Simple(STokenKind::LeftBrace))),
103 Some(b'}') => Ok(self.commit_token(TokenKind::Simple(STokenKind::RightBrace))),
104 Some(b'[') => Ok(self.commit_token(TokenKind::Simple(STokenKind::LeftBracket))),
105 Some(b']') => Ok(self.commit_token(TokenKind::Simple(STokenKind::RightBracket))),
106 Some(b',') => Ok(self.commit_token(TokenKind::Simple(STokenKind::Comma))),
107 Some(b'.') => Ok(self.commit_token(TokenKind::Simple(STokenKind::Dot))),
108 Some(b'(') => Ok(self.commit_token(TokenKind::Simple(STokenKind::LeftParen))),
109 Some(b')') => Ok(self.commit_token(TokenKind::Simple(STokenKind::RightParen))),
110 Some(b';') => Ok(self.commit_token(TokenKind::Simple(STokenKind::Semicolon))),
111 Some(b'/') => {
112 if self.eat_byte(b'/') {
113 self.lex_single_line_comment()
114 } else if self.eat_byte(b'*') {
115 self.lex_multi_line_comment()
116 } else {
117 Ok(self.lex_operator())
118 }
119 }
120 Some(b'|') => {
121 if self.eat_slice(b"||") {
122 self.lex_text_block()
123 } else {
124 Ok(self.lex_operator())
125 }
126 }
127 Some(
128 b'!' | b'$' | b':' | b'~' | b'+' | b'-' | b'&' | b'^' | b'=' | b'<' | b'>' | b'*'
129 | b'%',
130 ) => Ok(self.lex_operator()),
131 Some(b' ' | b'\t' | b'\n' | b'\r') => {
132 while self.eat_byte_if(|byte| matches!(byte, b' ' | b'\t' | b'\n' | b'\r')) {}
133 Ok(self.commit_token(TokenKind::Whitespace))
134 }
135 Some(b'#') => self.lex_single_line_comment(),
136 Some(chr @ b'0'..=b'9') => self.lex_number(chr),
137 Some(b'_' | b'a'..=b'z' | b'A'..=b'Z') => Ok(self.lex_ident()),
138 Some(b'@') => {
139 if self.eat_byte(b'\'') {
140 self.lex_verbatim_string(b'\'')
141 } else if self.eat_byte(b'"') {
142 self.lex_verbatim_string(b'"')
143 } else {
144 let span = self.make_span(self.start_pos, self.end_pos);
145 return Err(LexError::InvalidChar { span, chr: '@' });
146 }
147 }
148 Some(b'\'') => self.lex_quoted_string(b'\''),
149 Some(b'"') => self.lex_quoted_string(b'"'),
150 Some(byte0) => match self.eat_cont_any_char(byte0) {
151 Ok(chr) => {
152 let span = self.make_span(self.start_pos, self.end_pos);
153 Err(LexError::InvalidChar { span, chr })
154 }
155 Err(_) => {
156 let span = self.make_span(self.start_pos, self.end_pos);
157 Err(LexError::InvalidUtf8 {
158 span,
159 seq: self.input[self.start_pos..self.end_pos].to_vec(),
160 })
161 }
162 },
163 }
164 }
165
166 fn lex_single_line_comment(&mut self) -> Result<Token<'p, 'ast>, LexError> {
167 while !matches!(self.eat_any_byte(), None | Some(b'\n')) {}
168 Ok(self.commit_token(TokenKind::Comment))
169 }
170
171 fn lex_multi_line_comment(&mut self) -> Result<Token<'p, 'ast>, LexError> {
172 loop {
173 if self.eat_slice(b"*/") {
174 break;
175 } else if self.eat_any_byte().is_none() {
176 let span = self.make_span(self.start_pos, self.end_pos);
177 return Err(LexError::UnfinishedMultilineComment { span });
178 }
179 }
180 Ok(self.commit_token(TokenKind::Comment))
181 }
182
183 #[must_use]
184 fn lex_operator(&mut self) -> Token<'p, 'ast> {
185 let mut sure_end_pos = self.end_pos;
186 loop {
187 if self.eat_slice(b"|||") || self.eat_slice(b"//") || self.eat_slice(b"/*") {
188 break;
190 }
191
192 let Some(next_byte) = self.eat_any_byte() else {
193 break;
194 };
195 if matches!(
197 next_byte,
198 b':' | b'&' | b'|' | b'^' | b'=' | b'<' | b'>' | b'*' | b'/' | b'%'
199 ) {
200 sure_end_pos = self.end_pos;
201 } else if !matches!(next_byte, b'+' | b'-' | b'~' | b'!' | b'$') {
202 break;
203 }
204 }
205 self.end_pos = sure_end_pos;
206 let op = &self.input[self.start_pos..self.end_pos];
207 match op {
208 b":" => self.commit_token(TokenKind::Simple(STokenKind::Colon)),
209 b"::" => self.commit_token(TokenKind::Simple(STokenKind::ColonColon)),
210 b":::" => self.commit_token(TokenKind::Simple(STokenKind::ColonColonColon)),
211 b"+:" => self.commit_token(TokenKind::Simple(STokenKind::PlusColon)),
212 b"+::" => self.commit_token(TokenKind::Simple(STokenKind::PlusColonColon)),
213 b"+:::" => self.commit_token(TokenKind::Simple(STokenKind::PlusColonColonColon)),
214 b"=" => self.commit_token(TokenKind::Simple(STokenKind::Eq)),
215 b"$" => self.commit_token(TokenKind::Simple(STokenKind::Dollar)),
216 b"*" => self.commit_token(TokenKind::Simple(STokenKind::Asterisk)),
217 b"/" => self.commit_token(TokenKind::Simple(STokenKind::Slash)),
218 b"%" => self.commit_token(TokenKind::Simple(STokenKind::Percent)),
219 b"+" => self.commit_token(TokenKind::Simple(STokenKind::Plus)),
220 b"-" => self.commit_token(TokenKind::Simple(STokenKind::Minus)),
221 b"<<" => self.commit_token(TokenKind::Simple(STokenKind::LtLt)),
222 b">>" => self.commit_token(TokenKind::Simple(STokenKind::GtGt)),
223 b"<" => self.commit_token(TokenKind::Simple(STokenKind::Lt)),
224 b"<=" => self.commit_token(TokenKind::Simple(STokenKind::LtEq)),
225 b">" => self.commit_token(TokenKind::Simple(STokenKind::Gt)),
226 b">=" => self.commit_token(TokenKind::Simple(STokenKind::GtEq)),
227 b"==" => self.commit_token(TokenKind::Simple(STokenKind::EqEq)),
228 b"!=" => self.commit_token(TokenKind::Simple(STokenKind::ExclamEq)),
229 b"&" => self.commit_token(TokenKind::Simple(STokenKind::Amp)),
230 b"^" => self.commit_token(TokenKind::Simple(STokenKind::Hat)),
231 b"|" => self.commit_token(TokenKind::Simple(STokenKind::Pipe)),
232 b"&&" => self.commit_token(TokenKind::Simple(STokenKind::AmpAmp)),
233 b"||" => self.commit_token(TokenKind::Simple(STokenKind::PipePipe)),
234 b"!" => self.commit_token(TokenKind::Simple(STokenKind::Exclam)),
235 b"~" => self.commit_token(TokenKind::Simple(STokenKind::Tilde)),
236 _ => self.commit_token(TokenKind::OtherOp(
237 self.ast_arena.alloc_str(std::str::from_utf8(op).unwrap()),
238 )),
239 }
240 }
241
242 #[must_use]
243 fn lex_ident(&mut self) -> Token<'p, 'ast> {
244 while self.eat_byte_if(|b| b.is_ascii_alphanumeric() || b == b'_') {}
245 let ident_bytes = &self.input[self.start_pos..self.end_pos];
246 match ident_bytes {
247 b"assert" => self.commit_token(TokenKind::Simple(STokenKind::Assert)),
248 b"else" => self.commit_token(TokenKind::Simple(STokenKind::Else)),
249 b"error" => self.commit_token(TokenKind::Simple(STokenKind::Error)),
250 b"false" => self.commit_token(TokenKind::Simple(STokenKind::False)),
251 b"for" => self.commit_token(TokenKind::Simple(STokenKind::For)),
252 b"function" => self.commit_token(TokenKind::Simple(STokenKind::Function)),
253 b"if" => self.commit_token(TokenKind::Simple(STokenKind::If)),
254 b"import" => self.commit_token(TokenKind::Simple(STokenKind::Import)),
255 b"importstr" => self.commit_token(TokenKind::Simple(STokenKind::Importstr)),
256 b"importbin" => self.commit_token(TokenKind::Simple(STokenKind::Importbin)),
257 b"in" => self.commit_token(TokenKind::Simple(STokenKind::In)),
258 b"local" => self.commit_token(TokenKind::Simple(STokenKind::Local)),
259 b"null" => self.commit_token(TokenKind::Simple(STokenKind::Null)),
260 b"tailstrict" => self.commit_token(TokenKind::Simple(STokenKind::Tailstrict)),
261 b"then" => self.commit_token(TokenKind::Simple(STokenKind::Then)),
262 b"self" => self.commit_token(TokenKind::Simple(STokenKind::Self_)),
263 b"super" => self.commit_token(TokenKind::Simple(STokenKind::Super)),
264 b"true" => self.commit_token(TokenKind::Simple(STokenKind::True)),
265 _ => self.commit_token(TokenKind::Ident(
266 self.str_interner
267 .intern(self.arena, std::str::from_utf8(ident_bytes).unwrap()),
268 )),
269 }
270 }
271
272 fn lex_number(&mut self, chr0: u8) -> Result<Token<'p, 'ast>, LexError> {
273 let leading_zero = chr0 == b'0';
274 let mut digits = String::new();
275 digits.push(char::from(chr0));
276
277 while let Some(chr) = self.eat_get_byte_if(|b| b.is_ascii_digit()) {
278 if digits.len() == 1 && leading_zero {
279 let span = self.make_span(self.end_pos - 2, self.end_pos - 1);
280 return Err(LexError::LeadingZeroInNumber { span });
281 }
282 digits.push(char::from(chr));
283 }
284
285 let mut implicit_exp = 0i64;
286 if self.eat_byte(b'.') {
287 while let Some(chr) = self.eat_get_byte_if(|b| b.is_ascii_digit()) {
288 digits.push(char::from(chr));
289 implicit_exp -= 1;
290 }
291 if implicit_exp == 0 {
292 let span = self.make_span(self.end_pos - 1, self.end_pos);
293 return Err(LexError::MissingFracDigits { span });
294 }
295 }
296
297 let eff_exp;
298 if self.eat_byte_if(|b| matches!(b, b'e' | b'E')) {
299 let mut explicit_exp_sign = false;
300 let mut explicit_exp = Some(0u64);
301
302 let exp_start = self.end_pos - 1;
303
304 if self.eat_byte(b'+') {
305 } else if self.eat_byte(b'-') {
307 explicit_exp_sign = true;
308 }
309
310 let mut num_exp_digits = 0;
311 while let Some(chr) = self.eat_get_byte_if(|b| b.is_ascii_digit()) {
312 explicit_exp = explicit_exp
313 .and_then(|e| e.checked_mul(10))
314 .and_then(|e| e.checked_add(u64::from(chr - b'0')));
315 num_exp_digits += 1;
316 }
317
318 if num_exp_digits == 0 {
319 let span = self.make_span(exp_start, self.end_pos);
320 return Err(LexError::MissingExpDigits { span });
321 }
322
323 eff_exp = explicit_exp
324 .and_then(|e| i64::try_from(e).ok())
325 .and_then(|e| {
326 if explicit_exp_sign {
327 implicit_exp.checked_sub(e)
328 } else {
329 implicit_exp.checked_add(e)
330 }
331 })
332 .ok_or_else(|| {
333 let span = self.make_span(exp_start, self.end_pos);
334 LexError::ExpOverflow { span }
335 })?;
336 } else {
337 eff_exp = implicit_exp;
338 }
339
340 Ok(self.commit_token(TokenKind::Number(Number {
341 digits: self.ast_arena.alloc_str(&digits),
342 exp: eff_exp,
343 })))
344 }
345
346 fn lex_quoted_string(&mut self, delim: u8) -> Result<Token<'p, 'ast>, LexError> {
347 let mut string = String::new();
348 loop {
349 if self.eat_byte(delim) {
350 break;
351 } else if self.eat_byte(b'\\') {
352 let escape_start = self.end_pos - 1;
353 if self.eat_byte(b'"') {
354 string.push('"');
355 } else if self.eat_byte(b'\'') {
356 string.push('\'');
357 } else if self.eat_byte(b'\\') {
358 string.push('\\');
359 } else if self.eat_byte(b'/') {
360 string.push('/');
361 } else if self.eat_byte(b'b') {
362 string.push('\u{8}');
363 } else if self.eat_byte(b'f') {
364 string.push('\u{C}');
365 } else if self.eat_byte(b'n') {
366 string.push('\n');
367 } else if self.eat_byte(b'r') {
368 string.push('\r');
369 } else if self.eat_byte(b't') {
370 string.push('\t');
371 } else if self.eat_byte(b'u') {
372 let hex_from_digit = |b: u8| match b {
373 b'0'..=b'9' => Some(b - b'0'),
374 b'a'..=b'f' => Some(b - b'a' + 10),
375 b'A'..=b'F' => Some(b - b'A' + 10),
376 _ => None,
377 };
378
379 let eat_codeunit = |this: &mut Self| -> Option<u16> {
380 let d0 = this.eat_map_byte(hex_from_digit)?;
381 let d1 = this.eat_map_byte(hex_from_digit)?;
382 let d2 = this.eat_map_byte(hex_from_digit)?;
383 let d3 = this.eat_map_byte(hex_from_digit)?;
384 Some(
385 (u16::from(d0) << 12)
386 | (u16::from(d1) << 8)
387 | (u16::from(d2) << 4)
388 | u16::from(d3),
389 )
390 };
391
392 let Some(cu1) = eat_codeunit(self) else {
393 let span = self.make_span(escape_start, self.end_pos);
394 return Err(LexError::IncompleteUnicodeEscape { span });
395 };
396
397 if matches!(cu1, 0xD800..=0xDFFF) && self.eat_slice(b"\\u") {
398 let Some(cu2) = eat_codeunit(self) else {
399 let span = self.make_span(escape_start + 6, self.end_pos);
400 return Err(LexError::IncompleteUnicodeEscape { span });
401 };
402 if let Ok(chr) = char::decode_utf16([cu1, cu2]).next().unwrap() {
403 string.push(chr);
404 } else {
405 let span = self.make_span(escape_start, self.end_pos);
406 return Err(LexError::InvalidUtf16EscapeSequence {
407 span,
408 cu1,
409 cu2: Some(cu2),
410 });
411 }
412 } else if let Some(chr) = char::from_u32(cu1.into()) {
413 string.push(chr);
414 } else {
415 let span = self.make_span(escape_start, self.end_pos);
416 return Err(LexError::InvalidUtf16EscapeSequence {
417 span,
418 cu1,
419 cu2: None,
420 });
421 }
422 } else {
423 match self.eat_any_char() {
424 None => {
425 let span = self.make_span(self.start_pos, self.end_pos);
426 return Err(LexError::UnfinishedString { span });
427 }
428 Some(chr) => {
429 let span = self.make_span(escape_start, self.end_pos);
430 return Err(LexError::InvalidEscapeInString {
431 span,
432 chr: chr.unwrap_or('\u{FFFD}'),
433 });
434 }
435 }
436 }
437 } else {
438 match self.eat_any_char() {
439 None => {
440 let span = self.make_span(self.start_pos, self.end_pos);
441 return Err(LexError::UnfinishedString { span });
442 }
443 Some(Ok(chr)) => string.push(chr),
444 Some(Err(_)) => string.push('\u{FFFD}'),
445 }
446 }
447 }
448 Ok(self.commit_token(TokenKind::String(self.ast_arena.alloc_str(&string))))
449 }
450
451 fn lex_verbatim_string(&mut self, delim: u8) -> Result<Token<'p, 'ast>, LexError> {
452 let mut string = String::new();
453 loop {
454 if self.eat_byte(delim) {
455 if self.eat_byte(delim) {
456 string.push(char::from(delim));
457 } else {
458 break;
459 }
460 } else {
461 match self.eat_any_char() {
462 None => {
463 let span = self.make_span(self.start_pos, self.end_pos);
464 return Err(LexError::UnfinishedString { span });
465 }
466 Some(chr) => string.push(chr.unwrap_or('\u{FFFD}')),
467 }
468 }
469 }
470 Ok(self.commit_token(TokenKind::String(self.ast_arena.alloc_str(&string))))
471 }
472
473 #[inline]
474 fn lex_text_block(&mut self) -> Result<Token<'p, 'ast>, LexError> {
475 let mut string = String::new();
476 let mut prefix;
477 while self.eat_byte_if(|b| matches!(b, b' ' | b'\t' | b'\r')) {}
478 if self.eat_byte(b'\n') {
479 loop {
480 let prefix_start = self.end_pos;
481 while self.eat_byte_if(|b| matches!(b, b' ' | b'\t')) {}
482 let prefix_end = self.end_pos;
483 prefix = &self.input[prefix_start..prefix_end];
484 if self.eat_byte(b'\r') {
485 string.push('\r');
486 }
487 if prefix.is_empty() {
488 if self.eat_byte(b'\n') {
489 string.push('\n');
495 continue;
496 } else {
497 let span = self.make_span(prefix_start, prefix_end);
498 return Err(LexError::MissingWhitespaceTextBlockStart { span });
499 }
500 }
501 break;
502 }
503 } else {
504 let span = self.make_span(self.start_pos, self.end_pos);
505 return Err(LexError::MissingLineBreakAfterTextBlockStart { span });
506 }
507
508 'outer: loop {
509 while self.eat_byte(b'\n') {
510 string.push('\n');
511 loop {
512 if self.eat_byte(b'\n') {
514 string.push('\n');
515 } else if self.eat_slice(b"\r\n") {
516 string.push_str("\r\n");
517 } else {
518 break;
519 }
520 }
521 if !self.eat_slice(prefix) {
522 let line_start = self.end_pos;
523 while self.eat_byte_if(|b| matches!(b, b' ' | b'\t')) {}
524 if self.eat_slice(b"|||") {
525 break 'outer;
526 } else {
527 let span = self.make_span(line_start, self.end_pos);
528 return Err(LexError::InvalidTextBlockTermination { span });
529 }
530 }
531 }
532
533 match self.eat_any_char() {
534 None => {
535 let span = self.make_span(self.start_pos, self.end_pos);
536 return Err(LexError::UnfinishedString { span });
537 }
538 Some(chr) => string.push(chr.unwrap_or('\u{FFFD}')),
539 }
540 }
541
542 Ok(self.commit_token(TokenKind::TextBlock(self.ast_arena.alloc_str(&string))))
543 }
544
545 #[must_use]
546 #[inline]
547 fn eat_byte(&mut self, byte: u8) -> bool {
548 if matches!(self.input.get(self.end_pos), Some(&b) if b == byte) {
549 self.end_pos += 1;
550 true
551 } else {
552 false
553 }
554 }
555
556 #[must_use]
557 #[inline]
558 fn eat_byte_if(&mut self, pred: impl FnOnce(u8) -> bool) -> bool {
559 if matches!(self.input.get(self.end_pos), Some(&b) if pred(b)) {
560 self.end_pos += 1;
561 true
562 } else {
563 false
564 }
565 }
566
567 #[must_use]
568 #[inline]
569 fn eat_get_byte_if(&mut self, pred: impl FnOnce(u8) -> bool) -> Option<u8> {
570 if let Some(&b) = self.input.get(self.end_pos).filter(|&&b| pred(b)) {
571 self.end_pos += 1;
572 Some(b)
573 } else {
574 None
575 }
576 }
577
578 #[must_use]
579 #[inline]
580 fn eat_map_byte<R>(&mut self, f: impl FnOnce(u8) -> Option<R>) -> Option<R> {
581 if let Some(r) = self.input.get(self.end_pos).and_then(|&b| f(b)) {
582 self.end_pos += 1;
583 Some(r)
584 } else {
585 None
586 }
587 }
588
589 #[must_use]
590 #[inline]
591 fn eat_slice(&mut self, s: &[u8]) -> bool {
592 if self
593 .input
594 .get(self.end_pos..)
595 .is_some_and(|rem| rem.starts_with(s))
596 {
597 self.end_pos += s.len();
598 true
599 } else {
600 false
601 }
602 }
603
604 #[must_use]
605 #[inline]
606 fn decode_cont_char(&self, byte0: u8) -> (usize, Option<char>) {
607 const TAG_CONT_U8: u8 = 128;
609 fn safe_get(xs: &[u8], i: usize) -> u8 {
610 *xs.get(i).unwrap_or(&0)
611 }
612
613 let mut i = self.end_pos;
614 match byte0 {
615 0..=0x7F => (i, Some(char::from(byte0))),
616 0b11000000..=0b11011111 => {
617 let byte1 = safe_get(self.input, i);
618 if byte1 & 192 != TAG_CONT_U8 {
619 return (i, None);
620 }
621 i += 1;
622
623 let cp = (u32::from(byte0 & 0b11111) << 6) | u32::from(byte1 & 0b111111);
624 (i, Some(char::from_u32(cp).unwrap()))
625 }
626 0b11100000..=0b11101111 => {
627 let byte1 = safe_get(self.input, i);
628 match (byte0, byte1) {
629 (0xE0, 0xA0..=0xBF) => (),
630 (0xE1..=0xEC, 0x80..=0xBF) => (),
631 (0xED, 0x80..=0x9F) => (),
632 (0xEE..=0xEF, 0x80..=0xBF) => (),
633 _ => return (i, None),
634 }
635 i += 1;
636 let byte2 = safe_get(self.input, i);
637 if byte2 & 192 != TAG_CONT_U8 {
638 return (i, None);
639 }
640 i += 1;
641
642 let cp = (u32::from(byte0 & 0b1111) << 12)
643 | (u32::from(byte1 & 0b111111) << 6)
644 | u32::from(byte2 & 0b111111);
645 (i, Some(char::from_u32(cp).unwrap()))
646 }
647 0b11110000..=0b11110111 => {
648 let byte1 = safe_get(self.input, i);
649 match (byte0, byte1) {
650 (0xF0, 0x90..=0xBF) => (),
651 (0xF1..=0xF3, 0x80..=0xBF) => (),
652 (0xF4, 0x80..=0x8F) => (),
653 _ => return (i, None),
654 }
655 i += 1;
656 let byte2 = safe_get(self.input, i);
657 if byte2 & 192 != TAG_CONT_U8 {
658 return (i, None);
659 }
660 i += 1;
661 let byte3 = safe_get(self.input, i);
662 if byte3 & 192 != TAG_CONT_U8 {
663 return (i, None);
664 }
665 i += 1;
666
667 let cp = (u32::from(byte0 & 0b111) << 18)
668 | (u32::from(byte1 & 0b111111) << 12)
669 | (u32::from(byte2 & 0b111111) << 6)
670 | u32::from(byte3 & 0b111111);
671 (i, Some(char::from_u32(cp).unwrap()))
672 }
673 _ => (i, None),
674 }
675 }
676
677 #[must_use]
678 #[inline]
679 fn eat_any_byte(&mut self) -> Option<u8> {
680 if let Some(&byte) = self.input.get(self.end_pos) {
681 self.end_pos += 1;
682 Some(byte)
683 } else {
684 None
685 }
686 }
687
688 #[inline]
689 fn eat_cont_any_char(&mut self, byte0: u8) -> Result<char, usize> {
690 let (end_pos, chr) = self.decode_cont_char(byte0);
691 if let Some(chr) = chr {
692 self.end_pos = end_pos;
693 Ok(chr)
694 } else {
695 let error_len = end_pos - self.end_pos + 1;
696 self.end_pos = end_pos;
697 Err(error_len)
698 }
699 }
700
701 #[must_use]
702 #[inline]
703 fn eat_any_char(&mut self) -> Option<Result<char, usize>> {
704 self.eat_any_byte()
705 .map(|byte0| self.eat_cont_any_char(byte0))
706 }
707
708 #[must_use]
709 #[inline]
710 fn commit_token(&mut self, kind: TokenKind<'p, 'ast>) -> Token<'p, 'ast> {
711 let start_pos = self.start_pos;
712 self.start_pos = self.end_pos;
713 Token {
714 span: self.make_span(start_pos, self.end_pos),
715 kind,
716 }
717 }
718
719 #[must_use]
720 #[inline]
721 fn make_span(&mut self, start_pos: usize, end_pos: usize) -> SpanId {
722 self.span_mgr.intern_span(self.span_ctx, start_pos, end_pos)
723 }
724}
725
726#[cfg(test)]
727mod tests {
728 use super::Lexer;
729 use crate::arena::Arena;
730 use crate::interner::StrInterner;
731 use crate::span::SpanManager;
732
733 #[test]
734 fn test_decode_valid_utf8() {
735 let arena = Arena::new();
736 let ast_arena = Arena::new();
737 let str_interner = StrInterner::new();
738 let mut span_mgr = SpanManager::new();
739 let (span_ctx, _) = span_mgr.insert_source_context(4);
740
741 for chr in '\u{0}'..=char::MAX {
742 let mut buf = [0; 4];
743 let encoded = chr.encode_utf8(&mut buf);
744
745 let mut lexer = Lexer::new(
746 &arena,
747 &ast_arena,
748 &str_interner,
749 &mut span_mgr,
750 span_ctx,
751 encoded.as_bytes(),
752 );
753 assert_eq!(lexer.eat_any_char(), Some(Ok(chr)));
754 }
755 }
756
757 #[test]
758 fn test_decode_invalid_utf8() {
759 let arena = Arena::new();
760 let ast_area = Arena::new();
761 let str_interner = StrInterner::new();
762 let mut span_mgr = SpanManager::new();
763
764 let tests: &[&[u8]] = &[
765 b"\xFF ",
767 b"\x80 ",
768 b"\xC1 ",
769 b"\xC1",
770 b"\xC2",
771 b"\xC2 ",
772 b"\xC2\xC0",
773 b"\xE0",
774 b"\xE0\x9F",
775 b"\xE0\xA0",
776 b"\xE0\xA0\xC0",
777 b"\xE0\xA0 ",
778 b"\xED\xA0\x80 ",
779 b"\xF1",
780 b"\xF1\x80",
781 b"\xF1\x80\x80",
782 b"\xF1 ",
783 b"\xF1\x80 ",
784 b"\xF1\x80\x80 ",
785 ];
786
787 let (span_ctx, _) =
788 span_mgr.insert_source_context(tests.iter().fold(0, |l, s| l.max(s.len())));
789
790 for &source in tests.iter() {
791 let utf8_error = std::str::from_utf8(source).unwrap_err();
792 assert_eq!(utf8_error.valid_up_to(), 0);
793 let error_len = utf8_error.error_len().unwrap_or(source.len());
794
795 let mut lexer = Lexer::new(
796 &ast_area,
797 &arena,
798 &str_interner,
799 &mut span_mgr,
800 span_ctx,
801 source,
802 );
803 assert_eq!(lexer.eat_any_char(), Some(Err(error_len)));
804 }
805 }
806}