1use crate::arena::Arena;
29use crate::interner::StrInterner;
30use crate::span::{SpanContextId, SpanId, SpanManager};
31use crate::token::{Number, STokenKind, Token, TokenKind};
32
33mod error;
34
35pub use error::LexError;
36
37pub struct Lexer<'a, 'p, 'ast> {
39 arena: &'p Arena,
40 ast_arena: &'ast Arena,
41 str_interner: &'a StrInterner<'p>,
42 span_mgr: &'a mut SpanManager,
43 span_ctx: SpanContextId,
44 input: &'a [u8],
45 start_pos: usize,
46 end_pos: usize,
47}
48
49impl<'a, 'p, 'ast> Lexer<'a, 'p, 'ast> {
50 pub fn new(
52 arena: &'p Arena,
53 ast_arena: &'ast Arena,
54 str_interner: &'a StrInterner<'p>,
55 span_mgr: &'a mut SpanManager,
56 span_ctx: SpanContextId,
57 input: &'a [u8],
58 ) -> Self {
59 Self {
60 arena,
61 ast_arena,
62 str_interner,
63 span_mgr,
64 span_ctx,
65 input,
66 start_pos: 0,
67 end_pos: 0,
68 }
69 }
70
71 pub fn lex_to_eof(
79 mut self,
80 whitespaces_and_comments: bool,
81 ) -> Result<Vec<Token<'p, 'ast>>, LexError> {
82 let mut tokens = Vec::new();
83 loop {
84 let token = self.next_token()?;
85 let is_eof = token.kind == TokenKind::EndOfFile;
86 if whitespaces_and_comments
87 || !matches!(token.kind, TokenKind::Whitespace | TokenKind::Comment)
88 {
89 tokens.push(token);
90 }
91 if is_eof {
92 break;
93 }
94 }
95 Ok(tokens)
96 }
97
98 pub fn next_token(&mut self) -> Result<Token<'p, 'ast>, LexError> {
100 match self.eat_any_byte() {
101 None => Ok(self.commit_token(TokenKind::EndOfFile)),
102 Some(b'{') => Ok(self.commit_token(TokenKind::Simple(STokenKind::LeftBrace))),
103 Some(b'}') => Ok(self.commit_token(TokenKind::Simple(STokenKind::RightBrace))),
104 Some(b'[') => Ok(self.commit_token(TokenKind::Simple(STokenKind::LeftBracket))),
105 Some(b']') => Ok(self.commit_token(TokenKind::Simple(STokenKind::RightBracket))),
106 Some(b',') => Ok(self.commit_token(TokenKind::Simple(STokenKind::Comma))),
107 Some(b'.') => Ok(self.commit_token(TokenKind::Simple(STokenKind::Dot))),
108 Some(b'(') => Ok(self.commit_token(TokenKind::Simple(STokenKind::LeftParen))),
109 Some(b')') => Ok(self.commit_token(TokenKind::Simple(STokenKind::RightParen))),
110 Some(b';') => Ok(self.commit_token(TokenKind::Simple(STokenKind::Semicolon))),
111 Some(b'/') => {
112 if self.eat_byte(b'/') {
113 self.lex_single_line_comment()
114 } else if self.eat_byte(b'*') {
115 self.lex_multi_line_comment()
116 } else {
117 Ok(self.lex_operator())
118 }
119 }
120 Some(b'|') => {
121 if self.eat_slice(b"||") {
122 self.lex_text_block()
123 } else {
124 Ok(self.lex_operator())
125 }
126 }
127 Some(
128 b'!' | b'$' | b':' | b'~' | b'+' | b'-' | b'&' | b'^' | b'=' | b'<' | b'>' | b'*'
129 | b'%',
130 ) => Ok(self.lex_operator()),
131 Some(b' ' | b'\t' | b'\n' | b'\r') => {
132 while self.eat_byte_if(|byte| matches!(byte, b' ' | b'\t' | b'\n' | b'\r')) {}
133 Ok(self.commit_token(TokenKind::Whitespace))
134 }
135 Some(b'#') => self.lex_single_line_comment(),
136 Some(chr @ b'0'..=b'9') => self.lex_number(chr),
137 Some(b'_' | b'a'..=b'z' | b'A'..=b'Z') => Ok(self.lex_ident()),
138 Some(b'@') => {
139 if self.eat_byte(b'\'') {
140 self.lex_verbatim_string(b'\'')
141 } else if self.eat_byte(b'"') {
142 self.lex_verbatim_string(b'"')
143 } else {
144 let span = self.make_span(self.start_pos, self.end_pos);
145 return Err(LexError::InvalidChar { span, chr: '@' });
146 }
147 }
148 Some(b'\'') => self.lex_quoted_string(b'\''),
149 Some(b'"') => self.lex_quoted_string(b'"'),
150 Some(byte0) => match self.eat_cont_any_char(byte0) {
151 Ok(chr) => {
152 let span = self.make_span(self.start_pos, self.end_pos);
153 Err(LexError::InvalidChar { span, chr })
154 }
155 Err(_) => {
156 let span = self.make_span(self.start_pos, self.end_pos);
157 Err(LexError::InvalidUtf8 {
158 span,
159 seq: self.input[self.start_pos..self.end_pos].to_vec(),
160 })
161 }
162 },
163 }
164 }
165
166 fn lex_single_line_comment(&mut self) -> Result<Token<'p, 'ast>, LexError> {
167 while !matches!(self.eat_any_byte(), None | Some(b'\n')) {}
168 Ok(self.commit_token(TokenKind::Comment))
169 }
170
171 fn lex_multi_line_comment(&mut self) -> Result<Token<'p, 'ast>, LexError> {
172 loop {
173 if self.eat_slice(b"*/") {
174 break;
175 } else if self.eat_any_byte().is_none() {
176 let span = self.make_span(self.start_pos, self.end_pos);
177 return Err(LexError::UnfinishedMultilineComment { span });
178 }
179 }
180 Ok(self.commit_token(TokenKind::Comment))
181 }
182
183 #[must_use]
184 fn lex_operator(&mut self) -> Token<'p, 'ast> {
185 let mut sure_end_pos = self.end_pos;
186 loop {
187 if self.eat_slice(b"|||") || self.eat_slice(b"//") || self.eat_slice(b"/*") {
188 break;
190 }
191
192 let Some(next_byte) = self.eat_any_byte() else {
193 break;
194 };
195 if matches!(
197 next_byte,
198 b':' | b'&' | b'|' | b'^' | b'=' | b'<' | b'>' | b'*' | b'/' | b'%'
199 ) {
200 sure_end_pos = self.end_pos;
201 } else if !matches!(next_byte, b'+' | b'-' | b'~' | b'!' | b'$') {
202 break;
203 }
204 }
205 self.end_pos = sure_end_pos;
206 let op = &self.input[self.start_pos..self.end_pos];
207 match op {
208 b":" => self.commit_token(TokenKind::Simple(STokenKind::Colon)),
209 b"::" => self.commit_token(TokenKind::Simple(STokenKind::ColonColon)),
210 b":::" => self.commit_token(TokenKind::Simple(STokenKind::ColonColonColon)),
211 b"+:" => self.commit_token(TokenKind::Simple(STokenKind::PlusColon)),
212 b"+::" => self.commit_token(TokenKind::Simple(STokenKind::PlusColonColon)),
213 b"+:::" => self.commit_token(TokenKind::Simple(STokenKind::PlusColonColonColon)),
214 b"=" => self.commit_token(TokenKind::Simple(STokenKind::Eq)),
215 b"$" => self.commit_token(TokenKind::Simple(STokenKind::Dollar)),
216 b"*" => self.commit_token(TokenKind::Simple(STokenKind::Asterisk)),
217 b"/" => self.commit_token(TokenKind::Simple(STokenKind::Slash)),
218 b"%" => self.commit_token(TokenKind::Simple(STokenKind::Percent)),
219 b"+" => self.commit_token(TokenKind::Simple(STokenKind::Plus)),
220 b"-" => self.commit_token(TokenKind::Simple(STokenKind::Minus)),
221 b"<<" => self.commit_token(TokenKind::Simple(STokenKind::LtLt)),
222 b">>" => self.commit_token(TokenKind::Simple(STokenKind::GtGt)),
223 b"<" => self.commit_token(TokenKind::Simple(STokenKind::Lt)),
224 b"<=" => self.commit_token(TokenKind::Simple(STokenKind::LtEq)),
225 b">" => self.commit_token(TokenKind::Simple(STokenKind::Gt)),
226 b">=" => self.commit_token(TokenKind::Simple(STokenKind::GtEq)),
227 b"==" => self.commit_token(TokenKind::Simple(STokenKind::EqEq)),
228 b"!=" => self.commit_token(TokenKind::Simple(STokenKind::ExclamEq)),
229 b"&" => self.commit_token(TokenKind::Simple(STokenKind::Amp)),
230 b"^" => self.commit_token(TokenKind::Simple(STokenKind::Hat)),
231 b"|" => self.commit_token(TokenKind::Simple(STokenKind::Pipe)),
232 b"&&" => self.commit_token(TokenKind::Simple(STokenKind::AmpAmp)),
233 b"||" => self.commit_token(TokenKind::Simple(STokenKind::PipePipe)),
234 b"!" => self.commit_token(TokenKind::Simple(STokenKind::Exclam)),
235 b"~" => self.commit_token(TokenKind::Simple(STokenKind::Tilde)),
236 _ => self.commit_token(TokenKind::OtherOp(
237 self.ast_arena.alloc_str(std::str::from_utf8(op).unwrap()),
238 )),
239 }
240 }
241
242 #[must_use]
243 fn lex_ident(&mut self) -> Token<'p, 'ast> {
244 while self.eat_byte_if(|b| b.is_ascii_alphanumeric() || b == b'_') {}
245 let ident_bytes = &self.input[self.start_pos..self.end_pos];
246 match ident_bytes {
247 b"assert" => self.commit_token(TokenKind::Simple(STokenKind::Assert)),
248 b"else" => self.commit_token(TokenKind::Simple(STokenKind::Else)),
249 b"error" => self.commit_token(TokenKind::Simple(STokenKind::Error)),
250 b"false" => self.commit_token(TokenKind::Simple(STokenKind::False)),
251 b"for" => self.commit_token(TokenKind::Simple(STokenKind::For)),
252 b"function" => self.commit_token(TokenKind::Simple(STokenKind::Function)),
253 b"if" => self.commit_token(TokenKind::Simple(STokenKind::If)),
254 b"import" => self.commit_token(TokenKind::Simple(STokenKind::Import)),
255 b"importstr" => self.commit_token(TokenKind::Simple(STokenKind::Importstr)),
256 b"importbin" => self.commit_token(TokenKind::Simple(STokenKind::Importbin)),
257 b"in" => self.commit_token(TokenKind::Simple(STokenKind::In)),
258 b"local" => self.commit_token(TokenKind::Simple(STokenKind::Local)),
259 b"null" => self.commit_token(TokenKind::Simple(STokenKind::Null)),
260 b"tailstrict" => self.commit_token(TokenKind::Simple(STokenKind::Tailstrict)),
261 b"then" => self.commit_token(TokenKind::Simple(STokenKind::Then)),
262 b"self" => self.commit_token(TokenKind::Simple(STokenKind::Self_)),
263 b"super" => self.commit_token(TokenKind::Simple(STokenKind::Super)),
264 b"true" => self.commit_token(TokenKind::Simple(STokenKind::True)),
265 _ => self.commit_token(TokenKind::Ident(
266 self.str_interner
267 .intern(self.arena, std::str::from_utf8(ident_bytes).unwrap()),
268 )),
269 }
270 }
271
272 fn lex_number(&mut self, chr0: u8) -> Result<Token<'p, 'ast>, LexError> {
273 let leading_zero = chr0 == b'0';
274 let mut digits = String::new();
275 digits.push(char::from(chr0));
276
277 while let Some(chr) = self.eat_get_byte_if(|b| b.is_ascii_digit()) {
278 if digits.len() == 1 && leading_zero {
279 let span = self.make_span(self.end_pos - 2, self.end_pos - 1);
280 return Err(LexError::LeadingZeroInNumber { span });
281 }
282 digits.push(char::from(chr));
283 }
284
285 let mut implicit_exp = 0i64;
286 if self.eat_byte(b'.') {
287 while let Some(chr) = self.eat_get_byte_if(|b| b.is_ascii_digit()) {
288 digits.push(char::from(chr));
289 implicit_exp -= 1;
290 }
291 if implicit_exp == 0 {
292 let span = self.make_span(self.end_pos - 1, self.end_pos);
293 return Err(LexError::MissingFracDigits { span });
294 }
295 }
296
297 let eff_exp;
298 if self.eat_byte_if(|b| matches!(b, b'e' | b'E')) {
299 let mut explicit_exp_sign = false;
300 let mut explicit_exp = Some(0u64);
301
302 let exp_start = self.end_pos - 1;
303
304 if self.eat_byte(b'+') {
305 } else if self.eat_byte(b'-') {
307 explicit_exp_sign = true;
308 }
309
310 let mut num_exp_digits = 0;
311 while let Some(chr) = self.eat_get_byte_if(|b| b.is_ascii_digit()) {
312 explicit_exp = explicit_exp
313 .and_then(|e| e.checked_mul(10))
314 .and_then(|e| e.checked_add(u64::from(chr - b'0')));
315 num_exp_digits += 1;
316 }
317
318 if num_exp_digits == 0 {
319 let span = self.make_span(exp_start, self.end_pos);
320 return Err(LexError::MissingExpDigits { span });
321 }
322
323 eff_exp = explicit_exp
324 .and_then(|e| i64::try_from(e).ok())
325 .and_then(|e| {
326 if explicit_exp_sign {
327 implicit_exp.checked_sub(e)
328 } else {
329 implicit_exp.checked_add(e)
330 }
331 })
332 .ok_or_else(|| {
333 let span = self.make_span(exp_start, self.end_pos);
334 LexError::ExpOverflow { span }
335 })?;
336 } else {
337 eff_exp = implicit_exp;
338 }
339
340 Ok(self.commit_token(TokenKind::Number(Number {
341 digits: self.ast_arena.alloc_str(&digits),
342 exp: eff_exp,
343 })))
344 }
345
346 fn lex_quoted_string(&mut self, delim: u8) -> Result<Token<'p, 'ast>, LexError> {
347 let mut string = String::new();
348 loop {
349 if self.eat_byte(delim) {
350 break;
351 } else if self.eat_byte(b'\\') {
352 let escape_start = self.end_pos - 1;
353 if self.eat_byte(b'"') {
354 string.push('"');
355 } else if self.eat_byte(b'\'') {
356 string.push('\'');
357 } else if self.eat_byte(b'\\') {
358 string.push('\\');
359 } else if self.eat_byte(b'/') {
360 string.push('/');
361 } else if self.eat_byte(b'b') {
362 string.push('\u{8}');
363 } else if self.eat_byte(b'f') {
364 string.push('\u{C}');
365 } else if self.eat_byte(b'n') {
366 string.push('\n');
367 } else if self.eat_byte(b'r') {
368 string.push('\r');
369 } else if self.eat_byte(b't') {
370 string.push('\t');
371 } else if self.eat_byte(b'u') {
372 let hex_from_digit = |b: u8| match b {
373 b'0'..=b'9' => Some(b - b'0'),
374 b'a'..=b'f' => Some(b - b'a' + 10),
375 b'A'..=b'F' => Some(b - b'A' + 10),
376 _ => None,
377 };
378
379 let eat_codeunit = |this: &mut Self| -> Option<u16> {
380 let d0 = this.eat_map_byte(hex_from_digit)?;
381 let d1 = this.eat_map_byte(hex_from_digit)?;
382 let d2 = this.eat_map_byte(hex_from_digit)?;
383 let d3 = this.eat_map_byte(hex_from_digit)?;
384 Some(
385 (u16::from(d0) << 12)
386 | (u16::from(d1) << 8)
387 | (u16::from(d2) << 4)
388 | u16::from(d3),
389 )
390 };
391
392 let Some(cu1) = eat_codeunit(self) else {
393 let span = self.make_span(escape_start, self.end_pos);
394 return Err(LexError::IncompleteUnicodeEscape { span });
395 };
396
397 if matches!(cu1, 0xD800..=0xDFFF) && self.eat_slice(b"\\u") {
398 let Some(cu2) = eat_codeunit(self) else {
399 let span = self.make_span(escape_start + 6, self.end_pos);
400 return Err(LexError::IncompleteUnicodeEscape { span });
401 };
402 if let Ok(chr) = char::decode_utf16([cu1, cu2]).next().unwrap() {
403 string.push(chr);
404 } else {
405 let span = self.make_span(escape_start, self.end_pos);
406 return Err(LexError::InvalidUtf16EscapeSequence {
407 span,
408 cu1,
409 cu2: Some(cu2),
410 });
411 }
412 } else if let Some(chr) = char::from_u32(cu1.into()) {
413 string.push(chr);
414 } else {
415 let span = self.make_span(escape_start, self.end_pos);
416 return Err(LexError::InvalidUtf16EscapeSequence {
417 span,
418 cu1,
419 cu2: None,
420 });
421 }
422 } else {
423 match self.eat_any_char() {
424 None => {
425 let span = self.make_span(self.start_pos, self.end_pos);
426 return Err(LexError::UnfinishedString { span });
427 }
428 Some(chr) => {
429 let span = self.make_span(escape_start, self.end_pos);
430 return Err(LexError::InvalidEscapeInString {
431 span,
432 chr: chr.unwrap_or('\u{FFFD}'),
433 });
434 }
435 }
436 }
437 } else {
438 match self.eat_any_char() {
439 None => {
440 let span = self.make_span(self.start_pos, self.end_pos);
441 return Err(LexError::UnfinishedString { span });
442 }
443 Some(Ok(chr)) => string.push(chr),
444 Some(Err(_)) => string.push('\u{FFFD}'),
445 }
446 }
447 }
448 Ok(self.commit_token(TokenKind::String(self.ast_arena.alloc_str(&string))))
449 }
450
451 fn lex_verbatim_string(&mut self, delim: u8) -> Result<Token<'p, 'ast>, LexError> {
452 let mut string = String::new();
453 loop {
454 if self.eat_byte(delim) {
455 if self.eat_byte(delim) {
456 string.push(char::from(delim));
457 } else {
458 break;
459 }
460 } else {
461 match self.eat_any_char() {
462 None => {
463 let span = self.make_span(self.start_pos, self.end_pos);
464 return Err(LexError::UnfinishedString { span });
465 }
466 Some(chr) => string.push(chr.unwrap_or('\u{FFFD}')),
467 }
468 }
469 }
470 Ok(self.commit_token(TokenKind::String(self.ast_arena.alloc_str(&string))))
471 }
472
473 #[inline]
474 fn lex_text_block(&mut self) -> Result<Token<'p, 'ast>, LexError> {
475 let strip_last_lf = self.eat_byte(b'-');
476
477 let mut string = String::new();
478 let mut prefix;
479 while self.eat_byte_if(|b| matches!(b, b' ' | b'\t' | b'\r')) {}
480 if self.eat_byte(b'\n') {
481 loop {
482 let prefix_start = self.end_pos;
483 while self.eat_byte_if(|b| matches!(b, b' ' | b'\t')) {}
484 let prefix_end = self.end_pos;
485 prefix = &self.input[prefix_start..prefix_end];
486 if self.eat_byte(b'\r') {
487 string.push('\r');
488 }
489 if prefix.is_empty() {
490 if self.eat_byte(b'\n') {
491 string.push('\n');
497 continue;
498 } else {
499 let span = self.make_span(prefix_start, prefix_end);
500 return Err(LexError::MissingWhitespaceTextBlockStart { span });
501 }
502 }
503 break;
504 }
505 } else {
506 let span = self.make_span(self.start_pos, self.end_pos);
507 return Err(LexError::MissingLineBreakAfterTextBlockStart { span });
508 }
509
510 'outer: loop {
511 while self.eat_byte(b'\n') {
512 string.push('\n');
513 loop {
514 if self.eat_byte(b'\n') {
516 string.push('\n');
517 } else if self.eat_slice(b"\r\n") {
518 string.push_str("\r\n");
519 } else {
520 break;
521 }
522 }
523 if !self.eat_slice(prefix) {
524 let line_start = self.end_pos;
525 while self.eat_byte_if(|b| matches!(b, b' ' | b'\t')) {}
526 if self.eat_slice(b"|||") {
527 break 'outer;
528 } else {
529 let span = self.make_span(line_start, self.end_pos);
530 return Err(LexError::InvalidTextBlockTermination { span });
531 }
532 }
533 }
534
535 match self.eat_any_char() {
536 None => {
537 let span = self.make_span(self.start_pos, self.end_pos);
538 return Err(LexError::UnfinishedString { span });
539 }
540 Some(chr) => string.push(chr.unwrap_or('\u{FFFD}')),
541 }
542 }
543
544 let actual_string = if strip_last_lf {
545 string.strip_suffix('\n').unwrap()
546 } else {
547 string.as_str()
548 };
549
550 Ok(self.commit_token(TokenKind::TextBlock(
551 self.ast_arena.alloc_str(actual_string),
552 )))
553 }
554
555 #[must_use]
556 #[inline]
557 fn eat_byte(&mut self, byte: u8) -> bool {
558 if matches!(self.input.get(self.end_pos), Some(&b) if b == byte) {
559 self.end_pos += 1;
560 true
561 } else {
562 false
563 }
564 }
565
566 #[must_use]
567 #[inline]
568 fn eat_byte_if(&mut self, pred: impl FnOnce(u8) -> bool) -> bool {
569 if matches!(self.input.get(self.end_pos), Some(&b) if pred(b)) {
570 self.end_pos += 1;
571 true
572 } else {
573 false
574 }
575 }
576
577 #[must_use]
578 #[inline]
579 fn eat_get_byte_if(&mut self, pred: impl FnOnce(u8) -> bool) -> Option<u8> {
580 if let Some(&b) = self.input.get(self.end_pos).filter(|&&b| pred(b)) {
581 self.end_pos += 1;
582 Some(b)
583 } else {
584 None
585 }
586 }
587
588 #[must_use]
589 #[inline]
590 fn eat_map_byte<R>(&mut self, f: impl FnOnce(u8) -> Option<R>) -> Option<R> {
591 if let Some(r) = self.input.get(self.end_pos).and_then(|&b| f(b)) {
592 self.end_pos += 1;
593 Some(r)
594 } else {
595 None
596 }
597 }
598
599 #[must_use]
600 #[inline]
601 fn eat_slice(&mut self, s: &[u8]) -> bool {
602 if self
603 .input
604 .get(self.end_pos..)
605 .is_some_and(|rem| rem.starts_with(s))
606 {
607 self.end_pos += s.len();
608 true
609 } else {
610 false
611 }
612 }
613
614 #[must_use]
615 #[inline]
616 fn decode_cont_char(&self, byte0: u8) -> (usize, Option<char>) {
617 const TAG_CONT_U8: u8 = 128;
619 fn safe_get(xs: &[u8], i: usize) -> u8 {
620 *xs.get(i).unwrap_or(&0)
621 }
622
623 let mut i = self.end_pos;
624 match byte0 {
625 0..=0x7F => (i, Some(char::from(byte0))),
626 0b11000000..=0b11011111 => {
627 let byte1 = safe_get(self.input, i);
628 if byte1 & 192 != TAG_CONT_U8 {
629 return (i, None);
630 }
631 i += 1;
632
633 let cp = (u32::from(byte0 & 0b11111) << 6) | u32::from(byte1 & 0b111111);
634 (i, Some(char::from_u32(cp).unwrap()))
635 }
636 0b11100000..=0b11101111 => {
637 let byte1 = safe_get(self.input, i);
638 match (byte0, byte1) {
639 (0xE0, 0xA0..=0xBF) => (),
640 (0xE1..=0xEC, 0x80..=0xBF) => (),
641 (0xED, 0x80..=0x9F) => (),
642 (0xEE..=0xEF, 0x80..=0xBF) => (),
643 _ => return (i, None),
644 }
645 i += 1;
646 let byte2 = safe_get(self.input, i);
647 if byte2 & 192 != TAG_CONT_U8 {
648 return (i, None);
649 }
650 i += 1;
651
652 let cp = (u32::from(byte0 & 0b1111) << 12)
653 | (u32::from(byte1 & 0b111111) << 6)
654 | u32::from(byte2 & 0b111111);
655 (i, Some(char::from_u32(cp).unwrap()))
656 }
657 0b11110000..=0b11110111 => {
658 let byte1 = safe_get(self.input, i);
659 match (byte0, byte1) {
660 (0xF0, 0x90..=0xBF) => (),
661 (0xF1..=0xF3, 0x80..=0xBF) => (),
662 (0xF4, 0x80..=0x8F) => (),
663 _ => return (i, None),
664 }
665 i += 1;
666 let byte2 = safe_get(self.input, i);
667 if byte2 & 192 != TAG_CONT_U8 {
668 return (i, None);
669 }
670 i += 1;
671 let byte3 = safe_get(self.input, i);
672 if byte3 & 192 != TAG_CONT_U8 {
673 return (i, None);
674 }
675 i += 1;
676
677 let cp = (u32::from(byte0 & 0b111) << 18)
678 | (u32::from(byte1 & 0b111111) << 12)
679 | (u32::from(byte2 & 0b111111) << 6)
680 | u32::from(byte3 & 0b111111);
681 (i, Some(char::from_u32(cp).unwrap()))
682 }
683 _ => (i, None),
684 }
685 }
686
687 #[must_use]
688 #[inline]
689 fn eat_any_byte(&mut self) -> Option<u8> {
690 if let Some(&byte) = self.input.get(self.end_pos) {
691 self.end_pos += 1;
692 Some(byte)
693 } else {
694 None
695 }
696 }
697
698 #[inline]
699 fn eat_cont_any_char(&mut self, byte0: u8) -> Result<char, usize> {
700 let (end_pos, chr) = self.decode_cont_char(byte0);
701 if let Some(chr) = chr {
702 self.end_pos = end_pos;
703 Ok(chr)
704 } else {
705 let error_len = end_pos - self.end_pos + 1;
706 self.end_pos = end_pos;
707 Err(error_len)
708 }
709 }
710
711 #[must_use]
712 #[inline]
713 fn eat_any_char(&mut self) -> Option<Result<char, usize>> {
714 self.eat_any_byte()
715 .map(|byte0| self.eat_cont_any_char(byte0))
716 }
717
718 #[must_use]
719 #[inline]
720 fn commit_token(&mut self, kind: TokenKind<'p, 'ast>) -> Token<'p, 'ast> {
721 let start_pos = self.start_pos;
722 self.start_pos = self.end_pos;
723 Token {
724 span: self.make_span(start_pos, self.end_pos),
725 kind,
726 }
727 }
728
729 #[must_use]
730 #[inline]
731 fn make_span(&mut self, start_pos: usize, end_pos: usize) -> SpanId {
732 self.span_mgr.intern_span(self.span_ctx, start_pos, end_pos)
733 }
734}
735
736#[cfg(test)]
737mod tests {
738 use super::Lexer;
739 use crate::arena::Arena;
740 use crate::interner::StrInterner;
741 use crate::span::SpanManager;
742
743 #[test]
744 fn test_decode_valid_utf8() {
745 let arena = Arena::new();
746 let ast_arena = Arena::new();
747 let str_interner = StrInterner::new();
748 let mut span_mgr = SpanManager::new();
749 let (span_ctx, _) = span_mgr.insert_source_context(4);
750
751 for chr in '\u{0}'..=char::MAX {
752 let mut buf = [0; 4];
753 let encoded = chr.encode_utf8(&mut buf);
754
755 let mut lexer = Lexer::new(
756 &arena,
757 &ast_arena,
758 &str_interner,
759 &mut span_mgr,
760 span_ctx,
761 encoded.as_bytes(),
762 );
763 assert_eq!(lexer.eat_any_char(), Some(Ok(chr)));
764 }
765 }
766
767 #[test]
768 fn test_decode_invalid_utf8() {
769 let arena = Arena::new();
770 let ast_area = Arena::new();
771 let str_interner = StrInterner::new();
772 let mut span_mgr = SpanManager::new();
773
774 let tests: &[&[u8]] = &[
775 b"\xFF ",
777 b"\x80 ",
778 b"\xC1 ",
779 b"\xC1",
780 b"\xC2",
781 b"\xC2 ",
782 b"\xC2\xC0",
783 b"\xE0",
784 b"\xE0\x9F",
785 b"\xE0\xA0",
786 b"\xE0\xA0\xC0",
787 b"\xE0\xA0 ",
788 b"\xED\xA0\x80 ",
789 b"\xF1",
790 b"\xF1\x80",
791 b"\xF1\x80\x80",
792 b"\xF1 ",
793 b"\xF1\x80 ",
794 b"\xF1\x80\x80 ",
795 ];
796
797 let (span_ctx, _) =
798 span_mgr.insert_source_context(tests.iter().fold(0, |l, s| l.max(s.len())));
799
800 for &source in tests.iter() {
801 let utf8_error = std::str::from_utf8(source).unwrap_err();
802 assert_eq!(utf8_error.valid_up_to(), 0);
803 let error_len = utf8_error.error_len().unwrap_or(source.len());
804
805 let mut lexer = Lexer::new(
806 &ast_area,
807 &arena,
808 &str_interner,
809 &mut span_mgr,
810 span_ctx,
811 source,
812 );
813 assert_eq!(lexer.eat_any_char(), Some(Err(error_len)));
814 }
815 }
816}