1use alloc::string::String;
9use alloc::vec::Vec;
10use core::fmt;
11use core::iter::repeat_n;
12use core::mem::discriminant;
13use core::num::NonZeroUsize;
14use core::ops::Range;
15
16use smol_str::SmolStr;
17use thiserror::Error;
18
19use crate::dom::Number;
20use crate::dom::number::NumberBuilder;
21use crate::ssb2::SmolStrBuilder2;
22
23#[cfg(test)]
24mod tests;
25
26#[derive(Debug, Clone, PartialEq, Eq, Hash)]
28pub enum Token {
29 Bom,
31 Eof,
33 Lines,
35 Spaces,
37 String(SmolStr),
39 Number(Number),
41 SkippedString,
43 SkippedNumber,
45 SlashDash,
47 SemiColon,
49 Equals,
51 OpenParen,
53 CloseParen,
55 OpenCurly,
57 CloseCurly,
59 Bool(bool),
61 Null,
63}
64
65impl fmt::Display for Token {
66 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
67 match self {
68 Token::Bom => f.write_str("byte order mark"),
69 Token::Eof => f.write_str("end of file"),
70 Token::Lines => f.write_str("'\\n'"),
71 Token::Spaces => f.write_str("' '"),
72 Token::String(value) => fmt::Debug::fmt(value, f),
73 Token::Number(value) => fmt::Display::fmt(value, f),
74 Token::SkippedString => f.write_str("a string"),
75 Token::SkippedNumber => f.write_str("a number"),
76 Token::SlashDash => f.write_str("'/-'"),
77 Token::SemiColon => f.write_str("';'"),
78 Token::Equals => f.write_str("'='"),
79 Token::OpenParen => f.write_str("'('"),
80 Token::CloseParen => f.write_str("')'"),
81 Token::OpenCurly => f.write_str("'{'"),
82 Token::CloseCurly => f.write_str("'}'"),
83 &Token::Bool(value) => f.write_str(if value { "#true" } else { "#false" }),
84 Token::Null => f.write_str("#null"),
85 }
86 }
87}
88
89#[derive(Debug, Error)]
97#[non_exhaustive]
98pub enum LexerError {
99 #[cfg(feature = "std")]
100 #[error(transparent)]
101 #[expect(clippy::absolute_paths, reason = "feature-gated")]
102 Io(std::io::Error),
104 #[cfg(not(feature = "std"))]
105 #[error("IO error")]
106 Io(()),
108 #[error("Invalid UTF-8 text at {0}")]
109 #[doc = "Invalid UTF-8 text at {0}"]
110 InvalidUtf8(usize),
111 #[error("invalid document character at {0}")]
112 #[doc = "invalid document character at {0}"]
113 InvalidCharacter(usize),
114 #[error("Unexpected end-of-file at {0}")]
115 #[doc = "Unexpected end-of-file at {0}"]
116 UnexpectedEof(usize),
117 #[error("Bad escline body at {0}")]
118 #[doc = "Bad escline body at {0}"]
119 BadEscline(usize),
120 #[error("Unexpected plain keyword")]
121 #[doc = "Unexpected plain keyword"]
122 UnexpectedKeyword,
123 #[error("Invalid string escape at {0}")]
124 #[doc = "Invalid string escape at {0}"]
125 InvalidEscape(usize),
126 #[error("Invalid number value")]
127 #[doc = "Invalid number value"]
128 InvalidNumber,
129 #[error("Bad unicode string escape at {0}")]
130 #[doc = "Bad unicode string escape at {0}"]
131 BadUnicodeEscape(usize),
132 #[error("Unexpected newline in single-line string at {0}")]
133 #[doc = "Unexpected newline in single-line string at {0}"]
134 UnexpectedStringNewline(usize),
135 #[error("Bad raw string start")]
136 #[doc = "Bad raw string start"]
137 BadRawString,
138 #[error("Missing newline after multi-line string start")]
139 #[doc = "Missing newline after multi-line string start"]
140 MissingStringNewline,
141 #[error("Text before multi-line string end at {0}")]
142 #[doc = "Text before multi-line string end at {0}"]
143 BadEndString(usize),
144 #[error("Bad multi-line string indent at {0:?}")]
145 #[doc = "Bad multi-line string indent at {0}"]
146 BadIndent(Option<usize>),
147 #[error("Invalid operator")]
148 #[doc = "Invalid operator"]
149 InvalidOperator,
150 #[error("Missing expected text")]
151 #[doc = "Missing expected text"]
152 MissingText,
153}
154
155impl PartialEq for LexerError {
157 fn eq(&self, other: &Self) -> bool { discriminant(self) == discriminant(other) }
158}
159
160type LexerResult<T> = Result<T, LexerError>;
161
162#[derive(Debug, Clone, Copy)]
164enum NextSkip {
165 None,
166 Spaces,
167 Lines,
168 RecoverLineComment,
169 RecoverBlockComment(usize),
170 RecoverString {
171 multiline: bool,
172 hashes: Option<NonZeroUsize>,
173 },
174 IrrecoverableError,
177}
178
179pub trait Input {
185 fn peek(&mut self, n: usize) -> LexerResult<&[u8]>;
191 fn advance(&mut self, n: usize);
193}
194
195impl Input for &[u8] {
196 fn peek(&mut self, _n: usize) -> LexerResult<&[u8]> { Ok(self) }
197 fn advance(&mut self, n: usize) { *self = &self[n..]; }
198}
199
200#[cfg(feature = "std")]
201const MAX_PEEK: usize = char::MAX_LEN_UTF8;
202
203#[cfg(feature = "std")]
205#[cfg_attr(docsrs, doc(cfg(feature = "std")))]
206#[derive(Debug)]
207pub struct ReadInput<T> {
208 reader: T,
209 buffer: [u8; MAX_PEEK],
211 buffer_len: u8,
212}
213#[cfg(feature = "std")]
214impl<T> ReadInput<T> {
215 pub fn new(reader: T) -> Self {
217 Self {
218 reader,
219 buffer: [0; MAX_PEEK],
220 buffer_len: 0,
221 }
222 }
223}
224
225#[cfg(feature = "std")]
226#[expect(clippy::absolute_paths, reason = "feature-gated")]
227#[expect(clippy::panic_in_result_fn, reason = "precondition validation")]
228#[expect(
229 clippy::cast_possible_truncation,
230 reason = "start <= request <= MAX_PEEK"
231)]
232impl<T: std::io::Read> Input for ReadInput<T> {
233 fn peek(&mut self, request: usize) -> LexerResult<&[u8]> {
234 assert!(request <= MAX_PEEK, "target length too long");
235 let mut start = usize::from(self.buffer_len);
237 while start < request {
238 match self.reader.read(&mut self.buffer[start..]) {
240 Ok(0) => break,
241 Ok(n) => start += n,
242 Err(e) if e.kind() == std::io::ErrorKind::Interrupted => {}
243 Err(e) => return Err(LexerError::Io(e)),
244 }
245 }
246 self.buffer_len = start as u8;
247 Ok(&self.buffer[..start])
248 }
249 fn advance(&mut self, request: usize) {
250 assert!(
251 request <= usize::from(self.buffer_len),
252 "target length larger than buffer"
253 );
254 self.buffer =
255 (u32::from_le_bytes(self.buffer).unbounded_shr(8 * request as u32)).to_le_bytes();
256 self.buffer_len -= request as u8;
257 }
258}
259
260macro_rules! utf8_class {
263 (bom) => {[0xEF, 0xBB, 0xBF]};
265 (invalid) => {
267 [0x00..=0x08 | 0x0E..=0x1F | 0x7F]
269 | [0xE2, 0x80, 0x8E | 0x8F | 0xAA..=0xAE]
271 | [0xE2, 0x81, 0xA6..=0xA9]
273 | [0xED, 0xA0..=0xBF, _]
275 | utf8_class!(bom)
276 };
277 (line) => {
278 [0x0A..=0x0D]
280 | [0xC2, 0x85]
282 | [0xE2, 0x80, 0xA8 | 0xA9]
284 };
285 (space) => {
286 [0x09 | 0x20]
288 | [0xC2, 0xA0]
290 | [0xE1, 0x9A, 0x80]
292 | [0xE2, 0x80, 0x80..=0x8A | 0xAF]
294 | [0xE2, 0x81, 0x9F]
296 | [0xE3, 0x80, 0x80]
298 };
299 (not_ident) => {
301 utf8_class!(invalid)
302 | utf8_class!(line)
303 | utf8_class!(space)
304 | b"/" | b"\\" | b"(" | b")" | b"{" | b"}" | b"[" | b"]" | b";" | b"\"" | b"#" | b"="
305 };
306}
307
308fn utf8_len(first: u8) -> usize {
309 const LUT: [u8; 16] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4];
311 LUT[(first >> 4) as usize] as usize
312}
313
314pub(crate) trait StringOutput {
317 fn so_push_str(&mut self, text: &str);
318 fn so_push_char(&mut self, c: char);
319 fn so_push_close(&mut self, hashes: usize);
320 fn so_finish(self) -> Token;
321 fn so_finish_num(self, first: u8) -> Option<SmolStr>;
322}
323impl StringOutput for SmolStrBuilder2 {
325 fn so_push_str(&mut self, text: &str) { self.push_str(text); }
326 fn so_push_char(&mut self, c: char) { self.push(c); }
327 fn so_push_close(&mut self, hashes: usize) {
328 self.push_str("\"");
329 self.push_repeated(b'#', hashes);
330 }
331 fn so_finish(self) -> Token { Token::String(self.finish()) }
332 fn so_finish_num(mut self, first: u8) -> Option<SmolStr> {
333 self.swap0(first);
334 Some(self.finish())
335 }
336}
337impl StringOutput for () {
338 fn so_push_str(&mut self, _text: &str) {}
339 fn so_push_char(&mut self, _c: char) {}
340 fn so_push_close(&mut self, _hashes: usize) {}
341 fn so_finish(self) -> Token { Token::SkippedString }
342 fn so_finish_num(self, _first: u8) -> Option<SmolStr> { None }
343}
344
345#[derive(Debug)]
347pub struct Lexer<T> {
348 reader: T,
349 cursor: usize,
350 next_skip: NextSkip,
351}
352
353#[expect(clippy::unnested_or_patterns, reason = "does not respect utf8_class")]
360impl<T: Input> Lexer<T> {
361 pub const fn new(input: T) -> Self {
363 Self {
364 reader: input,
365 cursor: 0,
366 next_skip: NextSkip::None,
367 }
368 }
369 fn peek(&mut self, n: Range<usize>) -> LexerResult<&[u8]> {
370 match self.reader.peek(n.start) {
371 Ok(result) => Ok(&result[..result.len().min(n.end)]),
372 Err(err) => {
373 self.next_skip = NextSkip::IrrecoverableError;
375 Err(err)
376 }
377 }
378 }
379 fn peek_table(&mut self, f: impl FnOnce(u8) -> usize) -> LexerResult<&[u8]> {
381 let &[first] = self.peek(1..1)? else {
382 return Ok(&[]);
384 };
385 let size = f(first);
387 self.peek(size..size)
388 }
389 fn advance(&mut self, n: usize) {
390 self.cursor += n;
391 self.reader.advance(n);
392 }
393 fn adv_certain(&mut self, text: &[u8]) {
394 debug_assert_eq!(
395 self.peek(text.len()..text.len()).unwrap(),
396 text,
397 "adv_certain was certainly wrong"
398 );
399 self.advance(text.len());
400 }
401 fn adv_uncertain(&mut self, text: &[u8]) -> LexerResult<()> {
402 if self.peek(text.len()..text.len())? == text {
403 self.advance(text.len());
404 Ok(())
405 } else {
406 Err(LexerError::MissingText)
407 }
408 }
409 fn begin_skip(&mut self, size: usize, skip: NextSkip) -> Token {
410 self.advance(size);
411 self.next_skip = skip;
412 match skip {
413 NextSkip::Spaces => Token::Spaces,
414 NextSkip::Lines => Token::Lines,
415 _ => unreachable!(),
416 }
417 }
418 fn just(&mut self, size: usize, token: Token) -> Token {
419 self.advance(size);
420 token
421 }
422 fn keyword(&mut self, head: &[u8], tail: &[u8], token: Token) -> LexerResult<Token> {
424 self.adv_certain(head);
425 self.adv_uncertain(tail)?;
426 Ok(token)
427 }
428 fn keyword_number(
429 &mut self,
430 head: &[u8],
431 tail: &[u8],
432 skip: bool,
433 number: Number,
434 ) -> LexerResult<Token> {
435 let token = if skip {
436 Token::SkippedNumber
437 } else {
438 Token::Number(number)
439 };
440 self.keyword(head, tail, token)
441 }
442 fn block_comment(&mut self) -> LexerResult<()> {
443 self.adv_certain(b"/*");
444 let mut depth = 0_usize;
445 loop {
446 self.next_skip = NextSkip::RecoverBlockComment(depth);
447 let peek = self.peek_table(|first| match first {
448 b'/' | b'*' => 2,
449 _ => utf8_len(first),
450 })?;
451 let size = peek.len();
452 match peek {
453 [] => return Err(LexerError::UnexpectedEof(self.cursor)),
454 utf8_class!(invalid) => {
455 return Err(LexerError::InvalidCharacter(self.cursor));
456 }
457 b"/*" => {
458 self.advance(2);
459 depth = depth.checked_add(1).expect("excessive comment depth");
460 }
461 b"*/" => {
462 self.advance(2);
463 match depth.checked_sub(1) {
464 Some(new) => depth = new,
465 None => break,
466 }
467 }
468 [b'/' | b'*', ..] => self.advance(1),
469 text if str::from_utf8(text).is_ok() => self.advance(size),
470 _ => return Err(LexerError::InvalidUtf8(self.cursor)),
471 }
472 }
473 self.next_skip = NextSkip::None;
474 Ok(())
475 }
476 fn line_comment(&mut self) -> LexerResult<()> {
477 self.adv_certain(b"//");
478 self.next_skip = NextSkip::RecoverLineComment;
479 loop {
480 let peek = self.peek_table(utf8_len)?;
481 let size = peek.len();
482 match peek {
483 [] => break,
484 utf8_class!(line) => {
485 self.advance(size);
487 break;
488 }
489 utf8_class!(invalid) => {
490 return Err(LexerError::InvalidCharacter(self.cursor));
491 }
492 text if str::from_utf8(text).is_ok() => self.advance(size),
493 _ => return Err(LexerError::InvalidUtf8(self.cursor)),
494 }
495 }
496 self.next_skip = NextSkip::None;
497 Ok(())
498 }
499 fn escline(&mut self) -> LexerResult<()> {
500 self.adv_certain(b"\\");
501 loop {
502 let peek = self.peek_table(|first| match first {
503 b'/' => 2,
504 _ => utf8_len(first),
505 })?;
506 let size = peek.len();
507 match peek {
508 [] => break,
509 utf8_class!(space) => self.advance(size),
510 utf8_class!(line) => break self.advance(size),
511 b"/*" => self.block_comment()?,
512 b"//" => break self.line_comment()?,
513 _ => return Err(LexerError::BadEscline(self.cursor)),
515 }
516 }
517 Ok(())
518 }
519 fn string_escape(&mut self) -> LexerResult<Option<char>> {
521 let start_of_escape = self.cursor;
522 self.adv_certain(b"\\");
523 let peek = self.peek_table(utf8_len)?;
524 let size = peek.len();
525 let ch = match peek {
526 utf8_class!(space) | utf8_class!(line) => {
527 let mut size = size;
528 loop {
529 self.advance(size);
530 let peek_space = self.peek_table(utf8_len)?;
531 size = peek_space.len();
532 if !matches!(peek_space, utf8_class!(space) | utf8_class!(line)) {
533 return Ok(None);
534 }
535 }
536 }
537 b"\"" => '\"',
538 b"\\" => '\\',
539 b"b" => '\x08',
540 b"f" => '\x0C',
541 b"n" => '\n',
542 b"r" => '\r',
543 b"t" => '\t',
544 b"s" => ' ',
545 b"u" => {
546 #[expect(
547 clippy::cast_possible_wrap,
548 clippy::cast_sign_loss,
549 reason = "cursed anyways"
550 )]
551 fn hex(v: u8) -> u32 { ((((v + v) as i8 >> 7) & 9) as u8 + (v & 15)).into() }
553 macro_rules! hex { ($id:ident) => {$id @ (b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f')} }
554 self.advance(1);
555 self.adv_uncertain(b"{")?;
556 let value = match *self.peek(3..3)? {
560 [hex!(c2), hex!(c1), hex!(c0)] => {
561 self.advance(3);
562 let base = hex(c2) << 8 | hex(c1) << 4 | hex(c0);
563 match *self.peek(2..2)? {
565 [hex!(c1), hex!(c0)] => {
566 let base = base << 8 | hex(c1) << 4 | hex(c0);
567 self.advance(2);
568 match *self.peek(2..2)? {
570 [hex!(c0), b'}'] => {
571 self.advance(2);
572 base << 4 | hex(c0)
573 }
574 [b'}', ..] => {
575 self.advance(1);
576 base
577 }
578 _ => return Err(LexerError::BadUnicodeEscape(start_of_escape)),
579 }
580 }
581 [hex!(c0), b'}'] => {
582 self.advance(2);
583 base << 4 | hex(c0)
584 }
585 [b'}', ..] => {
586 self.advance(1);
587 base
588 }
589 _ => return Err(LexerError::BadUnicodeEscape(start_of_escape)),
590 }
591 }
592 [hex!(c1), hex!(c0), b'}'] => {
593 self.advance(3);
594 hex(c1) << 4 | hex(c0)
595 }
596 [hex!(c0), b'}', ..] => {
597 self.advance(2);
598 hex(c0)
599 }
600 _ => return Err(LexerError::BadUnicodeEscape(start_of_escape)),
601 };
602 return Ok(Some(
603 char::from_u32(value).ok_or(LexerError::BadUnicodeEscape(start_of_escape))?,
604 ));
605 }
606 _ => return Err(LexerError::InvalidEscape(start_of_escape)),
607 };
608 self.advance(size);
609 Ok(Some(ch))
610 }
611 fn spaces(&mut self) -> LexerResult<()> {
612 loop {
613 let peek = self.peek_table(|first| match first {
614 b'/' => 2,
615 _ => utf8_len(first),
616 })?;
617 let size = peek.len();
618 match peek {
619 [] => break,
620 utf8_class!(space) => self.advance(size),
621 b"\\" => self.escline()?,
622 b"/*" => self.block_comment()?,
623 _ => break,
625 }
626 }
627 Ok(())
628 }
629 fn lines(&mut self) -> LexerResult<()> {
630 loop {
631 let peek = self.peek_table(|first| match first {
632 b'/' => 2,
633 _ => utf8_len(first),
634 })?;
635 let size = peek.len();
636 match peek {
637 [] => break,
638 utf8_class!(space) | utf8_class!(line) => self.advance(size),
639 b"\\" => self.escline()?,
640 b"/*" => self.block_comment()?,
641 b"//" => self.line_comment()?,
642 _ => break,
644 }
645 }
646 Ok(())
647 }
648 fn ident_inner(&mut self, number: bool, mut text: impl StringOutput) -> LexerResult<Token> {
649 if number {
650 let mut builder = NumberBuilder::new(text);
651 'text: loop {
652 let peek = self.peek(1..usize::MAX)?;
653 if peek.is_empty() {
654 break 'text;
655 }
656 for (i, &byte) in peek.iter().enumerate() {
657 if !matches!(byte, b'+'..=b'9' | b'A'..=b'Z' | b'_' | b'a'..=b'z') {
661 self.advance(i);
662 break 'text;
663 }
664 if !builder.step(byte) {
665 self.advance(i);
666 return Err(LexerError::InvalidNumber);
667 }
668 }
669 let size = peek.len();
670 self.advance(size);
671 }
672 if matches!(self.peek_table(utf8_len)?, [] | utf8_class!(not_ident)) {
673 match builder.finish() {
674 Some(Some(value)) => Ok(Token::Number(value)),
675 Some(None) => Ok(Token::SkippedNumber),
676 None => Err(LexerError::InvalidNumber),
677 }
678 } else {
679 Err(LexerError::InvalidNumber)
681 }
682 } else {
683 let debug_start = self.cursor;
684 loop {
685 let cursor = self.cursor;
686 let peek = self.peek_table(utf8_len)?;
687 let size = peek.len();
688 match peek {
689 [] | utf8_class!(not_ident) => {
690 debug_assert_ne!(debug_start, cursor, "empty ident!");
691 break;
692 }
693 ch => {
694 if let Ok(ch) = str::from_utf8(ch) {
695 text.so_push_str(ch);
696 self.advance(size);
697 } else {
698 self.advance(1);
699 return Err(LexerError::InvalidUtf8(cursor));
700 }
701 }
702 }
703 }
704 Ok(text.so_finish())
705 }
706 }
707 fn ident(&mut self, skip: bool) -> LexerResult<Token> {
708 #[derive(Clone, Copy, PartialEq)]
709 enum Preview {
710 Regular,
711 Number,
712 Keyword,
713 }
714 let preview = match self.peek(2..2)? {
716 [b'0'..=b'9', ..] | [b'+' | b'-', b'0'..=b'9'] => Preview::Number,
717 [b'.', b'0'..=b'9'] => {
718 self.advance(1);
719 return Err(LexerError::InvalidNumber);
720 }
721 [b'+' | b'-', b'.'] => match self.peek(3..3)? {
723 [b'+' | b'-', b'.', b'0'..=b'9'] => {
724 self.advance(2);
725 return Err(LexerError::InvalidNumber);
726 }
727 _ => Preview::Regular,
728 },
729 b"tr" | b"fa" | b"nu" | b"in" | b"-i" | b"na" => Preview::Keyword,
730 _ => Preview::Regular,
731 };
732 match (skip, preview) {
733 (skip, Preview::Keyword) => {
734 let Token::String(text) = self.ident_inner(false, SmolStrBuilder2::new())? else {
737 unreachable!()
738 };
739 if matches!(&*text, "true" | "false" | "null" | "inf" | "-inf" | "nan") {
740 Err(LexerError::UnexpectedKeyword)
741 } else if skip {
742 Ok(Token::SkippedString)
743 } else {
744 Ok(Token::String(text))
745 }
746 }
747 (true, _) => self.ident_inner(preview == Preview::Number, ()),
748 (false, _) => self.ident_inner(preview == Preview::Number, SmolStrBuilder2::new()),
749 }
750 }
751 fn singleline_string(
752 &mut self,
753 hashes: Option<NonZeroUsize>,
754 mut text: impl StringOutput,
755 ) -> LexerResult<Token> {
756 'text: loop {
757 let cursor = self.cursor;
758 let peek = self.peek_table(utf8_len)?;
759 let size = peek.len();
760 match peek {
761 [] => return Err(LexerError::UnexpectedEof(cursor)),
762 utf8_class!(invalid) => return Err(LexerError::InvalidCharacter(cursor)),
763 utf8_class!(line) => return Err(LexerError::UnexpectedStringNewline(cursor)),
764 b"\"" => {
765 self.advance(1);
766 let hashes = hashes.map_or(0, NonZeroUsize::get);
767 let mut hashes_left = hashes;
768 while hashes_left > 0 {
769 let tail = self.peek(1..hashes_left)?;
770 if tail.is_empty() {
771 self.next_skip = NextSkip::None;
772 return Err(LexerError::UnexpectedEof(cursor));
773 }
774 if !tail.iter().all(|&v| v == b'#') {
776 text.so_push_close(hashes - hashes_left);
777 continue 'text;
780 }
781 let len = tail.len();
782 hashes_left -= len;
783 self.advance(len);
784 }
785 self.next_skip = NextSkip::None;
786 break Ok(text.so_finish());
787 }
788 b"\\" if hashes.is_none() => {
789 if let Some(ch) = self.string_escape()? {
790 text.so_push_char(ch);
791 }
792 }
793 ch => {
794 text.so_push_str(
795 str::from_utf8(ch).map_err(|_| LexerError::InvalidUtf8(cursor))?,
796 );
797 self.advance(size);
798 }
799 }
800 }
801 }
802 fn newline_crlf(&mut self) -> LexerResult<Option<NonZeroUsize>> {
804 let peek = self.peek_table(|first| match first {
805 b'\r' => 2,
806 first => utf8_len(first),
807 })?;
808 Ok(NonZeroUsize::new(match peek {
809 b"\r\n" => 2,
810 [b'\r', ..] => 1,
811 utf8_class!(line) => peek.len(),
812 _ => 0,
813 }))
814 }
815 #[expect(clippy::too_many_lines, reason = "off-by-one :)")]
816 fn multiline_string_regular(&mut self, hashes: Option<NonZeroUsize>) -> LexerResult<Token> {
817 let mut full_text = String::new();
819 let mut lines = Vec::<Option<(usize, usize, usize, usize)>>::new();
821 let tail = 'line: loop {
822 let line_cursor = self.cursor;
823 let line_start = full_text.len();
824 let mut peek_indent = self.peek_table(utf8_len)?;
825 while matches!(peek_indent, utf8_class!(space)) {
827 full_text.push_str(str::from_utf8(peek_indent).unwrap_or_else(|_| unreachable!()));
828 let size = peek_indent.len();
829 self.advance(size);
830 peek_indent = self.peek_table(utf8_len)?;
831 }
832 let text_start = full_text.len();
833 let newline = self.newline_crlf()?;
834 if let Some(size) = newline {
835 lines.push(None);
836 self.advance(size.get());
837 continue;
838 }
839 'text: loop {
840 let cursor = self.cursor;
841 let peek = self.peek_table(|first| match first {
842 b'"' => 3,
843 b'\r' => 2,
844 first => utf8_len(first),
845 })?;
846 let size = peek.len();
847 match peek {
848 [] => return Err(LexerError::UnexpectedEof(cursor)),
849 utf8_class!(invalid) => return Err(LexerError::InvalidCharacter(cursor)),
850 [b'\r', ..] | utf8_class!(line) => {
851 lines.push(Some((line_cursor, line_start, text_start, full_text.len())));
852 let size = match peek {
853 b"\r\n" => 2,
854 [b'\r', ..] => 1,
855 _ => size,
856 };
857 self.advance(size);
858 break;
859 }
860 b"\"\"\"" => {
861 self.advance(3);
862 let hashes = hashes.map_or(0, NonZeroUsize::get);
863 let mut hashes_left = hashes;
864 while hashes_left > 0 {
865 let tail = self.peek(1..hashes_left)?;
866 if tail.is_empty() {
867 self.next_skip = NextSkip::None;
868 return Err(LexerError::UnexpectedEof(self.cursor));
869 }
870 if !tail.iter().all(|&v| v == b'#') {
872 full_text.push_str("\"\"\"");
873 full_text.extend(repeat_n('#', hashes - hashes_left));
874 continue 'text;
877 }
878 let len = tail.len();
879 hashes_left -= len;
880 self.advance(len);
881 }
882 self.next_skip = NextSkip::None;
883 if full_text.len() > text_start {
884 return Err(LexerError::BadEndString(cursor));
885 }
886 break 'line line_start..text_start;
887 }
888 [b'"', ..] => {
889 full_text.push('"');
890 self.advance(1);
891 }
892 b"\\" if hashes.is_none() => {
893 if let Some(ch) = self.string_escape()? {
894 full_text.push(ch);
895 }
896 }
897 ch => {
898 full_text.push_str(
899 str::from_utf8(ch).map_err(|_| LexerError::InvalidUtf8(cursor))?,
900 );
901 self.advance(size);
902 }
903 }
904 }
905 };
906 let tail_len = tail.end - tail.start;
907 let mut text = SmolStrBuilder2::new();
909 let mut pre_newline = false;
910 for line in lines {
911 if pre_newline {
912 text.push('\n');
913 }
914 pre_newline = true;
915 if let Some((line_cursor, line_start, text_start, line_end)) = line {
916 if text_start - line_start < tail_len
917 || full_text[tail.clone()] != full_text[line_start..line_start + tail_len]
918 {
919 return Err(LexerError::BadIndent(Some(line_cursor)));
920 }
921 text.push_str(&full_text[line_start + tail_len..line_end]);
922 }
923 }
924 Ok(Token::String(text.finish()))
925 }
926 fn multiline_string_skip(&mut self, hashes: Option<NonZeroUsize>) -> LexerResult<Token> {
928 let mut indent = SmolStrBuilder2::new();
933 let mut peek_indent = self.peek_table(utf8_len)?;
934 while matches!(peek_indent, utf8_class!(space)) {
936 indent.push_str(str::from_utf8(peek_indent).unwrap_or_else(|_| unreachable!()));
937 let size = peek_indent.len();
938 self.advance(size);
939 peek_indent = self.peek_table(utf8_len)?;
940 }
941 let mut next_truncate_length = indent.len();
942 'line: loop {
943 let has_leading_space = matches!(self.peek_table(utf8_len)?, utf8_class!(space));
945 let mut has_body = false;
947 'text: loop {
948 let cursor = self.cursor;
949 let peek = self.peek_table(|first| match first {
950 b'"' => 3,
951 b'\r' => 2,
952 first => utf8_len(first),
953 })?;
954 let size = peek.len();
955 match peek {
956 [] => return Err(LexerError::UnexpectedEof(cursor)),
957 utf8_class!(invalid) => return Err(LexerError::InvalidCharacter(cursor)),
958 utf8_class!(space) => {
959 self.advance(size);
961 }
962 [b'\r', ..] | utf8_class!(line) => {
963 let size = match peek {
964 b"\r\n" => 2,
965 [b'\r', ..] => 1,
966 _ => size,
967 };
968 self.advance(size);
969 break;
970 }
971 b"\"\"\"" => {
972 self.advance(3);
973 let hashes = hashes.map_or(0, NonZeroUsize::get);
974 let mut hashes_left = hashes;
975 while hashes_left > 0 {
976 let tail = self.peek(1..hashes_left)?;
977 if tail.is_empty() {
978 self.next_skip = NextSkip::None;
979 return Err(LexerError::UnexpectedEof(self.cursor));
980 }
981 if !tail.iter().all(|&v| v == b'#') {
983 has_body = true;
986 continue 'text;
987 }
988 let len = tail.len();
989 hashes_left -= len;
990 self.advance(len);
991 }
992 self.next_skip = NextSkip::None;
993 if has_body {
994 return Err(LexerError::BadEndString(cursor));
995 } else if has_leading_space {
996 return Err(LexerError::BadIndent(None));
997 }
998 break 'line;
999 }
1000 [b'"', ..] => {
1001 has_body = true;
1002 self.advance(1);
1003 }
1004 b"\\" if hashes.is_none() => has_body |= self.string_escape()?.is_some(),
1006 ch => {
1007 _ = str::from_utf8(ch).map_err(|_| LexerError::InvalidUtf8(cursor))?;
1008 has_body = true;
1009 self.advance(size);
1010 }
1011 }
1012 }
1013 if has_body {
1015 indent.truncate_floor(next_truncate_length);
1016 }
1017 next_truncate_length = indent.len();
1019 let mut matched_bytes = 0;
1020 while matched_bytes < indent.len() {
1021 fn common_prefix(a: &[u8], b: &[u8]) -> usize {
1023 a.iter().zip(b).take_while(|(a, b)| a == b).count()
1024 }
1025 let peek = self.peek(1..indent.len() - matched_bytes)?;
1026 let next = &indent.as_bytes()[matched_bytes..];
1027 let common = common_prefix(peek, next);
1028 matched_bytes += common;
1029 let size = peek.len();
1030 self.advance(common);
1031 if common < size {
1032 next_truncate_length = matched_bytes;
1033 break;
1034 }
1035 }
1036 }
1037 Ok(Token::SkippedString)
1038 }
1039 fn string(&mut self, skip: bool) -> LexerResult<Token> {
1041 let mut hashes = 0_usize;
1042 'count: loop {
1043 let mut advance = 0;
1044 for &byte in self.peek(1..usize::MAX)? {
1045 if byte != b'#' {
1046 self.advance(advance);
1047 hashes = hashes.checked_add(advance).unwrap();
1048 break 'count;
1049 }
1050 advance += 1;
1051 }
1052 if advance == 0 {
1053 return Err(LexerError::UnexpectedEof(self.cursor));
1054 }
1055 self.advance(advance);
1056 hashes = hashes.checked_add(advance).unwrap();
1058 }
1059 let hashes = NonZeroUsize::new(hashes);
1060 match self.peek(3..3)? {
1061 b"\"\"\"" => {
1062 self.next_skip = NextSkip::RecoverString {
1063 multiline: true,
1064 hashes,
1065 };
1066 self.advance(3);
1067 let Some(size) = self.newline_crlf()? else {
1068 return Err(LexerError::MissingStringNewline);
1069 };
1070 let size = size.get();
1071 self.advance(size);
1072 if skip {
1073 self.multiline_string_skip(hashes)
1074 } else {
1075 self.multiline_string_regular(hashes)
1076 }
1077 }
1078 [b'"', ..] => {
1079 self.next_skip = NextSkip::RecoverString {
1080 multiline: false,
1081 hashes,
1082 };
1083 self.advance(1);
1084 if skip {
1085 self.singleline_string(hashes, ())
1086 } else {
1087 self.singleline_string(hashes, SmolStrBuilder2::new())
1088 }
1089 }
1090 _ => Err(LexerError::BadRawString),
1091 }
1092 }
1093 fn advance_err(&mut self, n: usize, err: LexerError) -> LexerError {
1094 self.advance(n);
1095 err
1096 }
1097 fn recover_until(
1098 &mut self,
1099 table: impl Fn(u8) -> usize,
1100 mut done: impl FnMut(&[u8]) -> bool,
1101 ) -> LexerResult<()> {
1102 loop {
1103 match self.peek_table(&table)? {
1104 [] => break,
1105 ch => {
1106 if done(ch) {
1107 let size = ch.len();
1108 self.advance(size);
1109 break;
1110 }
1111 self.advance(1);
1112 }
1113 }
1114 }
1115 Ok(())
1116 }
1117 fn next_token_value(&mut self, skip: bool, out_cursor: &mut usize) -> LexerResult<Token> {
1118 match self.next_skip {
1119 NextSkip::None => {}
1120 NextSkip::Spaces => self.spaces()?,
1121 NextSkip::Lines => self.lines()?,
1122 NextSkip::RecoverLineComment => {
1123 self.recover_until(utf8_len, |ch| matches!(ch, utf8_class!(line)))?;
1124 }
1125 NextSkip::RecoverBlockComment(mut depth) => {
1126 self.recover_until(
1127 |_| 2,
1128 |ch| {
1129 if ch == b"*/" {
1130 if depth == 0 {
1131 true
1132 } else {
1133 depth -= 1;
1134 false
1135 }
1136 } else if ch == b"/*" {
1137 depth += 1;
1138 false
1139 } else {
1140 false
1141 }
1142 },
1143 )?;
1144 }
1145 NextSkip::RecoverString { multiline, hashes } => {
1146 let quotes = if multiline { 3_usize } else { 1_usize };
1147 let length = quotes + hashes.map_or(0, NonZeroUsize::get);
1148 let mut distance = 0;
1149 self.recover_until(
1150 |_| 1,
1151 |ch| {
1152 if (ch == b"\"" && distance < quotes) || (ch == b"#" && distance < length) {
1153 distance += 1;
1154 } else {
1155 distance = 0;
1156 }
1157 distance == length
1158 },
1159 )?;
1160 }
1161 NextSkip::IrrecoverableError => return Ok(Token::Eof),
1162 }
1163 self.next_skip = NextSkip::None;
1164 let start = self.cursor;
1165 *out_cursor = start;
1166 let peek = self.peek_table(|first: u8| match first {
1171 b'/' => 2,
1172 b'#' => 3,
1173 _ => utf8_len(first),
1174 })?;
1175 let size = peek.len();
1176 Ok(match peek {
1177 [] => Token::Eof,
1178 utf8_class!(bom) if start == 0 => self.just(3, Token::Bom),
1179 utf8_class!(invalid) => {
1180 return Err(self.advance_err(size, LexerError::InvalidCharacter(self.cursor)));
1181 }
1182 utf8_class!(line) => self.begin_skip(size, NextSkip::Lines),
1183 utf8_class!(space) => self.begin_skip(size, NextSkip::Spaces),
1184 b"\\" => self.begin_skip(0, NextSkip::Spaces),
1185 b";" => self.just(1, Token::SemiColon),
1186 b"=" => self.just(1, Token::Equals),
1187 b"(" => self.just(1, Token::OpenParen),
1188 b")" => self.just(1, Token::CloseParen),
1189 b"{" => self.just(1, Token::OpenCurly),
1190 b"}" => self.just(1, Token::CloseCurly),
1191 b"[" | b"]" => return Err(self.advance_err(1, LexerError::InvalidOperator)),
1192 b"#tr" => self.keyword(b"#tr", b"ue", Token::Bool(true))?,
1194 b"#fa" => self.keyword(b"#fa", b"lse", Token::Bool(false))?,
1195 b"#nu" => self.keyword(b"#nu", b"ll", Token::Null)?,
1196 b"#in" => self.keyword_number(b"#in", b"f", skip, Number::INFINITY)?,
1197 b"#-i" => self.keyword_number(b"#-i", b"nf", skip, Number::NEG_INFINITY)?,
1198 b"#na" => self.keyword_number(b"#na", b"n", skip, Number::NAN)?,
1199 [b'#', ..] | b"\"" => self.string(skip)?,
1200 b"/-" => self.just(2, Token::SlashDash),
1201 b"/*" => self.begin_skip(0, NextSkip::Spaces),
1202 b"//" => self.begin_skip(0, NextSkip::Lines),
1203 [b'/', ..] => return Err(self.advance_err(1, LexerError::InvalidOperator)),
1204 _ => self.ident(skip)?,
1205 })
1206 }
1207 pub fn next_token(&mut self, skip: bool) -> (LexerResult<Token>, usize) {
1210 let mut pos = self.cursor;
1212 let token = self.next_token_value(skip, &mut pos);
1213 (token, pos)
1214 }
1215 pub fn current_position(&mut self) -> usize { self.cursor }
1221}