just_kdl/
lexer.rs

1// SPDX-License-Identifier: MIT OR Apache-2.0
2//! Read raw tokens out of a file.
3//!
4//! Possibly useful if you want to implement syntax highlighting.
5//!
6//! You probably want to start at [`Lexer`].
7
8use alloc::string::String;
9use alloc::vec::Vec;
10use core::fmt;
11use core::iter::repeat_n;
12use core::mem::discriminant;
13use core::num::NonZeroUsize;
14use core::ops::Range;
15
16use smol_str::SmolStr;
17use thiserror::Error;
18
19use crate::dom::Number;
20use crate::dom::number::NumberBuilder;
21use crate::ssb2::SmolStrBuilder2;
22
23#[cfg(test)]
24mod tests;
25
26/// A successful token of text.
27#[derive(Debug, Clone, PartialEq, Eq, Hash)]
28pub enum Token {
29	/// `\u{FEFF}` at position 0 (can only be the first token).
30	Bom,
31	/// End of file.
32	Eof,
33	/// Some vertical gap.
34	Lines,
35	/// Some horizontal gap.
36	Spaces,
37	/// Any textual value.
38	String(SmolStr),
39	/// A numeric value, including `#inf`, `#-inf`, and `#-nan`.
40	Number(Number),
41	/// `String` without any value.
42	SkippedString,
43	/// `Number` without any value.
44	SkippedNumber,
45	/// `/-`
46	SlashDash,
47	/// `;`
48	SemiColon,
49	/// `=`
50	Equals,
51	/// `(`
52	OpenParen,
53	/// `)`
54	CloseParen,
55	/// `{`
56	OpenCurly,
57	/// `}`
58	CloseCurly,
59	/// `#true` or `#false`
60	Bool(bool),
61	/// `#null`
62	Null,
63}
64
65impl fmt::Display for Token {
66	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
67		match self {
68			Token::Bom => f.write_str("byte order mark"),
69			Token::Eof => f.write_str("end of file"),
70			Token::Lines => f.write_str("'\\n'"),
71			Token::Spaces => f.write_str("' '"),
72			Token::String(value) => fmt::Debug::fmt(value, f),
73			Token::Number(value) => fmt::Display::fmt(value, f),
74			Token::SkippedString => f.write_str("a string"),
75			Token::SkippedNumber => f.write_str("a number"),
76			Token::SlashDash => f.write_str("'/-'"),
77			Token::SemiColon => f.write_str("';'"),
78			Token::Equals => f.write_str("'='"),
79			Token::OpenParen => f.write_str("'('"),
80			Token::CloseParen => f.write_str("')'"),
81			Token::OpenCurly => f.write_str("'{'"),
82			Token::CloseCurly => f.write_str("'}'"),
83			&Token::Bool(value) => f.write_str(if value { "#true" } else { "#false" }),
84			Token::Null => f.write_str("#null"),
85		}
86	}
87}
88
89// error terminology
90// invalid = not one of many possible choices (branches)
91// unexpected = valid in other places but not here
92// missing = opposite of unexpected (not expected!)
93// bad = not one of possible options (for one thing)
94
95/// An error while lexing.
96#[derive(Debug, Error)]
97#[non_exhaustive]
98pub enum LexerError {
99	#[cfg(feature = "std")]
100	#[error(transparent)]
101	#[expect(clippy::absolute_paths, reason = "feature-gated")]
102	/// IO error, in `no_std` environments this is a `()`
103	Io(std::io::Error),
104	#[cfg(not(feature = "std"))]
105	#[error("IO error")]
106	/// IO error, in `std` environments this is a `std::io::Error`
107	Io(()),
108	#[error("Invalid UTF-8 text at {0}")]
109	#[doc = "Invalid UTF-8 text at {0}"]
110	InvalidUtf8(usize),
111	#[error("invalid document character at {0}")]
112	#[doc = "invalid document character at {0}"]
113	InvalidCharacter(usize),
114	#[error("Unexpected end-of-file at {0}")]
115	#[doc = "Unexpected end-of-file at {0}"]
116	UnexpectedEof(usize),
117	#[error("Bad escline body at {0}")]
118	#[doc = "Bad escline body at {0}"]
119	BadEscline(usize),
120	#[error("Unexpected plain keyword")]
121	#[doc = "Unexpected plain keyword"]
122	UnexpectedKeyword,
123	#[error("Invalid string escape at {0}")]
124	#[doc = "Invalid string escape at {0}"]
125	InvalidEscape(usize),
126	#[error("Invalid number value")]
127	#[doc = "Invalid number value"]
128	InvalidNumber,
129	#[error("Bad unicode string escape at {0}")]
130	#[doc = "Bad unicode string escape at {0}"]
131	BadUnicodeEscape(usize),
132	#[error("Unexpected newline in single-line string at {0}")]
133	#[doc = "Unexpected newline in single-line string at {0}"]
134	UnexpectedStringNewline(usize),
135	#[error("Bad raw string start")]
136	#[doc = "Bad raw string start"]
137	BadRawString,
138	#[error("Missing newline after multi-line string start")]
139	#[doc = "Missing newline after multi-line string start"]
140	MissingStringNewline,
141	#[error("Text before multi-line string end at {0}")]
142	#[doc = "Text before multi-line string end at {0}"]
143	BadEndString(usize),
144	#[error("Bad multi-line string indent at {0:?}")]
145	#[doc = "Bad multi-line string indent at {0}"]
146	BadIndent(Option<usize>),
147	#[error("Invalid operator")]
148	#[doc = "Invalid operator"]
149	InvalidOperator,
150	#[error("Missing expected text")]
151	#[doc = "Missing expected text"]
152	MissingText,
153}
154
155/// Don't trust this impl :)
156impl PartialEq for LexerError {
157	fn eq(&self, other: &Self) -> bool { discriminant(self) == discriminant(other) }
158}
159
160type LexerResult<T> = Result<T, LexerError>;
161
162// stored state between lexer calls
163#[derive(Debug, Clone, Copy)]
164enum NextSkip {
165	None,
166	Spaces,
167	Lines,
168	RecoverLineComment,
169	RecoverBlockComment(usize),
170	RecoverString {
171		multiline: bool,
172		hashes: Option<NonZeroUsize>,
173	},
174	/// An IO error was encountered, which could infinitely repeat without
175	/// advancing the cursor
176	IrrecoverableError,
177}
178
179/// Abstract lexer input trait, essentially [`BufRead`] with better ergonomics.
180///
181/// Notably implemented for <code>&\[[u8]\]</code> and [`ReadInput`].
182///
183/// [`BufRead`]: std::io::BufRead
184pub trait Input {
185	/// Peek at least `n` bytes, any less means end-of-file,
186	/// Must work for `n` in `1..=char::MAX_LEN_UTF8`.
187	///
188	/// # Errors
189	/// On IO error.
190	fn peek(&mut self, n: usize) -> LexerResult<&[u8]>;
191	/// Advance n bytes, always called after at least `n` peek.
192	fn advance(&mut self, n: usize);
193}
194
195impl Input for &[u8] {
196	fn peek(&mut self, _n: usize) -> LexerResult<&[u8]> { Ok(self) }
197	fn advance(&mut self, n: usize) { *self = &self[n..]; }
198}
199
200#[cfg(feature = "std")]
201const MAX_PEEK: usize = char::MAX_LEN_UTF8;
202
203/// Input from a [`std::io::Read`].
204#[cfg(feature = "std")]
205#[cfg_attr(docsrs, doc(cfg(feature = "std")))]
206#[derive(Debug)]
207pub struct ReadInput<T> {
208	reader: T,
209	// TODO/perf: able to use BufRead directly somehow?
210	buffer: [u8; MAX_PEEK],
211	buffer_len: u8,
212}
213#[cfg(feature = "std")]
214impl<T> ReadInput<T> {
215	/// Create a new instance.
216	pub fn new(reader: T) -> Self {
217		Self {
218			reader,
219			buffer: [0; MAX_PEEK],
220			buffer_len: 0,
221		}
222	}
223}
224
225#[cfg(feature = "std")]
226#[expect(clippy::absolute_paths, reason = "feature-gated")]
227#[expect(clippy::panic_in_result_fn, reason = "precondition validation")]
228#[expect(
229	clippy::cast_possible_truncation,
230	reason = "start <= request <= MAX_PEEK"
231)]
232impl<T: std::io::Read> Input for ReadInput<T> {
233	fn peek(&mut self, request: usize) -> LexerResult<&[u8]> {
234		assert!(request <= MAX_PEEK, "target length too long");
235		// manual impl of Read::read_exact, to correctly handle EOF
236		let mut start = usize::from(self.buffer_len);
237		while start < request {
238			// allow reading past requested length, that data will be kept after advance
239			match self.reader.read(&mut self.buffer[start..]) {
240				Ok(0) => break,
241				Ok(n) => start += n,
242				Err(e) if e.kind() == std::io::ErrorKind::Interrupted => {}
243				Err(e) => return Err(LexerError::Io(e)),
244			}
245		}
246		self.buffer_len = start as u8;
247		Ok(&self.buffer[..start])
248	}
249	fn advance(&mut self, request: usize) {
250		assert!(
251			request <= usize::from(self.buffer_len),
252			"target length larger than buffer"
253		);
254		self.buffer =
255			(u32::from_le_bytes(self.buffer).unbounded_shr(8 * request as u32)).to_le_bytes();
256		self.buffer_len -= request as u8;
257	}
258}
259
260/// return a matcher for a `&[u8; utf8]`
261// use `printf '\u____' | xxd` to calculate these :)
262macro_rules! utf8_class {
263	// FEFF
264	(bom) => {[0xEF, 0xBB, 0xBF]};
265	// TODO/perf: consider including invalid utf8 encodings / surrogates
266	(invalid) => {
267		// 0..=8, E..=1F, 7F
268		[0x00..=0x08 | 0x0E..=0x1F | 0x7F]
269		// 200E, 200F, 202A..=202E
270		| [0xE2, 0x80, 0x8E | 0x8F | 0xAA..=0xAE]
271		// 2066..=2069
272		| [0xE2, 0x81, 0xA6..=0xA9]
273		// D800..=DFFF
274		| [0xED, 0xA0..=0xBF, _]
275		| utf8_class!(bom)
276	};
277	(line) => {
278		// A..=D
279		[0x0A..=0x0D]
280		// 85
281		| [0xC2, 0x85]
282		// 2028, 2029
283		| [0xE2, 0x80, 0xA8 | 0xA9]
284	};
285	(space) => {
286		// 9, 20
287		[0x09 | 0x20]
288		// A0
289		| [0xC2, 0xA0]
290		// 1680
291		| [0xE1, 0x9A, 0x80]
292		// 2000..=200A, 202F
293		| [0xE2, 0x80, 0x80..=0x8A | 0xAF]
294		// 205F
295		| [0xE2, 0x81, 0x9F]
296		// 3000
297		| [0xE3, 0x80, 0x80]
298	};
299	// invalid, line, space, \/(){}[];"#=
300	(not_ident) => {
301		utf8_class!(invalid)
302		| utf8_class!(line)
303		| utf8_class!(space)
304		| b"/" | b"\\" | b"(" | b")" | b"{" | b"}" | b"[" | b"]" | b";" | b"\"" | b"#" | b"="
305	};
306}
307
308fn utf8_len(first: u8) -> usize {
309	// TODO/perf: compare with leading_ones-based solution?
310	const LUT: [u8; 16] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4];
311	LUT[(first >> 4) as usize] as usize
312}
313
314/// Generic "buffer" to abstract over allocation type
315// TODO/perf: try dynamic dispatching this instead of mono
316pub(crate) trait StringOutput {
317	fn so_push_str(&mut self, text: &str);
318	fn so_push_char(&mut self, c: char);
319	fn so_push_close(&mut self, hashes: usize);
320	fn so_finish(self) -> Token;
321	fn so_finish_num(self, first: u8) -> Option<SmolStr>;
322}
323//impl RawStringOutput for Vec<u8> {}
324impl StringOutput for SmolStrBuilder2 {
325	fn so_push_str(&mut self, text: &str) { self.push_str(text); }
326	fn so_push_char(&mut self, c: char) { self.push(c); }
327	fn so_push_close(&mut self, hashes: usize) {
328		self.push_str("\"");
329		self.push_repeated(b'#', hashes);
330	}
331	fn so_finish(self) -> Token { Token::String(self.finish()) }
332	fn so_finish_num(mut self, first: u8) -> Option<SmolStr> {
333		self.swap0(first);
334		Some(self.finish())
335	}
336}
337impl StringOutput for () {
338	fn so_push_str(&mut self, _text: &str) {}
339	fn so_push_char(&mut self, _c: char) {}
340	fn so_push_close(&mut self, _hashes: usize) {}
341	fn so_finish(self) -> Token { Token::SkippedString }
342	fn so_finish_num(self, _first: u8) -> Option<SmolStr> { None }
343}
344
345/// Lexer to turn a text stream into tokens.
346#[derive(Debug)]
347pub struct Lexer<T> {
348	reader: T,
349	cursor: usize,
350	next_skip: NextSkip,
351}
352
353// TODO/perf: we now only allow utf-8 text again,
354// consider rewriting parser to take advantage of that.
355// it might not be worth it to use `char`s, but using strings instead of
356// byte-strings. this is best explored as a whole-lexer rewrite.
357
358// TODO/perf: attempt to use a pure state machine, for better error recovery
359#[expect(clippy::unnested_or_patterns, reason = "does not respect utf8_class")]
360impl<T: Input> Lexer<T> {
361	/// Create a new lexer from the input.
362	pub const fn new(input: T) -> Self {
363		Self {
364			reader: input,
365			cursor: 0,
366			next_skip: NextSkip::None,
367		}
368	}
369	fn peek(&mut self, n: Range<usize>) -> LexerResult<&[u8]> {
370		match self.reader.peek(n.start) {
371			Ok(result) => Ok(&result[..result.len().min(n.end)]),
372			Err(err) => {
373				// TODO/perf: i think it's better to have it here (instead of in next_token)
374				self.next_skip = NextSkip::IrrecoverableError;
375				Err(err)
376			}
377		}
378	}
379	// TODO/perf: replace f with some specific solution like a 256-LUT
380	fn peek_table(&mut self, f: impl FnOnce(u8) -> usize) -> LexerResult<&[u8]> {
381		let &[first] = self.peek(1..1)? else {
382			// TODO/perf: return peek result directly?
383			return Ok(&[]);
384		};
385		// fucked up lifetime things means that i need to do 2 peeks :(
386		let size = f(first);
387		self.peek(size..size)
388	}
389	fn advance(&mut self, n: usize) {
390		self.cursor += n;
391		self.reader.advance(n);
392	}
393	fn adv_certain(&mut self, text: &[u8]) {
394		debug_assert_eq!(
395			self.peek(text.len()..text.len()).unwrap(),
396			text,
397			"adv_certain was certainly wrong"
398		);
399		self.advance(text.len());
400	}
401	fn adv_uncertain(&mut self, text: &[u8]) -> LexerResult<()> {
402		if self.peek(text.len()..text.len())? == text {
403			self.advance(text.len());
404			Ok(())
405		} else {
406			Err(LexerError::MissingText)
407		}
408	}
409	fn begin_skip(&mut self, size: usize, skip: NextSkip) -> Token {
410		self.advance(size);
411		self.next_skip = skip;
412		match skip {
413			NextSkip::Spaces => Token::Spaces,
414			NextSkip::Lines => Token::Lines,
415			_ => unreachable!(),
416		}
417	}
418	fn just(&mut self, size: usize, token: Token) -> Token {
419		self.advance(size);
420		token
421	}
422	// TODO/perf: check that head never appears in release builds
423	fn keyword(&mut self, head: &[u8], tail: &[u8], token: Token) -> LexerResult<Token> {
424		self.adv_certain(head);
425		self.adv_uncertain(tail)?;
426		Ok(token)
427	}
428	fn keyword_number(
429		&mut self,
430		head: &[u8],
431		tail: &[u8],
432		skip: bool,
433		number: Number,
434	) -> LexerResult<Token> {
435		let token = if skip {
436			Token::SkippedNumber
437		} else {
438			Token::Number(number)
439		};
440		self.keyword(head, tail, token)
441	}
442	fn block_comment(&mut self) -> LexerResult<()> {
443		self.adv_certain(b"/*");
444		let mut depth = 0_usize;
445		loop {
446			self.next_skip = NextSkip::RecoverBlockComment(depth);
447			let peek = self.peek_table(|first| match first {
448				b'/' | b'*' => 2,
449				_ => utf8_len(first),
450			})?;
451			let size = peek.len();
452			match peek {
453				[] => return Err(LexerError::UnexpectedEof(self.cursor)),
454				utf8_class!(invalid) => {
455					return Err(LexerError::InvalidCharacter(self.cursor));
456				}
457				b"/*" => {
458					self.advance(2);
459					depth = depth.checked_add(1).expect("excessive comment depth");
460				}
461				b"*/" => {
462					self.advance(2);
463					match depth.checked_sub(1) {
464						Some(new) => depth = new,
465						None => break,
466					}
467				}
468				[b'/' | b'*', ..] => self.advance(1),
469				text if str::from_utf8(text).is_ok() => self.advance(size),
470				_ => return Err(LexerError::InvalidUtf8(self.cursor)),
471			}
472		}
473		self.next_skip = NextSkip::None;
474		Ok(())
475	}
476	fn line_comment(&mut self) -> LexerResult<()> {
477		self.adv_certain(b"//");
478		self.next_skip = NextSkip::RecoverLineComment;
479		loop {
480			let peek = self.peek_table(utf8_len)?;
481			let size = peek.len();
482			match peek {
483				[] => break,
484				utf8_class!(line) => {
485					// consume newline
486					self.advance(size);
487					break;
488				}
489				utf8_class!(invalid) => {
490					return Err(LexerError::InvalidCharacter(self.cursor));
491				}
492				text if str::from_utf8(text).is_ok() => self.advance(size),
493				_ => return Err(LexerError::InvalidUtf8(self.cursor)),
494			}
495		}
496		self.next_skip = NextSkip::None;
497		Ok(())
498	}
499	fn escline(&mut self) -> LexerResult<()> {
500		self.adv_certain(b"\\");
501		loop {
502			let peek = self.peek_table(|first| match first {
503				b'/' => 2,
504				_ => utf8_len(first),
505			})?;
506			let size = peek.len();
507			match peek {
508				[] => break,
509				utf8_class!(space) => self.advance(size),
510				utf8_class!(line) => break self.advance(size),
511				b"/*" => self.block_comment()?,
512				b"//" => break self.line_comment()?,
513				// /_ fallthrough
514				_ => return Err(LexerError::BadEscline(self.cursor)),
515			}
516		}
517		Ok(())
518	}
519	// advances entire length of text
520	fn string_escape(&mut self) -> LexerResult<Option<char>> {
521		let start_of_escape = self.cursor;
522		self.adv_certain(b"\\");
523		let peek = self.peek_table(utf8_len)?;
524		let size = peek.len();
525		let ch = match peek {
526			utf8_class!(space) | utf8_class!(line) => {
527				let mut size = size;
528				loop {
529					self.advance(size);
530					let peek_space = self.peek_table(utf8_len)?;
531					size = peek_space.len();
532					if !matches!(peek_space, utf8_class!(space) | utf8_class!(line)) {
533						return Ok(None);
534					}
535				}
536			}
537			b"\"" => '\"',
538			b"\\" => '\\',
539			b"b" => '\x08',
540			b"f" => '\x0C',
541			b"n" => '\n',
542			b"r" => '\r',
543			b"t" => '\t',
544			b"s" => ' ',
545			b"u" => {
546				#[expect(
547					clippy::cast_possible_wrap,
548					clippy::cast_sign_loss,
549					reason = "cursed anyways"
550				)]
551				/// thanks "needsmoreestrogen" from RPLCS
552				fn hex(v: u8) -> u32 { ((((v + v) as i8 >> 7) & 9) as u8 + (v & 15)).into() }
553				macro_rules! hex { ($id:ident) => {$id @ (b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f')} }
554				self.advance(1);
555				self.adv_uncertain(b"{")?;
556				// TODO/style: improve how this looks? This uses fixed width peeks
557				// but it could be possible with variable width ones
558				// {^0}"
559				let value = match *self.peek(3..3)? {
560					[hex!(c2), hex!(c1), hex!(c0)] => {
561						self.advance(3);
562						let base = hex(c2) << 8 | hex(c1) << 4 | hex(c0);
563						// {000^}"
564						match *self.peek(2..2)? {
565							[hex!(c1), hex!(c0)] => {
566								let base = base << 8 | hex(c1) << 4 | hex(c0);
567								self.advance(2);
568								// {00000^}"
569								match *self.peek(2..2)? {
570									[hex!(c0), b'}'] => {
571										self.advance(2);
572										base << 4 | hex(c0)
573									}
574									[b'}', ..] => {
575										self.advance(1);
576										base
577									}
578									_ => return Err(LexerError::BadUnicodeEscape(start_of_escape)),
579								}
580							}
581							[hex!(c0), b'}'] => {
582								self.advance(2);
583								base << 4 | hex(c0)
584							}
585							[b'}', ..] => {
586								self.advance(1);
587								base
588							}
589							_ => return Err(LexerError::BadUnicodeEscape(start_of_escape)),
590						}
591					}
592					[hex!(c1), hex!(c0), b'}'] => {
593						self.advance(3);
594						hex(c1) << 4 | hex(c0)
595					}
596					[hex!(c0), b'}', ..] => {
597						self.advance(2);
598						hex(c0)
599					}
600					_ => return Err(LexerError::BadUnicodeEscape(start_of_escape)),
601				};
602				return Ok(Some(
603					char::from_u32(value).ok_or(LexerError::BadUnicodeEscape(start_of_escape))?,
604				));
605			}
606			_ => return Err(LexerError::InvalidEscape(start_of_escape)),
607		};
608		self.advance(size);
609		Ok(Some(ch))
610	}
611	fn spaces(&mut self) -> LexerResult<()> {
612		loop {
613			let peek = self.peek_table(|first| match first {
614				b'/' => 2,
615				_ => utf8_len(first),
616			})?;
617			let size = peek.len();
618			match peek {
619				[] => break,
620				utf8_class!(space) => self.advance(size),
621				b"\\" => self.escline()?,
622				b"/*" => self.block_comment()?,
623				// /_ fallthrough
624				_ => break,
625			}
626		}
627		Ok(())
628	}
629	fn lines(&mut self) -> LexerResult<()> {
630		loop {
631			let peek = self.peek_table(|first| match first {
632				b'/' => 2,
633				_ => utf8_len(first),
634			})?;
635			let size = peek.len();
636			match peek {
637				[] => break,
638				utf8_class!(space) | utf8_class!(line) => self.advance(size),
639				b"\\" => self.escline()?,
640				b"/*" => self.block_comment()?,
641				b"//" => self.line_comment()?,
642				// /_ fallthrough
643				_ => break,
644			}
645		}
646		Ok(())
647	}
648	fn ident_inner(&mut self, number: bool, mut text: impl StringOutput) -> LexerResult<Token> {
649		if number {
650			let mut builder = NumberBuilder::new(text);
651			'text: loop {
652				let peek = self.peek(1..usize::MAX)?;
653				if peek.is_empty() {
654					break 'text;
655				}
656				for (i, &byte) in peek.iter().enumerate() {
657					// subset of valid identifiers that could be valid numeric characters
658					// TODO/perf: compare codegen with making these ranges more specific
659					// or use a u128 lookup table :)
660					if !matches!(byte, b'+'..=b'9' | b'A'..=b'Z' | b'_' | b'a'..=b'z') {
661						self.advance(i);
662						break 'text;
663					}
664					if !builder.step(byte) {
665						self.advance(i);
666						return Err(LexerError::InvalidNumber);
667					}
668				}
669				let size = peek.len();
670				self.advance(size);
671			}
672			if matches!(self.peek_table(utf8_len)?, [] | utf8_class!(not_ident)) {
673				match builder.finish() {
674					Some(Some(value)) => Ok(Token::Number(value)),
675					Some(None) => Ok(Token::SkippedNumber),
676					None => Err(LexerError::InvalidNumber),
677				}
678			} else {
679				// number didn't consume the entire ident (e.g. high-unicode tail)
680				Err(LexerError::InvalidNumber)
681			}
682		} else {
683			let debug_start = self.cursor;
684			loop {
685				let cursor = self.cursor;
686				let peek = self.peek_table(utf8_len)?;
687				let size = peek.len();
688				match peek {
689					[] | utf8_class!(not_ident) => {
690						debug_assert_ne!(debug_start, cursor, "empty ident!");
691						break;
692					}
693					ch => {
694						if let Ok(ch) = str::from_utf8(ch) {
695							text.so_push_str(ch);
696							self.advance(size);
697						} else {
698							self.advance(1);
699							return Err(LexerError::InvalidUtf8(cursor));
700						}
701					}
702				}
703			}
704			Ok(text.so_finish())
705		}
706	}
707	fn ident(&mut self, skip: bool) -> LexerResult<Token> {
708		#[derive(Clone, Copy, PartialEq)]
709		enum Preview {
710			Regular,
711			Number,
712			Keyword,
713		}
714		// check for number-like value or bad keyword
715		let preview = match self.peek(2..2)? {
716			[b'0'..=b'9', ..] | [b'+' | b'-', b'0'..=b'9'] => Preview::Number,
717			[b'.', b'0'..=b'9'] => {
718				self.advance(1);
719				return Err(LexerError::InvalidNumber);
720			}
721			// peek further to see if it's an error (since both bytes would now be known)
722			[b'+' | b'-', b'.'] => match self.peek(3..3)? {
723				[b'+' | b'-', b'.', b'0'..=b'9'] => {
724					self.advance(2);
725					return Err(LexerError::InvalidNumber);
726				}
727				_ => Preview::Regular,
728			},
729			b"tr" | b"fa" | b"nu" | b"in" | b"-i" | b"na" => Preview::Keyword,
730			_ => Preview::Regular,
731		};
732		match (skip, preview) {
733			(skip, Preview::Keyword) => {
734				// since we need the full text, always allocate for this case
735				// it could be done without but it's not worth the complexity
736				let Token::String(text) = self.ident_inner(false, SmolStrBuilder2::new())? else {
737					unreachable!()
738				};
739				if matches!(&*text, "true" | "false" | "null" | "inf" | "-inf" | "nan") {
740					Err(LexerError::UnexpectedKeyword)
741				} else if skip {
742					Ok(Token::SkippedString)
743				} else {
744					Ok(Token::String(text))
745				}
746			}
747			(true, _) => self.ident_inner(preview == Preview::Number, ()),
748			(false, _) => self.ident_inner(preview == Preview::Number, SmolStrBuilder2::new()),
749		}
750	}
751	fn singleline_string(
752		&mut self,
753		hashes: Option<NonZeroUsize>,
754		mut text: impl StringOutput,
755	) -> LexerResult<Token> {
756		'text: loop {
757			let cursor = self.cursor;
758			let peek = self.peek_table(utf8_len)?;
759			let size = peek.len();
760			match peek {
761				[] => return Err(LexerError::UnexpectedEof(cursor)),
762				utf8_class!(invalid) => return Err(LexerError::InvalidCharacter(cursor)),
763				utf8_class!(line) => return Err(LexerError::UnexpectedStringNewline(cursor)),
764				b"\"" => {
765					self.advance(1);
766					let hashes = hashes.map_or(0, NonZeroUsize::get);
767					let mut hashes_left = hashes;
768					while hashes_left > 0 {
769						let tail = self.peek(1..hashes_left)?;
770						if tail.is_empty() {
771							self.next_skip = NextSkip::None;
772							return Err(LexerError::UnexpectedEof(cursor));
773						}
774						// TODO/perf: ensure this check is vectorized in some way
775						if !tail.iter().all(|&v| v == b'#') {
776							text.so_push_close(hashes - hashes_left);
777							// rather than trying to find the exact point where the
778							// hashes stop, just let the regular text parser handle them
779							continue 'text;
780						}
781						let len = tail.len();
782						hashes_left -= len;
783						self.advance(len);
784					}
785					self.next_skip = NextSkip::None;
786					break Ok(text.so_finish());
787				}
788				b"\\" if hashes.is_none() => {
789					if let Some(ch) = self.string_escape()? {
790						text.so_push_char(ch);
791					}
792				}
793				ch => {
794					text.so_push_str(
795						str::from_utf8(ch).map_err(|_| LexerError::InvalidUtf8(cursor))?,
796					);
797					self.advance(size);
798				}
799			}
800		}
801	}
802	// return advance distance for newline (if there is any)
803	fn newline_crlf(&mut self) -> LexerResult<Option<NonZeroUsize>> {
804		let peek = self.peek_table(|first| match first {
805			b'\r' => 2,
806			first => utf8_len(first),
807		})?;
808		Ok(NonZeroUsize::new(match peek {
809			b"\r\n" => 2,
810			[b'\r', ..] => 1,
811			utf8_class!(line) => peek.len(),
812			_ => 0,
813		}))
814	}
815	#[expect(clippy::too_many_lines, reason = "off-by-one :)")]
816	fn multiline_string_regular(&mut self, hashes: Option<NonZeroUsize>) -> LexerResult<Token> {
817		// buffer of contents, escaped & normalized but not aligned
818		let mut full_text = String::new();
819		// (line_cursor, line_start, text_start, line_end)
820		let mut lines = Vec::<Option<(usize, usize, usize, usize)>>::new();
821		let tail = 'line: loop {
822			let line_cursor = self.cursor;
823			let line_start = full_text.len();
824			let mut peek_indent = self.peek_table(utf8_len)?;
825			// indent
826			while matches!(peek_indent, utf8_class!(space)) {
827				full_text.push_str(str::from_utf8(peek_indent).unwrap_or_else(|_| unreachable!()));
828				let size = peek_indent.len();
829				self.advance(size);
830				peek_indent = self.peek_table(utf8_len)?;
831			}
832			let text_start = full_text.len();
833			let newline = self.newline_crlf()?;
834			if let Some(size) = newline {
835				lines.push(None);
836				self.advance(size.get());
837				continue;
838			}
839			'text: loop {
840				let cursor = self.cursor;
841				let peek = self.peek_table(|first| match first {
842					b'"' => 3,
843					b'\r' => 2,
844					first => utf8_len(first),
845				})?;
846				let size = peek.len();
847				match peek {
848					[] => return Err(LexerError::UnexpectedEof(cursor)),
849					utf8_class!(invalid) => return Err(LexerError::InvalidCharacter(cursor)),
850					[b'\r', ..] | utf8_class!(line) => {
851						lines.push(Some((line_cursor, line_start, text_start, full_text.len())));
852						let size = match peek {
853							b"\r\n" => 2,
854							[b'\r', ..] => 1,
855							_ => size,
856						};
857						self.advance(size);
858						break;
859					}
860					b"\"\"\"" => {
861						self.advance(3);
862						let hashes = hashes.map_or(0, NonZeroUsize::get);
863						let mut hashes_left = hashes;
864						while hashes_left > 0 {
865							let tail = self.peek(1..hashes_left)?;
866							if tail.is_empty() {
867								self.next_skip = NextSkip::None;
868								return Err(LexerError::UnexpectedEof(self.cursor));
869							}
870							// TODO/perf: ensure this check is vectorized in some way
871							if !tail.iter().all(|&v| v == b'#') {
872								full_text.push_str("\"\"\"");
873								full_text.extend(repeat_n('#', hashes - hashes_left));
874								// rather than trying to find the exact point where the
875								// hashes stop, just let the regular text parser handle them
876								continue 'text;
877							}
878							let len = tail.len();
879							hashes_left -= len;
880							self.advance(len);
881						}
882						self.next_skip = NextSkip::None;
883						if full_text.len() > text_start {
884							return Err(LexerError::BadEndString(cursor));
885						}
886						break 'line line_start..text_start;
887					}
888					[b'"', ..] => {
889						full_text.push('"');
890						self.advance(1);
891					}
892					b"\\" if hashes.is_none() => {
893						if let Some(ch) = self.string_escape()? {
894							full_text.push(ch);
895						}
896					}
897					ch => {
898						full_text.push_str(
899							str::from_utf8(ch).map_err(|_| LexerError::InvalidUtf8(cursor))?,
900						);
901						self.advance(size);
902					}
903				}
904			}
905		};
906		let tail_len = tail.end - tail.start;
907		// create final text
908		let mut text = SmolStrBuilder2::new();
909		let mut pre_newline = false;
910		for line in lines {
911			if pre_newline {
912				text.push('\n');
913			}
914			pre_newline = true;
915			if let Some((line_cursor, line_start, text_start, line_end)) = line {
916				if text_start - line_start < tail_len
917					|| full_text[tail.clone()] != full_text[line_start..line_start + tail_len]
918				{
919					return Err(LexerError::BadIndent(Some(line_cursor)));
920				}
921				text.push_str(&full_text[line_start + tail_len..line_end]);
922			}
923		}
924		Ok(Token::String(text.finish()))
925	}
926	// TODO/style: merge inner loop with multiline_string_regular
927	fn multiline_string_skip(&mut self, hashes: Option<NonZeroUsize>) -> LexerResult<Token> {
928		// it's impossible to make a multiline string reader that doesn't allocate
929		// instead, notice that for the close quote (and thus the string) to be valid,
930		// it must have no body text. by only reading the indent as the smallest common
931		// one, this validates indents while only allocating one line
932		let mut indent = SmolStrBuilder2::new();
933		let mut peek_indent = self.peek_table(utf8_len)?;
934		// get first line indent
935		while matches!(peek_indent, utf8_class!(space)) {
936			indent.push_str(str::from_utf8(peek_indent).unwrap_or_else(|_| unreachable!()));
937			let size = peek_indent.len();
938			self.advance(size);
939			peek_indent = self.peek_table(utf8_len)?;
940		}
941		let mut next_truncate_length = indent.len();
942		'line: loop {
943			// if there's any leftover indent it's more indented than minimum (or empty)
944			let has_leading_space = matches!(self.peek_table(utf8_len)?, utf8_class!(space));
945			// line body / end of string
946			let mut has_body = false;
947			'text: loop {
948				let cursor = self.cursor;
949				let peek = self.peek_table(|first| match first {
950					b'"' => 3,
951					b'\r' => 2,
952					first => utf8_len(first),
953				})?;
954				let size = peek.len();
955				match peek {
956					[] => return Err(LexerError::UnexpectedEof(cursor)),
957					utf8_class!(invalid) => return Err(LexerError::InvalidCharacter(cursor)),
958					utf8_class!(space) => {
959						// doesn't set body flag
960						self.advance(size);
961					}
962					[b'\r', ..] | utf8_class!(line) => {
963						let size = match peek {
964							b"\r\n" => 2,
965							[b'\r', ..] => 1,
966							_ => size,
967						};
968						self.advance(size);
969						break;
970					}
971					b"\"\"\"" => {
972						self.advance(3);
973						let hashes = hashes.map_or(0, NonZeroUsize::get);
974						let mut hashes_left = hashes;
975						while hashes_left > 0 {
976							let tail = self.peek(1..hashes_left)?;
977							if tail.is_empty() {
978								self.next_skip = NextSkip::None;
979								return Err(LexerError::UnexpectedEof(self.cursor));
980							}
981							// TODO/perf: ensure this check is vectorized/usize-ized in some way
982							if !tail.iter().all(|&v| v == b'#') {
983								// rather than trying to find the exact point where the
984								// hashes stop, just let the regular text parser handle them
985								has_body = true;
986								continue 'text;
987							}
988							let len = tail.len();
989							hashes_left -= len;
990							self.advance(len);
991						}
992						self.next_skip = NextSkip::None;
993						if has_body {
994							return Err(LexerError::BadEndString(cursor));
995						} else if has_leading_space {
996							return Err(LexerError::BadIndent(None));
997						}
998						break 'line;
999					}
1000					[b'"', ..] => {
1001						has_body = true;
1002						self.advance(1);
1003					}
1004					// TODO/perf: duplicate loop with outer check? probably not worth it
1005					b"\\" if hashes.is_none() => has_body |= self.string_escape()?.is_some(),
1006					ch => {
1007						_ = str::from_utf8(ch).map_err(|_| LexerError::InvalidUtf8(cursor))?;
1008						has_body = true;
1009						self.advance(size);
1010					}
1011				}
1012			}
1013			// truncate if line isn't empty
1014			if has_body {
1015				indent.truncate_floor(next_truncate_length);
1016			}
1017			// take longest matching indent
1018			next_truncate_length = indent.len();
1019			let mut matched_bytes = 0;
1020			while matched_bytes < indent.len() {
1021				// TODO/perf: vectorizable approach https://users.rust-lang.org/t/25815
1022				fn common_prefix(a: &[u8], b: &[u8]) -> usize {
1023					a.iter().zip(b).take_while(|(a, b)| a == b).count()
1024				}
1025				let peek = self.peek(1..indent.len() - matched_bytes)?;
1026				let next = &indent.as_bytes()[matched_bytes..];
1027				let common = common_prefix(peek, next);
1028				matched_bytes += common;
1029				let size = peek.len();
1030				self.advance(common);
1031				if common < size {
1032					next_truncate_length = matched_bytes;
1033					break;
1034				}
1035			}
1036		}
1037		Ok(Token::SkippedString)
1038	}
1039	// all strings entry point
1040	fn string(&mut self, skip: bool) -> LexerResult<Token> {
1041		let mut hashes = 0_usize;
1042		'count: loop {
1043			let mut advance = 0;
1044			for &byte in self.peek(1..usize::MAX)? {
1045				if byte != b'#' {
1046					self.advance(advance);
1047					hashes = hashes.checked_add(advance).unwrap();
1048					break 'count;
1049				}
1050				advance += 1;
1051			}
1052			if advance == 0 {
1053				return Err(LexerError::UnexpectedEof(self.cursor));
1054			}
1055			self.advance(advance);
1056			// if this panics you've got bigger issues
1057			hashes = hashes.checked_add(advance).unwrap();
1058		}
1059		let hashes = NonZeroUsize::new(hashes);
1060		match self.peek(3..3)? {
1061			b"\"\"\"" => {
1062				self.next_skip = NextSkip::RecoverString {
1063					multiline: true,
1064					hashes,
1065				};
1066				self.advance(3);
1067				let Some(size) = self.newline_crlf()? else {
1068					return Err(LexerError::MissingStringNewline);
1069				};
1070				let size = size.get();
1071				self.advance(size);
1072				if skip {
1073					self.multiline_string_skip(hashes)
1074				} else {
1075					self.multiline_string_regular(hashes)
1076				}
1077			}
1078			[b'"', ..] => {
1079				self.next_skip = NextSkip::RecoverString {
1080					multiline: false,
1081					hashes,
1082				};
1083				self.advance(1);
1084				if skip {
1085					self.singleline_string(hashes, ())
1086				} else {
1087					self.singleline_string(hashes, SmolStrBuilder2::new())
1088				}
1089			}
1090			_ => Err(LexerError::BadRawString),
1091		}
1092	}
1093	fn advance_err(&mut self, n: usize, err: LexerError) -> LexerError {
1094		self.advance(n);
1095		err
1096	}
1097	fn recover_until(
1098		&mut self,
1099		table: impl Fn(u8) -> usize,
1100		mut done: impl FnMut(&[u8]) -> bool,
1101	) -> LexerResult<()> {
1102		loop {
1103			match self.peek_table(&table)? {
1104				[] => break,
1105				ch => {
1106					if done(ch) {
1107						let size = ch.len();
1108						self.advance(size);
1109						break;
1110					}
1111					self.advance(1);
1112				}
1113			}
1114		}
1115		Ok(())
1116	}
1117	fn next_token_value(&mut self, skip: bool, out_cursor: &mut usize) -> LexerResult<Token> {
1118		match self.next_skip {
1119			NextSkip::None => {}
1120			NextSkip::Spaces => self.spaces()?,
1121			NextSkip::Lines => self.lines()?,
1122			NextSkip::RecoverLineComment => {
1123				self.recover_until(utf8_len, |ch| matches!(ch, utf8_class!(line)))?;
1124			}
1125			NextSkip::RecoverBlockComment(mut depth) => {
1126				self.recover_until(
1127					|_| 2,
1128					|ch| {
1129						if ch == b"*/" {
1130							if depth == 0 {
1131								true
1132							} else {
1133								depth -= 1;
1134								false
1135							}
1136						} else if ch == b"/*" {
1137							depth += 1;
1138							false
1139						} else {
1140							false
1141						}
1142					},
1143				)?;
1144			}
1145			NextSkip::RecoverString { multiline, hashes } => {
1146				let quotes = if multiline { 3_usize } else { 1_usize };
1147				let length = quotes + hashes.map_or(0, NonZeroUsize::get);
1148				let mut distance = 0;
1149				self.recover_until(
1150					|_| 1,
1151					|ch| {
1152						if (ch == b"\"" && distance < quotes) || (ch == b"#" && distance < length) {
1153							distance += 1;
1154						} else {
1155							distance = 0;
1156						}
1157						distance == length
1158					},
1159				)?;
1160			}
1161			NextSkip::IrrecoverableError => return Ok(Token::Eof),
1162		}
1163		self.next_skip = NextSkip::None;
1164		let start = self.cursor;
1165		*out_cursor = start;
1166		// TODO/perf: ideas for general parsing improvements
1167		// - splitting match by result length (0/1/2/3/4 bytes)
1168		// - match keywords with 4 bytes instead of 3 (u32?)
1169		// - check on byte-slice matching codegen (in general)
1170		let peek = self.peek_table(|first: u8| match first {
1171			b'/' => 2,
1172			b'#' => 3,
1173			_ => utf8_len(first),
1174		})?;
1175		let size = peek.len();
1176		Ok(match peek {
1177			[] => Token::Eof,
1178			utf8_class!(bom) if start == 0 => self.just(3, Token::Bom),
1179			utf8_class!(invalid) => {
1180				return Err(self.advance_err(size, LexerError::InvalidCharacter(self.cursor)));
1181			}
1182			utf8_class!(line) => self.begin_skip(size, NextSkip::Lines),
1183			utf8_class!(space) => self.begin_skip(size, NextSkip::Spaces),
1184			b"\\" => self.begin_skip(0, NextSkip::Spaces),
1185			b";" => self.just(1, Token::SemiColon),
1186			b"=" => self.just(1, Token::Equals),
1187			b"(" => self.just(1, Token::OpenParen),
1188			b")" => self.just(1, Token::CloseParen),
1189			b"{" => self.just(1, Token::OpenCurly),
1190			b"}" => self.just(1, Token::CloseCurly),
1191			b"[" | b"]" => return Err(self.advance_err(1, LexerError::InvalidOperator)),
1192			// silly trick I'm taking from serde_json
1193			b"#tr" => self.keyword(b"#tr", b"ue", Token::Bool(true))?,
1194			b"#fa" => self.keyword(b"#fa", b"lse", Token::Bool(false))?,
1195			b"#nu" => self.keyword(b"#nu", b"ll", Token::Null)?,
1196			b"#in" => self.keyword_number(b"#in", b"f", skip, Number::INFINITY)?,
1197			b"#-i" => self.keyword_number(b"#-i", b"nf", skip, Number::NEG_INFINITY)?,
1198			b"#na" => self.keyword_number(b"#na", b"n", skip, Number::NAN)?,
1199			[b'#', ..] | b"\"" => self.string(skip)?,
1200			b"/-" => self.just(2, Token::SlashDash),
1201			b"/*" => self.begin_skip(0, NextSkip::Spaces),
1202			b"//" => self.begin_skip(0, NextSkip::Lines),
1203			[b'/', ..] => return Err(self.advance_err(1, LexerError::InvalidOperator)),
1204			_ => self.ident(skip)?,
1205		})
1206	}
1207	/// Read one token from input, returns value and starting position. This can
1208	/// safely be resumed after an error, for attempted recovery information.
1209	pub fn next_token(&mut self, skip: bool) -> (LexerResult<Token>, usize) {
1210		// TODO/style: this whole mut ref thing is kinda jank
1211		let mut pos = self.cursor;
1212		let token = self.next_token_value(skip, &mut pos);
1213		(token, pos)
1214	}
1215	/// Get the current position, this is not the same as the next
1216	/// [`next_token`] position if the last token read was Spaces, Lines, or an
1217	/// error.
1218	///
1219	/// [`next_token`]: Self::next_token
1220	pub fn current_position(&mut self) -> usize { self.cursor }
1221}
just_kdl/lexer.rs

just_kdl/
lexer.rs