Skip to main content

surql_parser/upstream/syn/lexer/
ident.rs

1use super::unicode::is_identifier_continue;
2use crate::upstream::syn::error::{SyntaxError, bail, syntax_error};
3use crate::upstream::syn::lexer::keywords::KEYWORDS;
4use crate::upstream::syn::lexer::{BytesReader, Lexer};
5use crate::upstream::syn::token::{Span, Token, TokenKind};
6use unicase::UniCase;
7const BRACKET_CHARACTERS: [u8; 3] = const {
8	let mut b = [0; 3];
9	if '⟨'.encode_utf8(&mut b).len() != 3 {
10		panic!()
11	}
12	b
13};
14const BRACKET_START_CHARACTER: u8 = BRACKET_CHARACTERS[0];
15impl Lexer<'_> {
16	pub fn unescape_ident_span<'a>(
17		str: &'a str,
18		span: Span,
19		buffer: &'a mut Vec<u8>,
20	) -> Result<&'a str, SyntaxError> {
21		let mut reader = BytesReader::new(str.as_bytes());
22		match reader.next() {
23			Some(b'`') => Self::unescape_backtick_span(reader, span, buffer),
24			Some(BRACKET_START_CHARACTER) => Self::unescape_bracket_span(reader, span, buffer),
25			_ => Ok(str),
26		}
27	}
28	fn unescape_backtick_span<'a>(
29		mut reader: BytesReader,
30		span: Span,
31		buffer: &'a mut Vec<u8>,
32	) -> Result<&'a str, SyntaxError> {
33		buffer.clear();
34		loop {
35			let before = reader.offset();
36			let x = reader.next().expect("lexer validated input");
37			match x {
38				b'\\' => {
39					Self::lex_common_escape_sequence(&mut reader, span, before, buffer)?;
40				}
41				b'`' => break,
42				x => {
43					buffer.push(x);
44				}
45			}
46		}
47		Ok(unsafe { std::str::from_utf8_unchecked(buffer) })
48	}
49	fn unescape_bracket_span<'a>(
50		mut reader: BytesReader,
51		span: Span,
52		buffer: &'a mut Vec<u8>,
53	) -> Result<&'a str, SyntaxError> {
54		buffer.clear();
55		assert_eq!(
56			reader
57				.complete_char(BRACKET_START_CHARACTER)
58				.expect("valid character"),
59			'⟨'
60		);
61		loop {
62			let before = reader.offset();
63			let x = reader.next().expect("lexer validated input");
64			match x {
65				b'\\' => {
66					Self::lex_common_escape_sequence(&mut reader, span, before, buffer)?;
67				}
68				x if !x.is_ascii() => {
69					let c = reader.complete_char(x).expect("valid character");
70					if c == '⟩' {
71						break;
72					} else {
73						let mut char_buffer = [0u8; 4];
74						buffer.extend_from_slice(c.encode_utf8(&mut char_buffer).as_bytes());
75					}
76				}
77				x => {
78					buffer.push(x);
79				}
80			}
81		}
82		Ok(unsafe { std::str::from_utf8_unchecked(buffer) })
83	}
84	/// Lex a parameter in the form of `$[a-zA-Z0-9_]*`
85	///
86	/// # Lexer State
87	/// Expected the lexer to have already eaten the param starting `$`
88	pub(super) fn lex_param(&mut self) -> Token {
89		loop {
90			if let Some(x) = self.reader.peek()
91				&& (x.is_ascii_alphanumeric() || x == b'_')
92			{
93				self.reader.next();
94				continue;
95			}
96			return self.finish_token(TokenKind::Parameter);
97		}
98	}
99	pub(super) fn lex_surrounded_param(&mut self, is_backtick: bool) -> Token {
100		match self.lex_surrounded_ident_err(is_backtick) {
101			Ok(_) => self.finish_token(TokenKind::Parameter),
102			Err(e) => self.invalid_token(e),
103		}
104	}
105	/// Lex an not surrounded identifier in the form of `[a-zA-Z0-9_]*`
106	///
107	/// The start byte should already a valid byte of the identifier.
108	///
109	/// When calling the caller should already know that the token can't be any
110	/// other token covered by `[a-zA-Z0-9_]*`.
111	pub(super) fn lex_ident_from_next_byte(&mut self, start: u8) -> Token {
112		debug_assert!(matches!(start, b'a'..= b'z' | b'A'..= b'Z' | b'_'));
113		self.lex_ident()
114	}
115	/// Lex a not surrounded identfier.
116	///
117	/// The scratch should contain only identifier valid chars.
118	pub(super) fn lex_ident(&mut self) -> Token {
119		loop {
120			if let Some(x) = self.reader.peek()
121				&& is_identifier_continue(x)
122			{
123				self.reader.next();
124				continue;
125			}
126			let str = self.span_str(self.current_span());
127			if let Some(x) = KEYWORDS.get(&UniCase::ascii(str)).copied() {
128				if x != TokenKind::Identifier {
129					return self.finish_token(x);
130				}
131			} else if str == "NaN" {
132				return self.finish_token(TokenKind::NaN);
133			} else if str == "Infinity" {
134				return self.finish_token(TokenKind::Infinity);
135			}
136			return self.finish_token(TokenKind::Identifier);
137		}
138	}
139	/// Lex an ident which is surround by delimiters.
140	pub(super) fn lex_surrounded_ident(&mut self, is_backtick: bool) -> Token {
141		match self.lex_surrounded_ident_err(is_backtick) {
142			Ok(_) => self.finish_token(TokenKind::Identifier),
143			Err(e) => self.invalid_token(e),
144		}
145	}
146	/// Lex an ident surrounded either by `⟨⟩` or `\`\``
147	pub(super) fn lex_surrounded_ident_err(
148		&mut self,
149		is_backtick: bool,
150	) -> Result<(), SyntaxError> {
151		let start_span = self.current_span();
152		loop {
153			let Some(x) = self.reader.next() else {
154				let end_char = if is_backtick { '`' } else { '⟩' };
155				let error = syntax_error!(
156					"Unexpected end of file, expected identifier to end with `{end_char}`",
157					@ self.current_span()
158				);
159				return Err(error);
160			};
161			match x {
162				b'`' if is_backtick => {
163					return Ok(());
164				}
165				b'\\' => {
166					let Some(next) = self.reader.next() else {
167						bail!(
168							"Unexpected end of file, expected identifier to end.", @
169							start_span => "Identifier starting here."
170						);
171					};
172					if !next.is_ascii() {
173						self.reader.complete_char(next)?;
174					}
175				}
176				BRACKET_START_CHARACTER if !is_backtick => {
177					if self.reader.complete_char(BRACKET_START_CHARACTER)? == '⟩' {
178						return Ok(());
179					}
180				}
181				x => {
182					if !x.is_ascii() {
183						self.reader.complete_char(x)?;
184					}
185				}
186			}
187		}
188	}
189}