Skip to main content

surql_parser/upstream/syn/lexer/
mod.rs

1mod byte;
2mod char;
3pub mod compound;
4mod ident;
5pub mod keywords;
6mod reader;
7mod strings;
8mod unicode;
9use crate::upstream::syn::error::{SyntaxError, bail};
10use crate::upstream::syn::token::{Span, Token, TokenKind};
11pub use reader::{BytesReader, CharError};
12/// The SurrealQL lexer.
13/// Takes a slice of bytes and turns it into tokens. The lexer is designed with
14/// possible invalid utf-8 in mind and will handle bytes which are invalid utf-8
15/// with an error.
16///
17/// The lexer generates tokens lazily. whenever [`Lexer::next_token`] is called
18/// on the lexer it will try to lex the next bytes in the give source as a
19/// token. The lexer always returns a token, even if the source contains invalid
20/// tokens or as at the end of the source. In both cases a specific
21/// type of token is returned.
22///
23/// Note that SurrealQL syntax cannot be lexed in advance. For example, record
24/// strings and regexes, both cannot be parsed correctly without knowledge of
25/// previous tokens as they are both ambigious with other tokens.
26pub struct Lexer<'a> {
27	/// The reader for reading the source bytes.
28	pub(super) reader: BytesReader<'a>,
29	/// The one past the last character of the previous token.
30	last_offset: u32,
31	pub(super) error: Option<SyntaxError>,
32}
33impl<'a> Lexer<'a> {
34	/// Create a new lexer.
35	/// # Panic
36	/// This function will panic if the source is longer then u32::MAX.
37	pub fn new(source: &'a [u8]) -> Lexer<'a> {
38		assert!(
39			source.len() <= u32::MAX as usize,
40			"source code exceeded maximum size"
41		);
42		let reader = BytesReader::new(source);
43		Lexer {
44			reader,
45			last_offset: 0,
46			error: None,
47		}
48	}
49	/// Reset the state of the lexer.
50	///
51	/// Doesn't change the state of the reader.
52	pub fn reset(&mut self) {
53		self.last_offset = 0;
54		self.error = None;
55	}
56	/// Change the used source from the lexer to a new buffer.
57	///
58	/// Usefull for reusing buffers.
59	///
60	/// # Panic
61	/// This function will panic if the source is longer then u32::MAX.
62	pub fn change_source<'b>(self, source: &'b [u8]) -> Lexer<'b> {
63		assert!(
64			source.len() <= u32::MAX as usize,
65			"source code exceeded maximum size"
66		);
67		let reader = BytesReader::<'b>::new(source);
68		Lexer {
69			reader,
70			last_offset: 0,
71			error: self.error,
72		}
73	}
74	/// Returns the next token, driving the lexer forward.
75	///
76	/// If the lexer is at the end the source it will always return the Eof
77	/// token.
78	pub fn next_token(&mut self) -> Token {
79		let Some(byte) = self.reader.next() else {
80			return self.eof_token();
81		};
82		if byte.is_ascii() {
83			self.lex_ascii(byte)
84		} else {
85			self.lex_char(byte)
86		}
87	}
88	/// Creates the eof token.
89	///
90	/// An eof token has tokenkind Eof and an span which points to the last
91	/// character of the source.
92	fn eof_token(&mut self) -> Token {
93		Token {
94			kind: TokenKind::Eof,
95			span: Span {
96				offset: self.last_offset,
97				len: 0,
98			},
99		}
100	}
101	/// Return an invalid token.
102	fn invalid_token(&mut self, error: SyntaxError) -> Token {
103		self.error = Some(error);
104		self.finish_token(TokenKind::Invalid)
105	}
106	pub fn current_span(&self) -> Span {
107		let new_offset = self.reader.offset();
108		let len = new_offset - self.last_offset;
109		Span {
110			offset: self.last_offset,
111			len,
112		}
113	}
114	pub fn span_since(&self, offset: u32) -> Span {
115		let new_offset = self.reader.offset();
116		let len = new_offset - offset;
117		Span { offset, len }
118	}
119	fn advance_span(&mut self) -> Span {
120		let span = self.current_span();
121		self.last_offset = self.reader.offset();
122		span
123	}
124	/// Builds a token from an TokenKind.
125	///
126	/// Attaches a span to the token and returns, updates the new offset.
127	fn finish_token(&mut self, kind: TokenKind) -> Token {
128		Token {
129			kind,
130			span: self.advance_span(),
131		}
132	}
133	/// Moves the lexer state to after the give span.
134	///
135	/// # Warning
136	/// Moving the lexer into a state where the next byte is within a multibyte
137	/// character will result in spurious errors.
138	pub fn backup_after(&mut self, span: Span) {
139		let offset = span.offset + span.len;
140		self.reader.backup(offset);
141		self.last_offset = offset;
142	}
143	/// Checks if the next byte is the given byte, if it is it consumes the byte
144	/// and returns true. Otherwise returns false.
145	///
146	/// Also returns false if there is no next character.
147	fn eat(&mut self, byte: u8) -> bool {
148		if self.reader.peek() == Some(byte) {
149			self.reader.next();
150			true
151		} else {
152			false
153		}
154	}
155	/// Checks if the closure returns true when given the next byte, if it is it
156	/// consumes the byte and returns true. Otherwise returns false.
157	///
158	/// Also returns false if there is no next character.
159	fn eat_when<F: FnOnce(u8) -> bool>(&mut self, f: F) -> bool {
160		let Some(x) = self.reader.peek() else {
161			return false;
162		};
163		if f(x) {
164			self.reader.next();
165			true
166		} else {
167			false
168		}
169	}
170	fn expect(&mut self, c: char) -> Result<(), SyntaxError> {
171		match self.reader.peek() {
172			Some(x) => {
173				let offset = self.reader.offset();
174				self.reader.next();
175				let char = self.reader.convert_to_char(x)?;
176				if char == c {
177					return Ok(());
178				}
179				let len = self.reader.offset() - offset;
180				bail!(
181					"Unexpected character `{char}` expected `{c}`", @ Span { offset, len
182					}
183				)
184			}
185			None => {
186				bail!(
187					"Unexpected end of file, expected character `{c}`", @ self
188					.current_span()
189				)
190			}
191		}
192	}
193	/// Returns the string for a given span of the source.
194	/// Will panic if the given span was not valid for the source, or invalid
195	/// utf8
196	pub fn span_str(&self, span: Span) -> &'a str {
197		std::str::from_utf8(self.span_bytes(span)).expect("invalid span segment for source")
198	}
199	/// Returns the string for a given span of the source.
200	/// Will panic if the given span was not valid for the source, or invalid
201	/// utf8
202	pub fn span_bytes(&self, span: Span) -> &'a [u8] {
203		self.reader.span(span)
204	}
205	/// Returns an error if not all bytes were consumed.
206	pub fn assert_finished(&self) -> Result<(), SyntaxError> {
207		if !self.reader.is_empty() {
208			let offset = self.reader.offset();
209			let len = self.reader.remaining().len() as u32;
210			let span = Span { offset, len };
211			bail!("Trailing characters", @ span)
212		}
213		Ok(())
214	}
215}
216impl Iterator for Lexer<'_> {
217	type Item = Token;
218	fn next(&mut self) -> Option<Self::Item> {
219		let token = self.next_token();
220		if token.is_eof() {
221			return None;
222		}
223		Some(token)
224	}
225}