Skip to main content

surql_parser/upstream/syn/parser/
mod.rs

1//! Module implementing the SurrealQL parser.
2//!
3//! The SurrealQL parse is a relatively simple recursive decent parser.
4//! Most of the functions of the SurrealQL parser peek a token from the lexer
5//! and then decide to take a path depending on which token is next.
6//!
7//! # Implementation Details
8//!
9//! There are a bunch of common patterns for which this module has some
10//! confinence functions.
11//! - Whenever only one token can be next you should use the `expected!` macro. This macro ensures
12//!   that the given token type is next and if not returns a parser error.
13//! - Whenever a limited set of tokens can be next it is common to match the token kind and then
14//!   have a catch all arm which calles the macro `unexpected!`. This macro will raise an parse
15//!   error with information about the type of token it recieves and what it expected.
16//! - If a single token can be optionally next use [`Parser::eat`] this function returns a bool
17//!   depending on if the given tokenkind was eaten.
18//! - If a closing delimiting token is expected use `Parser::expect_closing_delimiter`. This
19//!   function will raise an error if the expected delimiter isn't the next token. This error will
20//!   also point to which delimiter the parser expected to be closed.
21//!
22//! ## Far Token Peek
23//!
24//! Occasionally the parser needs to check further ahead than peeking allows.
25//! This is done with the [`Parser::peek1`] function. This function peeks one
26//! token further then peek.
27//!
28//! ## WhiteSpace Tokens
29//!
30//! The lexer produces whitespace tokens, these are tokens which are normally
31//! ignored in most place in the syntax as they have no bearing on the meaning
32//! of a statements. [`Parser::next`] and [`Parser::peek`] automatically skip
33//! over any whitespace tokens. However in some places, like in a record-id and
34//! when gluing tokens, these white-space tokens are required for correct
35//! parsing. In which case the function [`Parser::next_whitespace`] and others
36//! with `_whitespace` are used. These functions don't skip whitespace tokens.
37//! However these functions do not undo whitespace tokens which might have been
38//! skipped. Implementers must be carefull to not call a functions which
39//! requires whitespace tokens when they may already have been skipped.
40//!
41//! ## Compound tokens and token gluing.
42//!
43//! SurrealQL has a bunch of tokens which have complex rules for when they are
44//! allowed and the value they contain. Such tokens are named compound tokens,
45//! and examples include a javascript body, strand-like tokens, regex, numbers,
46//! etc.
47//!
48//! These tokens need to be manually requested from the lexer with the
49//! [`Lexer::lex_compound`] function.
50//!
51//! This manually request of tokens leads to a problems when used in conjunction
52//! with peeking. Take for instance the production `{ "foo": "bar"}`. `"foo"` is
53//! a compound token so when intially encountered the lexer only returns a `"`
54//! token and then that token needs to be collected into a the full strand
55//! token. However the parser needs to figure out if we are parsing an object or
56//! a block so it needs to look past the compound token to see if the next token
57//! is `:`. This is where gluing comes in. Calling `Parser::glue` checks if the
58//! next token could start a compound token and combines them into a single
59//! token. This can only be done in places where we know if we encountered a
60//! leading token of a compound token it will result in the 'default' compound
61//! token.
62use self::token_buffer::TokenBuffer;
63use crate::upstream::sql;
64use crate::upstream::syn::error::{SyntaxError, bail};
65use crate::upstream::syn::lexer::Lexer;
66use crate::upstream::syn::lexer::compound::CompoundToken;
67use crate::upstream::syn::token::{Span, Token, TokenKind, t};
68use bytes::BytesMut;
69use reblessive::{Stack, Stk};
70mod basic;
71mod builtin;
72mod expression;
73mod function;
74mod idiom;
75mod kind;
76pub mod mac;
77mod object;
78mod prime;
79mod record_id;
80mod stmt;
81mod token;
82mod token_buffer;
83mod value;
84use super::error::{RenderedError, syntax_error};
85#[cfg(feature = "arbitrary")]
86pub use builtin::{PATHS, PathKind};
87pub(crate) use mac::{enter_object_recursion, enter_query_recursion, unexpected};
88/// The result returned by most parser function.
89pub type ParseResult<T> = Result<T, SyntaxError>;
90/// A result of trying to parse a possibly partial query.
91#[derive(Debug)]
92pub enum PartialResult<T> {
93	MoreData,
94	/// Parsing the source produced no reasonable value.
95	Empty {
96		used: usize,
97	},
98	Ok {
99		value: T,
100		used: usize,
101	},
102	Err {
103		err: SyntaxError,
104		used: usize,
105	},
106}
107#[derive(Clone, Debug)]
108pub struct ParserSettings {
109	/// Parse strand like the old parser where a strand which looks like a UUID,
110	/// Record-Id, Or a DateTime will be parsed as a date-time.
111	pub legacy_strands: bool,
112	/// Set whether to allow record-id's which don't adheare to regular ident
113	/// rules. Setting this to true will allow parsing of, for example,
114	/// `foo:0bar`. This would be rejected by normal identifier rules as most
115	/// identifiers can't start with a number.
116	pub flexible_record_id: bool,
117	/// Disallow a query to have objects deeper that limit.
118	/// Arrays also count towards objects. So `[{foo: [] }]` would be 3 deep.
119	pub object_recursion_limit: usize,
120	/// Disallow a query from being deeper than the give limit.
121	/// A query recurses when a statement contains another statement within
122	/// itself. Examples are subquery and blocks like block statements and if
123	/// statements and such.
124	pub query_recursion_limit: usize,
125	/// Whether the files feature is enabled
126	pub files_enabled: bool,
127	/// Whether the surrealism feature is enabled
128	pub surrealism_enabled: bool,
129}
130impl Default for ParserSettings {
131	fn default() -> Self {
132		ParserSettings {
133			legacy_strands: false,
134			flexible_record_id: true,
135			object_recursion_limit: 100,
136			query_recursion_limit: 20,
137			files_enabled: false,
138			surrealism_enabled: false,
139		}
140	}
141}
142impl ParserSettings {
143	pub fn default_with_experimental(enabled: bool) -> Self {
144		ParserSettings {
145			files_enabled: enabled,
146			surrealism_enabled: enabled,
147			..Self::default()
148		}
149	}
150}
151/// The SurrealQL parser.
152pub struct Parser<'a> {
153	lexer: Lexer<'a>,
154	last_span: Span,
155	token_buffer: TokenBuffer<4>,
156	pub table_as_field: bool,
157	settings: ParserSettings,
158	unscape_buffer: Vec<u8>,
159}
160impl<'a> Parser<'a> {
161	/// Create a new parser from a give source.
162	pub fn new(source: &'a [u8]) -> Self {
163		Parser::new_with_settings(source, ParserSettings::default())
164	}
165	/// Create a new parser from a give source.
166	pub fn new_with_experimental(source: &'a [u8], enabled: bool) -> Self {
167		Parser::new_with_settings(source, ParserSettings::default_with_experimental(enabled))
168	}
169	/// Create a new parser from a give source.
170	pub fn new_with_settings(source: &'a [u8], settings: ParserSettings) -> Self {
171		Parser {
172			lexer: Lexer::new(source),
173			last_span: Span::empty(),
174			token_buffer: TokenBuffer::new(),
175			table_as_field: true,
176			settings,
177			unscape_buffer: Vec::new(),
178		}
179	}
180	pub fn with_settings(mut self, settings: ParserSettings) -> Self {
181		self.settings = settings;
182		self
183	}
184	/// Returns the next token and advance the parser one token forward.
185	#[expect(clippy::should_implement_trait)]
186	pub fn next(&mut self) -> Token {
187		let res = self
188			.token_buffer
189			.pop()
190			.unwrap_or_else(|| self.lexer.next_token());
191		self.last_span = res.span;
192		res
193	}
194	/// Returns the next token and advance the parser one token forward.
195	///
196	/// This function is like next but returns whitespace tokens which are
197	/// normally skipped
198	pub fn next_whitespace(&mut self) -> Option<Token> {
199		if let Some(x) = self.peek_whitespace() {
200			self.pop_peek();
201			return Some(x);
202		}
203		None
204	}
205	/// Returns if there is a token in the token buffer, meaning that a token
206	/// was peeked.
207	pub fn has_peek(&self) -> bool {
208		self.token_buffer.is_empty()
209	}
210	/// Consume the current peeked value and advance the parser one token
211	/// forward.
212	///
213	/// Should only be called after peeking a value.
214	pub fn pop_peek(&mut self) -> Token {
215		let res = self.token_buffer.pop().expect("token buffer is non-empty");
216		self.last_span = res.span;
217		res
218	}
219	/// Returns the next token without consuming it.
220	pub fn peek(&mut self) -> Token {
221		let Some(x) = self.token_buffer.first() else {
222			let res = self.lexer.next_token();
223			self.token_buffer.push(res);
224			return res;
225		};
226		x
227	}
228	/// Returns the next token without consuming it.
229	///
230	/// This function is like peek but returns whitespace tokens which are
231	/// normally skipped Does not undo tokens skipped in a previous normal
232	/// peek.
233	pub fn peek_whitespace(&mut self) -> Option<Token> {
234		let token = if let Some(x) = self.token_buffer.first() {
235			x
236		} else {
237			let token = self.lexer.next_token();
238			self.token_buffer.push(token);
239			token
240		};
241		if !token.span.follows_from(&self.last_span) {
242			return None;
243		}
244		Some(token)
245	}
246	/// Return the token kind of the next token without consuming it.
247	pub fn peek_kind(&mut self) -> TokenKind {
248		self.peek().kind
249	}
250	/// Returns the next n'th token without consuming it.
251	/// `peek_token_at(0)` is equivalent to `peek`.
252	pub fn peek_token_at(&mut self, at: u8) -> Token {
253		for _ in self.token_buffer.len()..=at {
254			let r = self.lexer.next_token();
255			self.token_buffer.push(r);
256		}
257		self.token_buffer.at(at).expect("token exists at index")
258	}
259	pub fn peek1(&mut self) -> Token {
260		self.peek_token_at(1)
261	}
262	pub fn peek2(&mut self) -> Token {
263		self.peek_token_at(2)
264	}
265	/// Returns the next n'th token without consuming it.
266	/// This function will return None if there was any whitespace between the 'nth - 1; token
267	/// and the nth token.
268	///
269	/// `peek_token_at(0)` is equivalent to `peek`.
270	pub fn peek_whitespace_token_at<const AT: u8>(&mut self) -> Option<Token> {
271		const { assert!(AT < 4, "Peeking more then 4 tokens is not supported") };
272		if AT == 0 {
273			return self.peek_whitespace();
274		}
275		for _ in self.token_buffer.len()..=AT {
276			let res = self.lexer.next_token();
277			self.token_buffer.push(res);
278		}
279		let Some(token) = self.token_buffer.at(AT) else {
280			unreachable!()
281		};
282		let Some(prev_token) = self.token_buffer.at(AT - 1) else {
283			unreachable!()
284		};
285		if !token.span.follows_from(&prev_token.span) {
286			return None;
287		}
288		Some(token)
289	}
290	pub fn peek_whitespace1(&mut self) -> Option<Token> {
291		self.peek_whitespace_token_at::<1>()
292	}
293	pub fn peek_whitespace2(&mut self) -> Option<Token> {
294		self.peek_whitespace_token_at::<2>()
295	}
296	/// Returns the span of the next token if it was already peeked, otherwise
297	/// returns the token of the last consumed token.
298	pub fn recent_span(&mut self) -> Span {
299		self.token_buffer
300			.first()
301			.map(|x| x.span)
302			.unwrap_or(self.last_span)
303	}
304	///  returns the token of the last consumed token.
305	pub fn last_span(&mut self) -> Span {
306		self.last_span
307	}
308	pub fn assert_finished(&mut self) -> ParseResult<()> {
309		let p = self.peek();
310		if p.kind != TokenKind::Eof {
311			bail!("Unexpected token `{}`, expected no more tokens", p.kind, @ p.span);
312		}
313		Ok(())
314	}
315	/// Eat the next token if it is of the given kind.
316	/// Returns whether a token was eaten.
317	pub fn eat(&mut self, token: TokenKind) -> bool {
318		let peek = self.peek();
319		if token == peek.kind {
320			self.token_buffer.pop();
321			self.last_span = peek.span;
322			true
323		} else {
324			false
325		}
326	}
327	/// Eat the next token if it is of the given kind.
328	/// Returns whether a token was eaten.
329	///
330	/// Unlike [`Parser::eat`] this function will not consume the token if there is whitespace
331	/// between the last and next token.
332	pub fn eat_whitespace(&mut self, token: TokenKind) -> bool {
333		let Some(peek) = self.peek_whitespace() else {
334			return false;
335		};
336		if token == peek.kind {
337			self.token_buffer.pop();
338			self.last_span = peek.span;
339			true
340		} else {
341			false
342		}
343	}
344	/// Checks if the next token is of the given kind. If it isn't it returns a
345	/// UnclosedDelimiter error.
346	fn expect_closing_delimiter(&mut self, kind: TokenKind, should_close: Span) -> ParseResult<()> {
347		let peek = self.peek();
348		if peek.kind != kind {
349			bail!(
350				"Unexpected token `{}` expected delimiter `{kind}`", peek.kind, @ self
351				.recent_span(), @ should_close => "expected this delimiter to close"
352			);
353		}
354		self.pop_peek();
355		Ok(())
356	}
357	/// Recover the parser state to after a given span.
358	pub fn backup_after(&mut self, span: Span) {
359		self.token_buffer.clear();
360		self.lexer.backup_after(span);
361	}
362	/// Parse a full query.
363	///
364	/// This is the primary entry point of the parser.
365	pub async fn parse_query(&mut self, stk: &mut Stk) -> ParseResult<sql::Ast> {
366		let statements = self.parse_stmt_list(stk).await?;
367		Ok(sql::Ast {
368			expressions: statements,
369		})
370	}
371	/// Parse a single statement.
372	async fn parse_statement(&mut self, stk: &mut Stk) -> ParseResult<sql::TopLevelExpr> {
373		self.parse_top_level_expr(stk).await
374	}
375	/// Parse a single expression.
376	pub async fn parse_expr(&mut self, stk: &mut Stk) -> ParseResult<sql::Expr> {
377		self.parse_expr_start(stk).await
378	}
379	pub fn lex_compound<F, R>(
380		&mut self,
381		start: Token,
382		f: F,
383	) -> Result<CompoundToken<R>, SyntaxError>
384	where
385		F: Fn(&mut Lexer, Token) -> Result<R, SyntaxError>,
386	{
387		let res = self.lexer.lex_compound(start, f)?;
388		self.last_span = res.span;
389		Ok(res)
390	}
391	pub fn span_str(&self, span: Span) -> &str {
392		self.lexer.span_str(span)
393	}
394	pub fn unescape_ident_span(&mut self, span: Span) -> Result<&str, SyntaxError> {
395		let str = self.lexer.span_str(span);
396		Lexer::unescape_ident_span(str, span, &mut self.unscape_buffer)
397	}
398	pub fn unescape_string_span(&mut self, span: Span) -> Result<&str, SyntaxError> {
399		let str = self.lexer.span_str(span);
400		Lexer::unescape_string_span(str, span, &mut self.unscape_buffer)
401	}
402	pub fn unescape_regex_span(&mut self, span: Span) -> Result<&str, SyntaxError> {
403		let str = self.lexer.span_str(span);
404		Lexer::unescape_regex_span(str, span, &mut self.unscape_buffer)
405	}
406	/// Speculativily parse a branch.
407	///
408	/// If the callback returns `Ok(Some(_))` then the lexer state advances like it would normally.
409	/// However if any other value is returned from the callback the lexer is rolled back to before
410	/// the function was called.
411	///
412	/// This function can be used for cases where the right branch cannot be determined from the
413	/// n'th next token.
414	///
415	/// # Usage
416	/// This function is very powerfull but also has the drawbacks.
417	/// - First it enables ambigous grammar, when implementing new syntax using this function please
418	///   first see if it is possible to implement the feature using the peek functions an otherwise
419	///   maybe consider redesigning the syntax so it is `LL(n)`.
420	///
421	/// - Second because it doesn't provide feedback on what exactly happened it can result in
422	///   errors being unpredictable
423	///
424	/// - Third, any parsing using speculating and then recovering is doing extra work it ideally
425	///   didn't have to do.
426	///
427	/// Please limit the usage to only syntax that can't be efficiently parsed without backtracking
428	/// and do not use it for implementing new syntax. If new syntax requires this function to
429	/// implement it consider altering the syntax to remove the need for backtracking.
430	pub async fn speculate<T, F>(&mut self, stk: &mut Stk, cb: F) -> ParseResult<Option<T>>
431	where
432		F: AsyncFnOnce(&mut Stk, &mut Parser) -> ParseResult<Option<T>>,
433	{
434		let backup = self.last_span();
435		match cb(stk, self).await {
436			Ok(Some(x)) => Ok(Some(x)),
437			Ok(None) => {
438				self.backup_after(backup);
439				Ok(None)
440			}
441			Err(e) => Err(e),
442		}
443	}
444}
445/// A struct which can parse queries statements by statement
446pub struct StatementStream {
447	stack: Stack,
448	settings: ParserSettings,
449	col_offset: usize,
450	line_offset: usize,
451}
452impl StatementStream {
453	#[expect(clippy::new_without_default)]
454	pub fn new() -> Self {
455		Self::new_with_settings(ParserSettings::default())
456	}
457	pub fn new_with_settings(settings: ParserSettings) -> Self {
458		StatementStream {
459			stack: Stack::new(),
460			settings,
461			col_offset: 0,
462			line_offset: 0,
463		}
464	}
465	/// updates the line and column offset after consuming bytes.
466	fn accumulate_line_col(&mut self, bytes: &[u8]) {
467		let (line_num, remaining) = std::str::from_utf8(bytes)
468			.expect("parser validated utf8")
469			.lines()
470			.enumerate()
471			.last()
472			.unwrap_or((0, ""));
473		self.line_offset += line_num;
474		if line_num > 0 {
475			self.col_offset = 0;
476		}
477		self.col_offset += remaining.chars().count();
478	}
479	/// Parses a statement if the buffer contains sufficient data to parse a
480	/// statement.
481	///
482	/// When it will have done so the it will remove the read bytes from the
483	/// buffer and return Ok(Some(_)). In case of a parsing error it will
484	/// return Err(_), this will not consume data.
485	///
486	/// If the function returns Ok(None), not enough data was in the buffer to
487	/// fully parse a statement, the function might still consume data from the
488	/// buffer, like whitespace between statements, when a none is returned.
489	pub fn parse_partial(
490		&mut self,
491		buffer: &mut BytesMut,
492	) -> Result<Option<sql::TopLevelExpr>, RenderedError> {
493		let mut slice = &**buffer;
494		if slice.len() > u32::MAX as usize {
495			slice = &slice[..u32::MAX as usize];
496		}
497		let mut parser = Parser::new_with_settings(slice, self.settings.clone());
498		while parser.eat(t!(";")) {}
499		if parser.peek().span.offset != 0 && buffer.len() > u32::MAX as usize {
500			let eaten = buffer.split_to(parser.peek().span.offset as usize);
501			self.accumulate_line_col(&eaten);
502			slice = &**buffer;
503			if slice.len() > u32::MAX as usize {
504				slice = &slice[..u32::MAX as usize];
505			}
506			parser = Parser::new_with_settings(slice, self.settings.clone());
507		}
508		if parser.peek().is_eof() {
509			return Ok(None);
510		}
511		let res = self.stack.enter(|stk| parser.parse_statement(stk)).finish();
512		if parser.peek().is_eof() {
513			if buffer.len() > u32::MAX as usize {
514				let error = syntax_error!(
515					"Cannot parse query, statement exceeded maximum size of 4GB", @
516					parser.last_span()
517				);
518				return Err(error
519					.render_on_bytes(buffer)
520					.offset_location(self.line_offset, self.col_offset));
521			}
522			return Ok(None);
523		}
524		if !parser.eat(t!(";")) {
525			let peek = parser.next();
526			if parser.peek1().is_eof() {
527				return Ok(None);
528			}
529			if let Err(e) = res {
530				return Err(e
531					.render_on_bytes(slice)
532					.offset_location(self.line_offset, self.col_offset));
533			}
534			let error = syntax_error!(
535				"Unexpected token `{}` expected the query to end.", peek.kind.as_str(), @
536				peek.span => "maybe forgot a semicolon after the previous statement?"
537			);
538			return Err(error
539				.render_on_bytes(slice)
540				.offset_location(self.line_offset, self.col_offset));
541		}
542		while parser.eat(t!(";")) {}
543		let eaten = buffer.split_to(parser.last_span().after_offset() as usize);
544		let res = res.map(Some).map_err(|e| {
545			e.render_on_bytes(&eaten)
546				.offset_location(self.line_offset, self.col_offset)
547		});
548		self.accumulate_line_col(&eaten);
549		res
550	}
551	/// Parse remaining statements once the buffer is complete.
552	pub fn parse_complete(
553		&mut self,
554		buffer: &mut BytesMut,
555	) -> Result<Option<sql::TopLevelExpr>, RenderedError> {
556		let mut slice = &**buffer;
557		if slice.len() > u32::MAX as usize {
558			slice = &slice[..u32::MAX as usize];
559		}
560		let mut parser = Parser::new_with_settings(slice, self.settings.clone());
561		while parser.eat(t!(";")) {}
562		if parser.peek().is_eof() {
563			buffer.clear();
564			return Ok(None);
565		}
566		match self.stack.enter(|stk| parser.parse_statement(stk)).finish() {
567			Ok(x) => {
568				if !parser.peek().is_eof() && !parser.eat(t!(";")) {
569					let peek = parser.peek();
570					let error = syntax_error!(
571						"Unexpected token `{}` expected the query to end.", peek.kind
572						.as_str(), @ peek.span =>
573						"maybe forgot a semicolon after the previous statement?"
574					);
575					return Err(error
576						.render_on_bytes(slice)
577						.offset_location(self.line_offset, self.col_offset));
578				}
579				let eaten = buffer.split_to(parser.last_span().after_offset() as usize);
580				self.accumulate_line_col(&eaten);
581				Ok(Some(x))
582			}
583			Err(e) => Err(e
584				.render_on_bytes(slice)
585				.offset_location(self.line_offset, self.col_offset)),
586		}
587	}
588}