Skip to main content

reifydb_testing/testscript/
parser.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright (c) 2025 ReifyDB
3
4// This file includes and modifies code from the toydb project (https://github.com/erikgrinaker/toydb),
5// originally licensed under the Apache License, Version 2.0.
6// Original copyright:
7//   Copyright (c) 2024 Erik Grinaker
8//
9// The original Apache License can be found at:
10//   http://www.apache.org/licenses/LICENSE-2.0
11
12use std::{collections::HashSet, fmt};
13
14use crate::testscript::command::{Argument, Block, Command};
15
16#[derive(Debug, Clone)]
17pub struct ParseError {
18	pub message: String,
19	pub line: u32,
20	pub column: usize,
21	pub input: LocatedSpan,
22	pub code: String,
23}
24
25impl fmt::Display for ParseError {
26	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
27		write!(f, "Parse error at line {}:{}: {}", self.line, self.column, self.message)
28	}
29}
30
31impl std::error::Error for ParseError {}
32
33#[derive(Debug, Clone)]
34pub struct LocatedSpan {
35	column: usize,
36	line: u32,
37	line_text: String,
38}
39
40impl LocatedSpan {
41	fn new(_line_start: usize, column: usize, line: u32, line_text: String) -> Self {
42		LocatedSpan {
43			column,
44			line,
45			line_text,
46		}
47	}
48
49	pub fn location_line(&self) -> u32 {
50		self.line
51	}
52
53	pub fn get_column(&self) -> usize {
54		self.column
55	}
56
57	pub fn get_utf8_column(&self) -> usize {
58		self.column
59	}
60
61	pub fn get_line_beginning(&self) -> &[u8] {
62		self.line_text.as_bytes()
63	}
64}
65
66pub(crate) fn parse(input: &str) -> Result<Vec<Block>, ParseError> {
67	let mut parser = Parser::new(input);
68	parser.parse_blocks()
69}
70
71#[cfg(test)]
72pub(crate) fn parse_command(input: &str) -> Result<Command, ParseError> {
73	let mut parser = Parser::new(input);
74	parser.parse_command()
75}
76
77struct Parser<'a> {
78	input: &'a str,
79	pos: usize,
80	line: u32,
81	column: usize,
82	line_start_pos: usize,
83}
84
85impl<'a> Parser<'a> {
86	fn new(input: &'a str) -> Self {
87		Parser {
88			input,
89			pos: 0,
90			line: 1,
91			column: 1,
92			line_start_pos: 0,
93		}
94	}
95
96	fn current_char(&self) -> Option<char> {
97		self.input[self.pos..].chars().next()
98	}
99
100	fn peek_char(&self) -> Option<char> {
101		self.current_char()
102	}
103
104	fn peek_str(&self, n: usize) -> &str {
105		// This function returns n bytes from current position
106		// It's only used for checking ASCII patterns like "---" and
107		// "//"
108		let end = (self.pos + n).min(self.input.len());
109
110		// Make sure we don't split a UTF-8 character
111		let mut safe_end = end;
112		while safe_end > self.pos && !self.input.is_char_boundary(safe_end) {
113			safe_end -= 1;
114		}
115
116		&self.input[self.pos..safe_end]
117	}
118
119	fn advance(&mut self) -> Option<char> {
120		if let Some(ch) = self.current_char() {
121			self.pos += ch.len_utf8();
122			if ch == '\n' {
123				self.line += 1;
124				self.column = 1;
125				self.line_start_pos = self.pos;
126			} else {
127				self.column += 1;
128			}
129			Some(ch)
130		} else {
131			None
132		}
133	}
134
135	fn skip_whitespace(&mut self) {
136		while let Some(ch) = self.peek_char() {
137			if ch.is_whitespace() && ch != '\n' {
138				self.advance();
139			} else {
140				break;
141			}
142		}
143	}
144
145	fn skip_line(&mut self) {
146		while let Some(ch) = self.peek_char() {
147			if ch == '\n' {
148				self.advance();
149				break;
150			}
151			self.advance();
152		}
153	}
154
155	fn is_at_end(&self) -> bool {
156		self.pos >= self.input.len()
157	}
158
159	fn error(&self, message: impl Into<String>) -> ParseError {
160		let line_end = self.input[self.line_start_pos..]
161			.find('\n')
162			.map(|i| self.line_start_pos + i)
163			.unwrap_or(self.input.len());
164		let line_text = &self.input[self.line_start_pos..line_end];
165
166		ParseError {
167			message: message.into(),
168			line: self.line,
169			column: self.column,
170			input: LocatedSpan::new(self.line_start_pos, self.column, self.line, line_text.to_string()),
171			code: format!("{:?}", line_text),
172		}
173	}
174
175	fn parse_blocks(&mut self) -> Result<Vec<Block>, ParseError> {
176		let mut blocks = Vec::new();
177
178		while !self.is_at_end() {
179			if let Some(block) = self.parse_block()? {
180				blocks.push(block);
181			}
182		}
183
184		Ok(blocks)
185	}
186
187	fn parse_block(&mut self) -> Result<Option<Block>, ParseError> {
188		let line_number = self.line;
189		let literal_start = self.pos;
190
191		// Parse commands
192		let commands = self.parse_commands()?;
193
194		// Capture literal
195		let literal_end = self.pos;
196		let literal = self.input[literal_start..literal_end].to_string();
197
198		// Handle empty block at EOF
199		if self.is_at_end() && commands.is_empty() {
200			return Ok(Some(Block {
201				literal,
202				commands,
203				line_number,
204			}));
205		}
206
207		// If no commands and not at EOF, this isn't a valid block
208		if commands.is_empty() {
209			return Ok(None);
210		}
211
212		// Parse separator
213		if !self.parse_separator()? {
214			return Err(self.error("Expected --- separator"));
215		}
216
217		// Parse and skip output
218		self.parse_output()?;
219
220		Ok(Some(Block {
221			literal,
222			commands,
223			line_number,
224		}))
225	}
226
227	fn parse_commands(&mut self) -> Result<Vec<Command>, ParseError> {
228		let mut commands = Vec::new();
229
230		loop {
231			// Skip empty and comment lines
232			if self.skip_empty_or_comment_line() {
233				continue;
234			}
235
236			// Check for EOF
237			if self.is_at_end() {
238				break;
239			}
240
241			// Check for separator
242			if self.peek_str(3) == "---" {
243				if !commands.is_empty() {
244					break;
245				}
246			}
247
248			// Check for leading whitespace (not allowed for
249			// commands)
250			if let Some(ch) = self.peek_char() {
251				if ch.is_whitespace() && ch != '\n' {
252					return Err(self.error("Command cannot start with whitespace"));
253				}
254			}
255
256			// Parse command
257			match self.parse_command() {
258				Ok(cmd) => commands.push(cmd),
259				Err(e) => {
260					// If we hit a separator but have no
261					// commands, let parse_command error
262					if self.peek_str(3) == "---" && commands.is_empty() {
263						return Err(e);
264					}
265					return Err(e);
266				}
267			}
268		}
269
270		Ok(commands)
271	}
272
273	fn parse_command(&mut self) -> Result<Command, ParseError> {
274		let line_number = self.line;
275
276		// Check for silencing (
277		let silent = if self.peek_char() == Some('(') {
278			self.advance();
279			self.skip_whitespace();
280			true
281		} else {
282			false
283		};
284
285		// Parse prefix and tags
286		let mut tags = HashSet::new();
287		let mut prefix = None;
288
289		// Try to parse prefix (string followed by :)
290		let saved_pos = self.pos;
291		self.skip_whitespace();
292		if let Ok(s) = self.parse_string() {
293			self.skip_whitespace();
294			if self.peek_char() == Some(':') {
295				self.advance();
296				self.skip_whitespace();
297				prefix = Some(s);
298			} else {
299				// Backtrack
300				self.pos = saved_pos;
301			}
302		}
303
304		// Parse tags before command
305		self.skip_whitespace();
306		if let Some(parsed_tags) = self.parse_taglist()? {
307			tags.extend(parsed_tags);
308		}
309		self.skip_whitespace();
310
311		// Check for fail marker
312		let fail = if self.peek_char() == Some('!') {
313			self.advance();
314			self.skip_whitespace();
315			true
316		} else {
317			false
318		};
319
320		// Check for literal command (>)
321		if self.peek_char() == Some('>') {
322			self.advance();
323			self.skip_whitespace();
324			let name = self.parse_line_continuation()?;
325			return Ok(Command {
326				name,
327				args: Vec::new(),
328				tags,
329				prefix,
330				silent,
331				fail,
332				line_number,
333			});
334		}
335
336		// Parse command name
337		self.skip_whitespace();
338		let name = self.parse_string().map_err(|_| self.error("Expected command name"))?;
339
340		// Parse arguments
341		let mut args = Vec::new();
342		loop {
343			self.skip_whitespace();
344			if self.peek_char() == Some('[') {
345				// Might be trailing tags
346				if let Some(parsed_tags) = self.parse_taglist()? {
347					tags.extend(parsed_tags);
348					break;
349				}
350			}
351
352			// Check for end of command
353			if silent && self.peek_char() == Some(')') {
354				break;
355			}
356
357			if self.peek_char() == Some('#') || self.peek_str(2) == "//" {
358				break;
359			}
360
361			if self.peek_char() == Some('\n') || self.is_at_end() {
362				break;
363			}
364
365			// Try to parse an argument
366			let saved_pos = self.pos;
367			let saved_line = self.line;
368			let saved_column = self.column;
369			let saved_line_start = self.line_start_pos;
370			match self.parse_argument() {
371				Ok(arg) => args.push(arg),
372				Err(_) => {
373					self.pos = saved_pos;
374					self.line = saved_line;
375					self.column = saved_column;
376					self.line_start_pos = saved_line_start;
377					break;
378				}
379			}
380		}
381
382		// Handle closing ) for silent commands
383		if silent {
384			self.skip_whitespace();
385			if self.peek_char() != Some(')') {
386				return Err(self.error("Expected closing ) for silent command"));
387			}
388			self.advance();
389		}
390
391		// Skip trailing whitespace and comments
392		self.skip_whitespace();
393		if self.peek_char() == Some('#') || self.peek_str(2) == "//" {
394			self.skip_line();
395		} else if self.peek_char() == Some('\n') {
396			self.advance();
397		} else if !self.is_at_end() {
398			return Err(self.error("Expected end of line"));
399		}
400
401		Ok(Command {
402			name,
403			args,
404			tags,
405			prefix,
406			silent,
407			fail,
408			line_number,
409		})
410	}
411
412	fn parse_argument(&mut self) -> Result<Argument, ParseError> {
413		// Try key=value format first
414		let saved_pos = self.pos;
415		let saved_line = self.line;
416		let saved_column = self.column;
417		let saved_line_start = self.line_start_pos;
418
419		self.skip_whitespace();
420		if let Ok(key) = self.parse_string() {
421			if self.peek_char() == Some('=') {
422				self.advance();
423				// Allow empty value after =
424				// First check if we have an empty value (next
425				// is whitespace or another key)
426				let value = if matches!(self.peek_char(), Some(ch) if ch.is_whitespace())
427					|| matches!(self.peek_char(), Some('[' | ')' | '#'))
428					|| self.peek_char().is_none() || self.peek_str(2) == "//"
429				{
430					// Empty value case
431					String::new()
432				} else {
433					// Try to parse a value
434					match self.parse_string() {
435						Ok(v) => v,
436						Err(_) => {
437							// Check if this looks
438							// like another
439							// key=value pair
440							// by looking for a
441							// string followed by =
442							let check_pos = self.pos;
443							let check_line = self.line;
444							let check_column = self.column;
445							let check_line_start = self.line_start_pos;
446
447							// Try to parse what
448							// comes next as a
449							// potential key
450							if let Ok(_) = self.parse_string() {
451								if self.peek_char() == Some('=') {
452									// This looks like another key=value, so current
453									// value is empty
454									self.pos = check_pos;
455									self.line = check_line;
456									self.column = check_column;
457									self.line_start_pos = check_line_start;
458									String::new()
459								} else {
460									// Not a key=value, this is an error
461									self.pos = saved_pos;
462									self.line = saved_line;
463									self.column = saved_column;
464									self.line_start_pos = saved_line_start;
465									return Err(self.error(
466										"Expected argument value after =",
467									));
468								}
469							} else {
470								// Can't parse
471								// anything, this
472								// is an error
473								self.pos = saved_pos;
474								self.line = saved_line;
475								self.column = saved_column;
476								self.line_start_pos = saved_line_start;
477								return Err(
478									self.error("Expected argument value after =")
479								);
480							}
481						}
482					}
483				};
484				return Ok(Argument {
485					key: Some(key),
486					value,
487				});
488			}
489			// Just a value
490			return Ok(Argument {
491				key: None,
492				value: key,
493			});
494		}
495
496		self.pos = saved_pos;
497		Err(self.error("Expected argument"))
498	}
499
500	fn parse_taglist(&mut self) -> Result<Option<HashSet<String>>, ParseError> {
501		if self.peek_char() != Some('[') {
502			return Ok(None);
503		}
504
505		self.advance();
506		let mut tags = HashSet::new();
507
508		loop {
509			self.skip_whitespace();
510
511			if self.peek_char() == Some(']') {
512				// Empty tag list is an error
513				if tags.is_empty() {
514					return Err(self.error("Empty tag list"));
515				}
516				self.advance();
517				break;
518			}
519
520			self.skip_whitespace();
521			let tag = self.parse_string().map_err(|_| self.error("Expected tag name"))?;
522			tags.insert(tag);
523
524			self.skip_whitespace();
525			if self.peek_char() == Some(',') {
526				self.advance();
527				self.skip_whitespace();
528			} else if self.peek_char() == Some(' ') {
529				self.skip_whitespace();
530			}
531		}
532
533		Ok(Some(tags))
534	}
535
536	fn parse_string(&mut self) -> Result<String, ParseError> {
537		// Note: Don't skip whitespace here - the caller should handle
538		// that self.skip_whitespace();
539
540		match self.peek_char() {
541			Some('\'') => self.parse_quoted_string('\''),
542			Some('"') => self.parse_quoted_string('"'),
543			_ => self.parse_unquoted_string(),
544		}
545	}
546
547	fn parse_unquoted_string(&mut self) -> Result<String, ParseError> {
548		let mut result = String::new();
549
550		// First character must be alphanumeric or _
551		match self.peek_char() {
552			Some(ch) if ch.is_alphanumeric() || ch == '_' => {
553				result.push(ch);
554				self.advance();
555			}
556			_ => return Err(self.error("Expected string")),
557		}
558
559		// Subsequent characters
560		while let Some(ch) = self.peek_char() {
561			if ch.is_alphanumeric() || "_-./@".contains(ch) {
562				result.push(ch);
563				self.advance();
564			} else {
565				break;
566			}
567		}
568
569		Ok(result)
570	}
571
572	fn parse_quoted_string(&mut self, quote: char) -> Result<String, ParseError> {
573		let mut result = String::new();
574
575		// Skip opening quote
576		if self.peek_char() != Some(quote) {
577			return Err(self.error(format!("Expected {} quote", quote)));
578		}
579		self.advance();
580
581		while let Some(ch) = self.peek_char() {
582			if ch == quote {
583				self.advance();
584				return Ok(result);
585			} else if ch == '\\' {
586				self.advance();
587				match self.peek_char() {
588					Some('\'') => {
589						result.push('\'');
590						self.advance();
591					}
592					Some('"') => {
593						result.push('"');
594						self.advance();
595					}
596					Some('\\') => {
597						result.push('\\');
598						self.advance();
599					}
600					Some('0') => {
601						result.push('\0');
602						self.advance();
603					}
604					Some('n') => {
605						result.push('\n');
606						self.advance();
607					}
608					Some('r') => {
609						result.push('\r');
610						self.advance();
611					}
612					Some('t') => {
613						result.push('\t');
614						self.advance();
615					}
616					Some('x') => {
617						self.advance();
618						let hex = self.parse_hex_digits(2, 2)?;
619						let byte = u8::from_str_radix(&hex, 16)
620							.map_err(|_| self.error("Invalid hex escape"))?;
621						result.push(char::from(byte));
622					}
623					Some('u') => {
624						self.advance();
625						if self.peek_char() != Some('{') {
626							return Err(self.error("Expected { after \\u"));
627						}
628						self.advance();
629						let hex = self.parse_hex_digits(1, 6)?;
630						if self.peek_char() != Some('}') {
631							return Err(self.error("Expected } after unicode escape"));
632						}
633						self.advance();
634						let codepoint = u32::from_str_radix(&hex, 16)
635							.map_err(|_| self.error("Invalid unicode escape"))?;
636						let ch = char::from_u32(codepoint)
637							.ok_or_else(|| self.error("Invalid unicode codepoint"))?;
638						result.push(ch);
639					}
640					_ => {
641						return Err(self.error("Invalid escape sequence"));
642					}
643				}
644			} else {
645				result.push(ch);
646				self.advance();
647			}
648		}
649
650		Err(self.error(format!("Unterminated string (missing {})", quote)))
651	}
652
653	fn parse_hex_digits(&mut self, min: usize, max: usize) -> Result<String, ParseError> {
654		let mut hex = String::new();
655		for i in 0..max {
656			match self.peek_char() {
657				Some(ch) if ch.is_ascii_hexdigit() => {
658					hex.push(ch);
659					self.advance();
660				}
661				_ => {
662					if i < min {
663						return Err(self.error(format!("Expected at least {} hex digits", min)));
664					}
665					break;
666				}
667			}
668		}
669		if hex.len() < min {
670			return Err(self.error(format!("Expected at least {} hex digits", min)));
671		}
672		Ok(hex)
673	}
674
675	fn skip_empty_or_comment_line(&mut self) -> bool {
676		let saved_pos = self.pos;
677
678		self.skip_whitespace();
679
680		// Check for comment
681		if self.peek_char() == Some('#') || self.peek_str(2) == "//" {
682			self.skip_line();
683			return true;
684		}
685
686		// Check for empty line
687		if self.peek_char() == Some('\n') {
688			self.advance();
689			return true;
690		}
691
692		// Not an empty or comment line, restore position
693		self.pos = saved_pos;
694		false
695	}
696
697	fn parse_separator(&mut self) -> Result<bool, ParseError> {
698		if self.peek_str(3) != "---" {
699			return Ok(false);
700		}
701
702		self.advance(); // -
703		self.advance(); // -
704		self.advance(); // -
705
706		// Must be followed by newline (with optional \r) or EOF
707		match self.peek_char() {
708			Some('\r') => {
709				self.advance();
710				if self.peek_char() == Some('\n') {
711					self.advance();
712				}
713				Ok(true)
714			}
715			Some('\n') => {
716				self.advance();
717				Ok(true)
718			}
719			None => Ok(true),
720			_ => Err(self.error("Separator must be followed by newline or EOF")),
721		}
722	}
723
724	fn parse_output(&mut self) -> Result<(), ParseError> {
725		// Special case: no output (immediate newline or EOF)
726		if self.peek_char() == Some('\n') || self.is_at_end() {
727			if self.peek_char() == Some('\n') {
728				self.advance();
729			}
730			return Ok(());
731		}
732
733		// Read until double newline or EOF
734		let mut last_was_newline = false;
735		while !self.is_at_end() {
736			let ch = self.advance().unwrap();
737			if ch == '\n' {
738				if last_was_newline {
739					break;
740				}
741				last_was_newline = true;
742			} else {
743				last_was_newline = false;
744			}
745		}
746
747		Ok(())
748	}
749
750	fn parse_line_continuation(&mut self) -> Result<String, ParseError> {
751		let mut result = String::new();
752
753		loop {
754			// Read until end of line
755			while let Some(ch) = self.peek_char() {
756				if ch == '\n' {
757					break;
758				}
759				result.push(ch);
760				self.advance();
761			}
762
763			// Check for continuation
764			if result.ends_with('\\') {
765				result.pop(); // Remove the backslash
766				if self.peek_char() == Some('\n') {
767					self.advance(); // Skip newline
768					continue;
769				}
770			}
771
772			// Skip the final newline
773			if self.peek_char() == Some('\n') {
774				self.advance();
775			}
776
777			break;
778		}
779
780		Ok(result)
781	}
782}