dpc/parse/
lex.rs

1use std::fmt::{Debug, Display};
2
3use anyhow::bail;
4
5// Yes this is all copied from mcvm
6
7/// Create a list of tokens from package text contents that we will
8/// then use for parsing
9pub fn lex(text: &str) -> anyhow::Result<Vec<(Token, TextPos)>> {
10	let mut tokens: Vec<(Token, TextPos)> = Vec::new();
11
12	// Positional
13	let mut line_n: usize = 1;
14	let mut last_line_i: usize = 0;
15	let mut tok_start_pos = TextPos(line_n, 0, 0);
16
17	// Current token
18	let mut tok: Token = Token::None;
19	let mut tok_finished = false;
20
21	// Specific token-related vars
22	let mut escape = false;
23	let mut num_str = String::new();
24
25	for (i, c) in text.chars().enumerate() {
26		let pos = TextPos(line_n, i - last_line_i, i);
27		if c == '\n' {
28			line_n += 1;
29			// We add one since otherwise the next line starts at column 1 instead of 0
30			last_line_i = i + 1;
31		}
32
33		// Using this loop as a goto
34		loop {
35			let mut repeat = false;
36			match &mut tok {
37				Token::None => match c {
38					';' => {
39						tok = Token::Semicolon;
40						tok_finished = true;
41					}
42					':' => {
43						tok = Token::Colon;
44						tok_finished = true;
45					}
46					',' => {
47						tok = Token::Comma;
48						tok_finished = true;
49					}
50					'|' => {
51						tok = Token::Pipe;
52						tok_finished = true;
53					}
54					'{' => {
55						tok = Token::Curly(Side::Left);
56						tok_finished = true;
57					}
58					'}' => {
59						tok = Token::Curly(Side::Right);
60						tok_finished = true;
61					}
62					'[' => {
63						tok = Token::Square(Side::Left);
64						tok_finished = true;
65					}
66					']' => {
67						tok = Token::Square(Side::Right);
68						tok_finished = true;
69					}
70					'(' => {
71						tok = Token::Paren(Side::Left);
72						tok_finished = true;
73					}
74					')' => {
75						tok = Token::Paren(Side::Right);
76						tok_finished = true;
77					}
78					'<' => {
79						tok = Token::Angle(Side::Left);
80						tok_finished = true;
81					}
82					'>' => {
83						tok = Token::Angle(Side::Right);
84						tok_finished = true;
85					}
86					'@' => {
87						tok = Token::At;
88						tok_finished = true;
89					}
90					'!' => {
91						tok = Token::Bang;
92						tok_finished = true;
93					}
94					'=' => {
95						tok = Token::Equal;
96						tok_finished = true;
97					}
98					'%' => {
99						tok = Token::Percent;
100						tok_finished = true;
101					}
102					'&' => {
103						tok = Token::Ampersand;
104						tok_finished = true;
105					}
106					'~' => {
107						tok = Token::Tilde;
108						tok_finished = true;
109					}
110					'"' => tok = Token::Str(String::new()),
111					'#' => tok = Token::Comment(String::new()),
112					'$' => tok = Token::Variable(String::new()),
113					c if is_whitespace(c) => tok = Token::Whitespace,
114					c if is_num(c, true) => {
115						tok = Token::Num(0);
116						num_str = c.to_string();
117					}
118					c if is_ident(c, true) => tok = Token::Ident(c.into()),
119					_ => bail!("Unexpected token {tok:?} {pos}"),
120				},
121				Token::Str(string) => match lex_string_char(c, escape) {
122					StrLexResult::Append => {
123						string.push(c);
124						escape = false;
125					}
126					StrLexResult::Escape => escape = true,
127					StrLexResult::End => {
128						escape = false;
129						tok_finished = true;
130					}
131				},
132				Token::Comment(string) => {
133					if c == '\n' {
134						tok_finished = true;
135					} else {
136						string.push(c);
137					}
138				}
139				Token::Variable(name) => {
140					let allowed = is_ident(c, name.is_empty());
141
142					if allowed {
143						name.push(c);
144					} else {
145						repeat = true;
146						tokens.push((tok, tok_start_pos.clone()));
147						tok = Token::None;
148					}
149				}
150				Token::Whitespace => {
151					if !is_whitespace(c) {
152						repeat = true;
153						tokens.push((tok, tok_start_pos.clone()));
154						tok_start_pos = pos.clone();
155						tok = Token::None;
156					}
157				}
158				Token::Ident(name) => {
159					if is_ident(c, false) {
160						name.push(c);
161					} else {
162						repeat = true;
163						tokens.push((tok, tok_start_pos.clone()));
164						tok_start_pos = pos.clone();
165						tok = Token::None;
166					}
167				}
168				Token::Num(num) => {
169					if is_num(c, false) {
170						num_str.push(c);
171					} else if c == '.' {
172						tok = Token::Decimal(TryInto::<i32>::try_into(*num)?.try_into()?);
173						num_str.push('.');
174					} else {
175						repeat = true;
176						if num_str == "-" {
177							bail!("Invalid number '{num_str}', {pos}");
178						}
179						*num = num_str.parse().expect("Number contains invalid characters");
180						tokens.push((tok, tok_start_pos.clone()));
181						tok_start_pos = pos.clone();
182						tok = Token::None;
183					}
184				}
185				Token::Decimal(num) => {
186					if is_decimal(c, false, true) {
187						num_str.push(c);
188					} else {
189						repeat = true;
190						if num_str == "-" {
191							bail!("Invalid number '{num_str}', {pos}");
192						}
193						*num = num_str
194							.parse()
195							.expect("Decimal number contains invalid characters");
196						tokens.push((tok, tok_start_pos.clone()));
197						tok_start_pos = pos.clone();
198						tok = Token::None;
199					}
200				}
201				_ => {}
202			}
203			if !repeat {
204				break;
205			}
206		}
207		if tok_finished {
208			tok_finished = false;
209			tokens.push((tok, tok_start_pos));
210			tok_start_pos = pos.clone();
211			// Since these are not greedy we need to increase the col by 1
212			tok_start_pos.increase_col(1);
213			tok = Token::None;
214		}
215	}
216
217	match &mut tok {
218		Token::Num(num) => {
219			*num = num_str.parse().expect("Number contains invalid characters");
220			tokens.push((tok, tok_start_pos.clone()));
221		}
222		Token::Decimal(num) => {
223			*num = num_str.parse().expect("Number contains invalid characters");
224			tokens.push((tok, tok_start_pos.clone()));
225		}
226		Token::None => {}
227		_ => tokens.push((tok, tok_start_pos.clone())),
228	}
229	Ok(tokens)
230}
231
232/// A token that we derive from text
233#[derive(Debug, PartialEq, Clone)]
234pub enum Token {
235	/// An empty token with no meaning. These technically shouldn't appear in the output,
236	/// but just skip over them.
237	None,
238	/// Any whitespace, such as tabs and newlines
239	Whitespace,
240	/// A semicolon (;)
241	Semicolon,
242	/// A colon (:)
243	Colon,
244	/// A comma (,)
245	Comma,
246	/// A pipe / bar (|)
247	Pipe,
248	/// An at symbol (@)
249	At,
250	/// An exclamation point (!)
251	Bang,
252	/// An equal sign (=)
253	Equal,
254	/// A percent sign (%)
255	Percent,
256	/// An ampersand (&)
257	Ampersand,
258	/// A tilde (~)
259	Tilde,
260	/// A variable ($var_name)
261	Variable(String),
262	/// A curly brace ({ / })
263	Curly(Side),
264	/// A square bracket ([ / ])
265	Square(Side),
266	/// A parenthese (( / ))
267	Paren(Side),
268	/// An angle bracket (< / >)
269	Angle(Side),
270	/// A comment
271	Comment(String),
272	/// An identifier (foo)
273	Ident(String),
274	/// An integer number (-12, 6, 128, etc.)
275	Num(i128),
276	/// A decimal number (-2.4, 6.0, 88.9, etc.)
277	Decimal(f64),
278	/// A string literal ("'hello' there")
279	Str(String),
280}
281
282impl Token {
283	/// Print this token as a string
284	pub fn as_string(&self) -> String {
285		match self {
286			Token::None => "none".into(),
287			Token::Whitespace => " ".into(),
288			Token::Semicolon => ";".into(),
289			Token::Colon => ":".into(),
290			Token::Comma => ",".into(),
291			Token::Pipe => "|".into(),
292			Token::At => "@".into(),
293			Token::Bang => "!".into(),
294			Token::Equal => "=".into(),
295			Token::Percent => "%".into(),
296			Token::Ampersand => "&".into(),
297			Token::Tilde => "~".into(),
298			Token::Variable(name) => "$".to_string() + name,
299			Token::Curly(Side::Left) => "{".into(),
300			Token::Curly(Side::Right) => "}".into(),
301			Token::Square(Side::Left) => "[".into(),
302			Token::Square(Side::Right) => "]".into(),
303			Token::Paren(Side::Left) => "(".into(),
304			Token::Paren(Side::Right) => ")".into(),
305			Token::Angle(Side::Left) => "<".into(),
306			Token::Angle(Side::Right) => ">".into(),
307			Token::Comment(text) => "# ".to_string() + text,
308			Token::Ident(name) => name.clone(),
309			Token::Num(num) => num.to_string(),
310			Token::Decimal(num) => num.to_string(),
311			Token::Str(string) => format!("\"{string}\""),
312		}
313	}
314
315	/// Checks if this token is a useless character with no meaning
316	pub fn is_ignored(&self) -> bool {
317		matches!(self, Token::None | Token::Comment(..) | Token::Whitespace)
318	}
319}
320
321/// Generic side for something like a bracket
322#[derive(Debug, PartialEq, Copy, Clone)]
323pub enum Side {
324	/// Something on the left side (e.g. [)
325	Left,
326	/// Something on the right side (e.g. ])
327	Right,
328}
329
330/// Text positional information with row, column, and absolute index
331#[derive(Clone, PartialEq, Eq)]
332pub struct TextPos(usize, usize, usize);
333
334impl Debug for TextPos {
335	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
336		write!(f, "({}:{}:{})", self.0, self.1, self.2)
337	}
338}
339
340impl Display for TextPos {
341	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
342		write!(f, "({}:{})", self.0, self.1)
343	}
344}
345
346impl TextPos {
347	/// Create a new TextPos
348	pub fn new(row: usize, col: usize, abs: usize) -> Self {
349		Self(row, col, abs)
350	}
351
352	/// Get the row
353	pub fn row(&self) -> &usize {
354		&self.0
355	}
356
357	/// Get the column
358	pub fn col(&self) -> &usize {
359		&self.1
360	}
361
362	/// Get the absolute index
363	pub fn absolute(&self) -> &usize {
364		&self.2
365	}
366
367	/// Increase the col of the pos
368	pub fn increase_col(&mut self, amt: usize) {
369		self.1 += amt;
370		self.2 += amt;
371	}
372}
373
374/// Token and TextPos
375pub type TokenAndPos = (Token, TextPos);
376
377/// What action to perform after lexing a string character
378#[derive(Debug, PartialEq)]
379enum StrLexResult {
380	Append,
381	Escape,
382	End,
383}
384
385/// Figure out what to do with a character of a string when lexing
386fn lex_string_char(c: char, escape: bool) -> StrLexResult {
387	if escape {
388		StrLexResult::Append
389	} else {
390		match c {
391			'"' => StrLexResult::End,
392			'\\' => StrLexResult::Escape,
393			_ => StrLexResult::Append,
394		}
395	}
396}
397
398fn is_whitespace(c: char) -> bool {
399	c.is_whitespace()
400}
401
402/// Checks if a character is part of a valid identifier
403fn is_ident(c: char, first: bool) -> bool {
404	if first && c.is_numeric() {
405		return false;
406	}
407	c.is_alphanumeric() || c == '_'
408}
409
410/// Checks if a character is part of an integer
411fn is_num(c: char, first: bool) -> bool {
412	if first {
413		c.is_numeric() || c == '-'
414	} else {
415		c.is_numeric()
416	}
417}
418
419/// Checks if a character is part of a decimal number
420fn is_decimal(c: char, first: bool, after_decimal: bool) -> bool {
421	if first {
422		c.is_numeric() || c == '-' || c == '.'
423	} else if after_decimal {
424		c.is_numeric()
425	} else {
426		c.is_numeric() || c == '.'
427	}
428}
429
430/// Removes whitespace characters and comments from an iterator of tokens
431pub fn reduce_tokens<'a, T: Iterator<Item = &'a TokenAndPos>>(
432	tokens: T,
433) -> impl Iterator<Item = &'a TokenAndPos> {
434	tokens.filter(|(tok, ..)| !tok.is_ignored())
435}
436
437#[cfg(test)]
438mod tests {
439	use super::*;
440
441	macro_rules! assert_tokens {
442		($text:literal, $toks:expr) => {
443			assert_tokens!(lex($text), $toks)
444		};
445
446		($lexed:expr, $toks:expr) => {
447			match $lexed {
448				Ok(lexed) => {
449					assert_eq!(lexed.len(), $toks.len());
450					for ((left, _), right) in lexed.iter().zip($toks) {
451						assert_eq!(left, &right);
452					}
453				}
454				Err(e) => {
455					println!("{e}");
456					panic!();
457				}
458			};
459		};
460	}
461
462	#[test]
463	fn test_chars() {
464		assert!(is_ident('a', false));
465		assert!(is_ident('a', true));
466		assert!(is_ident('B', false));
467		assert!(is_ident('B', true));
468		assert!(is_ident('_', false));
469		assert!(is_ident('_', true));
470
471		assert!(is_ident('5', false));
472		assert!(!is_ident('2', true));
473
474		assert!(is_num('8', false));
475		assert!(is_num('8', true));
476		assert!(!is_num('t', false));
477		assert!(!is_num('t', true));
478		assert!(!is_num('.', false));
479		assert!(!is_num('.', true));
480		assert!(is_num('-', true));
481		assert!(!is_num('-', false));
482
483		assert!(is_whitespace(' '));
484		assert!(is_whitespace('\n'));
485		assert!(!is_whitespace('a'));
486		assert!(!is_whitespace('%'));
487	}
488
489	#[test]
490	fn test_semicolon() {
491		assert_tokens!(";;", vec![Token::Semicolon, Token::Semicolon]);
492	}
493
494	#[test]
495	fn test_string_chars() {
496		assert_eq!(lex_string_char('d', false), StrLexResult::Append);
497		assert_eq!(lex_string_char('\'', false), StrLexResult::Append);
498		assert_eq!(lex_string_char('"', false), StrLexResult::End);
499		assert_eq!(lex_string_char('"', true), StrLexResult::Append);
500		assert_eq!(lex_string_char('\\', false), StrLexResult::Escape);
501		assert_eq!(lex_string_char('\\', true), StrLexResult::Append);
502	}
503
504	#[test]
505	fn test_string() {
506		assert_tokens!("\"Hello\"", vec![Token::Str("Hello".into())]);
507	}
508
509	#[test]
510	fn test_combo() {
511		assert_tokens!(
512			"\"Uno\"; \"Dos\"; \"Tres\"; Identifier",
513			vec![
514				Token::Str("Uno".into()),
515				Token::Semicolon,
516				Token::Whitespace,
517				Token::Str("Dos".into()),
518				Token::Semicolon,
519				Token::Whitespace,
520				Token::Str("Tres".into()),
521				Token::Semicolon,
522				Token::Whitespace,
523				Token::Ident("Identifier".into())
524			]
525		);
526	}
527
528	#[test]
529	fn test_all() {
530		assert_tokens!(
531			"\"Hello\"; ident{}@routine[]$var():-1000,|# comment",
532			vec![
533				Token::Str("Hello".into()),
534				Token::Semicolon,
535				Token::Whitespace,
536				Token::Ident("ident".into()),
537				Token::Curly(Side::Left),
538				Token::Curly(Side::Right),
539				Token::At,
540				Token::Ident("routine".into()),
541				Token::Square(Side::Left),
542				Token::Square(Side::Right),
543				Token::Variable("var".into()),
544				Token::Paren(Side::Left),
545				Token::Paren(Side::Right),
546				Token::Colon,
547				Token::Num(-1000),
548				Token::Comma,
549				Token::Pipe,
550				Token::Comment(" comment".into())
551			]
552		);
553	}
554
555	#[test]
556	fn test_comment() {
557		assert_tokens!(
558			"\"Foo\" # Comment\n \"Bar\"",
559			vec![
560				Token::Str("Foo".into()),
561				Token::Whitespace,
562				Token::Comment(" Comment".into()),
563				Token::Whitespace,
564				Token::Str("Bar".into())
565			]
566		);
567	}
568
569	#[test]
570	fn test_num() {
571		assert_tokens!(
572			"12345;888;0;-10",
573			vec![
574				Token::Num(12345),
575				Token::Semicolon,
576				Token::Num(888),
577				Token::Semicolon,
578				Token::Num(0),
579				Token::Semicolon,
580				Token::Num(-10)
581			]
582		);
583	}
584
585	#[test]
586	fn test_decimal() {
587		assert_tokens!(
588			"12345;88.0,-73.5,-0.03",
589			vec![
590				Token::Num(12345),
591				Token::Semicolon,
592				Token::Decimal(88.0),
593				Token::Comma,
594				Token::Decimal(-73.5),
595				Token::Comma,
596				Token::Decimal(-0.03)
597			]
598		);
599	}
600
601	macro_rules! assert_token_positions {
602		($text:literal, $positions:expr) => {
603			assert_token_positions!(lex($text), $positions)
604		};
605
606		($toks:expr, $positions:expr) => {
607			match $toks {
608				Ok(toks) => {
609					dbg!(&toks, $positions);
610					assert_eq!(toks.len(), $positions.len());
611					for (i, ((_, tok_pos), (expected_row, expected_col))) in
612						toks.iter().zip($positions.iter()).enumerate()
613					{
614						// let expected_pos = TextPos(*expected_row, *expected_col);
615						assert_eq!(tok_pos.0, *expected_row as usize, "Index: {i}");
616						assert_eq!(tok_pos.1, *expected_col as usize, "Index: {i}");
617					}
618				}
619				Err(e) => {
620					println!("{e}");
621					panic!();
622				}
623			}
624		};
625	}
626
627	#[test]
628	fn test_token_pos_simple() {
629		assert_token_positions!(
630			"hello;world!!\nwhy\n\"where\";",
631			[
632				(1, 0),
633				(1, 5),
634				(1, 6),
635				(1, 11),
636				(1, 12),
637				(1, 13),
638				(2, 0),
639				(2, 3),
640				(3, 0),
641				(3, 7),
642			]
643		);
644	}
645}