trion/text/token/
mod.rs

1use core::fmt;
2use std::collections::VecDeque;
3use std::error::Error;
4
5use crate::asm::arcob::Arcob;
6use crate::text::Positioned;
7
8#[cfg(test)]
9mod test;
10
11#[derive(Clone, Copy, Debug)]
12pub enum Number
13{
14	Integer(i64),
15}
16
17pub type Token<'l> = Positioned<TokenValue<'l>>;
18
19#[derive(Clone, Debug)]
20pub enum TokenValue<'l>
21{
22	Separator, Terminator,
23	LabelMark, DirectiveMark,
24	Plus, Minus, Multiply, Divide, Modulo,
25	Not, BitAnd, BitOr, BitXor, LeftShift, RightShift,
26	Number(Number),
27	Identifier(&'l str),
28	String(Arcob<'l, str>),
29	// TODO implement ByteString(Arcob<'l, [u8]>),
30	BeginGroup, EndGroup,
31	BeginAddr, EndAddr,
32	BeginSeq, EndSeq,
33}
34
35impl<'l> TokenValue<'l>
36{
37	pub fn desc(&self) -> &'static str
38	{
39		match self
40		{
41			TokenValue::Separator => "','",
42			TokenValue::Terminator => "';'",
43			TokenValue::LabelMark => "':'",
44			TokenValue::DirectiveMark => "'.'",
45			TokenValue::Plus => "'+'",
46			TokenValue::Minus => "'-'",
47			TokenValue::Multiply => "'*'",
48			TokenValue::Divide => "'/'",
49			TokenValue::Modulo => "'%'",
50			TokenValue::Not => "'!'",
51			TokenValue::BitAnd => "'&'",
52			TokenValue::BitOr => "'|'",
53			TokenValue::BitXor => "'^'",
54			TokenValue::LeftShift => "\"<<\"",
55			TokenValue::RightShift => "\">>\"",
56			TokenValue::Number(..) => "number",
57			TokenValue::Identifier(..) => "identifier",
58			TokenValue::String(..) => "string",
59			TokenValue::BeginGroup => "'('",
60			TokenValue::EndGroup => "')'",
61			TokenValue::BeginAddr => "'['",
62			TokenValue::EndAddr => "']'",
63			TokenValue::BeginSeq => "'{'",
64			TokenValue::EndSeq => "'}'",
65		}
66	}
67}
68
69impl<'l> fmt::Display for TokenValue<'l>
70{
71	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result
72	{
73		f.write_str(self.desc())
74	}
75}
76
77#[derive(Clone, Debug)]
78pub struct Tokenizer<'l>
79{
80	data: &'l str,
81	utf_err: bool,
82	col: u32,
83	line: u32,
84	tokens: VecDeque<Token<'l>>,
85	token_err: Option<TokenError>,
86}
87
88impl<'l> Tokenizer<'l>
89{
90	pub fn new(data: &'l [u8]) -> Self
91	{
92		let (data, utf_err) = match core::str::from_utf8(data)
93		{
94			Ok(s) => (s, false),
95			Err(e) =>
96			{
97				// SAFETY: the error guarantees this slice is valid
98				(unsafe{core::str::from_utf8_unchecked(&data[..e.valid_up_to()])}, true)
99			},
100		};
101		Self
102		{
103			data, utf_err,
104			line: 1,
105			col: 1,
106			tokens: VecDeque::new(),
107			token_err: None,
108		}
109	}
110	
111	pub fn get_line(&self) -> u32
112	{
113		self.line
114	}
115	
116	pub fn get_column(&self) -> u32
117	{
118		self.col
119	}
120	
121	pub fn clear(&mut self)
122	{
123		self.data = "";
124		self.utf_err = false;
125	}
126	
127	fn update_pos(&mut self, data: &str)
128	{
129		let lines = u32::try_from(data.bytes().filter(|&b| b == b'\n').count()).unwrap_or(u32::MAX);
130		if lines > 0
131		{
132			self.line = self.line.saturating_add(lines);
133			self.col = 1;
134		}
135		let data = &data[data.bytes().rposition(|b| b == b'\n').map_or(0, |v| v + 1)..];
136		// faster character count by ignoring continuation bytes (instead of `str::chars`)
137		let coluns = u32::try_from(data.bytes().filter(|&b| (b & 0b11_000000) != 0b10_000000).count()).unwrap_or(u32::MAX);
138		self.col = self.col.saturating_add(coluns);
139	}
140	
141	fn positioned<T>(&self, value: T) -> Positioned<T>
142	{
143		Positioned{line: self.line, col: self.col, value}
144	}
145	
146	fn next_token(&mut self) -> Option<Result<Token<'l>, TokenError>>
147	{
148		while self.data.len() > 0
149		{
150			let space_bytes = match self.data.bytes().position(|b| b != b'\t' && b != b'\n' && b != b'\r' && b != b' ')
151			{
152				None => self.data.len(),
153				Some(cnt) => cnt,
154			};
155			if space_bytes > 0
156			{
157				self.update_pos(&self.data[..space_bytes]);
158				self.data = &self.data[space_bytes..];
159			}
160			
161			if self.data.starts_with("//")
162			{
163				match self.data.bytes().position(|b| b == b'\n')
164				{
165					None =>
166					{
167						self.update_pos(self.data);
168						let utf_err = self.utf_err;
169						self.clear();
170						if utf_err
171						{
172							return Some(Err(self.positioned(TokenErrorKind::BadUnicode)));
173						}
174					},
175					Some(line_len) =>
176					{
177						self.data = &self.data[line_len + 1..];
178						self.line = self.line.saturating_add(1);
179						self.col = 1;
180					},
181				}
182			}
183			else if self.data.starts_with("/*")
184			{
185				let mut depth = 1; // support nested block comments
186				let mut start = 2; // to avoid combinations like "/*/" and "*/*"
187				let comment_bytes = self.data[..self.data.len() - 1].bytes().enumerate().skip(2).position(|(p, b)|
188				{
189					if b == b'/' && p >= start && self.data.as_bytes()[p + 1] == b'*'
190					{
191						// no overflow because `p < self.data.len() - 1 <= usize::MAX - 1`
192						start = p + 2;
193						// no overflow because this can't happen more than `usize::MAX / 2` times
194						depth += 1;
195					}
196					else if b == b'*' && p >= start && self.data.as_bytes()[p + 1] == b'/'
197					{
198						// no overflow because `p < self.data.len() - 1 <= usize::MAX - 1`
199						start = p + 2;
200						// no overflow because this can't happen more than `usize::MAX / 2` times
201						depth -= 1;
202					}
203					depth == 0
204				});
205				if let Some(comment_bytes) = comment_bytes
206				{
207					self.update_pos(&self.data[..4 + comment_bytes]);
208					self.data = &self.data[4 + comment_bytes..];
209				}
210				else
211				{
212					self.update_pos(self.data);
213					let err = if self.utf_err {TokenErrorKind::BadUnicode} else {TokenErrorKind::BlockComment};
214					self.clear();
215					return Some(Err(self.positioned(err)))
216				}
217			}
218			else {break;}
219		}
220		
221		if !self.data.is_empty()
222		{
223			match self.do_next()
224			{
225				Some(Err(e)) =>
226				{
227					self.clear();
228					Some(Err(e))
229				},
230				r => r,
231			}
232		}
233		else
234		{
235			if self.utf_err
236			{
237				self.utf_err = false;
238				Some(Err(self.positioned(TokenErrorKind::BadUnicode)))
239			}
240			else {None}
241		}
242	}
243	
244	fn do_next(&mut self) -> Option<Result<Token<'l>, TokenError>>
245	{
246		let (n, ascii_ln, token) = match self.data.as_bytes()[0]
247		{
248			b',' => (1, true, TokenValue::Separator),
249			b';' => (1, true, TokenValue::Terminator),
250			b':' => (1, true, TokenValue::LabelMark),
251			b'.' => (1, true, TokenValue::DirectiveMark),
252			b'+' => (1, true, TokenValue::Plus),
253			b'-' => (1, true, TokenValue::Minus),
254			b'*' => (1, true, TokenValue::Multiply),
255			b'/' => (1, true, TokenValue::Divide),
256			b'%' => (1, true, TokenValue::Modulo),
257			b'!' => (1, true, TokenValue::Not),
258			b'&' => (1, true, TokenValue::BitAnd),
259			b'|' => (1, true, TokenValue::BitOr),
260			b'^' => (1, true, TokenValue::BitXor),
261			b'<' if self.data.len() >= 2 && self.data.as_bytes()[1] == b'<' => (2, true, TokenValue::LeftShift),
262			b'>' if self.data.len() >= 2 && self.data.as_bytes()[1] == b'>' => (2, true, TokenValue::RightShift),
263			b'0'..=b'9' =>
264			{
265				let (off, radix) =
266				{
267					if self.data.starts_with("0b") {(2, 2)}
268					else if self.data.starts_with("0o") {(2, 8)}
269					else if self.data.starts_with("0x") {(2, 16)}
270					else {(0, 10)}
271				};
272				let len = match self.data[off..].bytes().position(|b| !(b as char).is_digit(radix))
273				{
274					None if self.utf_err =>
275					{
276						// numbers are ASCII so `str::bytes().count() == str::chars().count()`
277						self.col = self.col.saturating_add(u32::try_from(self.data.len()).unwrap_or(u32::MAX));
278						self.clear();
279						return Some(Err(self.positioned(TokenErrorKind::BadUnicode)));
280					},
281					None => self.data.len() - off,
282					Some(n) => n,
283				};
284				let text = &self.data[off..off + len];
285				match i64::from_str_radix(text, radix)
286				{
287					Ok(v) => (off + len, true, TokenValue::Number(Number::Integer(v))),
288					Err(..) =>
289					{
290						self.clear();
291						return Some(Err(self.positioned(TokenErrorKind::BadNumber)));
292					},
293				}
294			},
295			b'\'' =>
296			{
297				let mut chars = self.data[1..].chars();
298				let result = match chars.next()
299				{
300					None => Err(self.utf_err),
301					Some('\\') =>
302					{
303						match chars.next()
304						{
305							None => Err(self.utf_err),
306							Some('t') => Ok((2, '\t')),
307							Some('n') => Ok((2, '\n')),
308							Some('r') => Ok((2, '\r')),
309							Some(c @ ('"' | '\'' | '\\')) => Ok((2, c)),
310							Some(..) => Err(false),
311						}
312					},
313					Some(c @ ('\t' | ' '..='~' | '\u{80}'..)) => Ok((c.len_utf8(), c)),
314					_ => Err(false),
315				}.and_then(|v|
316				{
317					match chars.next()
318					{
319						None => Err(self.utf_err),
320						Some('\'') => Ok(v),
321						Some(..) => Err(false),
322					}
323				});
324				match result
325				{
326					Ok((n, c)) => (1 + n + 1, c < '\u{80}', TokenValue::Number(Number::Integer(u32::from(c).into()))),
327					Err(utf_err) =>
328					{
329						let err = if utf_err
330						{
331							self.update_pos(self.data);
332							TokenErrorKind::BadUnicode
333						}
334						else {TokenErrorKind::BadCharacter};
335						self.clear();
336						return Some(Err(self.positioned(err)));
337					},
338				}
339			},
340			b'A'..=b'Z' | b'_' | b'a'..=b'z' =>
341			{
342				let len = match self.data.bytes().position(|b| !matches!(b, b'$' | b'.' | b'0'..=b'9' | b'@' | b'A'..=b'Z' | b'_' | b'a'..=b'z'))
343				{
344					None if self.utf_err =>
345					{
346						// identifiers are ASCII so `str::bytes().count() == str::chars().count()`
347						self.col = self.col.saturating_add(u32::try_from(self.data.len()).unwrap_or(u32::MAX));
348						self.clear();
349						return Some(Err(self.positioned(TokenErrorKind::BadUnicode)));
350					},
351					None => self.data.len(),
352					Some(n) => n,
353				};
354				(len, true, TokenValue::Identifier(&self.data[..len]))
355			},
356			b'"' =>
357			{
358				let mut pos = 1usize;
359				let mut escaped = String::new();
360				loop
361				{
362					match self.data[pos..].bytes().position(|b| b == b'"' || b == b'\\' || (b < b' ' && b != b'\t') || b == 0x7F)
363					{
364						None =>
365						{
366							let err = if self.utf_err
367							{
368								self.update_pos(self.data);
369								TokenErrorKind::BadUnicode
370							}
371							else {TokenErrorKind::BadString};
372							self.clear();
373							return Some(Err(self.positioned(err)));
374						},
375						Some(off) =>
376						{
377							let c = self.data.as_bytes()[pos + off];
378							// simple condition, none of the false positives are matched by above `str::bytes().position()` call
379							if c < b' ' && c >= 0x7F
380							{
381								self.clear();
382								return Some(Err(self.positioned(TokenErrorKind::BadString)));
383							}
384							if c == b'\\'
385							{
386								if off > 0
387								{
388									escaped.push_str(&self.data[pos..pos + off]);
389									pos += off;
390								}
391								// at least 3 characters are needed: backslash, escape character and closing quotation mark
392								if self.data.len() - pos < 3
393								{
394									self.clear();
395									return Some(Err(self.positioned(TokenErrorKind::BadString)));
396								}
397								match self.data.as_bytes()[pos + 1]
398								{
399									b'0' => escaped.push('\0'),
400									b't' => escaped.push('\t'),
401									b'n' => escaped.push('\n'),
402									b'r' => escaped.push('\r'),
403									c @ (b'"' | b'\'' | b'\\') => escaped.push(c as char),
404									// safe index, we checked for an extra character (see above comment)
405									b'u' if self.data.as_bytes()[pos + 2] == b'{' =>
406									{
407										// longest escape sequence is "\u{10FFFF}"
408										match self.data[pos + 3..].bytes().take(7).position(|b| b == b'}')
409										{
410											None =>
411											{
412												let err = if self.utf_err
413												{
414													self.update_pos(self.data);
415													TokenErrorKind::BadUnicode
416												}
417												else {TokenErrorKind::BadString};
418												self.clear();
419												return Some(Err(self.positioned(err)));
420											},
421											Some(end) =>
422											{
423												match u32::from_str_radix(&self.data[pos + 3..pos + 3 + end], 16).ok().and_then(char::from_u32)
424												{
425													Some(c) =>
426													{
427														escaped.push(c);
428														pos += end + 2; // 2 curly brackets + character value
429													},
430													None =>
431													{
432														self.clear();
433														return Some(Err(self.positioned(TokenErrorKind::BadString)));
434													},
435												}
436											},
437										}
438									},
439									_ =>
440									{
441										self.clear();
442										return Some(Err(self.positioned(TokenErrorKind::BadString)));
443									},
444								}
445								pos += 2; // backslash + escape character
446							}
447							else
448							{
449								assert_eq!(c, b'"');
450								if !escaped.is_empty() && off > 0
451								{
452									escaped.push_str(&self.data[pos..pos + off]);
453								}
454								pos += off + 1; // no overflow because this is at most `self.data.len()`
455								break;
456							}
457						},
458					}
459				}
460				// it's simpler to just use `Self::update_pos()` instead of manually checking if the input (!) is an ASCII line
461				(pos, false, TokenValue::String(if !escaped.is_empty() {Arcob::Arced(escaped.into())} else {Arcob::Borrowed(&self.data[1..pos - 1])}))
462			},
463			b'(' => (1, true, TokenValue::BeginGroup),
464			b')' => (1, true, TokenValue::EndGroup),
465			b'[' => (1, true, TokenValue::BeginAddr),
466			b']' => (1, true, TokenValue::EndAddr),
467			b'{' => (1, true, TokenValue::BeginSeq),
468			b'}' => (1, true, TokenValue::EndSeq),
469			_ =>
470			{
471				let c = self.data.chars().next().unwrap();
472				self.clear();
473				return Some(Err(self.positioned(TokenErrorKind::Unexpected(c))));
474			},
475		};
476		let token = self.positioned(token);
477		if ascii_ln
478		{
479			self.col = self.col.saturating_add(u32::try_from(n).unwrap_or(u32::MAX));
480		}
481		else
482		{
483			self.update_pos(&self.data[..n]);
484		}
485		self.data = &self.data[n..];
486		Some(Ok(token))
487	}
488	
489	pub fn peek<'s>(&'s mut self) -> Option<Result<&'s Token<'l>, &'s TokenError>>
490	{
491		if !self.tokens.is_empty()
492		{
493			return Some(Ok(self.tokens.front().unwrap()));
494		}
495		if let Some(ref e) = self.token_err {Some(Err(e))}
496		else
497		{
498			match self.next_token()
499			{
500				None => None,
501				Some(Ok(t)) =>
502				{
503					self.tokens.push_back(t);
504					self.tokens.back().map(Result::Ok)
505				},
506				Some(Err(e)) =>
507				{
508					self.token_err = Some(e);
509					self.token_err.as_ref().map(Result::Err)
510				},
511			}
512		}
513	}
514	
515	pub fn peek_nth(&mut self, idx: usize) -> Option<Result<&Token<'l>, &TokenError>>
516	{
517		while self.tokens.len() < idx
518		{
519			if self.token_err.is_some() {return None;}
520			match self.next_token()
521			{
522				None => return None,
523				Some(Ok(t)) => self.tokens.push_back(t),
524				Some(Err(e)) => self.token_err = Some(e),
525			}
526		}
527		if idx == self.tokens.len()
528		{
529			if let Some(ref e) = self.token_err {Some(Err(e))}
530			else {None}
531		}
532		else
533		{
534			self.tokens.get(idx).map(Result::Ok)
535		}
536	}
537}
538
539impl<'l> Iterator for Tokenizer<'l>
540{
541	type Item = Result<Token<'l>, TokenError>;
542	
543	fn next(&mut self) -> Option<Self::Item>
544	{
545		match self.tokens.pop_front()
546		{
547			None =>
548			{
549				if let Some(e) = self.token_err.take() {Some(Err(e))}
550				else {self.next_token()}
551			},
552			Some(t) => Some(Ok(t)),
553		}
554	}
555}
556
557pub type TokenError = Positioned<TokenErrorKind>;
558
559#[derive(Clone, Debug, Eq, PartialEq)]
560pub enum TokenErrorKind
561{
562	BadUnicode,
563	Invalid,
564	BlockComment,
565	BadNumber,
566	BadCharacter,
567	BadString,
568	Unexpected(char),
569}
570
571impl fmt::Display for TokenErrorKind
572{
573	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result
574	{
575		match self
576		{
577			TokenErrorKind::BadUnicode => f.write_str("malformed UTF-8 code-point"),
578			TokenErrorKind::Invalid => f.write_str("invalid character"),
579			TokenErrorKind::BlockComment => f.write_str("unclosed block comment"),
580			TokenErrorKind::BadNumber => f.write_str("malformed number"),
581			TokenErrorKind::BadCharacter => f.write_str("malformed character literal"),
582			TokenErrorKind::BadString => f.write_str("malformed string literal"),
583			TokenErrorKind::Unexpected(c) => write!(f, "unexpected character {c:?}"),
584		}
585	}
586}
587
588impl Error for TokenErrorKind {}