1use core::fmt;
2use std::collections::VecDeque;
3use std::error::Error;
4
5use crate::asm::arcob::Arcob;
6use crate::text::Positioned;
7
8#[cfg(test)]
9mod test;
10
11#[derive(Clone, Copy, Debug)]
12pub enum Number
13{
14 Integer(i64),
15}
16
17pub type Token<'l> = Positioned<TokenValue<'l>>;
18
19#[derive(Clone, Debug)]
20pub enum TokenValue<'l>
21{
22 Separator, Terminator,
23 LabelMark, DirectiveMark,
24 Plus, Minus, Multiply, Divide, Modulo,
25 Not, BitAnd, BitOr, BitXor, LeftShift, RightShift,
26 Number(Number),
27 Identifier(&'l str),
28 String(Arcob<'l, str>),
29 BeginGroup, EndGroup,
31 BeginAddr, EndAddr,
32 BeginSeq, EndSeq,
33}
34
35impl<'l> TokenValue<'l>
36{
37 pub fn desc(&self) -> &'static str
38 {
39 match self
40 {
41 TokenValue::Separator => "','",
42 TokenValue::Terminator => "';'",
43 TokenValue::LabelMark => "':'",
44 TokenValue::DirectiveMark => "'.'",
45 TokenValue::Plus => "'+'",
46 TokenValue::Minus => "'-'",
47 TokenValue::Multiply => "'*'",
48 TokenValue::Divide => "'/'",
49 TokenValue::Modulo => "'%'",
50 TokenValue::Not => "'!'",
51 TokenValue::BitAnd => "'&'",
52 TokenValue::BitOr => "'|'",
53 TokenValue::BitXor => "'^'",
54 TokenValue::LeftShift => "\"<<\"",
55 TokenValue::RightShift => "\">>\"",
56 TokenValue::Number(..) => "number",
57 TokenValue::Identifier(..) => "identifier",
58 TokenValue::String(..) => "string",
59 TokenValue::BeginGroup => "'('",
60 TokenValue::EndGroup => "')'",
61 TokenValue::BeginAddr => "'['",
62 TokenValue::EndAddr => "']'",
63 TokenValue::BeginSeq => "'{'",
64 TokenValue::EndSeq => "'}'",
65 }
66 }
67}
68
69impl<'l> fmt::Display for TokenValue<'l>
70{
71 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result
72 {
73 f.write_str(self.desc())
74 }
75}
76
77#[derive(Clone, Debug)]
78pub struct Tokenizer<'l>
79{
80 data: &'l str,
81 utf_err: bool,
82 col: u32,
83 line: u32,
84 tokens: VecDeque<Token<'l>>,
85 token_err: Option<TokenError>,
86}
87
88impl<'l> Tokenizer<'l>
89{
90 pub fn new(data: &'l [u8]) -> Self
91 {
92 let (data, utf_err) = match core::str::from_utf8(data)
93 {
94 Ok(s) => (s, false),
95 Err(e) =>
96 {
97 (unsafe{core::str::from_utf8_unchecked(&data[..e.valid_up_to()])}, true)
99 },
100 };
101 Self
102 {
103 data, utf_err,
104 line: 1,
105 col: 1,
106 tokens: VecDeque::new(),
107 token_err: None,
108 }
109 }
110
111 pub fn get_line(&self) -> u32
112 {
113 self.line
114 }
115
116 pub fn get_column(&self) -> u32
117 {
118 self.col
119 }
120
121 pub fn clear(&mut self)
122 {
123 self.data = "";
124 self.utf_err = false;
125 }
126
127 fn update_pos(&mut self, data: &str)
128 {
129 let lines = u32::try_from(data.bytes().filter(|&b| b == b'\n').count()).unwrap_or(u32::MAX);
130 if lines > 0
131 {
132 self.line = self.line.saturating_add(lines);
133 self.col = 1;
134 }
135 let data = &data[data.bytes().rposition(|b| b == b'\n').map_or(0, |v| v + 1)..];
136 let coluns = u32::try_from(data.bytes().filter(|&b| (b & 0b11_000000) != 0b10_000000).count()).unwrap_or(u32::MAX);
138 self.col = self.col.saturating_add(coluns);
139 }
140
141 fn positioned<T>(&self, value: T) -> Positioned<T>
142 {
143 Positioned{line: self.line, col: self.col, value}
144 }
145
146 fn next_token(&mut self) -> Option<Result<Token<'l>, TokenError>>
147 {
148 while self.data.len() > 0
149 {
150 let space_bytes = match self.data.bytes().position(|b| b != b'\t' && b != b'\n' && b != b'\r' && b != b' ')
151 {
152 None => self.data.len(),
153 Some(cnt) => cnt,
154 };
155 if space_bytes > 0
156 {
157 self.update_pos(&self.data[..space_bytes]);
158 self.data = &self.data[space_bytes..];
159 }
160
161 if self.data.starts_with("//")
162 {
163 match self.data.bytes().position(|b| b == b'\n')
164 {
165 None =>
166 {
167 self.update_pos(self.data);
168 let utf_err = self.utf_err;
169 self.clear();
170 if utf_err
171 {
172 return Some(Err(self.positioned(TokenErrorKind::BadUnicode)));
173 }
174 },
175 Some(line_len) =>
176 {
177 self.data = &self.data[line_len + 1..];
178 self.line = self.line.saturating_add(1);
179 self.col = 1;
180 },
181 }
182 }
183 else if self.data.starts_with("/*")
184 {
185 let mut depth = 1; let mut start = 2; let comment_bytes = self.data[..self.data.len() - 1].bytes().enumerate().skip(2).position(|(p, b)|
188 {
189 if b == b'/' && p >= start && self.data.as_bytes()[p + 1] == b'*'
190 {
191 start = p + 2;
193 depth += 1;
195 }
196 else if b == b'*' && p >= start && self.data.as_bytes()[p + 1] == b'/'
197 {
198 start = p + 2;
200 depth -= 1;
202 }
203 depth == 0
204 });
205 if let Some(comment_bytes) = comment_bytes
206 {
207 self.update_pos(&self.data[..4 + comment_bytes]);
208 self.data = &self.data[4 + comment_bytes..];
209 }
210 else
211 {
212 self.update_pos(self.data);
213 let err = if self.utf_err {TokenErrorKind::BadUnicode} else {TokenErrorKind::BlockComment};
214 self.clear();
215 return Some(Err(self.positioned(err)))
216 }
217 }
218 else {break;}
219 }
220
221 if !self.data.is_empty()
222 {
223 match self.do_next()
224 {
225 Some(Err(e)) =>
226 {
227 self.clear();
228 Some(Err(e))
229 },
230 r => r,
231 }
232 }
233 else
234 {
235 if self.utf_err
236 {
237 self.utf_err = false;
238 Some(Err(self.positioned(TokenErrorKind::BadUnicode)))
239 }
240 else {None}
241 }
242 }
243
244 fn do_next(&mut self) -> Option<Result<Token<'l>, TokenError>>
245 {
246 let (n, ascii_ln, token) = match self.data.as_bytes()[0]
247 {
248 b',' => (1, true, TokenValue::Separator),
249 b';' => (1, true, TokenValue::Terminator),
250 b':' => (1, true, TokenValue::LabelMark),
251 b'.' => (1, true, TokenValue::DirectiveMark),
252 b'+' => (1, true, TokenValue::Plus),
253 b'-' => (1, true, TokenValue::Minus),
254 b'*' => (1, true, TokenValue::Multiply),
255 b'/' => (1, true, TokenValue::Divide),
256 b'%' => (1, true, TokenValue::Modulo),
257 b'!' => (1, true, TokenValue::Not),
258 b'&' => (1, true, TokenValue::BitAnd),
259 b'|' => (1, true, TokenValue::BitOr),
260 b'^' => (1, true, TokenValue::BitXor),
261 b'<' if self.data.len() >= 2 && self.data.as_bytes()[1] == b'<' => (2, true, TokenValue::LeftShift),
262 b'>' if self.data.len() >= 2 && self.data.as_bytes()[1] == b'>' => (2, true, TokenValue::RightShift),
263 b'0'..=b'9' =>
264 {
265 let (off, radix) =
266 {
267 if self.data.starts_with("0b") {(2, 2)}
268 else if self.data.starts_with("0o") {(2, 8)}
269 else if self.data.starts_with("0x") {(2, 16)}
270 else {(0, 10)}
271 };
272 let len = match self.data[off..].bytes().position(|b| !(b as char).is_digit(radix))
273 {
274 None if self.utf_err =>
275 {
276 self.col = self.col.saturating_add(u32::try_from(self.data.len()).unwrap_or(u32::MAX));
278 self.clear();
279 return Some(Err(self.positioned(TokenErrorKind::BadUnicode)));
280 },
281 None => self.data.len() - off,
282 Some(n) => n,
283 };
284 let text = &self.data[off..off + len];
285 match i64::from_str_radix(text, radix)
286 {
287 Ok(v) => (off + len, true, TokenValue::Number(Number::Integer(v))),
288 Err(..) =>
289 {
290 self.clear();
291 return Some(Err(self.positioned(TokenErrorKind::BadNumber)));
292 },
293 }
294 },
295 b'\'' =>
296 {
297 let mut chars = self.data[1..].chars();
298 let result = match chars.next()
299 {
300 None => Err(self.utf_err),
301 Some('\\') =>
302 {
303 match chars.next()
304 {
305 None => Err(self.utf_err),
306 Some('t') => Ok((2, '\t')),
307 Some('n') => Ok((2, '\n')),
308 Some('r') => Ok((2, '\r')),
309 Some(c @ ('"' | '\'' | '\\')) => Ok((2, c)),
310 Some(..) => Err(false),
311 }
312 },
313 Some(c @ ('\t' | ' '..='~' | '\u{80}'..)) => Ok((c.len_utf8(), c)),
314 _ => Err(false),
315 }.and_then(|v|
316 {
317 match chars.next()
318 {
319 None => Err(self.utf_err),
320 Some('\'') => Ok(v),
321 Some(..) => Err(false),
322 }
323 });
324 match result
325 {
326 Ok((n, c)) => (1 + n + 1, c < '\u{80}', TokenValue::Number(Number::Integer(u32::from(c).into()))),
327 Err(utf_err) =>
328 {
329 let err = if utf_err
330 {
331 self.update_pos(self.data);
332 TokenErrorKind::BadUnicode
333 }
334 else {TokenErrorKind::BadCharacter};
335 self.clear();
336 return Some(Err(self.positioned(err)));
337 },
338 }
339 },
340 b'A'..=b'Z' | b'_' | b'a'..=b'z' =>
341 {
342 let len = match self.data.bytes().position(|b| !matches!(b, b'$' | b'.' | b'0'..=b'9' | b'@' | b'A'..=b'Z' | b'_' | b'a'..=b'z'))
343 {
344 None if self.utf_err =>
345 {
346 self.col = self.col.saturating_add(u32::try_from(self.data.len()).unwrap_or(u32::MAX));
348 self.clear();
349 return Some(Err(self.positioned(TokenErrorKind::BadUnicode)));
350 },
351 None => self.data.len(),
352 Some(n) => n,
353 };
354 (len, true, TokenValue::Identifier(&self.data[..len]))
355 },
356 b'"' =>
357 {
358 let mut pos = 1usize;
359 let mut escaped = String::new();
360 loop
361 {
362 match self.data[pos..].bytes().position(|b| b == b'"' || b == b'\\' || (b < b' ' && b != b'\t') || b == 0x7F)
363 {
364 None =>
365 {
366 let err = if self.utf_err
367 {
368 self.update_pos(self.data);
369 TokenErrorKind::BadUnicode
370 }
371 else {TokenErrorKind::BadString};
372 self.clear();
373 return Some(Err(self.positioned(err)));
374 },
375 Some(off) =>
376 {
377 let c = self.data.as_bytes()[pos + off];
378 if c < b' ' && c >= 0x7F
380 {
381 self.clear();
382 return Some(Err(self.positioned(TokenErrorKind::BadString)));
383 }
384 if c == b'\\'
385 {
386 if off > 0
387 {
388 escaped.push_str(&self.data[pos..pos + off]);
389 pos += off;
390 }
391 if self.data.len() - pos < 3
393 {
394 self.clear();
395 return Some(Err(self.positioned(TokenErrorKind::BadString)));
396 }
397 match self.data.as_bytes()[pos + 1]
398 {
399 b'0' => escaped.push('\0'),
400 b't' => escaped.push('\t'),
401 b'n' => escaped.push('\n'),
402 b'r' => escaped.push('\r'),
403 c @ (b'"' | b'\'' | b'\\') => escaped.push(c as char),
404 b'u' if self.data.as_bytes()[pos + 2] == b'{' =>
406 {
407 match self.data[pos + 3..].bytes().take(7).position(|b| b == b'}')
409 {
410 None =>
411 {
412 let err = if self.utf_err
413 {
414 self.update_pos(self.data);
415 TokenErrorKind::BadUnicode
416 }
417 else {TokenErrorKind::BadString};
418 self.clear();
419 return Some(Err(self.positioned(err)));
420 },
421 Some(end) =>
422 {
423 match u32::from_str_radix(&self.data[pos + 3..pos + 3 + end], 16).ok().and_then(char::from_u32)
424 {
425 Some(c) =>
426 {
427 escaped.push(c);
428 pos += end + 2; },
430 None =>
431 {
432 self.clear();
433 return Some(Err(self.positioned(TokenErrorKind::BadString)));
434 },
435 }
436 },
437 }
438 },
439 _ =>
440 {
441 self.clear();
442 return Some(Err(self.positioned(TokenErrorKind::BadString)));
443 },
444 }
445 pos += 2; }
447 else
448 {
449 assert_eq!(c, b'"');
450 if !escaped.is_empty() && off > 0
451 {
452 escaped.push_str(&self.data[pos..pos + off]);
453 }
454 pos += off + 1; break;
456 }
457 },
458 }
459 }
460 (pos, false, TokenValue::String(if !escaped.is_empty() {Arcob::Arced(escaped.into())} else {Arcob::Borrowed(&self.data[1..pos - 1])}))
462 },
463 b'(' => (1, true, TokenValue::BeginGroup),
464 b')' => (1, true, TokenValue::EndGroup),
465 b'[' => (1, true, TokenValue::BeginAddr),
466 b']' => (1, true, TokenValue::EndAddr),
467 b'{' => (1, true, TokenValue::BeginSeq),
468 b'}' => (1, true, TokenValue::EndSeq),
469 _ =>
470 {
471 let c = self.data.chars().next().unwrap();
472 self.clear();
473 return Some(Err(self.positioned(TokenErrorKind::Unexpected(c))));
474 },
475 };
476 let token = self.positioned(token);
477 if ascii_ln
478 {
479 self.col = self.col.saturating_add(u32::try_from(n).unwrap_or(u32::MAX));
480 }
481 else
482 {
483 self.update_pos(&self.data[..n]);
484 }
485 self.data = &self.data[n..];
486 Some(Ok(token))
487 }
488
489 pub fn peek<'s>(&'s mut self) -> Option<Result<&'s Token<'l>, &'s TokenError>>
490 {
491 if !self.tokens.is_empty()
492 {
493 return Some(Ok(self.tokens.front().unwrap()));
494 }
495 if let Some(ref e) = self.token_err {Some(Err(e))}
496 else
497 {
498 match self.next_token()
499 {
500 None => None,
501 Some(Ok(t)) =>
502 {
503 self.tokens.push_back(t);
504 self.tokens.back().map(Result::Ok)
505 },
506 Some(Err(e)) =>
507 {
508 self.token_err = Some(e);
509 self.token_err.as_ref().map(Result::Err)
510 },
511 }
512 }
513 }
514
515 pub fn peek_nth(&mut self, idx: usize) -> Option<Result<&Token<'l>, &TokenError>>
516 {
517 while self.tokens.len() < idx
518 {
519 if self.token_err.is_some() {return None;}
520 match self.next_token()
521 {
522 None => return None,
523 Some(Ok(t)) => self.tokens.push_back(t),
524 Some(Err(e)) => self.token_err = Some(e),
525 }
526 }
527 if idx == self.tokens.len()
528 {
529 if let Some(ref e) = self.token_err {Some(Err(e))}
530 else {None}
531 }
532 else
533 {
534 self.tokens.get(idx).map(Result::Ok)
535 }
536 }
537}
538
539impl<'l> Iterator for Tokenizer<'l>
540{
541 type Item = Result<Token<'l>, TokenError>;
542
543 fn next(&mut self) -> Option<Self::Item>
544 {
545 match self.tokens.pop_front()
546 {
547 None =>
548 {
549 if let Some(e) = self.token_err.take() {Some(Err(e))}
550 else {self.next_token()}
551 },
552 Some(t) => Some(Ok(t)),
553 }
554 }
555}
556
557pub type TokenError = Positioned<TokenErrorKind>;
558
559#[derive(Clone, Debug, Eq, PartialEq)]
560pub enum TokenErrorKind
561{
562 BadUnicode,
563 Invalid,
564 BlockComment,
565 BadNumber,
566 BadCharacter,
567 BadString,
568 Unexpected(char),
569}
570
571impl fmt::Display for TokenErrorKind
572{
573 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result
574 {
575 match self
576 {
577 TokenErrorKind::BadUnicode => f.write_str("malformed UTF-8 code-point"),
578 TokenErrorKind::Invalid => f.write_str("invalid character"),
579 TokenErrorKind::BlockComment => f.write_str("unclosed block comment"),
580 TokenErrorKind::BadNumber => f.write_str("malformed number"),
581 TokenErrorKind::BadCharacter => f.write_str("malformed character literal"),
582 TokenErrorKind::BadString => f.write_str("malformed string literal"),
583 TokenErrorKind::Unexpected(c) => write!(f, "unexpected character {c:?}"),
584 }
585 }
586}
587
588impl Error for TokenErrorKind {}