1use alloc::{format, string::String, sync::Arc, vec::Vec};
2use core::fmt::Debug;
3
4use crate::error::{Error, Result};
5
6#[derive(Debug, Clone, PartialEq)]
7pub enum Token {
8 LParen, RParen, LBrace, RBrace, LBracket, RBracket, Dot, Colon, Comma, Semicolon, Nil, Eq, Ne, Gt, Lt, Ge, Le, In, And, Or, Not, Add, Sub, Mul, Div, Mod, At, Question, QuestionQuestion, Str(Arc<str>), Int(i64), Float(f64), Bool(bool), Id(Arc<str>), }
43
44pub struct Tokenizer {
46 chars: Vec<char>,
47 idx: usize,
48 len: usize,
49 pub tokens: Vec<Token>,
50}
51
52impl Tokenizer {
53 #[allow(clippy::new_ret_no_self)]
54 pub fn new(s: &str) -> Result<Vec<Token>> {
55 let chars: Vec<char> = s.chars().collect();
56 let len = chars.len();
57 let mut t = Tokenizer {
58 chars,
59 idx: 0,
60 len,
61 tokens: Vec::with_capacity(s.len() / 4), };
63 t.parse()?;
64 Ok(t.tokens)
65 }
66
67 fn eof(&self) -> bool {
68 self.idx >= self.len
69 }
70
71 fn peek(&self, offset: usize) -> Option<char> {
72 self.chars.get(self.idx + offset).copied()
73 }
74
75 fn err<T: AsRef<str>>(&self, msg: T) -> String {
76 let mut line = 1usize;
78 let mut col = 1usize;
79 for i in 0..self.idx.min(self.len) {
80 if self.chars[i] == '\n' {
81 line += 1;
82 col = 1;
83 } else {
84 col += 1;
85 }
86 }
87 let r_idx = if self.idx + 5 < self.len {
89 self.idx + 5
90 } else {
91 self.len
92 };
93 let l_idx = self.idx.saturating_sub(5);
94 let r_idx = if r_idx > self.len { self.len } else { r_idx };
95 let chars = &self.chars[l_idx..r_idx];
96 let chars: String = chars.iter().collect();
97 let c = self.chars.get(self.idx);
98 let ctx = if let Some(&c) = c {
99 format!("'{}' at {}:{}, near '{}'", c, line, col, chars)
100 } else {
101 format!("at end ({}:{}), near '{}'", line, col, chars)
102 };
103 format!("Syntax error:\n{} ({})", msg.as_ref(), ctx)
104 }
105
106 fn skip_whitespace(&mut self) {
107 while self.idx < self.len && self.chars[self.idx].is_whitespace() {
108 self.idx += 1;
109 }
110 }
111
112 fn is_id_start(c: char) -> bool {
113 c.is_alphabetic() || c == '_'
114 }
115
116 fn is_id_continue(c: char) -> bool {
117 c.is_alphanumeric() || c == '_' || c == '-'
118 }
119
120 fn parse_str(&mut self) -> Result<()> {
121 let mut s = String::new();
122 let quote = self.chars[self.idx];
123 self.idx += 1;
124
125 while !self.eof() {
126 let c = self.chars[self.idx];
127 match c {
128 '\\' => {
129 self.idx += 1;
130 if self.eof() {
131 return Err(Error::Tokenize(self.err("Invalid escape sequence")));
132 }
133
134 let escaped = match self.chars[self.idx] {
135 '\\' => '\\',
136 '"' => '"',
137 '\'' => '\'',
138 'n' => '\n',
139 'r' => '\r',
140 't' => '\t',
141 '0' => '\0',
142 'u' => {
143 self.idx += 1;
144 if self.idx + 4 > self.len {
145 return Err(Error::Tokenize(self.err("Invalid \\uXXXX escape, need 4 hex digits")));
146 }
147 let hex: String = self.chars[self.idx..self.idx + 4].iter().collect();
148 let code = u32::from_str_radix(&hex, 16)
149 .map_err(|_| Error::Tokenize(self.err(format!("Invalid unicode escape: \\u{hex}"))))?;
150 let ch = char::from_u32(code).ok_or_else(|| {
151 Error::Tokenize(self.err(format!("Invalid unicode codepoint: \\u{hex}")))
152 })?;
153 self.idx += 4;
154 s.push(ch);
155 continue;
156 }
157 other => {
158 return Err(Error::Tokenize(
159 self.err(format!("Unsupported escape sequence: \\{other}")),
160 ));
161 }
162 };
163 s.push(escaped);
164 self.idx += 1;
165 }
166 c if c == quote => {
167 self.idx += 1;
168 self.tokens.push(Token::Str(Arc::<str>::from(s)));
169 return Ok(());
170 }
171 _ => {
172 s.push(c);
173 self.idx += 1;
174 }
175 }
176 }
177
178 Err(Error::Tokenize(self.err("String not closed")))
179 }
180
181 fn sign_starts_number(&self) -> bool {
185 match self.tokens.last() {
186 None => true, Some(tok) => !matches!(
188 tok,
189 Token::Int(_)
190 | Token::Float(_)
191 | Token::Str(_)
192 | Token::Bool(_)
193 | Token::Nil
194 | Token::Id(_)
195 | Token::RParen
196 | Token::RBracket
197 | Token::RBrace
198 ),
199 }
200 }
201
202 fn parse_num(&mut self) -> Result<()> {
207 let start_idx = self.idx;
208
209 if !self.eof() && (self.chars[self.idx] == '-' || self.chars[self.idx] == '+') {
211 self.idx += 1;
212 }
213
214 if !self.eof()
216 && self.chars[self.idx] == '0'
217 && self.idx + 1 < self.len
218 && (self.chars[self.idx + 1] == 'x'
219 || self.chars[self.idx + 1] == 'X'
220 || self.chars[self.idx + 1] == 'o'
221 || self.chars[self.idx + 1] == 'O')
222 {
223 self.idx = start_idx;
224 return self.parse_int();
225 }
226
227 self.idx = start_idx;
229 let mut dot_count = 0;
230 while !self.eof() {
231 let c = self.chars[self.idx];
232 if c.is_ascii_digit() {
233 self.idx += 1;
234 } else if c == '.' {
235 if dot_count > 0 {
236 return Err(Error::Tokenize(self.err("Invalid float, multiple '.'")));
237 }
238 self.idx += 1;
239 dot_count += 1;
240 } else if (c == '-' || c == '+') && self.idx == start_idx {
241 self.idx += 1;
242 } else {
243 break;
244 }
245 }
246
247 if self.idx > start_idx && self.chars[self.idx - 1] == '.' {
248 return Err(Error::Tokenize(self.err("Invalid float, ends with '.'")));
249 }
250
251 let num_str: String = self.chars[start_idx..self.idx].iter().collect();
252
253 let num = if dot_count > 0 {
254 match num_str.parse() {
255 Ok(f) => Token::Float(f),
256 Err(_) => return Err(Error::Tokenize(format!("{}: {}", self.err("Invalid float"), num_str))),
257 }
258 } else {
259 match num_str.parse() {
260 Ok(i) => Token::Int(i),
261 Err(_) => return Err(Error::Tokenize(format!("{}: {}", self.err("Invalid int"), num_str))),
262 }
263 };
264 self.tokens.push(num);
265 Ok(())
266 }
267
268 fn parse_id(&mut self) -> Result<()> {
269 if self.eof() || !Self::is_id_start(self.chars[self.idx]) {
270 return Err(Error::Tokenize(self.err("Invalid identifier start")));
271 }
272
273 let start_idx = self.idx;
274 self.idx = self.scan_id_end(start_idx);
275 self.push_id_token(start_idx, self.idx);
276 Ok(())
277 }
278
279 fn scan_id_end(&self, start_idx: usize) -> usize {
280 let mut end = start_idx + 1;
281 while end < self.len && Self::is_id_continue(self.chars[end]) {
282 end += 1;
283 }
284 end
285 }
286
287 fn push_id_token(&mut self, start_idx: usize, end_idx: usize) {
288 let id: String = self.chars[start_idx..end_idx].iter().collect();
289 self.tokens.push(Token::Id(Arc::<str>::from(id)));
290 }
291
292 fn parse_ident_or_keyword(&mut self) -> Result<()> {
293 if self.eof() || !Self::is_id_start(self.chars[self.idx]) {
294 return Err(Error::Tokenize(self.err("Invalid identifier start")));
295 }
296
297 let start_idx = self.idx;
298 let end_idx = self.scan_id_end(start_idx);
299 self.idx = end_idx;
300
301 let token = match &self.chars[start_idx..end_idx] {
302 ['t', 'r', 'u', 'e'] => Token::Bool(true),
303 ['f', 'a', 'l', 's', 'e'] => Token::Bool(false),
304 ['n', 'i', 'l'] => Token::Nil,
305 ['i', 'n'] => Token::In,
306 _ => {
307 self.push_id_token(start_idx, end_idx);
308 return Ok(());
309 }
310 };
311 self.tokens.push(token);
312 Ok(())
313 }
314
315 fn parse_at_list(&mut self) -> Result<()> {
318 self.idx += 1;
319 self.tokens.push(Token::At);
320
321 while !self.eof() {
322 let c = self.chars[self.idx];
323 if Self::is_id_start(c) {
324 self.parse_id()?;
325 continue;
326 }
327 if c.is_ascii_digit() {
328 self.parse_int()?;
329 continue;
330 }
331 if matches!(c, '+' | '-') && self.tokens.last().is_some_and(|tok| tok == &Token::Dot) {
332 if self.peek(1).is_some_and(|next| next.is_ascii_digit()) {
333 self.parse_int()?;
334 continue;
335 }
336 }
337
338 if c == '.' {
339 self.idx += 1;
340 self.tokens.push(Token::Dot);
341 continue;
342 }
343 break;
344 }
345 Ok(())
346 }
347
348 fn parse_int(&mut self) -> Result<()> {
349 let start_idx = self.idx;
350
351 if !self.eof() && (self.chars[self.idx] == '-' || self.chars[self.idx] == '+') {
352 self.idx += 1;
353 }
354
355 if !self.eof()
356 && self.chars[self.idx] == '0'
357 && self.idx + 1 < self.len
358 && (self.chars[self.idx + 1] == 'x'
359 || self.chars[self.idx + 1] == 'X'
360 || self.chars[self.idx + 1] == 'o'
361 || self.chars[self.idx + 1] == 'O')
362 {
363 let is_hex = self.chars[self.idx + 1] == 'x' || self.chars[self.idx + 1] == 'X';
364 let radix = if is_hex { 16 } else { 8 };
365 self.idx += 2; let digits_start = self.idx;
367 while !self.eof() {
368 let c = self.chars[self.idx];
369 let valid = if is_hex {
370 c.is_ascii_hexdigit()
371 } else {
372 matches!(c, '0'..='7')
373 };
374 if valid {
375 self.idx += 1;
376 } else {
377 break;
378 }
379 }
380 if self.idx == digits_start {
381 let label = if is_hex { "hex" } else { "octal" };
382 return Err(Error::Tokenize(self.err(format!("Invalid {label} literal, no digits"))));
383 }
384 let digits: String = self.chars[digits_start..self.idx].iter().collect();
385 let val = i64::from_str_radix(&digits, radix).map_err(|_| {
386 Error::Tokenize(self.err(format!("Invalid int: 0{}{}", if is_hex { "x" } else { "o" }, digits)))
387 })?;
388 let val = if start_idx < self.chars.len() && self.chars[start_idx] == '-' {
389 val.checked_neg()
390 .ok_or_else(|| Error::Tokenize(self.err("Integer overflow")))?
391 } else {
392 val
393 };
394 self.tokens.push(Token::Int(val));
395 return Ok(());
396 }
397
398 while !self.eof() {
399 let c = self.chars[self.idx];
400 if c.is_ascii_digit() {
401 self.idx += 1;
402 } else {
403 break;
404 }
405 }
406
407 let num_str: String = self.chars[start_idx..self.idx].iter().collect();
408 let num = match num_str.parse() {
409 Ok(i) => i,
410 Err(_) => return Err(Error::Tokenize(format!("{}: {}", self.err("Invalid int"), num_str))),
411 };
412 self.tokens.push(Token::Int(num));
413 Ok(())
414 }
415
416 fn parse_punctuations(&mut self) -> Result<()> {
417 let c = self.chars[self.idx];
418 match c {
419 '(' => {
420 self.idx += 1;
421 self.tokens.push(Token::LParen);
422 Ok(())
423 }
424 ')' => {
425 self.idx += 1;
426 self.tokens.push(Token::RParen);
427 Ok(())
428 }
429 '{' => {
430 self.idx += 1;
431 self.tokens.push(Token::LBrace);
432 Ok(())
433 }
434 '}' => {
435 self.idx += 1;
436 self.tokens.push(Token::RBrace);
437 Ok(())
438 }
439 '[' => {
440 self.idx += 1;
441 self.tokens.push(Token::LBracket);
442 Ok(())
443 }
444 ']' => {
445 self.idx += 1;
446 self.tokens.push(Token::RBracket);
447 Ok(())
448 }
449 ':' => {
450 self.idx += 1;
451 self.tokens.push(Token::Colon);
452 Ok(())
453 }
454 ',' => {
455 self.idx += 1;
456 self.tokens.push(Token::Comma);
457 Ok(())
458 }
459 ';' => {
460 self.idx += 1;
461 self.tokens.push(Token::Semicolon);
462 Ok(())
463 }
464 '.' => {
465 self.idx += 1;
466 self.tokens.push(Token::Dot);
467 if self.starts_int_literal() {
468 return self.parse_int();
469 }
470 Ok(())
471 }
472 '&' => {
473 if self.peek(1) == Some('&') {
474 self.idx += 2;
475 self.tokens.push(Token::And);
476 Ok(())
477 } else {
478 Err(Error::Tokenize(self.err("Expect '&&'")))
479 }
480 }
481 '|' => {
482 if self.peek(1) == Some('|') {
483 self.idx += 2;
484 self.tokens.push(Token::Or);
485 Ok(())
486 } else {
487 Err(Error::Tokenize(self.err("Expect '||'")))
488 }
489 }
490 '+' => {
491 if self.sign_starts_number() && self.peek(1).is_some_and(|next| next.is_ascii_digit()) {
492 return self.parse_num();
493 }
494 self.idx += 1;
495 self.tokens.push(Token::Add);
496 Ok(())
497 }
498 '-' => {
499 if self.sign_starts_number() && self.peek(1).is_some_and(|next| next.is_ascii_digit()) {
500 return self.parse_num();
501 }
502 self.idx += 1;
503 self.tokens.push(Token::Sub);
504 Ok(())
505 }
506 '*' => {
507 self.idx += 1;
508 self.tokens.push(Token::Mul);
509 Ok(())
510 }
511 '/' => {
512 if self.peek(1) == Some('*') {
513 self.idx += 2;
515 let mut depth = 1usize;
516 while !self.eof() && depth > 0 {
517 if self.chars[self.idx] == '/' && self.peek(1) == Some('*') {
518 depth += 1;
519 self.idx += 2;
520 } else if self.chars[self.idx] == '*' && self.peek(1) == Some('/') {
521 depth -= 1;
522 self.idx += 2;
523 } else {
524 self.idx += 1;
525 }
526 }
527 if depth > 0 {
528 return Err(Error::Tokenize(self.err("Unterminated block comment")));
529 }
530 } else if self.peek(1) == Some('/') {
531 self.idx += 2;
532 while !self.eof() {
534 let c = self.chars[self.idx];
535 if c == '\n' {
536 self.idx += 1;
537 break;
538 }
539 self.idx += 1;
540 }
541 } else {
542 self.idx += 1;
543 self.tokens.push(Token::Div);
544 }
545 Ok(())
546 }
547 '%' => {
548 self.idx += 1;
549 self.tokens.push(Token::Mod);
550 Ok(())
551 }
552 '@' => self.parse_at_list(),
553 '=' => {
554 if self.peek(1) == Some('=') {
555 self.idx += 2;
556 self.tokens.push(Token::Eq);
557 Ok(())
558 } else {
559 Err(Error::Tokenize(self.err("Expect '=='")))
560 }
561 }
562 '!' => {
563 if self.peek(1) == Some('=') {
564 self.idx += 2;
565 self.tokens.push(Token::Ne);
566 Ok(())
567 } else {
568 self.idx += 1;
569 self.tokens.push(Token::Not);
570 Ok(())
571 }
572 }
573 '>' => {
574 if self.peek(1) == Some('=') {
575 self.idx += 2;
576 self.tokens.push(Token::Ge);
577 Ok(())
578 } else {
579 self.idx += 1;
580 self.tokens.push(Token::Gt);
581 Ok(())
582 }
583 }
584 '<' => {
585 if self.peek(1) == Some('=') {
586 self.idx += 2;
587 self.tokens.push(Token::Le);
588 Ok(())
589 } else {
590 self.idx += 1;
591 self.tokens.push(Token::Lt);
592 Ok(())
593 }
594 }
595 '?' => {
596 if self.peek(1) == Some('?') {
597 self.idx += 2;
598 self.tokens.push(Token::QuestionQuestion);
599 } else {
600 self.idx += 1;
601 self.tokens.push(Token::Question);
602 }
603 Ok(())
604 }
605 _ => Err(Error::Tokenize(self.err("Unknown punctuation"))),
606 }
607 }
608
609 fn parse(&mut self) -> Result<()> {
610 while !self.eof() {
611 self.skip_whitespace();
612 if self.eof() {
613 break;
614 }
615 let c = self.chars[self.idx];
616 match c {
617 '"' | '\'' => {
618 self.parse_str()?;
619 }
620 '0'..='9' => {
621 self.parse_num()?;
622 }
623 _ => {
624 if self.is_punctuation(c) {
625 self.parse_punctuations()?;
626 } else if Self::is_id_start(c) {
627 self.parse_ident_or_keyword()?;
628 } else {
629 return Err(Error::Tokenize(self.err("Invalid identifier start")));
630 }
631 }
632 }
633 }
634 Ok(())
635 }
636
637 fn is_punctuation(&self, c: char) -> bool {
638 matches!(
639 c,
640 '(' | ')'
641 | '{'
642 | '}'
643 | '['
644 | ']'
645 | '.'
646 | ':'
647 | ','
648 | ';'
649 | '&'
650 | '|'
651 | '+'
652 | '-'
653 | '*'
654 | '/'
655 | '%'
656 | '@'
657 | '='
658 | '!'
659 | '>'
660 | '<'
661 | '?'
662 )
663 }
664
665 fn starts_int_literal(&self) -> bool {
666 if self.eof() {
667 return false;
668 }
669
670 let c = self.chars[self.idx];
671 if c.is_ascii_digit() {
672 return true;
673 }
674
675 if matches!(c, '+' | '-') {
676 return self.peek(1).is_some_and(|next| next.is_ascii_digit());
677 }
678
679 false
680 }
681}