Skip to main content

toon_format/decode/
scanner.rs

1use crate::types::{
2    Delimiter,
3    ToonError,
4    ToonResult,
5};
6
7/// Tokens produced by the scanner during lexical analysis.
8#[derive(Debug, Clone, PartialEq)]
9pub enum Token {
10    LeftBracket,
11    RightBracket,
12    LeftBrace,
13    RightBrace,
14    Colon,
15    Dash,
16    Newline,
17    String(String, bool),
18    Number(f64),
19    Integer(i64),
20    Bool(bool),
21    Null,
22    Delimiter(Delimiter),
23    Eof,
24}
25
26/// Scanner that tokenizes TOON input into a sequence of tokens.
27pub struct Scanner {
28    input: Vec<char>,
29    position: usize,
30    line: usize,
31    column: usize,
32    active_delimiter: Option<Delimiter>,
33    last_line_indent: usize,
34    last_whitespace_count: usize,
35    last_token_text: String,
36}
37
38impl Scanner {
39    /// Create a new scanner for the given input string.
40    pub fn new(input: &str) -> Self {
41        Self {
42            input: input.chars().collect(),
43            position: 0,
44            line: 1,
45            column: 1,
46            active_delimiter: None,
47            last_line_indent: 0,
48            last_whitespace_count: 0,
49            last_token_text: String::new(),
50        }
51    }
52
53    /// Set the active delimiter for tokenizing array elements.
54    pub fn set_active_delimiter(&mut self, delimiter: Option<Delimiter>) {
55        self.active_delimiter = delimiter;
56    }
57
58    /// Get the current position (line, column).
59    pub fn current_position(&self) -> (usize, usize) {
60        (self.line, self.column)
61    }
62
63    pub fn get_line(&self) -> usize {
64        self.line
65    }
66
67    pub fn get_column(&self) -> usize {
68        self.column
69    }
70
71    pub fn peek(&self) -> Option<char> {
72        self.input.get(self.position).copied()
73    }
74
75    pub fn count_leading_spaces(&self) -> usize {
76        let mut idx = self.position;
77        let mut count = 0;
78        while let Some(&ch) = self.input.get(idx) {
79            if ch == ' ' {
80                count += 1;
81                idx += 1;
82            } else {
83                break;
84            }
85        }
86        count
87    }
88
89    pub fn count_spaces_after_newline(&self) -> usize {
90        let mut idx = self.position;
91        if self.input.get(idx) != Some(&'\n') {
92            return 0;
93        }
94        idx += 1;
95        let mut count = 0;
96        while let Some(&ch) = self.input.get(idx) {
97            if ch == ' ' {
98                count += 1;
99                idx += 1;
100            } else {
101                break;
102            }
103        }
104        count
105    }
106
107    pub fn peek_ahead(&self, offset: usize) -> Option<char> {
108        self.input.get(self.position + offset).copied()
109    }
110
111    pub fn advance(&mut self) -> Option<char> {
112        if let Some(ch) = self.input.get(self.position) {
113            self.position += 1;
114            if *ch == '\n' {
115                self.line += 1;
116                self.column = 1;
117            } else {
118                self.column += 1;
119            }
120            Some(*ch)
121        } else {
122            None
123        }
124    }
125
126    pub fn skip_whitespace(&mut self) {
127        self.last_whitespace_count = 0;
128        while let Some(ch) = self.peek() {
129            if ch == ' ' {
130                self.last_whitespace_count += 1;
131                self.advance();
132            } else {
133                break;
134            }
135        }
136    }
137
138    pub fn last_whitespace_count(&self) -> usize {
139        self.last_whitespace_count
140    }
141
142    pub fn last_token_text(&self) -> &str {
143        &self.last_token_text
144    }
145
146    /// Scan the next token from the input.
147    pub fn scan_token(&mut self) -> ToonResult<Token> {
148        if self.column == 1 {
149            let mut count = 0;
150            let mut idx = self.position;
151
152            while let Some(&ch) = self.input.get(idx) {
153                if ch == ' ' {
154                    count += 1;
155                    idx += 1;
156                } else {
157                    if ch == '\t' {
158                        let (line, col) = self.current_position();
159                        return Err(ToonError::parse_error(
160                            line,
161                            col + count,
162                            "Tabs are not allowed in indentation",
163                        ));
164                    }
165                    break;
166                }
167            }
168            self.last_line_indent = count;
169        }
170
171        self.skip_whitespace();
172
173        match self.peek() {
174            None => Ok(Token::Eof),
175            Some('\n') => {
176                self.advance();
177                Ok(Token::Newline)
178            }
179            Some('[') => {
180                self.advance();
181                self.last_token_text = "[".to_string();
182                Ok(Token::LeftBracket)
183            }
184            Some(']') => {
185                self.advance();
186                self.last_token_text = "]".to_string();
187                Ok(Token::RightBracket)
188            }
189            Some('{') => {
190                self.advance();
191                self.last_token_text = "{".to_string();
192                Ok(Token::LeftBrace)
193            }
194            Some('}') => {
195                self.advance();
196                self.last_token_text = "}".to_string();
197                Ok(Token::RightBrace)
198            }
199            Some(':') => {
200                self.advance();
201                self.last_token_text = ":".to_string();
202                Ok(Token::Colon)
203            }
204            Some('-') => {
205                self.advance();
206                if let Some(ch) = self.peek() {
207                    if ch.is_ascii_digit() {
208                        let num_str = self.scan_number_string(true)?;
209                        self.last_token_text = num_str.clone();
210                        return self.parse_number(&num_str);
211                    }
212                }
213                self.last_token_text = "-".to_string();
214                Ok(Token::Dash)
215            }
216            Some(',') => {
217                // Delimiter only when active, otherwise part of unquoted string
218                if matches!(self.active_delimiter, Some(Delimiter::Comma)) {
219                    self.advance();
220                    self.last_token_text = ",".to_string();
221                    Ok(Token::Delimiter(Delimiter::Comma))
222                } else {
223                    self.scan_unquoted_string()
224                }
225            }
226            Some('|') => {
227                if matches!(self.active_delimiter, Some(Delimiter::Pipe)) {
228                    self.advance();
229                    self.last_token_text = "|".to_string();
230                    Ok(Token::Delimiter(Delimiter::Pipe))
231                } else {
232                    self.scan_unquoted_string()
233                }
234            }
235            Some('\t') => {
236                if matches!(self.active_delimiter, Some(Delimiter::Tab)) {
237                    self.advance();
238                    self.last_token_text = "\t".to_string();
239                    Ok(Token::Delimiter(Delimiter::Tab))
240                } else {
241                    self.scan_unquoted_string()
242                }
243            }
244            Some('"') => self.scan_quoted_string(),
245            Some(ch) if ch.is_ascii_digit() => {
246                let num_str = self.scan_number_string(false)?;
247                self.last_token_text = num_str.clone();
248                self.parse_number(&num_str)
249            }
250            Some(_) => self.scan_unquoted_string(),
251        }
252    }
253
254    fn scan_quoted_string(&mut self) -> ToonResult<Token> {
255        self.advance();
256
257        let mut value = String::new();
258        let mut escaped = false;
259
260        while let Some(ch) = self.advance() {
261            if escaped {
262                match ch {
263                    'n' => value.push('\n'),
264                    'r' => value.push('\r'),
265                    't' => value.push('\t'),
266                    '"' => value.push('"'),
267                    '\\' => value.push('\\'),
268                    _ => {
269                        let (line, col) = self.current_position();
270                        return Err(ToonError::parse_error(
271                            line,
272                            col - 1,
273                            format!("Invalid escape sequence: \\{ch}"),
274                        ));
275                    }
276                }
277                escaped = false;
278            } else if ch == '\\' {
279                escaped = true;
280            } else if ch == '"' {
281                self.last_token_text = format!("\"{}\"", crate::utils::escape_string(&value));
282                return Ok(Token::String(value, true));
283            } else {
284                value.push(ch);
285            }
286        }
287
288        Err(ToonError::UnexpectedEof)
289    }
290
291    fn scan_unquoted_string(&mut self) -> ToonResult<Token> {
292        let mut value = String::new();
293
294        while let Some(ch) = self.peek() {
295            if ch == '\n'
296                || ch == ' '
297                || ch == ':'
298                || ch == '['
299                || ch == ']'
300                || ch == '{'
301                || ch == '}'
302            {
303                break;
304            }
305
306            // Active delimiters stop the string; otherwise they're part of it
307            if let Some(active) = self.active_delimiter {
308                if (active == Delimiter::Comma && ch == ',')
309                    || (active == Delimiter::Pipe && ch == '|')
310                    || (active == Delimiter::Tab && ch == '\t')
311                {
312                    break;
313                }
314            }
315            value.push(ch);
316            self.advance();
317        }
318
319        // Single-char delimiters kept as-is, others trimmed
320        let value = if value.len() == 1 && (value == "," || value == "|" || value == "\t") {
321            value
322        } else {
323            value.trim_end().to_string()
324        };
325
326        self.last_token_text = value.clone();
327        match value.as_str() {
328            "null" => Ok(Token::Null),
329            "true" => Ok(Token::Bool(true)),
330            "false" => Ok(Token::Bool(false)),
331            _ => Ok(Token::String(value, false)),
332        }
333    }
334
335    pub fn get_last_line_indent(&self) -> usize {
336        self.last_line_indent
337    }
338
339    fn scan_number_string(&mut self, negative: bool) -> ToonResult<String> {
340        let mut num_str = if negative {
341            String::from("-")
342        } else {
343            String::new()
344        };
345
346        while let Some(ch) = self.peek() {
347            if ch.is_ascii_digit() || ch == '.' || ch == 'e' || ch == 'E' || ch == '+' || ch == '-'
348            {
349                num_str.push(ch);
350                self.advance();
351            } else {
352                break;
353            }
354        }
355
356        Ok(num_str)
357    }
358
359    fn parse_number(&self, s: &str) -> ToonResult<Token> {
360        // Number followed immediately by other chars like "0(f)" should be a string
361        if let Some(next_ch) = self.peek() {
362            if next_ch != ' '
363                && next_ch != '\n'
364                && next_ch != ':'
365                && next_ch != '['
366                && next_ch != ']'
367                && next_ch != '{'
368                && next_ch != '}'
369                && !matches!(
370                    (self.active_delimiter, next_ch),
371                    (Some(Delimiter::Comma), ',')
372                        | (Some(Delimiter::Pipe), '|')
373                        | (Some(Delimiter::Tab), '\t')
374                )
375            {
376                return Ok(Token::String(s.to_string(), false));
377            }
378        }
379
380        // Leading zeros like "05" are strings, but "0", "0.5", "-0" are numbers
381        if s.starts_with('0') && s.len() > 1 {
382            let second_char = s.chars().nth(1).unwrap();
383            if second_char.is_ascii_digit() {
384                return Ok(Token::String(s.to_string(), false));
385            }
386        }
387
388        // Negative leading zeros like "-05" are also strings
389        if s.starts_with("-0") && s.len() > 2 {
390            let third_char = s.chars().nth(2).unwrap();
391            if third_char.is_ascii_digit() {
392                return Ok(Token::String(s.to_string(), false));
393            }
394        }
395
396        if s.contains('.') || s.contains('e') || s.contains('E') {
397            if let Ok(f) = s.parse::<f64>() {
398                Ok(Token::Number(f))
399            } else {
400                Ok(Token::String(s.to_string(), false))
401            }
402        } else if let Ok(i) = s.parse::<i64>() {
403            Ok(Token::Integer(i))
404        } else {
405            Ok(Token::String(s.to_string(), false))
406        }
407    }
408
409    /// Read the rest of the current line (until newline or EOF).
410    /// Returns the content with the count of leading spaces consumed.
411    pub fn read_rest_of_line_with_space_info(&mut self) -> (String, usize) {
412        let mut space_count = 0;
413        while let Some(' ') = self.peek() {
414            space_count += 1;
415            self.advance();
416        }
417
418        let mut result = String::new();
419        while let Some(ch) = self.peek() {
420            if ch == '\n' {
421                break;
422            }
423            result.push(ch);
424            self.advance();
425        }
426
427        (result.trim_end().to_string(), space_count)
428    }
429
430    /// Read the rest of the current line (until newline or EOF).
431    pub fn read_rest_of_line(&mut self) -> String {
432        self.read_rest_of_line_with_space_info().0
433    }
434
435    /// Read raw text until the next active delimiter, newline, or EOF.
436    /// Returns the content with the count of leading spaces consumed.
437    pub fn read_until_delimiter_with_space_info(&mut self) -> (String, usize) {
438        let mut space_count = 0;
439        while let Some(' ') = self.peek() {
440            space_count += 1;
441            self.advance();
442        }
443
444        let mut result = String::new();
445        while let Some(ch) = self.peek() {
446            if ch == '\n' {
447                break;
448            }
449            if let Some(active) = self.active_delimiter {
450                if (active == Delimiter::Comma && ch == ',')
451                    || (active == Delimiter::Pipe && ch == '|')
452                    || (active == Delimiter::Tab && ch == '\t')
453                {
454                    break;
455                }
456            }
457            result.push(ch);
458            self.advance();
459        }
460
461        (result.trim_end().to_string(), space_count)
462    }
463
464    /// Parse a complete value string into a token.
465    pub fn parse_value_string(&self, s: &str) -> ToonResult<Token> {
466        let trimmed = s.trim();
467
468        if trimmed.is_empty() {
469            return Ok(Token::String(String::new(), false));
470        }
471
472        if trimmed.starts_with('"') {
473            let mut value = String::new();
474            let mut escaped = false;
475            let chars: Vec<char> = trimmed.chars().collect();
476            let mut i = 1;
477
478            while i < chars.len() {
479                let ch = chars[i];
480                if escaped {
481                    match ch {
482                        'n' => value.push('\n'),
483                        'r' => value.push('\r'),
484                        't' => value.push('\t'),
485                        '"' => value.push('"'),
486                        '\\' => value.push('\\'),
487                        _ => {
488                            return Err(ToonError::parse_error(
489                                self.line,
490                                self.column,
491                                format!("Invalid escape sequence: \\{ch}"),
492                            ));
493                        }
494                    }
495                    escaped = false;
496                } else if ch == '\\' {
497                    escaped = true;
498                } else if ch == '"' {
499                    if i != chars.len() - 1 {
500                        return Err(ToonError::parse_error(
501                            self.line,
502                            self.column,
503                            "Unexpected characters after closing quote",
504                        ));
505                    }
506                    return Ok(Token::String(value, true));
507                } else {
508                    value.push(ch);
509                }
510                i += 1;
511            }
512
513            return Err(ToonError::parse_error(
514                self.line,
515                self.column,
516                "Unterminated string: missing closing quote",
517            ));
518        }
519
520        match trimmed {
521            "true" => return Ok(Token::Bool(true)),
522            "false" => return Ok(Token::Bool(false)),
523            "null" => return Ok(Token::Null),
524            _ => {}
525        }
526
527        if trimmed.starts_with('-') || trimmed.chars().next().unwrap().is_ascii_digit() {
528            // Leading zeros like "05" are strings
529            if trimmed.starts_with('0') && trimmed.len() > 1 {
530                let second_char = trimmed.chars().nth(1).unwrap();
531                if second_char.is_ascii_digit() {
532                    return Ok(Token::String(trimmed.to_string(), false));
533                }
534            }
535
536            // Negative leading zeros like "-05" are also strings
537            if trimmed.starts_with("-0") && trimmed.len() > 2 {
538                let third_char = trimmed.chars().nth(2).unwrap();
539                if third_char.is_ascii_digit() {
540                    return Ok(Token::String(trimmed.to_string(), false));
541                }
542            }
543
544            if trimmed.contains('.') || trimmed.contains('e') || trimmed.contains('E') {
545                if let Ok(f) = trimmed.parse::<f64>() {
546                    let normalized = if f == -0.0 { 0.0 } else { f };
547                    return Ok(Token::Number(normalized));
548                }
549            } else if let Ok(i) = trimmed.parse::<i64>() {
550                return Ok(Token::Integer(i));
551            }
552        }
553
554        Ok(Token::String(trimmed.to_string(), false))
555    }
556
557    pub fn detect_delimiter(&mut self) -> Option<Delimiter> {
558        let saved_pos = self.position;
559
560        while let Some(ch) = self.peek() {
561            match ch {
562                ',' => {
563                    self.position = saved_pos;
564                    return Some(Delimiter::Comma);
565                }
566                '|' => {
567                    self.position = saved_pos;
568                    return Some(Delimiter::Pipe);
569                }
570                '\t' => {
571                    self.position = saved_pos;
572                    return Some(Delimiter::Tab);
573                }
574                '\n' | ':' | '[' | ']' | '{' | '}' => break,
575                _ => {
576                    self.advance();
577                }
578            }
579        }
580
581        self.position = saved_pos;
582        None
583    }
584}
585
586#[cfg(test)]
587mod tests {
588    use core::f64;
589
590    use super::*;
591
592    #[test]
593    fn test_scan_structural_tokens() {
594        let mut scanner = Scanner::new("[]{}:-");
595        assert_eq!(scanner.scan_token().unwrap(), Token::LeftBracket);
596        assert_eq!(scanner.scan_token().unwrap(), Token::RightBracket);
597        assert_eq!(scanner.scan_token().unwrap(), Token::LeftBrace);
598        assert_eq!(scanner.scan_token().unwrap(), Token::RightBrace);
599        assert_eq!(scanner.scan_token().unwrap(), Token::Colon);
600        assert_eq!(scanner.scan_token().unwrap(), Token::Dash);
601    }
602
603    #[test]
604    fn test_scan_numbers() {
605        let mut scanner = Scanner::new("42 3.141592653589793 -5");
606        assert_eq!(scanner.scan_token().unwrap(), Token::Integer(42));
607        assert_eq!(
608            scanner.scan_token().unwrap(),
609            Token::Number(f64::consts::PI)
610        );
611        assert_eq!(scanner.scan_token().unwrap(), Token::Integer(-5));
612    }
613
614    #[test]
615    fn test_scan_booleans() {
616        let mut scanner = Scanner::new("true false");
617        assert_eq!(scanner.scan_token().unwrap(), Token::Bool(true));
618        assert_eq!(scanner.scan_token().unwrap(), Token::Bool(false));
619    }
620
621    #[test]
622    fn test_scan_null() {
623        let mut scanner = Scanner::new("null");
624        assert_eq!(scanner.scan_token().unwrap(), Token::Null);
625    }
626
627    #[test]
628    fn test_scan_quoted_string() {
629        let mut scanner = Scanner::new(r#""hello world""#);
630        assert_eq!(
631            scanner.scan_token().unwrap(),
632            Token::String("hello world".to_string(), true)
633        );
634    }
635
636    #[test]
637    fn test_scan_escaped_string() {
638        let mut scanner = Scanner::new(r#""hello\nworld""#);
639        assert_eq!(
640            scanner.scan_token().unwrap(),
641            Token::String("hello\nworld".to_string(), true)
642        );
643    }
644
645    #[test]
646    fn test_scan_unquoted_string() {
647        let mut scanner = Scanner::new("hello");
648        assert_eq!(
649            scanner.scan_token().unwrap(),
650            Token::String("hello".to_string(), false)
651        );
652    }
653
654    #[test]
655    fn test_detect_delimiter() {
656        let mut scanner = Scanner::new("a,b,c");
657        assert_eq!(scanner.detect_delimiter(), Some(Delimiter::Comma));
658
659        let mut scanner = Scanner::new("a|b|c");
660        assert_eq!(scanner.detect_delimiter(), Some(Delimiter::Pipe));
661
662        let mut scanner = Scanner::new("a\tb\tc");
663        assert_eq!(scanner.detect_delimiter(), Some(Delimiter::Tab));
664    }
665
666    #[test]
667    fn test_read_rest_of_line_with_space_info() {
668        let mut scanner = Scanner::new(" world");
669        let (content, space_count) = scanner.read_rest_of_line_with_space_info();
670        assert_eq!(content, "world");
671        assert_eq!(space_count, 1);
672
673        let mut scanner = Scanner::new("world");
674        let (content, space_count) = scanner.read_rest_of_line_with_space_info();
675        assert_eq!(content, "world");
676        assert_eq!(space_count, 0);
677
678        let mut scanner = Scanner::new("(hello)");
679        let (content, space_count) = scanner.read_rest_of_line_with_space_info();
680        assert_eq!(content, "(hello)");
681        assert_eq!(space_count, 0);
682
683        let mut scanner = Scanner::new("");
684        let (content, space_count) = scanner.read_rest_of_line_with_space_info();
685        assert_eq!(content, "");
686        assert_eq!(space_count, 0);
687
688        let mut scanner = Scanner::new("   world");
689        let (content, space_count) = scanner.read_rest_of_line_with_space_info();
690        assert_eq!(content, "world");
691        assert_eq!(space_count, 3);
692    }
693
694    #[test]
695    fn test_parse_value_string() {
696        let scanner = Scanner::new("");
697        assert_eq!(
698            scanner.parse_value_string("hello").unwrap(),
699            Token::String("hello".to_string(), false)
700        );
701
702        assert_eq!(
703            scanner.parse_value_string("(hello)").unwrap(),
704            Token::String("(hello)".to_string(), false)
705        );
706
707        assert_eq!(
708            scanner
709                .parse_value_string("Mostly Functions (3 of 3)")
710                .unwrap(),
711            Token::String("Mostly Functions (3 of 3)".to_string(), false)
712        );
713        assert_eq!(
714            scanner.parse_value_string("0(f)").unwrap(),
715            Token::String("0(f)".to_string(), false)
716        );
717
718        assert_eq!(
719            scanner.parse_value_string("42").unwrap(),
720            Token::Integer(42)
721        );
722
723        assert_eq!(
724            scanner.parse_value_string("true").unwrap(),
725            Token::Bool(true)
726        );
727        assert_eq!(
728            scanner.parse_value_string("false").unwrap(),
729            Token::Bool(false)
730        );
731        assert_eq!(scanner.parse_value_string("null").unwrap(), Token::Null);
732
733        assert_eq!(
734            scanner.parse_value_string(r#""hello world""#).unwrap(),
735            Token::String("hello world".to_string(), true)
736        );
737    }
738
739    #[test]
740    fn test_number_followed_by_parenthesis() {
741        let mut scanner = Scanner::new("0(f)");
742        let num_token = scanner.scan_number_string(false).unwrap();
743        let token = scanner.parse_number(&num_token).unwrap();
744
745        assert_eq!(token, Token::String("0".to_string(), false));
746    }
747}