toon_format/decode/
scanner.rs

1use crate::types::{
2    Delimiter,
3    ToonError,
4    ToonResult,
5};
6
7/// Tokens produced by the scanner during lexical analysis.
8#[derive(Debug, Clone, PartialEq)]
9pub enum Token {
10    LeftBracket,
11    RightBracket,
12    LeftBrace,
13    RightBrace,
14    Colon,
15    Dash,
16    Newline,
17    String(String, bool),
18    Number(f64),
19    Integer(i64),
20    Bool(bool),
21    Null,
22    Delimiter(Delimiter),
23    Eof,
24}
25
26/// Scanner that tokenizes TOON input into a sequence of tokens.
27pub struct Scanner {
28    input: Vec<char>,
29    position: usize,
30    line: usize,
31    column: usize,
32    active_delimiter: Option<Delimiter>,
33    last_line_indent: usize,
34}
35
36impl Scanner {
37    /// Create a new scanner for the given input string.
38    pub fn new(input: &str) -> Self {
39        Self {
40            input: input.chars().collect(),
41            position: 0,
42            line: 1,
43            column: 1,
44            active_delimiter: None,
45            last_line_indent: 0,
46        }
47    }
48
49    /// Set the active delimiter for tokenizing array elements.
50    pub fn set_active_delimiter(&mut self, delimiter: Option<Delimiter>) {
51        self.active_delimiter = delimiter;
52    }
53
54    /// Get the current position (line, column).
55    pub fn current_position(&self) -> (usize, usize) {
56        (self.line, self.column)
57    }
58
59    pub fn get_line(&self) -> usize {
60        self.line
61    }
62
63    pub fn get_column(&self) -> usize {
64        self.column
65    }
66
67    pub fn peek(&self) -> Option<char> {
68        self.input.get(self.position).copied()
69    }
70
71    pub fn count_leading_spaces(&self) -> usize {
72        let mut idx = self.position;
73        let mut count = 0;
74        while let Some(&ch) = self.input.get(idx) {
75            if ch == ' ' {
76                count += 1;
77                idx += 1;
78            } else {
79                break;
80            }
81        }
82        count
83    }
84
85    pub fn count_spaces_after_newline(&self) -> usize {
86        let mut idx = self.position;
87        if self.input.get(idx) != Some(&'\n') {
88            return 0;
89        }
90        idx += 1;
91        let mut count = 0;
92        while let Some(&ch) = self.input.get(idx) {
93            if ch == ' ' {
94                count += 1;
95                idx += 1;
96            } else {
97                break;
98            }
99        }
100        count
101    }
102
103    pub fn peek_ahead(&self, offset: usize) -> Option<char> {
104        self.input.get(self.position + offset).copied()
105    }
106
107    pub fn advance(&mut self) -> Option<char> {
108        if let Some(ch) = self.input.get(self.position) {
109            self.position += 1;
110            if *ch == '\n' {
111                self.line += 1;
112                self.column = 1;
113            } else {
114                self.column += 1;
115            }
116            Some(*ch)
117        } else {
118            None
119        }
120    }
121
122    pub fn skip_whitespace(&mut self) {
123        while let Some(ch) = self.peek() {
124            if ch == ' ' {
125                self.advance();
126            } else {
127                break;
128            }
129        }
130    }
131
132    /// Scan the next token from the input.
133    pub fn scan_token(&mut self) -> ToonResult<Token> {
134        if self.column == 1 {
135            let mut count = 0;
136            let mut idx = self.position;
137
138            while let Some(&ch) = self.input.get(idx) {
139                if ch == ' ' {
140                    count += 1;
141                    idx += 1;
142                } else {
143                    if ch == '\t' {
144                        let (line, col) = self.current_position();
145                        return Err(ToonError::parse_error(
146                            line,
147                            col + count,
148                            "Tabs are not allowed in indentation",
149                        ));
150                    }
151                    break;
152                }
153            }
154            self.last_line_indent = count;
155        }
156
157        self.skip_whitespace();
158
159        match self.peek() {
160            None => Ok(Token::Eof),
161            Some('\n') => {
162                self.advance();
163                Ok(Token::Newline)
164            }
165            Some('[') => {
166                self.advance();
167                Ok(Token::LeftBracket)
168            }
169            Some(']') => {
170                self.advance();
171                Ok(Token::RightBracket)
172            }
173            Some('{') => {
174                self.advance();
175                Ok(Token::LeftBrace)
176            }
177            Some('}') => {
178                self.advance();
179                Ok(Token::RightBrace)
180            }
181            Some(':') => {
182                self.advance();
183                Ok(Token::Colon)
184            }
185            Some('-') => {
186                self.advance();
187                if let Some(ch) = self.peek() {
188                    if ch.is_ascii_digit() {
189                        let num_str = self.scan_number_string(true)?;
190                        return self.parse_number(&num_str);
191                    }
192                }
193                Ok(Token::Dash)
194            }
195            Some(',') => {
196                // Delimiter only when active, otherwise part of unquoted string
197                if matches!(self.active_delimiter, Some(Delimiter::Comma)) {
198                    self.advance();
199                    Ok(Token::Delimiter(Delimiter::Comma))
200                } else {
201                    self.scan_unquoted_string()
202                }
203            }
204            Some('|') => {
205                if matches!(self.active_delimiter, Some(Delimiter::Pipe)) {
206                    self.advance();
207                    Ok(Token::Delimiter(Delimiter::Pipe))
208                } else {
209                    self.scan_unquoted_string()
210                }
211            }
212            Some('\t') => {
213                if matches!(self.active_delimiter, Some(Delimiter::Tab)) {
214                    self.advance();
215                    Ok(Token::Delimiter(Delimiter::Tab))
216                } else {
217                    self.scan_unquoted_string()
218                }
219            }
220            Some('"') => self.scan_quoted_string(),
221            Some(ch) if ch.is_ascii_digit() => {
222                let num_str = self.scan_number_string(false)?;
223                self.parse_number(&num_str)
224            }
225            Some(_) => self.scan_unquoted_string(),
226        }
227    }
228
229    fn scan_quoted_string(&mut self) -> ToonResult<Token> {
230        self.advance();
231
232        let mut value = String::new();
233        let mut escaped = false;
234
235        while let Some(ch) = self.advance() {
236            if escaped {
237                match ch {
238                    'n' => value.push('\n'),
239                    'r' => value.push('\r'),
240                    't' => value.push('\t'),
241                    '"' => value.push('"'),
242                    '\\' => value.push('\\'),
243                    _ => {
244                        let (line, col) = self.current_position();
245                        return Err(ToonError::parse_error(
246                            line,
247                            col - 1,
248                            format!("Invalid escape sequence: \\{ch}"),
249                        ));
250                    }
251                }
252                escaped = false;
253            } else if ch == '\\' {
254                escaped = true;
255            } else if ch == '"' {
256                return Ok(Token::String(value, true));
257            } else {
258                value.push(ch);
259            }
260        }
261
262        Err(ToonError::UnexpectedEof)
263    }
264
265    fn scan_unquoted_string(&mut self) -> ToonResult<Token> {
266        let mut value = String::new();
267
268        while let Some(ch) = self.peek() {
269            if ch == '\n'
270                || ch == ' '
271                || ch == ':'
272                || ch == '['
273                || ch == ']'
274                || ch == '{'
275                || ch == '}'
276            {
277                break;
278            }
279
280            // Active delimiters stop the string; otherwise they're part of it
281            if let Some(active) = self.active_delimiter {
282                if (active == Delimiter::Comma && ch == ',')
283                    || (active == Delimiter::Pipe && ch == '|')
284                    || (active == Delimiter::Tab && ch == '\t')
285                {
286                    break;
287                }
288            }
289            value.push(ch);
290            self.advance();
291        }
292
293        // Single-char delimiters kept as-is, others trimmed
294        let value = if value.len() == 1 && (value == "," || value == "|" || value == "\t") {
295            value
296        } else {
297            value.trim_end().to_string()
298        };
299
300        match value.as_str() {
301            "null" => Ok(Token::Null),
302            "true" => Ok(Token::Bool(true)),
303            "false" => Ok(Token::Bool(false)),
304            _ => Ok(Token::String(value, false)),
305        }
306    }
307
308    pub fn get_last_line_indent(&self) -> usize {
309        self.last_line_indent
310    }
311
312    fn scan_number_string(&mut self, negative: bool) -> ToonResult<String> {
313        let mut num_str = if negative {
314            String::from("-")
315        } else {
316            String::new()
317        };
318
319        while let Some(ch) = self.peek() {
320            if ch.is_ascii_digit() || ch == '.' || ch == 'e' || ch == 'E' || ch == '+' || ch == '-'
321            {
322                num_str.push(ch);
323                self.advance();
324            } else {
325                break;
326            }
327        }
328
329        Ok(num_str)
330    }
331
332    fn parse_number(&self, s: &str) -> ToonResult<Token> {
333        // Number followed immediately by other chars like "0(f)" should be a string
334        if let Some(next_ch) = self.peek() {
335            if next_ch != ' '
336                && next_ch != '\n'
337                && next_ch != ':'
338                && next_ch != '['
339                && next_ch != ']'
340                && next_ch != '{'
341                && next_ch != '}'
342                && !matches!(
343                    (self.active_delimiter, next_ch),
344                    (Some(Delimiter::Comma), ',')
345                        | (Some(Delimiter::Pipe), '|')
346                        | (Some(Delimiter::Tab), '\t')
347                )
348            {
349                return Ok(Token::String(s.to_string(), false));
350            }
351        }
352
353        // Leading zeros like "05" are strings, but "0", "0.5", "-0" are numbers
354        if s.starts_with('0') && s.len() > 1 {
355            let second_char = s.chars().nth(1).unwrap();
356            if second_char.is_ascii_digit() {
357                return Ok(Token::String(s.to_string(), false));
358            }
359        }
360
361        if s.contains('.') || s.contains('e') || s.contains('E') {
362            if let Ok(f) = s.parse::<f64>() {
363                Ok(Token::Number(f))
364            } else {
365                Ok(Token::String(s.to_string(), false))
366            }
367        } else if let Ok(i) = s.parse::<i64>() {
368            Ok(Token::Integer(i))
369        } else {
370            Ok(Token::String(s.to_string(), false))
371        }
372    }
373
374    /// Read the rest of the current line (until newline or EOF).
375    /// Returns the content with a flag indicating if it started with
376    /// whitespace.
377    pub fn read_rest_of_line_with_space_info(&mut self) -> (String, bool) {
378        let had_leading_space = matches!(self.peek(), Some(' '));
379        self.skip_whitespace();
380
381        let mut result = String::new();
382        while let Some(ch) = self.peek() {
383            if ch == '\n' {
384                break;
385            }
386            result.push(ch);
387            self.advance();
388        }
389
390        (result.trim_end().to_string(), had_leading_space)
391    }
392
393    /// Read the rest of the current line (until newline or EOF).
394    pub fn read_rest_of_line(&mut self) -> String {
395        self.read_rest_of_line_with_space_info().0
396    }
397
398    /// Parse a complete value string into a token.
399    pub fn parse_value_string(&self, s: &str) -> ToonResult<Token> {
400        let trimmed = s.trim();
401
402        if trimmed.is_empty() {
403            return Ok(Token::String(String::new(), false));
404        }
405
406        if trimmed.starts_with('"') {
407            let mut value = String::new();
408            let mut escaped = false;
409            let chars: Vec<char> = trimmed.chars().collect();
410            let mut i = 1;
411
412            while i < chars.len() {
413                let ch = chars[i];
414                if escaped {
415                    match ch {
416                        'n' => value.push('\n'),
417                        'r' => value.push('\r'),
418                        't' => value.push('\t'),
419                        '"' => value.push('"'),
420                        '\\' => value.push('\\'),
421                        _ => {
422                            return Err(ToonError::parse_error(
423                                self.line,
424                                self.column,
425                                format!("Invalid escape sequence: \\{ch}"),
426                            ));
427                        }
428                    }
429                    escaped = false;
430                } else if ch == '\\' {
431                    escaped = true;
432                } else if ch == '"' {
433                    if i != chars.len() - 1 {
434                        return Err(ToonError::parse_error(
435                            self.line,
436                            self.column,
437                            "Unexpected characters after closing quote",
438                        ));
439                    }
440                    return Ok(Token::String(value, true));
441                } else {
442                    value.push(ch);
443                }
444                i += 1;
445            }
446
447            return Err(ToonError::parse_error(
448                self.line,
449                self.column,
450                "Unterminated string: missing closing quote",
451            ));
452        }
453
454        match trimmed {
455            "true" => return Ok(Token::Bool(true)),
456            "false" => return Ok(Token::Bool(false)),
457            "null" => return Ok(Token::Null),
458            _ => {}
459        }
460
461        if trimmed.starts_with('-') || trimmed.chars().next().unwrap().is_ascii_digit() {
462            // Leading zeros like "05" are strings
463            if trimmed.starts_with('0') && trimmed.len() > 1 {
464                let second_char = trimmed.chars().nth(1).unwrap();
465                if second_char.is_ascii_digit() {
466                    return Ok(Token::String(trimmed.to_string(), false));
467                }
468            }
469
470            if trimmed.contains('.') || trimmed.contains('e') || trimmed.contains('E') {
471                if let Ok(f) = trimmed.parse::<f64>() {
472                    let normalized = if f == -0.0 { 0.0 } else { f };
473                    return Ok(Token::Number(normalized));
474                }
475            } else if let Ok(i) = trimmed.parse::<i64>() {
476                return Ok(Token::Integer(i));
477            }
478        }
479
480        Ok(Token::String(trimmed.to_string(), false))
481    }
482
483    pub fn detect_delimiter(&mut self) -> Option<Delimiter> {
484        let saved_pos = self.position;
485
486        while let Some(ch) = self.peek() {
487            match ch {
488                ',' => {
489                    self.position = saved_pos;
490                    return Some(Delimiter::Comma);
491                }
492                '|' => {
493                    self.position = saved_pos;
494                    return Some(Delimiter::Pipe);
495                }
496                '\t' => {
497                    self.position = saved_pos;
498                    return Some(Delimiter::Tab);
499                }
500                '\n' | ':' | '[' | ']' | '{' | '}' => break,
501                _ => {
502                    self.advance();
503                }
504            }
505        }
506
507        self.position = saved_pos;
508        None
509    }
510}
511
512#[cfg(test)]
513mod tests {
514    use core::f64;
515
516    use super::*;
517
518    #[test]
519    fn test_scan_structural_tokens() {
520        let mut scanner = Scanner::new("[]{}:-");
521        assert_eq!(scanner.scan_token().unwrap(), Token::LeftBracket);
522        assert_eq!(scanner.scan_token().unwrap(), Token::RightBracket);
523        assert_eq!(scanner.scan_token().unwrap(), Token::LeftBrace);
524        assert_eq!(scanner.scan_token().unwrap(), Token::RightBrace);
525        assert_eq!(scanner.scan_token().unwrap(), Token::Colon);
526        assert_eq!(scanner.scan_token().unwrap(), Token::Dash);
527    }
528
529    #[test]
530    fn test_scan_numbers() {
531        let mut scanner = Scanner::new("42 3.141592653589793 -5");
532        assert_eq!(scanner.scan_token().unwrap(), Token::Integer(42));
533        assert_eq!(
534            scanner.scan_token().unwrap(),
535            Token::Number(f64::consts::PI)
536        );
537        assert_eq!(scanner.scan_token().unwrap(), Token::Integer(-5));
538    }
539
540    #[test]
541    fn test_scan_booleans() {
542        let mut scanner = Scanner::new("true false");
543        assert_eq!(scanner.scan_token().unwrap(), Token::Bool(true));
544        assert_eq!(scanner.scan_token().unwrap(), Token::Bool(false));
545    }
546
547    #[test]
548    fn test_scan_null() {
549        let mut scanner = Scanner::new("null");
550        assert_eq!(scanner.scan_token().unwrap(), Token::Null);
551    }
552
553    #[test]
554    fn test_scan_quoted_string() {
555        let mut scanner = Scanner::new(r#""hello world""#);
556        assert_eq!(
557            scanner.scan_token().unwrap(),
558            Token::String("hello world".to_string(), true)
559        );
560    }
561
562    #[test]
563    fn test_scan_escaped_string() {
564        let mut scanner = Scanner::new(r#""hello\nworld""#);
565        assert_eq!(
566            scanner.scan_token().unwrap(),
567            Token::String("hello\nworld".to_string(), true)
568        );
569    }
570
571    #[test]
572    fn test_scan_unquoted_string() {
573        let mut scanner = Scanner::new("hello");
574        assert_eq!(
575            scanner.scan_token().unwrap(),
576            Token::String("hello".to_string(), false)
577        );
578    }
579
580    #[test]
581    fn test_detect_delimiter() {
582        let mut scanner = Scanner::new("a,b,c");
583        assert_eq!(scanner.detect_delimiter(), Some(Delimiter::Comma));
584
585        let mut scanner = Scanner::new("a|b|c");
586        assert_eq!(scanner.detect_delimiter(), Some(Delimiter::Pipe));
587
588        let mut scanner = Scanner::new("a\tb\tc");
589        assert_eq!(scanner.detect_delimiter(), Some(Delimiter::Tab));
590    }
591
592    #[test]
593    fn test_read_rest_of_line_with_space_info() {
594        let mut scanner = Scanner::new(" world");
595        let (content, had_space) = scanner.read_rest_of_line_with_space_info();
596        assert_eq!(content, "world");
597        assert!(had_space);
598
599        let mut scanner = Scanner::new("world");
600        let (content, had_space) = scanner.read_rest_of_line_with_space_info();
601        assert_eq!(content, "world");
602        assert!(!had_space);
603
604        let mut scanner = Scanner::new("(hello)");
605        let (content, had_space) = scanner.read_rest_of_line_with_space_info();
606        assert_eq!(content, "(hello)");
607        assert!(!had_space);
608
609        let mut scanner = Scanner::new("");
610        let (content, had_space) = scanner.read_rest_of_line_with_space_info();
611        assert_eq!(content, "");
612        assert!(!had_space);
613    }
614
615    #[test]
616    fn test_parse_value_string() {
617        let scanner = Scanner::new("");
618        assert_eq!(
619            scanner.parse_value_string("hello").unwrap(),
620            Token::String("hello".to_string(), false)
621        );
622
623        assert_eq!(
624            scanner.parse_value_string("(hello)").unwrap(),
625            Token::String("(hello)".to_string(), false)
626        );
627
628        assert_eq!(
629            scanner
630                .parse_value_string("Mostly Functions (3 of 3)")
631                .unwrap(),
632            Token::String("Mostly Functions (3 of 3)".to_string(), false)
633        );
634        assert_eq!(
635            scanner.parse_value_string("0(f)").unwrap(),
636            Token::String("0(f)".to_string(), false)
637        );
638
639        assert_eq!(
640            scanner.parse_value_string("42").unwrap(),
641            Token::Integer(42)
642        );
643
644        assert_eq!(
645            scanner.parse_value_string("true").unwrap(),
646            Token::Bool(true)
647        );
648        assert_eq!(
649            scanner.parse_value_string("false").unwrap(),
650            Token::Bool(false)
651        );
652        assert_eq!(scanner.parse_value_string("null").unwrap(), Token::Null);
653
654        assert_eq!(
655            scanner.parse_value_string(r#""hello world""#).unwrap(),
656            Token::String("hello world".to_string(), true)
657        );
658    }
659
660    #[test]
661    fn test_number_followed_by_parenthesis() {
662        let mut scanner = Scanner::new("0(f)");
663        let num_token = scanner.scan_number_string(false).unwrap();
664        let token = scanner.parse_number(&num_token).unwrap();
665
666        assert_eq!(token, Token::String("0".to_string(), false));
667    }
668}