toon_format/decode/
scanner.rs

1use crate::types::{
2    Delimiter,
3    ToonError,
4    ToonResult,
5};
6
7/// Tokens produced by the scanner during lexical analysis.
8#[derive(Debug, Clone, PartialEq)]
9pub enum Token {
10    LeftBracket,
11    RightBracket,
12    LeftBrace,
13    RightBrace,
14    Colon,
15    Dash,
16    Newline,
17    String(String, bool),
18    Number(f64),
19    Integer(i64),
20    Bool(bool),
21    Null,
22    Delimiter(Delimiter),
23    Eof,
24}
25
26/// Scanner that tokenizes TOON input into a sequence of tokens.
27pub struct Scanner {
28    input: Vec<char>,
29    position: usize,
30    line: usize,
31    column: usize,
32    active_delimiter: Option<Delimiter>,
33    last_line_indent: usize,
34}
35
36impl Scanner {
37    /// Create a new scanner for the given input string.
38    pub fn new(input: &str) -> Self {
39        Self {
40            input: input.chars().collect(),
41            position: 0,
42            line: 1,
43            column: 1,
44            active_delimiter: None,
45            last_line_indent: 0,
46        }
47    }
48
49    /// Set the active delimiter for tokenizing array elements.
50    pub fn set_active_delimiter(&mut self, delimiter: Option<Delimiter>) {
51        self.active_delimiter = delimiter;
52    }
53
54    /// Get the current position (line, column).
55    pub fn current_position(&self) -> (usize, usize) {
56        (self.line, self.column)
57    }
58
59    pub fn get_line(&self) -> usize {
60        self.line
61    }
62
63    pub fn get_column(&self) -> usize {
64        self.column
65    }
66
67    pub fn peek(&self) -> Option<char> {
68        self.input.get(self.position).copied()
69    }
70
71    pub fn count_leading_spaces(&self) -> usize {
72        let mut idx = self.position;
73        let mut count = 0;
74        while let Some(&ch) = self.input.get(idx) {
75            if ch == ' ' {
76                count += 1;
77                idx += 1;
78            } else {
79                break;
80            }
81        }
82        count
83    }
84
85    pub fn count_spaces_after_newline(&self) -> usize {
86        let mut idx = self.position;
87        if self.input.get(idx) != Some(&'\n') {
88            return 0;
89        }
90        idx += 1;
91        let mut count = 0;
92        while let Some(&ch) = self.input.get(idx) {
93            if ch == ' ' {
94                count += 1;
95                idx += 1;
96            } else {
97                break;
98            }
99        }
100        count
101    }
102
103    pub fn peek_ahead(&self, offset: usize) -> Option<char> {
104        self.input.get(self.position + offset).copied()
105    }
106
107    pub fn advance(&mut self) -> Option<char> {
108        if let Some(ch) = self.input.get(self.position) {
109            self.position += 1;
110            if *ch == '\n' {
111                self.line += 1;
112                self.column = 1;
113            } else {
114                self.column += 1;
115            }
116            Some(*ch)
117        } else {
118            None
119        }
120    }
121
122    pub fn skip_whitespace(&mut self) {
123        while let Some(ch) = self.peek() {
124            if ch == ' ' {
125                self.advance();
126            } else {
127                break;
128            }
129        }
130    }
131
132    /// Scan the next token from the input.
133    pub fn scan_token(&mut self) -> ToonResult<Token> {
134        // Track indentation at the start of each line (column 1)
135        if self.column == 1 {
136            let mut count = 0;
137            let mut idx = self.position;
138
139            while let Some(&ch) = self.input.get(idx) {
140                if ch == ' ' {
141                    count += 1;
142                    idx += 1;
143                } else {
144                    if ch == '\t' {
145                        let (line, col) = self.current_position();
146                        return Err(ToonError::parse_error(
147                            line,
148                            col + count,
149                            "Tabs are not allowed in indentation",
150                        ));
151                    }
152                    break;
153                }
154            }
155            self.last_line_indent = count;
156        }
157
158        self.skip_whitespace();
159
160        match self.peek() {
161            None => Ok(Token::Eof),
162            Some('\n') => {
163                self.advance();
164                Ok(Token::Newline)
165            }
166            Some('[') => {
167                self.advance();
168                Ok(Token::LeftBracket)
169            }
170            Some(']') => {
171                self.advance();
172                Ok(Token::RightBracket)
173            }
174            Some('{') => {
175                self.advance();
176                Ok(Token::LeftBrace)
177            }
178            Some('}') => {
179                self.advance();
180                Ok(Token::RightBrace)
181            }
182            Some(':') => {
183                self.advance();
184                Ok(Token::Colon)
185            }
186            Some('-') => {
187                self.advance();
188                // Dash can be a list marker or start of negative number
189                if let Some(ch) = self.peek() {
190                    if ch.is_ascii_digit() {
191                        let num_str = self.scan_number_string(true)?;
192                        return self.parse_number(&num_str);
193                    }
194                }
195                Ok(Token::Dash)
196            }
197            Some(',') => {
198                // Comma is a delimiter only when active, otherwise part of string
199                if matches!(self.active_delimiter, Some(Delimiter::Comma)) {
200                    self.advance();
201                    Ok(Token::Delimiter(Delimiter::Comma))
202                } else {
203                    self.scan_unquoted_string()
204                }
205            }
206            Some('|') => {
207                if matches!(self.active_delimiter, Some(Delimiter::Pipe)) {
208                    self.advance();
209                    Ok(Token::Delimiter(Delimiter::Pipe))
210                } else {
211                    self.scan_unquoted_string()
212                }
213            }
214            Some('\t') => {
215                if matches!(self.active_delimiter, Some(Delimiter::Tab)) {
216                    self.advance();
217                    Ok(Token::Delimiter(Delimiter::Tab))
218                } else {
219                    self.scan_unquoted_string()
220                }
221            }
222            Some('"') => self.scan_quoted_string(),
223            Some(ch) if ch.is_ascii_digit() => {
224                let num_str = self.scan_number_string(false)?;
225                self.parse_number(&num_str)
226            }
227            Some(_) => self.scan_unquoted_string(),
228        }
229    }
230
231    fn scan_quoted_string(&mut self) -> ToonResult<Token> {
232        self.advance(); // Skip opening quote
233
234        let mut value = String::new();
235        let mut escaped = false;
236
237        while let Some(ch) = self.advance() {
238            if escaped {
239                // Process escape sequences
240                match ch {
241                    'n' => value.push('\n'),
242                    'r' => value.push('\r'),
243                    't' => value.push('\t'),
244                    '"' => value.push('"'),
245                    '\\' => value.push('\\'),
246                    _ => {
247                        let (line, col) = self.current_position();
248                        return Err(ToonError::parse_error(
249                            line,
250                            col - 1,
251                            format!("Invalid escape sequence: \\{ch}"),
252                        ));
253                    }
254                }
255                escaped = false;
256            } else if ch == '\\' {
257                escaped = true;
258            } else if ch == '"' {
259                return Ok(Token::String(value, true));
260            } else {
261                value.push(ch);
262            }
263        }
264
265        // Unclosed string
266        Err(ToonError::UnexpectedEof)
267    }
268
269    fn scan_unquoted_string(&mut self) -> ToonResult<Token> {
270        let mut value = String::new();
271
272        // Stop at structural characters or whitespace
273        while let Some(ch) = self.peek() {
274            if ch == '\n'
275                || ch == ' '
276                || ch == ':'
277                || ch == '['
278                || ch == ']'
279                || ch == '{'
280                || ch == '}'
281            {
282                break;
283            }
284
285            // If a delimiter is active, it stops the string; otherwise it's part of it
286            if let Some(active) = self.active_delimiter {
287                if (active == Delimiter::Comma && ch == ',')
288                    || (active == Delimiter::Pipe && ch == '|')
289                    || (active == Delimiter::Tab && ch == '\t')
290                {
291                    break;
292                }
293            }
294            value.push(ch);
295            self.advance();
296        }
297
298        // Single-character delimiter strings are kept as-is, others get trailing spaces
299        // trimmed
300        let value = if value.len() == 1 && (value == "," || value == "|" || value == "\t") {
301            value
302        } else {
303            value.trim_end().to_string()
304        };
305
306        // Check for keywords
307        match value.as_str() {
308            "null" => Ok(Token::Null),
309            "true" => Ok(Token::Bool(true)),
310            "false" => Ok(Token::Bool(false)),
311            _ => Ok(Token::String(value, false)),
312        }
313    }
314
315    pub fn get_last_line_indent(&self) -> usize {
316        self.last_line_indent
317    }
318
319    fn scan_number_string(&mut self, negative: bool) -> ToonResult<String> {
320        let mut num_str = if negative {
321            String::from("-")
322        } else {
323            String::new()
324        };
325
326        // Collect digits, decimal point, and scientific notation parts
327        while let Some(ch) = self.peek() {
328            if ch.is_ascii_digit() || ch == '.' || ch == 'e' || ch == 'E' || ch == '+' || ch == '-'
329            {
330                num_str.push(ch);
331                self.advance();
332            } else {
333                break;
334            }
335        }
336
337        Ok(num_str)
338    }
339
340    fn parse_number(&self, s: &str) -> ToonResult<Token> {
341        // Reject leading zeros like "05" or "007", but allow "0", "0.5", "-0"
342        if s.starts_with('0') && s.len() > 1 {
343            let second_char = s.chars().nth(1).unwrap();
344            if second_char.is_ascii_digit() {
345                return Ok(Token::String(s.to_string(), false));
346            }
347        }
348
349        // Try parsing as float first (handles scientific notation)
350        if s.contains('.') || s.contains('e') || s.contains('E') {
351            if let Ok(f) = s.parse::<f64>() {
352                Ok(Token::Number(f))
353            } else {
354                // Invalid float format - treat as string
355                Ok(Token::String(s.to_string(), false))
356            }
357        } else if let Ok(i) = s.parse::<i64>() {
358            Ok(Token::Integer(i))
359        } else {
360            // Not a valid number - treat as string
361            Ok(Token::String(s.to_string(), false))
362        }
363    }
364
365    /// Detect the delimiter used in the input by scanning ahead.
366    /// Stops at structural characters or when a delimiter is found.
367    pub fn detect_delimiter(&mut self) -> Option<Delimiter> {
368        let saved_pos = self.position;
369
370        while let Some(ch) = self.peek() {
371            match ch {
372                ',' => {
373                    self.position = saved_pos;
374                    return Some(Delimiter::Comma);
375                }
376                '|' => {
377                    self.position = saved_pos;
378                    return Some(Delimiter::Pipe);
379                }
380                '\t' => {
381                    self.position = saved_pos;
382                    return Some(Delimiter::Tab);
383                }
384                // Stop scanning at structural characters
385                '\n' | ':' | '[' | ']' | '{' | '}' => break,
386                _ => {
387                    self.advance();
388                }
389            }
390        }
391
392        self.position = saved_pos;
393        None
394    }
395}
396
397#[cfg(test)]
398mod tests {
399    use core::f64;
400
401    use super::*;
402
403    #[test]
404    fn test_scan_structural_tokens() {
405        let mut scanner = Scanner::new("[]{}:-");
406        assert_eq!(scanner.scan_token().unwrap(), Token::LeftBracket);
407        assert_eq!(scanner.scan_token().unwrap(), Token::RightBracket);
408        assert_eq!(scanner.scan_token().unwrap(), Token::LeftBrace);
409        assert_eq!(scanner.scan_token().unwrap(), Token::RightBrace);
410        assert_eq!(scanner.scan_token().unwrap(), Token::Colon);
411        assert_eq!(scanner.scan_token().unwrap(), Token::Dash);
412    }
413
414    #[test]
415    fn test_scan_numbers() {
416        let mut scanner = Scanner::new("42 3.141592653589793 -5");
417        assert_eq!(scanner.scan_token().unwrap(), Token::Integer(42));
418        assert_eq!(
419            scanner.scan_token().unwrap(),
420            Token::Number(f64::consts::PI)
421        );
422        assert_eq!(scanner.scan_token().unwrap(), Token::Integer(-5));
423    }
424
425    #[test]
426    fn test_scan_booleans() {
427        let mut scanner = Scanner::new("true false");
428        assert_eq!(scanner.scan_token().unwrap(), Token::Bool(true));
429        assert_eq!(scanner.scan_token().unwrap(), Token::Bool(false));
430    }
431
432    #[test]
433    fn test_scan_null() {
434        let mut scanner = Scanner::new("null");
435        assert_eq!(scanner.scan_token().unwrap(), Token::Null);
436    }
437
438    #[test]
439    fn test_scan_quoted_string() {
440        let mut scanner = Scanner::new(r#""hello world""#);
441        assert_eq!(
442            scanner.scan_token().unwrap(),
443            Token::String("hello world".to_string(), true)
444        );
445    }
446
447    #[test]
448    fn test_scan_escaped_string() {
449        let mut scanner = Scanner::new(r#""hello\nworld""#);
450        assert_eq!(
451            scanner.scan_token().unwrap(),
452            Token::String("hello\nworld".to_string(), true)
453        );
454    }
455
456    #[test]
457    fn test_scan_unquoted_string() {
458        let mut scanner = Scanner::new("hello");
459        assert_eq!(
460            scanner.scan_token().unwrap(),
461            Token::String("hello".to_string(), false)
462        );
463    }
464
465    #[test]
466    fn test_detect_delimiter() {
467        let mut scanner = Scanner::new("a,b,c");
468        assert_eq!(scanner.detect_delimiter(), Some(Delimiter::Comma));
469
470        let mut scanner = Scanner::new("a|b|c");
471        assert_eq!(scanner.detect_delimiter(), Some(Delimiter::Pipe));
472
473        let mut scanner = Scanner::new("a\tb\tc");
474        assert_eq!(scanner.detect_delimiter(), Some(Delimiter::Tab));
475    }
476}