toon_format/decode/
scanner.rs

1use crate::types::{
2    Delimiter,
3    ToonError,
4    ToonResult,
5};
6
7/// Tokens produced by the scanner during lexical analysis.
8#[derive(Debug, Clone, PartialEq)]
9pub enum Token {
10    LeftBracket,
11    RightBracket,
12    LeftBrace,
13    RightBrace,
14    Colon,
15    Dash,
16    Newline,
17    String(String, bool),
18    Number(f64),
19    Integer(i64),
20    Bool(bool),
21    Null,
22    Delimiter(Delimiter),
23    Eof,
24}
25
26/// Scanner that tokenizes TOON input into a sequence of tokens.
27pub struct Scanner {
28    input: Vec<char>,
29    position: usize,
30    line: usize,
31    column: usize,
32    active_delimiter: Option<Delimiter>,
33    last_line_indent: usize,
34}
35
36impl Scanner {
37    /// Create a new scanner for the given input string.
38    pub fn new(input: &str) -> Self {
39        Self {
40            input: input.chars().collect(),
41            position: 0,
42            line: 1,
43            column: 1,
44            active_delimiter: None,
45            last_line_indent: 0,
46        }
47    }
48
49    /// Set the active delimiter for tokenizing array elements.
50    pub fn set_active_delimiter(&mut self, delimiter: Option<Delimiter>) {
51        self.active_delimiter = delimiter;
52    }
53
54    /// Get the current position (line, column).
55    pub fn current_position(&self) -> (usize, usize) {
56        (self.line, self.column)
57    }
58
59    pub fn get_line(&self) -> usize {
60        self.line
61    }
62
63    pub fn get_column(&self) -> usize {
64        self.column
65    }
66
67    pub fn peek(&self) -> Option<char> {
68        self.input.get(self.position).copied()
69    }
70
71    pub fn count_leading_spaces(&self) -> usize {
72        let mut idx = self.position;
73        let mut count = 0;
74        while let Some(&ch) = self.input.get(idx) {
75            if ch == ' ' {
76                count += 1;
77                idx += 1;
78            } else {
79                break;
80            }
81        }
82        count
83    }
84
85    pub fn count_spaces_after_newline(&self) -> usize {
86        let mut idx = self.position;
87        if self.input.get(idx) != Some(&'\n') {
88            return 0;
89        }
90        idx += 1;
91        let mut count = 0;
92        while let Some(&ch) = self.input.get(idx) {
93            if ch == ' ' {
94                count += 1;
95                idx += 1;
96            } else {
97                break;
98            }
99        }
100        count
101    }
102
103    pub fn peek_ahead(&self, offset: usize) -> Option<char> {
104        self.input.get(self.position + offset).copied()
105    }
106
107    pub fn advance(&mut self) -> Option<char> {
108        if let Some(ch) = self.input.get(self.position) {
109            self.position += 1;
110            if *ch == '\n' {
111                self.line += 1;
112                self.column = 1;
113            } else {
114                self.column += 1;
115            }
116            Some(*ch)
117        } else {
118            None
119        }
120    }
121
122    pub fn skip_whitespace(&mut self) {
123        while let Some(ch) = self.peek() {
124            if ch == ' ' {
125                self.advance();
126            } else {
127                break;
128            }
129        }
130    }
131
132    /// Scan the next token from the input.
133    pub fn scan_token(&mut self) -> ToonResult<Token> {
134        if self.column == 1 {
135            let mut count = 0;
136            let mut idx = self.position;
137            while let Some(&ch) = self.input.get(idx) {
138                if ch == ' ' {
139                    count += 1;
140                    idx += 1;
141                } else {
142                    if ch == '\t' {
143                        let (line, col) = self.current_position();
144                        return Err(ToonError::parse_error(
145                            line,
146                            col + count,
147                            "Tabs are not allowed in indentation",
148                        ));
149                    }
150                    break;
151                }
152            }
153            self.last_line_indent = count;
154        }
155
156        self.skip_whitespace();
157
158        match self.peek() {
159            None => Ok(Token::Eof),
160            Some('\n') => {
161                self.advance();
162                Ok(Token::Newline)
163            }
164            Some('[') => {
165                self.advance();
166                Ok(Token::LeftBracket)
167            }
168            Some(']') => {
169                self.advance();
170                Ok(Token::RightBracket)
171            }
172            Some('{') => {
173                self.advance();
174                Ok(Token::LeftBrace)
175            }
176            Some('}') => {
177                self.advance();
178                Ok(Token::RightBrace)
179            }
180            Some(':') => {
181                self.advance();
182                Ok(Token::Colon)
183            }
184            Some('-') => {
185                self.advance();
186                // Check if '-' is part of a negative number
187                if let Some(ch) = self.peek() {
188                    if ch.is_ascii_digit() {
189                        let num_str = self.scan_number_string(true)?;
190                        return self.parse_number(&num_str);
191                    }
192                }
193                Ok(Token::Dash)
194            }
195            Some(',') => {
196                if matches!(self.active_delimiter, Some(Delimiter::Comma)) {
197                    self.advance();
198                    Ok(Token::Delimiter(Delimiter::Comma))
199                } else {
200                    self.scan_unquoted_string()
201                }
202            }
203            Some('|') => {
204                if matches!(self.active_delimiter, Some(Delimiter::Pipe)) {
205                    self.advance();
206                    Ok(Token::Delimiter(Delimiter::Pipe))
207                } else {
208                    self.scan_unquoted_string()
209                }
210            }
211            Some('\t') => {
212                if matches!(self.active_delimiter, Some(Delimiter::Tab)) {
213                    self.advance();
214                    Ok(Token::Delimiter(Delimiter::Tab))
215                } else {
216                    self.scan_unquoted_string()
217                }
218            }
219            Some('"') => self.scan_quoted_string(),
220            Some(ch) if ch.is_ascii_digit() => {
221                let num_str = self.scan_number_string(false)?;
222                self.parse_number(&num_str)
223            }
224            Some(_) => self.scan_unquoted_string(),
225        }
226    }
227
228    fn scan_quoted_string(&mut self) -> ToonResult<Token> {
229        self.advance();
230
231        let mut value = String::new();
232        let mut escaped = false;
233
234        while let Some(ch) = self.advance() {
235            if escaped {
236                match ch {
237                    'n' => value.push('\n'),
238                    'r' => value.push('\r'),
239                    't' => value.push('\t'),
240                    '"' => value.push('"'),
241                    '\\' => value.push('\\'),
242                    _ => {
243                        let (line, col) = self.current_position();
244                        return Err(ToonError::parse_error(
245                            line,
246                            col - 1,
247                            format!("Invalid escape sequence: \\{ch}"),
248                        ));
249                    }
250                }
251                escaped = false;
252            } else if ch == '\\' {
253                escaped = true;
254            } else if ch == '"' {
255                return Ok(Token::String(value, true));
256            } else {
257                value.push(ch);
258            }
259        }
260
261        Err(ToonError::UnexpectedEof)
262    }
263
264    fn scan_unquoted_string(&mut self) -> ToonResult<Token> {
265        let mut value = String::new();
266
267        while let Some(ch) = self.peek() {
268            if ch == '\n'
269                || ch == ' '
270                || ch == ':'
271                || ch == '['
272                || ch == ']'
273                || ch == '{'
274                || ch == '}'
275            {
276                break;
277            }
278
279            if let Some(active) = self.active_delimiter {
280                if (active == Delimiter::Comma && ch == ',')
281                    || (active == Delimiter::Pipe && ch == '|')
282                    || (active == Delimiter::Tab && ch == '\t')
283                {
284                    break;
285                }
286            }
287            value.push(ch);
288            self.advance();
289        }
290
291        let value = if value.len() == 1 && (value == "," || value == "|" || value == "\t") {
292            value
293        } else {
294            value.trim_end().to_string()
295        };
296
297        match value.as_str() {
298            "null" => Ok(Token::Null),
299            "true" => Ok(Token::Bool(true)),
300            "false" => Ok(Token::Bool(false)),
301            _ => Ok(Token::String(value, false)),
302        }
303    }
304
305    pub fn get_last_line_indent(&self) -> usize {
306        self.last_line_indent
307    }
308
309    fn scan_number_string(&mut self, negative: bool) -> ToonResult<String> {
310        let mut num_str = if negative {
311            String::from("-")
312        } else {
313            String::new()
314        };
315
316        while let Some(ch) = self.peek() {
317            if ch.is_ascii_digit() || ch == '.' || ch == 'e' || ch == 'E' || ch == '+' || ch == '-'
318            {
319                num_str.push(ch);
320                self.advance();
321            } else {
322                break;
323            }
324        }
325
326        Ok(num_str)
327    }
328
329    fn parse_number(&self, s: &str) -> ToonResult<Token> {
330        if s.contains('.') || s.contains('e') || s.contains('E') {
331            if let Ok(f) = s.parse::<f64>() {
332                Ok(Token::Number(f))
333            } else {
334                Ok(Token::String(s.to_string(), false))
335            }
336        } else if let Ok(i) = s.parse::<i64>() {
337            Ok(Token::Integer(i))
338        } else {
339            Ok(Token::String(s.to_string(), false))
340        }
341    }
342
343    /// Detect the delimiter used in the input by scanning ahead.
344    pub fn detect_delimiter(&mut self) -> Option<Delimiter> {
345        let saved_pos = self.position;
346
347        while let Some(ch) = self.peek() {
348            match ch {
349                ',' => {
350                    self.position = saved_pos;
351                    return Some(Delimiter::Comma);
352                }
353                '|' => {
354                    self.position = saved_pos;
355                    return Some(Delimiter::Pipe);
356                }
357                '\t' => {
358                    self.position = saved_pos;
359                    return Some(Delimiter::Tab);
360                }
361                '\n' | ':' | '[' | ']' | '{' | '}' => break,
362                _ => {
363                    self.advance();
364                }
365            }
366        }
367
368        self.position = saved_pos;
369        None
370    }
371}
372
373#[cfg(test)]
374mod tests {
375    use core::f64;
376
377    use super::*;
378
379    #[test]
380    fn test_scan_structural_tokens() {
381        let mut scanner = Scanner::new("[]{}:-");
382        assert_eq!(scanner.scan_token().unwrap(), Token::LeftBracket);
383        assert_eq!(scanner.scan_token().unwrap(), Token::RightBracket);
384        assert_eq!(scanner.scan_token().unwrap(), Token::LeftBrace);
385        assert_eq!(scanner.scan_token().unwrap(), Token::RightBrace);
386        assert_eq!(scanner.scan_token().unwrap(), Token::Colon);
387        assert_eq!(scanner.scan_token().unwrap(), Token::Dash);
388    }
389
390    #[test]
391    fn test_scan_numbers() {
392        let mut scanner = Scanner::new("42 3.141592653589793 -5");
393        assert_eq!(scanner.scan_token().unwrap(), Token::Integer(42));
394        assert_eq!(
395            scanner.scan_token().unwrap(),
396            Token::Number(f64::consts::PI)
397        );
398        assert_eq!(scanner.scan_token().unwrap(), Token::Integer(-5));
399    }
400
401    #[test]
402    fn test_scan_booleans() {
403        let mut scanner = Scanner::new("true false");
404        assert_eq!(scanner.scan_token().unwrap(), Token::Bool(true));
405        assert_eq!(scanner.scan_token().unwrap(), Token::Bool(false));
406    }
407
408    #[test]
409    fn test_scan_null() {
410        let mut scanner = Scanner::new("null");
411        assert_eq!(scanner.scan_token().unwrap(), Token::Null);
412    }
413
414    #[test]
415    fn test_scan_quoted_string() {
416        let mut scanner = Scanner::new(r#""hello world""#);
417        assert_eq!(
418            scanner.scan_token().unwrap(),
419            Token::String("hello world".to_string(), true)
420        );
421    }
422
423    #[test]
424    fn test_scan_escaped_string() {
425        let mut scanner = Scanner::new(r#""hello\nworld""#);
426        assert_eq!(
427            scanner.scan_token().unwrap(),
428            Token::String("hello\nworld".to_string(), true)
429        );
430    }
431
432    #[test]
433    fn test_scan_unquoted_string() {
434        let mut scanner = Scanner::new("hello");
435        assert_eq!(
436            scanner.scan_token().unwrap(),
437            Token::String("hello".to_string(), false)
438        );
439    }
440
441    #[test]
442    fn test_detect_delimiter() {
443        let mut scanner = Scanner::new("a,b,c");
444        assert_eq!(scanner.detect_delimiter(), Some(Delimiter::Comma));
445
446        let mut scanner = Scanner::new("a|b|c");
447        assert_eq!(scanner.detect_delimiter(), Some(Delimiter::Pipe));
448
449        let mut scanner = Scanner::new("a\tb\tc");
450        assert_eq!(scanner.detect_delimiter(), Some(Delimiter::Tab));
451    }
452}