fjson/
scanner.rs

1//! Scanner that provides an iterator over JSONC tokens.
2
3use std::{iter::Peekable, ops::Range, str::CharIndices};
4
5use crate::error::Error;
6
7/// Event combines a JSON Token and range in the source string. It is emitted
8/// from the Scanner.
9#[derive(Clone, Debug, PartialEq)]
10pub struct Event<'a> {
11    pub token: Token<'a>,
12    pub range: Range<usize>,
13}
14
15/// Token represents a single JSON token and is emitted via an Event from the
16/// Scanner. Its lifetime is tied to the lifetime of the source string.
17#[derive(Copy, Clone, Debug, PartialEq)]
18pub enum Token<'a> {
19    Newline,
20    ObjectStart,
21    ObjectEnd,
22    ArrayStart,
23    ArrayEnd,
24    Comma,
25    Colon,
26    Null,
27    LineComment(&'a str),
28    BlockComment(&'a str),
29    String(&'a str),
30    Number(&'a str),
31    Bool(bool),
32}
33
34/// ScanResult represents the output of the Scanner Iterator.
35pub type ScanResult<'a> = Result<Event<'a>, Error>;
36
37/// Scanner is a lexer for JSON with C-style comments and trailing commas. It is
38/// itself an `Iterator` over `ScanResult`s. Usually, a Scanner is only required
39/// to be used directly when you want to filter out certain Token types (like
40/// with the `without_metadata` method).
41pub struct Scanner<'a> {
42    input: &'a str,
43    has_error: bool,
44    current_idx: usize,
45    chars: Peekable<CharIndices<'a>>,
46}
47
48impl<'a> Iterator for Scanner<'a> {
49    type Item = ScanResult<'a>;
50
51    fn next(&mut self) -> Option<ScanResult<'a>> {
52        if self.has_error {
53            None
54        } else {
55            match self.parse_value() {
56                Some(Err(err)) => {
57                    self.has_error = true;
58                    Some(Err(err))
59                }
60                v => v,
61            }
62        }
63    }
64}
65
66impl<'a> Scanner<'a> {
67    /// Creates a new Scanner from the input string.
68    pub fn new(input: &'a str) -> Self {
69        Scanner {
70            input,
71            has_error: false,
72            current_idx: 0,
73            chars: input.char_indices().peekable(),
74        }
75    }
76
77    /// Return an `Iterator` that filters out all C-style comments and newlines.
78    pub fn without_metadata(self) -> impl Iterator<Item = ScanResult<'a>> {
79        self.into_iter().filter(|event| {
80            if let Ok(event) = event {
81                match event.token {
82                    Token::BlockComment(_) | Token::LineComment(_) | Token::Newline => {
83                        return false
84                    }
85                    _ => {}
86                }
87            }
88            true
89        })
90    }
91
92    fn parse_value(&mut self) -> Option<ScanResult<'a>> {
93        self.skip_whitespace();
94        if let Some((i, c)) = self.next_char() {
95            let start = self.current_idx;
96            match c {
97                '\n' => Some(Ok(Event {
98                    token: Token::Newline,
99                    range: start..(start + 1),
100                })),
101                '{' => Some(Ok(Event {
102                    token: Token::ObjectStart,
103                    range: start..(start + 1),
104                })),
105                '}' => Some(Ok(Event {
106                    token: Token::ObjectEnd,
107                    range: start..(start + 1),
108                })),
109                '[' => Some(Ok(Event {
110                    token: Token::ArrayStart,
111                    range: start..(start + 1),
112                })),
113                ']' => Some(Ok(Event {
114                    token: Token::ArrayEnd,
115                    range: start..(start + 1),
116                })),
117                ',' => Some(Ok(Event {
118                    token: Token::Comma,
119                    range: start..(start + 1),
120                })),
121                ':' => Some(Ok(Event {
122                    token: Token::Colon,
123                    range: start..(start + 1),
124                })),
125                'n' => Some(self.parse_null(start)),
126                't' => Some(self.parse_bool_true(start)),
127                'f' => Some(self.parse_bool_false(start)),
128                '/' => Some(self.parse_comment(start)),
129                '"' => Some(self.parse_string(start)),
130                c => {
131                    if c.is_ascii_digit() || c == '-' {
132                        Some(self.parse_number(start, c))
133                    } else {
134                        Some(Err(Error::UnexpectedCharacter(i, c)))
135                    }
136                }
137            }
138        } else {
139            None
140        }
141    }
142
143    fn parse_number(&mut self, start: usize, curr: char) -> ScanResult<'a> {
144        let curr = if curr == '-' {
145            self.next_digit()?
146        } else {
147            curr
148        };
149        if curr != '0' {
150            self.skip_digits();
151        }
152
153        if let Some(&(_, '.')) = self.peek_char() {
154            self.skip_char();
155            self.next_digit()?;
156            self.skip_digits();
157        }
158
159        if let Some(&(_, 'e' | 'E')) = self.peek_char() {
160            self.skip_char();
161            if let Some((_, '-' | '+')) = self.peek_char() {
162                self.skip_char();
163            }
164            self.next_digit()?;
165            self.skip_digits();
166        }
167
168        let range = start..(self.current_idx + 1);
169        Ok(Event {
170            token: Token::Number(&self.input[range.clone()]),
171            range,
172        })
173    }
174
175    fn parse_string(&mut self, start: usize) -> ScanResult<'a> {
176        while let Some((i, c)) = self.next_char() {
177            match c {
178                '\\' => match self.next_char() {
179                    Some((i, c)) => match c {
180                        '"' | '\\' | '/' | 'b' | 'f' | 'n' | 'r' | 't' => {}
181                        'u' => {
182                            for _ in 0..4 {
183                                match self.next_char() {
184                                    Some((i, c)) => {
185                                        if !c.is_ascii_hexdigit() {
186                                            return Err(Error::UnexpectedCharacter(i, c));
187                                        }
188                                    }
189                                    None => return Err(Error::UnexpectedEOF),
190                                }
191                            }
192                        }
193                        c => return Err(Error::UnexpectedCharacter(i, c)),
194                    },
195                    None => return Err(Error::UnexpectedEOF),
196                },
197                '"' => {
198                    let end = self.current_idx;
199                    return Ok(Event {
200                        token: Token::String(&self.input[(start + 1)..end]),
201                        range: start..(end + 1),
202                    });
203                }
204                c => {
205                    if !(0x0020..0x10FFFF).contains(&(c as u32)) {
206                        return Err(Error::UnexpectedCharacter(i, c));
207                    }
208                }
209            }
210        }
211        Err(Error::UnexpectedEOF)
212    }
213
214    fn parse_comment(&mut self, start: usize) -> ScanResult<'a> {
215        match self.next_char() {
216            Some((_, '/')) => self.parse_line_comment(start),
217            Some((_, '*')) => self.parse_block_comment(start),
218            Some(v) => Err(Error::UnexpectedCharacter(v.0, v.1)),
219            None => Err(Error::UnexpectedEOF),
220        }
221    }
222
223    fn parse_line_comment(&mut self, start: usize) -> ScanResult<'a> {
224        let mut end;
225        loop {
226            match self.peek_char() {
227                Some(&(i, c)) => {
228                    end = i;
229                    if c == '\n' {
230                        break;
231                    } else if c == '\r' {
232                        self.skip_char();
233                        if let Some(&(_, c)) = self.peek_char() {
234                            if c == '\n' {
235                                break;
236                            }
237                        }
238                        continue;
239                    } else {
240                        self.skip_char();
241                    }
242                }
243                None => {
244                    end = self.input.len();
245                    break;
246                }
247            }
248        }
249        Ok(Event {
250            token: Token::LineComment(&self.input[(start + 2..end)]),
251            range: start..end,
252        })
253    }
254
255    fn parse_block_comment(&mut self, start: usize) -> ScanResult<'a> {
256        while let Some((_, c)) = self.next_char() {
257            if c == '*' {
258                if let Some(&(i, '/')) = self.peek_char() {
259                    self.skip_char();
260                    return Ok(Event {
261                        token: Token::BlockComment(&self.input[(start + 2)..(i - 1)]),
262                        range: start..(i + 1),
263                    });
264                }
265            }
266        }
267        Err(Error::UnexpectedEOF)
268    }
269
270    fn parse_null(&mut self, start: usize) -> ScanResult<'a> {
271        if self.next_chars_equal("ull") {
272            Ok(Event {
273                token: Token::Null,
274                range: start..(start + 4),
275            })
276        } else {
277            Err(Error::UnexpectedCharacter(start, 'n'))
278        }
279    }
280
281    fn parse_bool_true(&mut self, start: usize) -> ScanResult<'a> {
282        if self.next_chars_equal("rue") {
283            Ok(Event {
284                token: Token::Bool(true),
285                range: start..(start + 4),
286            })
287        } else {
288            Err(Error::UnexpectedCharacter(start, 't'))
289        }
290    }
291
292    fn parse_bool_false(&mut self, start: usize) -> ScanResult<'a> {
293        if self.next_chars_equal("alse") {
294            Ok(Event {
295                token: Token::Bool(false),
296                range: start..(start + 5),
297            })
298        } else {
299            Err(Error::UnexpectedCharacter(start, 'f'))
300        }
301    }
302
303    fn next_digit(&mut self) -> Result<char, Error> {
304        match self.next_char() {
305            Some((i, c)) => {
306                if c.is_ascii_digit() {
307                    Ok(c)
308                } else {
309                    Err(Error::UnexpectedCharacter(i, c))
310                }
311            }
312            None => Err(Error::UnexpectedEOF),
313        }
314    }
315
316    fn skip_digits(&mut self) {
317        while let Some(&(_, c)) = self.peek_char() {
318            if c.is_ascii_digit() {
319                self.skip_char();
320            } else {
321                break;
322            }
323        }
324    }
325
326    fn skip_whitespace(&mut self) {
327        while let Some(c) = self.peek_char() {
328            if c.1.is_whitespace() && c.1 != '\n' {
329                self.skip_char();
330            } else {
331                return;
332            }
333        }
334    }
335
336    fn next_chars_equal(&mut self, s: &str) -> bool {
337        for ch in s.chars() {
338            match self.next_char() {
339                Some((_, c)) => {
340                    if ch != c {
341                        return false;
342                    }
343                }
344                None => {
345                    return false;
346                }
347            }
348        }
349        true
350    }
351
352    fn next_char(&mut self) -> Option<(usize, char)> {
353        if let Some((i, c)) = self.chars.next() {
354            self.current_idx = i;
355            Some((i, c))
356        } else {
357            None
358        }
359    }
360
361    fn skip_char(&mut self) {
362        self.next_char();
363    }
364
365    fn peek_char(&mut self) -> Option<&(usize, char)> {
366        self.chars.peek()
367    }
368}
369
370#[cfg(test)]
371mod tests {
372    use super::*;
373
374    #[test]
375    fn test_scanner() {
376        let input = r#"{
377            // This is a comment.
378            "key1": "val1",
379            "key2": 100,
380            /*
381             * This is a block comment.
382             */
383            "key3":[    true,    "1", 2, {}, null,  ]
384        }"#;
385        let expected = vec![
386            Event {
387                token: Token::ObjectStart,
388                range: 0..1,
389            },
390            Event {
391                token: Token::Newline,
392                range: 1..2,
393            },
394            Event {
395                token: Token::LineComment(" This is a comment."),
396                range: 14..35,
397            },
398            Event {
399                token: Token::Newline,
400                range: 35..36,
401            },
402            Event {
403                token: Token::String("key1"),
404                range: 48..54,
405            },
406            Event {
407                token: Token::Colon,
408                range: 54..55,
409            },
410            Event {
411                token: Token::String("val1"),
412                range: 56..62,
413            },
414            Event {
415                token: Token::Comma,
416                range: 62..63,
417            },
418            Event {
419                token: Token::Newline,
420                range: 63..64,
421            },
422            Event {
423                token: Token::String("key2"),
424                range: 76..82,
425            },
426            Event {
427                token: Token::Colon,
428                range: 82..83,
429            },
430            Event {
431                token: Token::Number("100"),
432                range: 84..87,
433            },
434            Event {
435                token: Token::Comma,
436                range: 87..88,
437            },
438            Event {
439                token: Token::Newline,
440                range: 88..89,
441            },
442            Event {
443                token: Token::BlockComment(
444                    "\n             * This is a block comment.\n             ",
445                ),
446                range: 101..159,
447            },
448            Event {
449                token: Token::Newline,
450                range: 159..160,
451            },
452            Event {
453                token: Token::String("key3"),
454                range: 172..178,
455            },
456            Event {
457                token: Token::Colon,
458                range: 178..179,
459            },
460            Event {
461                token: Token::ArrayStart,
462                range: 179..180,
463            },
464            Event {
465                token: Token::Bool(true),
466                range: 184..188,
467            },
468            Event {
469                token: Token::Comma,
470                range: 188..189,
471            },
472            Event {
473                token: Token::String("1"),
474                range: 193..196,
475            },
476            Event {
477                token: Token::Comma,
478                range: 196..197,
479            },
480            Event {
481                token: Token::Number("2"),
482                range: 198..199,
483            },
484            Event {
485                token: Token::Comma,
486                range: 199..200,
487            },
488            Event {
489                token: Token::ObjectStart,
490                range: 201..202,
491            },
492            Event {
493                token: Token::ObjectEnd,
494                range: 202..203,
495            },
496            Event {
497                token: Token::Comma,
498                range: 203..204,
499            },
500            Event {
501                token: Token::Null,
502                range: 205..209,
503            },
504            Event {
505                token: Token::Comma,
506                range: 209..210,
507            },
508            Event {
509                token: Token::ArrayEnd,
510                range: 212..213,
511            },
512            Event {
513                token: Token::Newline,
514                range: 213..214,
515            },
516            Event {
517                token: Token::ObjectEnd,
518                range: 222..223,
519            },
520        ];
521
522        let scanner = Scanner::new(input);
523        let output = scanner.map(|v| v.unwrap()).collect::<Vec<_>>();
524        assert_eq!(output, expected);
525
526        for event in output {
527            match event.token {
528                Token::Newline => assert_eq!(&input[event.range], "\n"),
529                Token::ObjectStart => assert_eq!(&input[event.range], "{"),
530                Token::ObjectEnd => assert_eq!(&input[event.range], "}"),
531                Token::ArrayStart => assert_eq!(&input[event.range], "["),
532                Token::ArrayEnd => assert_eq!(&input[event.range], "]"),
533                Token::Comma => assert_eq!(&input[event.range], ","),
534                Token::Colon => assert_eq!(&input[event.range], ":"),
535                Token::Null => assert_eq!(&input[event.range], "null"),
536                Token::LineComment(v) => assert_eq!(&input[event.range], ["//", v].join("")),
537                Token::BlockComment(v) => assert_eq!(&input[event.range], ["/*", v, "*/"].join("")),
538                Token::String(v) => assert_eq!(&input[event.range], ["\"", v, "\""].join("")),
539                Token::Number(v) => assert_eq!(&input[event.range], v),
540                Token::Bool(v) => assert_eq!(&input[event.range], if v { "true" } else { "false" }),
541            }
542        }
543    }
544
545    #[test]
546    fn test_line_comment() {
547        let input = "//";
548        let exp = Event {
549            token: Token::LineComment(""),
550            range: 0..2,
551        };
552        let scanner = Scanner::new(input);
553        let output = scanner.map(|v| v.unwrap()).collect::<Vec<_>>();
554        assert_eq!(output, vec![exp]);
555    }
556
557    #[test]
558    fn test_number() {
559        let input = "0.01";
560        let exp = Event {
561            token: Token::Number("0.01"),
562            range: 0..4,
563        };
564        let scanner = Scanner::new(input);
565        let output = scanner.map(|v| v.unwrap()).collect::<Vec<_>>();
566        assert_eq!(output, vec![exp]);
567    }
568}