1use std::{iter::Peekable, ops::Range, str::CharIndices};
4
5use crate::error::Error;
6
7#[derive(Clone, Debug, PartialEq)]
10pub struct Event<'a> {
11 pub token: Token<'a>,
12 pub range: Range<usize>,
13}
14
15#[derive(Copy, Clone, Debug, PartialEq)]
18pub enum Token<'a> {
19 Newline,
20 ObjectStart,
21 ObjectEnd,
22 ArrayStart,
23 ArrayEnd,
24 Comma,
25 Colon,
26 Null,
27 LineComment(&'a str),
28 BlockComment(&'a str),
29 String(&'a str),
30 Number(&'a str),
31 Bool(bool),
32}
33
34pub type ScanResult<'a> = Result<Event<'a>, Error>;
36
37pub struct Scanner<'a> {
42 input: &'a str,
43 has_error: bool,
44 current_idx: usize,
45 chars: Peekable<CharIndices<'a>>,
46}
47
48impl<'a> Iterator for Scanner<'a> {
49 type Item = ScanResult<'a>;
50
51 fn next(&mut self) -> Option<ScanResult<'a>> {
52 if self.has_error {
53 None
54 } else {
55 match self.parse_value() {
56 Some(Err(err)) => {
57 self.has_error = true;
58 Some(Err(err))
59 }
60 v => v,
61 }
62 }
63 }
64}
65
66impl<'a> Scanner<'a> {
67 pub fn new(input: &'a str) -> Self {
69 Scanner {
70 input,
71 has_error: false,
72 current_idx: 0,
73 chars: input.char_indices().peekable(),
74 }
75 }
76
77 pub fn without_metadata(self) -> impl Iterator<Item = ScanResult<'a>> {
79 self.into_iter().filter(|event| {
80 if let Ok(event) = event {
81 match event.token {
82 Token::BlockComment(_) | Token::LineComment(_) | Token::Newline => {
83 return false
84 }
85 _ => {}
86 }
87 }
88 true
89 })
90 }
91
92 fn parse_value(&mut self) -> Option<ScanResult<'a>> {
93 self.skip_whitespace();
94 if let Some((i, c)) = self.next_char() {
95 let start = self.current_idx;
96 match c {
97 '\n' => Some(Ok(Event {
98 token: Token::Newline,
99 range: start..(start + 1),
100 })),
101 '{' => Some(Ok(Event {
102 token: Token::ObjectStart,
103 range: start..(start + 1),
104 })),
105 '}' => Some(Ok(Event {
106 token: Token::ObjectEnd,
107 range: start..(start + 1),
108 })),
109 '[' => Some(Ok(Event {
110 token: Token::ArrayStart,
111 range: start..(start + 1),
112 })),
113 ']' => Some(Ok(Event {
114 token: Token::ArrayEnd,
115 range: start..(start + 1),
116 })),
117 ',' => Some(Ok(Event {
118 token: Token::Comma,
119 range: start..(start + 1),
120 })),
121 ':' => Some(Ok(Event {
122 token: Token::Colon,
123 range: start..(start + 1),
124 })),
125 'n' => Some(self.parse_null(start)),
126 't' => Some(self.parse_bool_true(start)),
127 'f' => Some(self.parse_bool_false(start)),
128 '/' => Some(self.parse_comment(start)),
129 '"' => Some(self.parse_string(start)),
130 c => {
131 if c.is_ascii_digit() || c == '-' {
132 Some(self.parse_number(start, c))
133 } else {
134 Some(Err(Error::UnexpectedCharacter(i, c)))
135 }
136 }
137 }
138 } else {
139 None
140 }
141 }
142
143 fn parse_number(&mut self, start: usize, curr: char) -> ScanResult<'a> {
144 let curr = if curr == '-' {
145 self.next_digit()?
146 } else {
147 curr
148 };
149 if curr != '0' {
150 self.skip_digits();
151 }
152
153 if let Some(&(_, '.')) = self.peek_char() {
154 self.skip_char();
155 self.next_digit()?;
156 self.skip_digits();
157 }
158
159 if let Some(&(_, 'e' | 'E')) = self.peek_char() {
160 self.skip_char();
161 if let Some((_, '-' | '+')) = self.peek_char() {
162 self.skip_char();
163 }
164 self.next_digit()?;
165 self.skip_digits();
166 }
167
168 let range = start..(self.current_idx + 1);
169 Ok(Event {
170 token: Token::Number(&self.input[range.clone()]),
171 range,
172 })
173 }
174
175 fn parse_string(&mut self, start: usize) -> ScanResult<'a> {
176 while let Some((i, c)) = self.next_char() {
177 match c {
178 '\\' => match self.next_char() {
179 Some((i, c)) => match c {
180 '"' | '\\' | '/' | 'b' | 'f' | 'n' | 'r' | 't' => {}
181 'u' => {
182 for _ in 0..4 {
183 match self.next_char() {
184 Some((i, c)) => {
185 if !c.is_ascii_hexdigit() {
186 return Err(Error::UnexpectedCharacter(i, c));
187 }
188 }
189 None => return Err(Error::UnexpectedEOF),
190 }
191 }
192 }
193 c => return Err(Error::UnexpectedCharacter(i, c)),
194 },
195 None => return Err(Error::UnexpectedEOF),
196 },
197 '"' => {
198 let end = self.current_idx;
199 return Ok(Event {
200 token: Token::String(&self.input[(start + 1)..end]),
201 range: start..(end + 1),
202 });
203 }
204 c => {
205 if !(0x0020..0x10FFFF).contains(&(c as u32)) {
206 return Err(Error::UnexpectedCharacter(i, c));
207 }
208 }
209 }
210 }
211 Err(Error::UnexpectedEOF)
212 }
213
214 fn parse_comment(&mut self, start: usize) -> ScanResult<'a> {
215 match self.next_char() {
216 Some((_, '/')) => self.parse_line_comment(start),
217 Some((_, '*')) => self.parse_block_comment(start),
218 Some(v) => Err(Error::UnexpectedCharacter(v.0, v.1)),
219 None => Err(Error::UnexpectedEOF),
220 }
221 }
222
223 fn parse_line_comment(&mut self, start: usize) -> ScanResult<'a> {
224 let mut end;
225 loop {
226 match self.peek_char() {
227 Some(&(i, c)) => {
228 end = i;
229 if c == '\n' {
230 break;
231 } else if c == '\r' {
232 self.skip_char();
233 if let Some(&(_, c)) = self.peek_char() {
234 if c == '\n' {
235 break;
236 }
237 }
238 continue;
239 } else {
240 self.skip_char();
241 }
242 }
243 None => {
244 end = self.input.len();
245 break;
246 }
247 }
248 }
249 Ok(Event {
250 token: Token::LineComment(&self.input[(start + 2..end)]),
251 range: start..end,
252 })
253 }
254
255 fn parse_block_comment(&mut self, start: usize) -> ScanResult<'a> {
256 while let Some((_, c)) = self.next_char() {
257 if c == '*' {
258 if let Some(&(i, '/')) = self.peek_char() {
259 self.skip_char();
260 return Ok(Event {
261 token: Token::BlockComment(&self.input[(start + 2)..(i - 1)]),
262 range: start..(i + 1),
263 });
264 }
265 }
266 }
267 Err(Error::UnexpectedEOF)
268 }
269
270 fn parse_null(&mut self, start: usize) -> ScanResult<'a> {
271 if self.next_chars_equal("ull") {
272 Ok(Event {
273 token: Token::Null,
274 range: start..(start + 4),
275 })
276 } else {
277 Err(Error::UnexpectedCharacter(start, 'n'))
278 }
279 }
280
281 fn parse_bool_true(&mut self, start: usize) -> ScanResult<'a> {
282 if self.next_chars_equal("rue") {
283 Ok(Event {
284 token: Token::Bool(true),
285 range: start..(start + 4),
286 })
287 } else {
288 Err(Error::UnexpectedCharacter(start, 't'))
289 }
290 }
291
292 fn parse_bool_false(&mut self, start: usize) -> ScanResult<'a> {
293 if self.next_chars_equal("alse") {
294 Ok(Event {
295 token: Token::Bool(false),
296 range: start..(start + 5),
297 })
298 } else {
299 Err(Error::UnexpectedCharacter(start, 'f'))
300 }
301 }
302
303 fn next_digit(&mut self) -> Result<char, Error> {
304 match self.next_char() {
305 Some((i, c)) => {
306 if c.is_ascii_digit() {
307 Ok(c)
308 } else {
309 Err(Error::UnexpectedCharacter(i, c))
310 }
311 }
312 None => Err(Error::UnexpectedEOF),
313 }
314 }
315
316 fn skip_digits(&mut self) {
317 while let Some(&(_, c)) = self.peek_char() {
318 if c.is_ascii_digit() {
319 self.skip_char();
320 } else {
321 break;
322 }
323 }
324 }
325
326 fn skip_whitespace(&mut self) {
327 while let Some(c) = self.peek_char() {
328 if c.1.is_whitespace() && c.1 != '\n' {
329 self.skip_char();
330 } else {
331 return;
332 }
333 }
334 }
335
336 fn next_chars_equal(&mut self, s: &str) -> bool {
337 for ch in s.chars() {
338 match self.next_char() {
339 Some((_, c)) => {
340 if ch != c {
341 return false;
342 }
343 }
344 None => {
345 return false;
346 }
347 }
348 }
349 true
350 }
351
352 fn next_char(&mut self) -> Option<(usize, char)> {
353 if let Some((i, c)) = self.chars.next() {
354 self.current_idx = i;
355 Some((i, c))
356 } else {
357 None
358 }
359 }
360
361 fn skip_char(&mut self) {
362 self.next_char();
363 }
364
365 fn peek_char(&mut self) -> Option<&(usize, char)> {
366 self.chars.peek()
367 }
368}
369
370#[cfg(test)]
371mod tests {
372 use super::*;
373
374 #[test]
375 fn test_scanner() {
376 let input = r#"{
377 // This is a comment.
378 "key1": "val1",
379 "key2": 100,
380 /*
381 * This is a block comment.
382 */
383 "key3":[ true, "1", 2, {}, null, ]
384 }"#;
385 let expected = vec![
386 Event {
387 token: Token::ObjectStart,
388 range: 0..1,
389 },
390 Event {
391 token: Token::Newline,
392 range: 1..2,
393 },
394 Event {
395 token: Token::LineComment(" This is a comment."),
396 range: 14..35,
397 },
398 Event {
399 token: Token::Newline,
400 range: 35..36,
401 },
402 Event {
403 token: Token::String("key1"),
404 range: 48..54,
405 },
406 Event {
407 token: Token::Colon,
408 range: 54..55,
409 },
410 Event {
411 token: Token::String("val1"),
412 range: 56..62,
413 },
414 Event {
415 token: Token::Comma,
416 range: 62..63,
417 },
418 Event {
419 token: Token::Newline,
420 range: 63..64,
421 },
422 Event {
423 token: Token::String("key2"),
424 range: 76..82,
425 },
426 Event {
427 token: Token::Colon,
428 range: 82..83,
429 },
430 Event {
431 token: Token::Number("100"),
432 range: 84..87,
433 },
434 Event {
435 token: Token::Comma,
436 range: 87..88,
437 },
438 Event {
439 token: Token::Newline,
440 range: 88..89,
441 },
442 Event {
443 token: Token::BlockComment(
444 "\n * This is a block comment.\n ",
445 ),
446 range: 101..159,
447 },
448 Event {
449 token: Token::Newline,
450 range: 159..160,
451 },
452 Event {
453 token: Token::String("key3"),
454 range: 172..178,
455 },
456 Event {
457 token: Token::Colon,
458 range: 178..179,
459 },
460 Event {
461 token: Token::ArrayStart,
462 range: 179..180,
463 },
464 Event {
465 token: Token::Bool(true),
466 range: 184..188,
467 },
468 Event {
469 token: Token::Comma,
470 range: 188..189,
471 },
472 Event {
473 token: Token::String("1"),
474 range: 193..196,
475 },
476 Event {
477 token: Token::Comma,
478 range: 196..197,
479 },
480 Event {
481 token: Token::Number("2"),
482 range: 198..199,
483 },
484 Event {
485 token: Token::Comma,
486 range: 199..200,
487 },
488 Event {
489 token: Token::ObjectStart,
490 range: 201..202,
491 },
492 Event {
493 token: Token::ObjectEnd,
494 range: 202..203,
495 },
496 Event {
497 token: Token::Comma,
498 range: 203..204,
499 },
500 Event {
501 token: Token::Null,
502 range: 205..209,
503 },
504 Event {
505 token: Token::Comma,
506 range: 209..210,
507 },
508 Event {
509 token: Token::ArrayEnd,
510 range: 212..213,
511 },
512 Event {
513 token: Token::Newline,
514 range: 213..214,
515 },
516 Event {
517 token: Token::ObjectEnd,
518 range: 222..223,
519 },
520 ];
521
522 let scanner = Scanner::new(input);
523 let output = scanner.map(|v| v.unwrap()).collect::<Vec<_>>();
524 assert_eq!(output, expected);
525
526 for event in output {
527 match event.token {
528 Token::Newline => assert_eq!(&input[event.range], "\n"),
529 Token::ObjectStart => assert_eq!(&input[event.range], "{"),
530 Token::ObjectEnd => assert_eq!(&input[event.range], "}"),
531 Token::ArrayStart => assert_eq!(&input[event.range], "["),
532 Token::ArrayEnd => assert_eq!(&input[event.range], "]"),
533 Token::Comma => assert_eq!(&input[event.range], ","),
534 Token::Colon => assert_eq!(&input[event.range], ":"),
535 Token::Null => assert_eq!(&input[event.range], "null"),
536 Token::LineComment(v) => assert_eq!(&input[event.range], ["//", v].join("")),
537 Token::BlockComment(v) => assert_eq!(&input[event.range], ["/*", v, "*/"].join("")),
538 Token::String(v) => assert_eq!(&input[event.range], ["\"", v, "\""].join("")),
539 Token::Number(v) => assert_eq!(&input[event.range], v),
540 Token::Bool(v) => assert_eq!(&input[event.range], if v { "true" } else { "false" }),
541 }
542 }
543 }
544
545 #[test]
546 fn test_line_comment() {
547 let input = "//";
548 let exp = Event {
549 token: Token::LineComment(""),
550 range: 0..2,
551 };
552 let scanner = Scanner::new(input);
553 let output = scanner.map(|v| v.unwrap()).collect::<Vec<_>>();
554 assert_eq!(output, vec![exp]);
555 }
556
557 #[test]
558 fn test_number() {
559 let input = "0.01";
560 let exp = Event {
561 token: Token::Number("0.01"),
562 range: 0..4,
563 };
564 let scanner = Scanner::new(input);
565 let output = scanner.map(|v| v.unwrap()).collect::<Vec<_>>();
566 assert_eq!(output, vec![exp]);
567 }
568}