1#![allow(unused_assignments)]
4#![allow(unused_variables)]
5#![allow(unreachable_code)]
6
7use crate::coords::{Coords, Span};
8use std::fmt::{Display, Formatter};
9use std::io::BufRead;
10
11use crate::lexer::lexer_input::{CharWithCoords, LexerInput};
12use crate::lexer_error;
13use crate::results::{ParserError, ParserErrorDetails, ParserErrorSource, ParserResult};
14
15const DEFAULT_BUFFER_SIZE: usize = 4096;
17const NULL_PATTERN: [char; 4] = ['n', 'u', 'l', 'l'];
19const TRUE_PATTERN: [char; 4] = ['t', 'r', 'u', 'e'];
21const FALSE_PATTERN: [char; 5] = ['f', 'a', 'l', 's', 'e'];
23
24#[derive(Debug, Clone, PartialEq)]
26pub enum Token {
27 StartObject,
28 EndObject,
29 StartArray,
30 EndArray,
31 Colon,
32 Comma,
33 Str(String),
34 Float(f64),
35 Integer(i64),
36 Null,
37 Boolean(bool),
38 EndOfInput,
39}
40
41impl Display for Token {
42 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
43 match self {
44 Token::StartObject => write!(f, "StartObject"),
45 Token::EndObject => write!(f, "EndObject"),
46 Token::StartArray => write!(f, "StartArray"),
47 Token::EndArray => write!(f, "EndArray"),
48 Token::Colon => write!(f, "Colon"),
49 Token::Comma => write!(f, "Comma"),
50 Token::Str(str) => write!(f, "String(\"{}\")", str),
51 Token::Float(num) => write!(f, "Float({})", num),
52 Token::Integer(num) => write!(f, "Integer({})", num),
53 Token::Null => write!(f, "Null"),
54 Token::Boolean(bool) => write!(f, "Boolean({})", bool),
55 Token::EndOfInput => write!(f, "EndOfInput"),
56 }
57 }
58}
59
60pub type PackedToken<'a> = (Token, Span);
62
63macro_rules! packed_token {
65 ($t:expr, $s:expr, $e:expr) => {
66 Ok(($t, Span { start: $s, end: $e }))
67 };
68 ($t:expr, $s:expr) => {
69 Ok(($t, Span { start: $s, end: $s }))
70 };
71}
72
73macro_rules! match_zero {
74 () => {
75 '0'
76 };
77}
78
79macro_rules! match_minus {
80 () => {
81 '-'
82 };
83}
84
85macro_rules! match_plus_minus {
86 () => {
87 '+' | '-'
88 };
89}
90
91macro_rules! match_digit {
92 () => {
93 '0'..='9'
94 };
95}
96
97macro_rules! match_non_zero_digit {
98 () => {
99 '1'..='9'
100 };
101}
102
103macro_rules! match_exponent {
104 () => {
105 'e' | 'E'
106 };
107}
108
109macro_rules! match_period {
110 () => {
111 '.'
112 };
113}
114
115macro_rules! match_numeric_terminator {
116 () => {
117 ']' | '}' | ','
118 };
119}
120
121macro_rules! match_escape {
122 () => {
123 '\\'
124 };
125}
126
127macro_rules! match_escape_non_unicode_suffix {
128 () => {
129 'n' | 't' | 'r' | '\\' | '/' | 'b' | 'f' | '\"'
130 };
131}
132
133macro_rules! match_escape_unicode_suffix {
134 () => {
135 'u'
136 };
137}
138
139macro_rules! match_quote {
140 () => {
141 '\"'
142 };
143}
144
145macro_rules! match_newline {
146 () => {
147 '\n'
148 };
149}
150
151pub struct Lexer<'a> {
152 input: LexerInput<'a>,
154}
155
156impl<'a> Lexer<'a> {
157 pub fn new(chars: &'a mut impl Iterator<Item = char>) -> Self {
158 Lexer {
159 input: LexerInput::new(chars),
160 }
161 }
162
163 fn front(&self) -> Option<CharWithCoords> {
165 self.input.front()
166 }
167
168 fn back(&self) -> Option<CharWithCoords> {
170 self.input.back()
171 }
172
173 fn front_char(&self) -> char {
175 self.input.front().unwrap().ch
176 }
177
178 fn back_char(&self) -> char {
180 self.input.back().unwrap().ch
181 }
182
183 fn front_coords(&self) -> Coords {
185 self.input.front().unwrap().coords
186 }
187
188 fn back_coords(&self) -> Coords {
190 self.input.back().unwrap().coords
191 }
192
193 fn absolute_position(&self) -> Coords {
195 self.input.position()
196 }
197
198 fn advance(&mut self, skip_whitespace: bool) -> ParserResult<()> {
200 self.input.advance(skip_whitespace)
201 }
202
203 fn advance_n(&mut self, n: usize, skip_whitespace: bool) -> ParserResult<()> {
205 self.input.advance_n(n, skip_whitespace)
206 }
207
208 fn current_string(&mut self) -> String {
210 self.input.buffer_as_string_with_span().str
211 }
212
213 fn current_chars(&mut self) -> Vec<char> {
215 self.input.buffer_as_char_array()
216 }
217
218 fn current_bytes(&mut self) -> Vec<u8> {
220 self.input.buffer_as_byte_array()
221 }
222
223 pub fn consume(&mut self) -> ParserResult<PackedToken> {
225 self.input.clear();
226 match self.advance(true) {
227 Ok(_) => match self.input.front() {
228 Some(CharWithCoords { ch: '{', coords }) => {
229 packed_token!(Token::StartObject, coords)
230 }
231 Some(CharWithCoords { ch: '}', coords }) => packed_token!(Token::EndObject, coords),
232 Some(CharWithCoords { ch: '[', coords }) => {
233 packed_token!(Token::StartArray, coords)
234 }
235 Some(CharWithCoords { ch: ']', coords }) => packed_token!(Token::EndArray, coords),
236 Some(CharWithCoords { ch: ':', coords }) => packed_token!(Token::Colon, coords),
237 Some(CharWithCoords { ch: ',', coords }) => packed_token!(Token::Comma, coords),
238 Some(CharWithCoords { ch: '\"', coords }) => self.match_string(),
239 Some(CharWithCoords { ch: 'n', coords }) => self.match_null(),
240 Some(CharWithCoords { ch: 't', coords }) => self.match_true(),
241 Some(CharWithCoords { ch: 'f', coords }) => self.match_false(),
242 Some(CharWithCoords { ch: '-', coords }) => self.match_number(),
243 Some(CharWithCoords { ch: d, coords }) if d.is_ascii_digit() => self.match_number(),
244 Some(CharWithCoords { ch, coords }) => lexer_error!(
245 ParserErrorDetails::InvalidCharacter(ch.clone()),
246 coords.clone()
247 ),
248 None => lexer_error!(ParserErrorDetails::EndOfInput),
249 },
250 Err(err) => match err.details {
251 ParserErrorDetails::EndOfInput => {
252 packed_token!(Token::EndOfInput, self.input.position())
253 }
254 _ => match err.coords {
255 Some(coords) => lexer_error!(err.details, coords),
256 None => lexer_error!(err.details),
257 },
258 },
259 }
260 }
261
262 fn match_string(&mut self) -> ParserResult<PackedToken> {
264 loop {
265 match self.advance(false) {
266 Ok(_) => match self.front_char() {
267 match_escape!() => match self.input.advance(false) {
268 Ok(_) => match self.front_char() {
269 match_escape_non_unicode_suffix!() => (),
270 match_escape_unicode_suffix!() => self.check_unicode_sequence()?,
271 _ => {
272 return lexer_error!(
273 ParserErrorDetails::InvalidEscapeSequence(
274 self.current_string()
275 ),
276 self.back_coords()
277 );
278 }
279 },
280 Err(err) => {
281 return lexer_error!(err.details, err.coords.unwrap());
282 }
283 },
284 match_quote!() => {
285 return packed_token!(
286 Token::Str(self.current_string()),
287 self.back_coords(),
288 self.front_coords()
289 );
290 }
291 _ => (),
292 },
293 Err(err) => return lexer_error!(err.details, err.coords.unwrap()),
294 }
295 }
296 }
297
298 fn check_unicode_sequence(&mut self) -> ParserResult<()> {
300 let start_position = self.absolute_position();
301 for i in 1..=4 {
302 match self.advance(false) {
303 Ok(_) => {
304 if !self.front_char().is_ascii_hexdigit() {
305 return lexer_error!(
306 ParserErrorDetails::InvalidUnicodeEscapeSequence(self.current_string()),
307 start_position
308 );
309 }
310 }
311 Err(e) => {
312 return lexer_error!(ParserErrorDetails::EndOfInput, self.absolute_position());
313 }
314 }
315 }
316 Ok(())
317 }
318
319 fn match_number(&mut self) -> ParserResult<PackedToken> {
330 let mut have_exponent = false;
331 let mut have_decimal = false;
332
333 match self.match_valid_number_prefix() {
334 Ok(integral) => {
335 have_decimal = !integral;
336 loop {
337 match self.advance(false) {
338 Ok(_) => match self.front_char() {
339 match_digit!() => (),
340 match_exponent!() => {
341 if !have_exponent {
342 self.check_following_exponent()?;
343 have_exponent = true;
344 } else {
345 return lexer_error!(
346 ParserErrorDetails::InvalidNumericRepresentation(
347 self.current_string()
348 ),
349 self.back_coords()
350 );
351 }
352 }
353 match_period!() => {
354 if !have_decimal {
355 have_decimal = true;
356 } else {
357 return lexer_error!(
358 ParserErrorDetails::InvalidNumericRepresentation(
359 self.current_string()
360 ),
361 self.back_coords()
362 );
363 }
364 }
365 match_numeric_terminator!() => {
366 self.input.pushback();
367 break;
368 }
369 ch if ch.is_ascii_whitespace() => {
370 self.input.pushback();
371 break;
372 }
373 ch if ch.is_alphabetic() => {
374 return lexer_error!(
375 ParserErrorDetails::InvalidNumericRepresentation(
376 self.current_string()
377 ),
378 self.back_coords()
379 );
380 }
381 _ => {
382 return lexer_error!(
383 ParserErrorDetails::InvalidNumericRepresentation(
384 self.current_string()
385 ),
386 self.back_coords()
387 );
388 }
389 },
390 Err(err) => {
391 return match err.coords {
392 Some(coords) => lexer_error!(err.details, coords),
393 None => lexer_error!(err.details),
394 };
395 }
396 }
397 }
398 }
399 Err(err) => {
400 return match err.coords {
401 Some(coords) => lexer_error!(err.details, coords),
402 None => lexer_error!(err.details),
403 }
404 }
405 }
406
407 self.parse_numeric(!have_decimal)
408 }
409
410 fn check_following_exponent(&mut self) -> ParserResult<()> {
411 self.advance(false).and_then(|_| {
412 return match self.front_char() {
413 match_plus_minus!() => Ok(()),
414 _ => lexer_error!(
415 ParserErrorDetails::InvalidNumericRepresentation(self.current_string()),
416 self.absolute_position()
417 ),
418 };
419 })
420 }
421
422 #[cfg(not(feature = "mixed_numerics"))]
423 fn parse_numeric(
424 &mut self,
425 integral: bool,
426 start_coords: Coords,
427 end_coords: Coords,
428 ) -> ParserResult<PackedToken> {
429 packed_token!(
430 Token::Float(fast_float::parse(self.input.buffer_as_bytes()).unwrap()),
431 back_input_coords!(),
432 front_input_coords!()
433 )
434 }
435
436 #[cfg(feature = "mixed_numerics")]
437 fn parse_numeric(&mut self, integral: bool) -> ParserResult<PackedToken> {
438 if integral {
439 packed_token!(
440 Token::Integer(lexical::parse(self.input.buffer_as_byte_array()).unwrap()),
441 self.back_coords(),
442 self.front_coords()
443 )
444 } else {
445 packed_token!(
446 Token::Float(fast_float::parse(self.input.buffer_as_byte_array()).unwrap()),
447 self.back_coords(),
448 self.front_coords()
449 )
450 }
451 }
452
453 fn match_valid_number_prefix(&mut self) -> ParserResult<bool> {
460 let ch = self.back_char();
461 assert!(ch.is_ascii_digit() || ch == '-');
462 match ch {
463 match_minus!() => self
464 .input
465 .advance(false)
466 .and_then(|_| self.check_following_minus()),
467 match_zero!() => self
468 .input
469 .advance(false)
470 .and_then(|_| self.check_following_zero()),
471 _ => Ok(true),
472 }
473 }
474
475 fn check_following_zero(&mut self) -> ParserResult<bool> {
476 match self.front_char() {
477 match_period!() => Ok(false),
478 match_digit!() => lexer_error!(
479 ParserErrorDetails::InvalidNumericRepresentation(self.current_string()),
480 self.back_coords()
481 ),
482 match_newline!() => {
483 self.input.pushback();
484 Ok(true)
485 }
486 _ => {
487 self.input.pushback();
488 Ok(true)
489 }
490 }
491 }
492
493 fn check_following_minus(&mut self) -> ParserResult<bool> {
494 match self.front_char() {
495 match_non_zero_digit!() => Ok(true),
496 match_zero!() => self.advance(false).and_then(|_| {
497 if self.front_char() != '.' {
498 return lexer_error!(
499 ParserErrorDetails::InvalidNumericRepresentation(self.current_string()),
500 self.back_coords()
501 );
502 }
503 Ok(false)
504 }),
505 match_newline!() => {
506 self.input.pushback();
507 Ok(true)
508 }
509 _ => lexer_error!(
510 ParserErrorDetails::InvalidNumericRepresentation(self.current_string()),
511 self.back_coords()
512 ),
513 }
514 }
515
516 fn match_null(&mut self) -> ParserResult<PackedToken> {
518 self.input.advance_n(3, false).and_then(|_| {
519 if self.current_chars() == NULL_PATTERN {
520 packed_token!(Token::Null, self.back_coords(), self.front_coords())
521 } else {
522 lexer_error!(
523 ParserErrorDetails::MatchFailed(
524 String::from_iter(NULL_PATTERN.iter()),
525 self.current_string()
526 ),
527 self.back_coords()
528 )
529 }
530 })
531 }
532
533 fn match_true(&mut self) -> ParserResult<PackedToken> {
535 self.advance_n(3, false).and_then(|_| {
536 if self.current_chars() == TRUE_PATTERN {
537 packed_token!(
538 Token::Boolean(true),
539 self.back_coords(),
540 self.front_coords()
541 )
542 } else {
543 lexer_error!(
544 ParserErrorDetails::MatchFailed(
545 String::from_iter(TRUE_PATTERN.iter()),
546 self.current_string()
547 ),
548 self.back_coords()
549 )
550 }
551 })
552 }
553
554 fn match_false(&mut self) -> ParserResult<PackedToken> {
556 self.advance_n(4, false).and_then(|_| {
557 if self.current_chars() == FALSE_PATTERN {
558 packed_token!(
559 Token::Boolean(false),
560 self.back_coords(),
561 self.front_coords()
562 )
563 } else {
564 lexer_error!(
565 ParserErrorDetails::MatchFailed(
566 String::from_iter(FALSE_PATTERN.iter()),
567 self.current_string()
568 ),
569 self.back_coords()
570 )
571 }
572 })
573 }
574}
575
576#[cfg(test)]
577mod tests {
578 use std::env;
579 use std::fs::File;
580 use std::io::{BufRead, BufReader};
581 use std::time::Instant;
582
583 use chisel_decoders::utf8::Utf8Decoder;
584
585 use crate::coords::Span;
586 use crate::lexer::lexer_core::{Lexer, PackedToken, Token};
587 use crate::results::{ParserError, ParserResult};
588 use crate::{lines_from_relative_file, reader_from_bytes};
589
590 #[test]
591 fn should_parse_basic_tokens() {
592 let mut reader = reader_from_bytes!("{}[],:");
593 let mut decoder = Utf8Decoder::new(&mut reader);
594 let mut lexer = Lexer::new(&mut decoder);
595 let mut tokens: Vec<Token> = vec![];
596 let mut spans: Vec<Span> = vec![];
597 for _ in 1..=7 {
598 let token = lexer.consume().unwrap();
599 tokens.push(token.0);
600 spans.push(token.1);
601 }
602 assert_eq!(
603 tokens,
604 [
605 Token::StartObject,
606 Token::EndObject,
607 Token::StartArray,
608 Token::EndArray,
609 Token::Comma,
610 Token::Colon,
611 Token::EndOfInput
612 ]
613 );
614 }
615
616 #[test]
617 fn should_parse_null_and_booleans() {
618 let mut reader = reader_from_bytes!("null true falsetruefalse");
619 let mut decoder = Utf8Decoder::new(&mut reader);
620 let mut lexer = Lexer::new(&mut decoder);
621 let mut tokens: Vec<Token> = vec![];
622 let mut spans: Vec<Span> = vec![];
623 for _ in 1..=6 {
624 let token = lexer.consume().unwrap();
625 tokens.push(token.0);
626 spans.push(token.1);
627 }
628 assert_eq!(
629 tokens,
630 [
631 Token::Null,
632 Token::Boolean(true),
633 Token::Boolean(false),
634 Token::Boolean(true),
635 Token::Boolean(false),
636 Token::EndOfInput
637 ]
638 );
639 }
640
641 #[test]
642 fn should_parse_strings() {
643 let lines = lines_from_relative_file!("fixtures/utf-8/strings.txt");
644 for l in lines.flatten() {
645 if !l.is_empty() {
646 let mut reader = reader_from_bytes!(l);
647 let mut decoder = Utf8Decoder::new(&mut reader);
648 let mut lexer = Lexer::new(&mut decoder);
649 let token = lexer.consume().unwrap();
650 match token.0 {
651 Token::Str(str) => {
652 assert_eq!(str, l)
653 }
654 _ => panic!(),
655 }
656 }
657 }
658 }
659
660 #[test]
661 fn should_report_correct_error_char_position() {
662 let mut reader = reader_from_bytes!("{\"abc\" : \nd}");
663 let mut decoder = Utf8Decoder::new(&mut reader);
664 let mut lexer = Lexer::new(&mut decoder);
665 let mut results = vec![];
666 for _ in 0..4 {
667 results.push(lexer.consume())
668 }
669 assert!(&results[3].is_err());
670 let coords = results[3].clone().err().unwrap().coords.unwrap();
671 assert_eq!(coords.absolute, 11);
672 assert_eq!(coords.line, 2)
673 }
674
675 #[test]
676 fn should_parse_numerics() {
677 let start = Instant::now();
678 let lines = lines_from_relative_file!("fixtures/utf-8/numbers.txt");
679 for l in lines.flatten() {
680 if !l.is_empty() {
681 println!("Parsing {}", l);
682 let mut reader = reader_from_bytes!(l);
683 let mut decoder = Utf8Decoder::new(&mut reader);
684 let mut lexer = Lexer::new(&mut decoder);
685 let token = lexer.consume().unwrap();
686 match token.0 {
687 Token::Integer(_) => {
688 assert_eq!(
689 token.0,
690 Token::Integer(l.replace(',', "").parse::<i64>().unwrap())
691 );
692 }
693 Token::Float(_) => {
694 assert_eq!(
695 token.0,
696 Token::Float(fast_float::parse(l.replace(',', "")).unwrap())
697 );
698 }
699 _ => panic!(),
700 }
701 }
702 }
703 println!("Parsed numerics in {:?}", start.elapsed());
704 }
705
706 #[test]
707 fn should_correctly_handle_invalid_numbers() {
708 let lines = lines_from_relative_file!("fixtures/utf-8/invalid_numbers.txt");
709 for l in lines.flatten() {
710 if !l.is_empty() {
711 let mut reader = reader_from_bytes!(l);
712 let mut decoder = Utf8Decoder::new(&mut reader);
713 let mut lexer = Lexer::new(&mut decoder);
714 let token = lexer.consume();
715 assert!(token.is_err());
716 }
717 }
718 }
719
720 #[test]
721 fn should_correctly_identity_dodgy_strings() {
722 let lines = lines_from_relative_file!("fixtures/utf-8/dodgy_strings.txt");
723 for l in lines.flatten() {
724 if !l.is_empty() {
725 let mut reader = reader_from_bytes!(l);
726 let mut decoder = Utf8Decoder::new(&mut reader);
727 let mut lexer = Lexer::new(&mut decoder);
728 let mut error_token: Option<ParserError> = None;
729 loop {
730 let token = lexer.consume();
731 match token {
732 Ok(packed) => {
733 if packed.0 == Token::EndOfInput {
734 break;
735 }
736 }
737 Err(err) => {
738 error_token = Some(err.clone());
739 println!("Dodgy string found: {} : {}", l, err.coords.unwrap());
740 break;
741 }
742 }
743 }
744 assert!(error_token.is_some());
745 }
746 }
747 }
748
749 #[test]
750 fn should_correctly_report_errors_for_booleans() {
751 let mut reader = reader_from_bytes!("true farse");
752 let mut decoder = Utf8Decoder::new(&mut reader);
753 let mut lexer = Lexer::new(&mut decoder);
754 let mut results: Vec<ParserResult<PackedToken>> = vec![];
755 for _ in 1..=2 {
756 results.push(lexer.consume());
757 }
758
759 assert!(results[0].is_ok());
761 assert!(results[1].is_err());
762
763 if results[0].is_ok() {
765 match &results[0] {
766 Ok(packed) => {
767 assert_eq!((*packed).1.start.column, 1)
768 }
769 Err(_) => {}
770 }
771 }
772
773 if results[1].is_err() {
775 match &results[1] {
776 Ok(_) => {}
777 Err(err) => {
778 assert_eq!(err.coords.unwrap().column, 6)
779 }
780 }
781 }
782
783 println!("Parse error: {:?}", results[1]);
784 }
785}