jsona/
parser.rs

1//! JSONA document to syntax tree parsing.
2
3use crate::dom;
4use crate::syntax::{SyntaxKind, SyntaxKind::*, SyntaxNode};
5use crate::util::validate_quote;
6use logos::{Lexer, Logos};
7use rowan::{GreenNode, GreenNodeBuilder, TextRange, TextSize};
8use std::collections::HashSet;
9
10macro_rules! with_node {
11    ($builder:expr, $kind:ident, $($content:tt)*) => {
12        {
13            $builder.start_node($kind.into());
14            let res = $($content)*;
15            $builder.finish_node();
16            res
17        }
18    };
19}
20
21/// A syntax error that can occur during parsing.
22#[derive(Debug, Clone, Eq, PartialEq, Hash)]
23pub struct Error {
24    /// The span of the error.
25    pub range: TextRange,
26
27    /// Human-friendly error message.
28    pub message: String,
29}
30
31impl core::fmt::Display for Error {
32    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
33        write!(f, "{} ({:?})", &self.message, &self.range)
34    }
35}
36impl std::error::Error for Error {}
37
38/// Parse a JSONA document into a [Rowan green tree](rowan::GreenNode).
39///
40/// The parsing will not stop at unexpected or invalid tokens.
41/// Instead errors will be collected with their character offsets and lengths,
42/// and the invalid token(s) will have the `ERROR` kind in the final tree.
43///
44/// The parser will also validate comment and string contents, looking for
45/// invalid escape sequences and invalid characters.
46/// These will also be reported as syntax errors.
47///
48/// This does not check for semantic errors such as duplicate keys.
49pub fn parse(source: &str) -> Parse {
50    Parser::new(source).parse()
51}
52
53/// A hand-written parser that uses the Logos lexer
54/// to tokenize the source, then constructs
55/// a Rowan green tree from them.
56pub(crate) struct Parser<'p> {
57    current_token: Option<SyntaxKind>,
58    lexer: Lexer<'p, SyntaxKind>,
59    builder: GreenNodeBuilder<'p>,
60    errors: Vec<Error>,
61    annotation_scope: bool,
62    parse_keys_mode: ParseKeysMode,
63}
64
65#[derive(Debug, Clone, PartialEq, Eq)]
66pub(crate) enum ParseKeysMode {
67    None,
68    Keys,
69    QueryKeys,
70}
71
72impl Default for ParseKeysMode {
73    fn default() -> Self {
74        ParseKeysMode::None
75    }
76}
77
78/// This is just a convenience type during parsing.
79/// It allows using "?", making the code cleaner.
80type ParserResult<T> = Result<T, ()>;
81
82impl<'p> Parser<'p> {
83    pub(crate) fn new(source: &'p str) -> Self {
84        Parser {
85            current_token: None,
86            lexer: SyntaxKind::lexer(source),
87            builder: Default::default(),
88            errors: Default::default(),
89            annotation_scope: false,
90            parse_keys_mode: Default::default(),
91        }
92    }
93
94    pub(crate) fn parse_keys_only(mut self, glob: bool) -> Parse {
95        if glob {
96            self.parse_keys_mode = ParseKeysMode::QueryKeys
97        } else {
98            self.parse_keys_mode = ParseKeysMode::Keys
99        }
100        let _ = with_node!(self.builder, KEYS, self.parse_keys());
101
102        Parse {
103            green_node: self.builder.finish(),
104            errors: self.errors,
105        }
106    }
107
108    fn parse(mut self) -> Parse {
109        let _ = with_node!(self.builder, VALUE, self.parse_root());
110
111        Parse {
112            green_node: self.builder.finish(),
113            errors: self.errors,
114        }
115    }
116
117    fn parse_root(&mut self) -> ParserResult<()> {
118        self.parse_value()?;
119        self.parse_annotations()?;
120        self.must_peek_eof()
121    }
122
123    fn parse_annotations(&mut self) -> ParserResult<()> {
124        if let Ok(ANNOTATION_KEY) = self.peek_token() {
125            self.builder.start_node(ANNOTATIONS.into());
126            while let Ok(ANNOTATION_KEY) = self.peek_token() {
127                if self.lexer.slice().len() == 1 {
128                    self.report_error("invalid annotation key");
129                }
130                let _ = with_node!(self.builder, ANNOTATION_PROPERTY, self.parse_anno_entry());
131            }
132            self.builder.finish_node();
133        }
134        Ok(())
135    }
136
137    fn parse_anno_entry(&mut self) -> ParserResult<()> {
138        self.must_token_or(ANNOTATION_KEY, r#"expected annotation key"#)?;
139        if self.annotation_scope {
140            self.report_error("nested annotation");
141        }
142        if let Ok(PARENTHESES_START) = self.peek_token() {
143            self.annotation_scope = true;
144            let ret = with_node!(self.builder, ANNOTATION_VALUE, self.parse_anno_value());
145            self.annotation_scope = false;
146            ret?;
147        }
148        Ok(())
149    }
150
151    fn parse_anno_value(&mut self) -> ParserResult<()> {
152        self.must_token_or(PARENTHESES_START, r#"expected "(""#)?;
153        if PARENTHESES_END == self.peek_token()? {
154            self.must_token_or(PARENTHESES_END, r#"expected ")""#)?;
155            return Ok(());
156        }
157        let ret = with_node!(self.builder, VALUE, self.parse_value());
158        self.must_token_or(PARENTHESES_END, r#"expected ")""#)?;
159        ret
160    }
161
162    fn parse_property(&mut self) -> ParserResult<bool> {
163        with_node!(self.builder, KEY, self.parse_key())?;
164        let _ = self.must_token_or(COLON, r#"expected ":""#);
165        if let Ok(t) = self.peek_token() {
166            match t {
167                COMMA => {
168                    self.report_error("expected value");
169                    self.consume_current_token()?;
170                    return Ok(true);
171                }
172                BRACE_END => {
173                    self.report_error("expected value");
174                    return Ok(false);
175                }
176                _ => {}
177            }
178        }
179        let ret = with_node!(self.builder, VALUE, self.parse_value_with_annotations());
180        Ok(ret.ok().unwrap_or_default())
181    }
182
183    fn parse_value(&mut self) -> ParserResult<()> {
184        let t = match self.peek_token() {
185            Ok(t) => t,
186            Err(_) => return Ok(()),
187        };
188        match t {
189            BRACE_START => {
190                with_node!(self.builder, OBJECT, self.parse_object())
191            }
192            BRACKET_START => {
193                with_node!(self.builder, ARRAY, self.parse_array())
194            }
195            NULL | BOOL => with_node!(self.builder, SCALAR, self.consume_current_token()),
196            INTEGER => {
197                // This could've been done more elegantly probably.
198                if (self.lexer.slice().starts_with('0') && self.lexer.slice() != "0")
199                    || (self.lexer.slice().starts_with("+0") && self.lexer.slice() != "+0")
200                    || (self.lexer.slice().starts_with("-0") && self.lexer.slice() != "-0")
201                {
202                    self.consume_error_token("zero-padded integers are not allowed")
203                } else if !validate_underscore_integer(self.lexer.slice(), 10) {
204                    self.consume_error_token("invalid underscores")
205                } else {
206                    with_node!(self.builder, SCALAR, self.consume_current_token())
207                }
208            }
209            INTEGER_BIN => {
210                if !validate_underscore_integer(self.lexer.slice(), 2) {
211                    self.consume_error_token("invalid underscores")
212                } else {
213                    with_node!(self.builder, SCALAR, self.consume_current_token())
214                }
215            }
216            INTEGER_HEX => {
217                if !validate_underscore_integer(self.lexer.slice(), 16) {
218                    self.consume_error_token("invalid underscores")
219                } else {
220                    with_node!(self.builder, SCALAR, self.consume_current_token())
221                }
222            }
223            INTEGER_OCT => {
224                if !validate_underscore_integer(self.lexer.slice(), 8) {
225                    self.consume_error_token("invalid underscores")
226                } else {
227                    with_node!(self.builder, SCALAR, self.consume_current_token())
228                }
229            }
230            FLOAT => {
231                let int_slice = if self.lexer.slice().contains('.') {
232                    self.lexer.slice().split('.').next().unwrap()
233                } else {
234                    self.lexer.slice().split('e').next().unwrap()
235                };
236
237                if (int_slice.starts_with('0') && int_slice != "0")
238                    || (int_slice.starts_with("+0") && int_slice != "+0")
239                    || (int_slice.starts_with("-0") && int_slice != "-0")
240                {
241                    self.consume_error_token("zero-padded numbers are not allowed")
242                } else if !validate_underscore_integer(self.lexer.slice(), 10) {
243                    self.consume_error_token("invalid underscores")
244                } else {
245                    with_node!(self.builder, SCALAR, self.consume_current_token())
246                }
247            }
248            DOUBLE_QUOTE | SINGLE_QUOTE => {
249                self.validate_string();
250                with_node!(self.builder, SCALAR, self.consume_current_token())
251            }
252            BACKTICK_QUOTE => {
253                self.validate_backtick();
254                with_node!(self.builder, SCALAR, self.consume_current_token())
255            }
256            COMMA => {
257                self.report_error("expected value");
258                Err(())
259            }
260            _ => self.consume_error_token("expected value"),
261        }
262    }
263
264    fn parse_value_with_annotations(&mut self) -> ParserResult<bool> {
265        self.parse_value()?;
266        let mut has_comma = false;
267        if let Ok(COMMA) = self.peek_token() {
268            has_comma = true;
269            self.consume_current_token()?;
270        }
271        self.parse_annotations()?;
272        Ok(has_comma)
273    }
274
275    fn parse_object(&mut self) -> ParserResult<()> {
276        self.must_token_or(BRACE_START, r#"expected "{""#)?;
277        self.parse_annotations()?;
278        let mut needs_comma = false;
279
280        while let Ok(t) = self.must_peek_token() {
281            match t {
282                BRACE_END => {
283                    return self.consume_current_token();
284                }
285                COMMA => {
286                    if needs_comma {
287                        needs_comma = false;
288                        self.consume_current_token()?;
289                    } else {
290                        let _ = self.consume_error_token(r#"unexpected ",""#);
291                    }
292                }
293                _ => {
294                    if needs_comma {
295                        self.point_error(r#"expected ",""#);
296                    }
297                    let ret = with_node!(self.builder, PROPERTY, self.parse_property());
298                    if let Ok(has_comma) = ret {
299                        needs_comma = !has_comma;
300                    }
301                }
302            }
303        }
304        Ok(())
305    }
306
307    fn parse_array(&mut self) -> ParserResult<()> {
308        self.must_token_or(BRACKET_START, r#"expected "[""#)?;
309        let _ = self.parse_annotations();
310        let mut needs_comma = false;
311
312        while let Ok(t) = self.must_peek_token() {
313            match t {
314                BRACKET_END => {
315                    return self.consume_current_token();
316                }
317                COMMA => {
318                    if needs_comma {
319                        needs_comma = false;
320                        self.consume_current_token()?;
321                    } else {
322                        let _ = self.consume_error_token(r#"unexpected ",""#);
323                    }
324                }
325                _ => {
326                    if needs_comma {
327                        self.point_error(r#"expected ",""#);
328                    }
329                    let ret = with_node!(self.builder, VALUE, self.parse_value_with_annotations());
330                    needs_comma = !ret.ok().unwrap_or_default();
331                }
332            }
333        }
334
335        Ok(())
336    }
337
338    fn parse_keys(&mut self) -> ParserResult<()> {
339        let mut first = true;
340        let mut after_dot = false;
341        let mut exist_annotation_key = false;
342        loop {
343            let t = match self.peek_token() {
344                Ok(token) => token,
345                Err(_) => {
346                    if !after_dot {
347                        return Ok(());
348                    }
349                    return self.consume_error_token("unexpected EOF");
350                }
351            };
352
353            match t {
354                ANNOTATION_KEY => {
355                    if after_dot || exist_annotation_key {
356                        return self.consume_error_token("unexpected annotation key");
357                    } else {
358                        self.consume_current_token()?;
359                        exist_annotation_key = true;
360                        after_dot = false;
361                        first = false;
362                    }
363                }
364                PERIOD => {
365                    if after_dot {
366                        return self.consume_error_token(r#"unexpected ".""#);
367                    } else {
368                        self.consume_current_token()?;
369                        after_dot = true;
370                    }
371                }
372                FLOAT => {
373                    let value = self.lexer.slice();
374                    if value.starts_with(['+', '-']) {
375                        return self.consume_error_token("unexpected identifier");
376                    } else {
377                        let mut dot = false;
378                        for (i, s) in value.split('.').enumerate() {
379                            if s.is_empty() {
380                                if i == 0 && after_dot {
381                                    return self.consume_error_token(r#"unexpected ".""#);
382                                }
383                                self.consume_token(PERIOD, ".");
384                                dot = true;
385                            } else {
386                                self.consume_token(IDENT, s);
387                                dot = false;
388                            }
389                        }
390                        if dot {
391                            after_dot = true;
392                        }
393                        self.next_token();
394                    }
395                }
396                BRACKET_START => {
397                    self.consume_current_token()?;
398
399                    self.parse_key()?;
400
401                    let token = self.peek_token()?;
402
403                    if !matches!(token, BRACKET_END) {
404                        self.consume_error_token(r#"expected "]""#)?;
405                    }
406                    self.consume_current_token()?;
407
408                    after_dot = false;
409                }
410                _ => {
411                    if after_dot || first {
412                        match self.parse_key() {
413                            Ok(_) => {}
414                            Err(_) => {
415                                self.report_error("expected identifier");
416                                return Err(());
417                            }
418                        }
419                        after_dot = false;
420                        first = false;
421                    } else {
422                        return self.consume_error_token(r#"expect ".""#);
423                    }
424                }
425            };
426        }
427    }
428
429    fn parse_key(&mut self) -> ParserResult<()> {
430        let t = self.must_peek_token()?;
431
432        match t {
433            IDENT => self.consume_current_token(),
434            IDENT_WITH_GLOB if self.parse_keys_mode == ParseKeysMode::QueryKeys => {
435                if let Err(err_indices) = validates::glob(self.lexer.slice()) {
436                    for e in err_indices {
437                        let span = self.lexer.span();
438                        self.add_error(&Error {
439                            range: TextRange::new(
440                                TextSize::from((span.start + e) as u32),
441                                TextSize::from((span.start + e) as u32),
442                            ),
443                            message: "invalid glob".into(),
444                        });
445                    }
446                };
447                self.consume_current_token()
448            }
449            NULL | BOOL => self.consume_current_token(),
450            INTEGER_HEX | INTEGER_BIN | INTEGER_OCT => self.consume_current_token(),
451            INTEGER => {
452                if self.lexer.slice().starts_with('+') {
453                    Err(())
454                } else {
455                    self.consume_current_token()
456                }
457            }
458            SINGLE_QUOTE | DOUBLE_QUOTE => {
459                self.validate_string();
460                self.consume_current_token()
461            }
462            BACKTICK_QUOTE => {
463                self.validate_backtick();
464                self.consume_current_token()
465            }
466            FLOAT if self.parse_keys_mode == ParseKeysMode::None => {
467                if self.lexer.slice().starts_with('0') {
468                    self.consume_error_token("zero-padded numbers are not allowed")
469                } else if self.lexer.slice().starts_with('+') {
470                    Err(())
471                } else {
472                    self.consume_current_token()
473                }
474            }
475            _ => self.consume_error_token("expect identifier"),
476        }
477    }
478
479    fn must_peek_token(&mut self) -> ParserResult<SyntaxKind> {
480        match self.peek_token() {
481            Ok(t) => Ok(t),
482            Err(_) => {
483                self.report_error("unexpected EOF");
484                Err(())
485            }
486        }
487    }
488
489    fn must_peek_eof(&mut self) -> ParserResult<()> {
490        match self.peek_token() {
491            Ok(_) => {
492                self.report_error("expect EOF");
493                Err(())
494            }
495            Err(_) => Ok(()),
496        }
497    }
498
499    fn must_token_or(&mut self, kind: SyntaxKind, message: &str) -> ParserResult<()> {
500        let t = self.must_peek_token()?;
501        if kind == t {
502            self.consume_current_token()
503        } else {
504            self.report_error(message);
505            Err(())
506        }
507    }
508
509    fn consume_current_token(&mut self) -> ParserResult<()> {
510        match self.peek_token() {
511            Err(_) => Err(()),
512            Ok(token) => {
513                self.consume_token(token, self.lexer.slice());
514                Ok(())
515            }
516        }
517    }
518
519    fn consume_error_token(&mut self, message: &str) -> ParserResult<()> {
520        self.report_error(message);
521
522        self.consume_token(ERROR, self.lexer.slice());
523
524        Err(())
525    }
526
527    fn peek_token(&mut self) -> ParserResult<SyntaxKind> {
528        if self.current_token.is_none() {
529            self.next_token();
530        }
531
532        self.current_token.ok_or(())
533    }
534
535    fn next_token(&mut self) {
536        self.current_token = None;
537        while let Some(token) = self.lexer.next() {
538            match token {
539                LINE_COMMENT | BLOCK_COMMENT => {
540                    let multiline = token == BLOCK_COMMENT;
541                    if let Err(err_indices) = validates::comment(self.lexer.slice(), multiline) {
542                        for e in err_indices {
543                            let span = self.lexer.span();
544                            self.add_error(&Error {
545                                range: TextRange::new(
546                                    TextSize::from((span.start + e) as u32),
547                                    TextSize::from((span.start + e) as u32),
548                                ),
549                                message: "invalid character in comment".into(),
550                            });
551                        }
552                    };
553
554                    self.consume_token(token, self.lexer.slice());
555                }
556                WHITESPACE | NEWLINE => {
557                    self.consume_token(token, self.lexer.slice());
558                }
559                ERROR => {
560                    let _ = self.consume_error_token("unexpected token");
561                }
562                _ => {
563                    self.current_token = Some(token);
564                    break;
565                }
566            }
567        }
568    }
569
570    fn consume_token(&mut self, kind: SyntaxKind, text: &str) {
571        self.builder.token(kind.into(), text);
572        self.current_token = None;
573    }
574
575    fn report_error(&mut self, message: &str) {
576        let span = self.lexer.span();
577
578        let err = Error {
579            range: TextRange::new(
580                TextSize::from(span.start as u32),
581                TextSize::from(span.end as u32),
582            ),
583            message: message.into(),
584        };
585        self.add_error(&err);
586    }
587
588    fn point_error(&mut self, message: &str) {
589        let span = self.lexer.span();
590        let point = TextSize::from(span.start.saturating_sub(1) as u32);
591        let err = Error {
592            range: TextRange::new(point, point),
593            message: message.into(),
594        };
595        self.add_error(&err);
596    }
597
598    fn add_error(&mut self, e: &Error) {
599        if let Some(last_err) = self.errors.last_mut() {
600            if last_err.range == e.range {
601                return;
602            }
603        }
604        self.errors.push(e.clone());
605    }
606
607    fn validate_string(&mut self) {
608        let mut indexes: HashSet<usize> = HashSet::default();
609
610        if let Err(err_indices) = validates::string(self.lexer.slice()) {
611            indexes.extend(err_indices);
612        };
613        if let Err(err_indices) = validate_quote(self.lexer.slice()) {
614            indexes.extend(err_indices);
615        };
616        let span = self.lexer.span();
617        for e in indexes {
618            self.add_error(&Error {
619                range: TextRange::new(
620                    TextSize::from((span.start + e) as u32),
621                    TextSize::from((span.start + e + 1) as u32),
622                ),
623                message: "invalid character in string".into(),
624            });
625        }
626    }
627    fn validate_backtick(&mut self) {
628        if let Err(err_indices) = validates::backtick_string(self.lexer.slice()) {
629            for e in err_indices {
630                let span = self.lexer.span();
631                self.add_error(&Error {
632                    range: TextRange::new(
633                        TextSize::from((span.start + e) as u32),
634                        TextSize::from((span.start + e + 1) as u32),
635                    ),
636                    message: "invalid character in string".into(),
637                });
638            }
639        };
640    }
641}
642
643fn validate_underscore_integer(s: &str, radix: u32) -> bool {
644    if s.starts_with('_') || s.ends_with('_') {
645        return false;
646    }
647
648    let mut prev_char = 0 as char;
649
650    for c in s.chars() {
651        if c == '_' && !prev_char.is_digit(radix) {
652            return false;
653        }
654        if !c.is_digit(radix) && prev_char == '_' {
655            return false;
656        }
657        prev_char = c;
658    }
659
660    true
661}
662
663/// The final results of a parsing.
664/// It contains the green tree, and
665/// the errors that ocurred during parsing.
666#[derive(Debug, Clone)]
667pub struct Parse {
668    pub green_node: GreenNode,
669    pub errors: Vec<Error>,
670}
671
672impl Parse {
673    /// Turn the parse into a syntax node.
674    pub fn into_syntax(self) -> SyntaxNode {
675        SyntaxNode::new_root(self.green_node)
676    }
677    /// Turn the parse into a DOM tree.
678    ///
679    /// Any semantic errors that occur will be collected
680    /// in the returned DOM node.
681    pub fn into_dom(self) -> dom::Node {
682        dom::from_syntax(self.into_syntax().into())
683    }
684}
685
686pub(crate) mod validates {
687    pub(crate) fn comment(s: &str, multiline: bool) -> Result<(), Vec<usize>> {
688        let mut err_indices = Vec::new();
689
690        for (i, c) in s.chars().enumerate() {
691            if multiline {
692                if c != '\t' && c != '\n' && c != '\r' && c.is_control() {
693                    err_indices.push(i);
694                }
695            } else if c != '\t' && c.is_control() {
696                err_indices.push(i);
697            }
698        }
699
700        if err_indices.is_empty() {
701            Ok(())
702        } else {
703            Err(err_indices)
704        }
705    }
706
707    pub(crate) fn string(s: &str) -> Result<(), Vec<usize>> {
708        let mut err_indices = Vec::new();
709
710        let mut index = 0;
711        for c in s.chars() {
712            if c != '\t' && c.is_ascii_control() {
713                err_indices.push(index);
714            }
715            index += c.len_utf8();
716        }
717
718        if err_indices.is_empty() {
719            Ok(())
720        } else {
721            Err(err_indices)
722        }
723    }
724
725    pub(crate) fn backtick_string(s: &str) -> Result<(), Vec<usize>> {
726        let mut err_indices = Vec::new();
727
728        let mut index = 0;
729        for c in s.chars() {
730            if c != '\t' && c != '\n' && c != '\r' && c.is_ascii_control() {
731                err_indices.push(index);
732            }
733            index += c.len_utf8();
734        }
735
736        if err_indices.is_empty() {
737            Ok(())
738        } else {
739            Err(err_indices)
740        }
741    }
742
743    pub(crate) fn glob(s: &str) -> Result<(), Vec<usize>> {
744        let mut err_indices = Vec::new();
745
746        if s == "*" || s == "**" {
747            return Ok(());
748        }
749        if let Some(i) = s.find("**") {
750            err_indices.push(i);
751        }
752        if err_indices.is_empty() {
753            Ok(())
754        } else {
755            Err(err_indices)
756        }
757    }
758}