jsonc_parser/
parse_to_ast.rs

1use std::borrow::Cow;
2use std::collections::HashMap;
3use std::rc::Rc;
4
5use super::ast::*;
6use super::common::Range;
7use super::errors::*;
8use super::scanner::Scanner;
9use super::scanner::ScannerOptions;
10use super::tokens::Token;
11use super::tokens::TokenAndRange;
12
13/// Map where the comments are stored in collections where
14/// the key is the previous token end or start of file or
15/// next token start or end of the file.
16pub type CommentMap<'a> = HashMap<usize, Rc<Vec<Comment<'a>>>>;
17
18/// Strategy for handling comments during parsing.
19///
20/// This enum determines how comments in the JSON/JSONC input are collected
21/// and represented in the resulting abstract syntax tree (AST).
22#[derive(Default, Debug, PartialEq, Clone)]
23pub enum CommentCollectionStrategy {
24  /// Comments are not collected and are effectively ignored during parsing.
25  #[default]
26  Off,
27  /// Comments are collected and stored separately from the main AST structure.
28  ///
29  /// When this strategy is used, comments are placed in a [`CommentMap`] where
30  /// the key is the previous token end or start of file, or the next token start
31  /// or end of file.
32  Separate,
33  /// Comments are collected and treated as tokens within the AST.
34  ///
35  /// When this strategy is used, comments appear alongside other tokens in the
36  /// token stream when `tokens: true` is set in [`CollectOptions`].
37  AsTokens,
38}
39
40/// Options for collecting comments and tokens.
41#[derive(Default, Clone)]
42pub struct CollectOptions {
43  /// Include comments in the result.
44  pub comments: CommentCollectionStrategy,
45  /// Include tokens in the result.
46  pub tokens: bool,
47}
48
49/// Options for parsing.
50#[derive(Clone)]
51pub struct ParseOptions {
52  /// Allow comments (defaults to `true`).
53  pub allow_comments: bool,
54  /// Allow words and numbers as object property names (defaults to `true`).
55  pub allow_loose_object_property_names: bool,
56  /// Allow trailing commas on object literal and array literal values (defaults to `true`).
57  pub allow_trailing_commas: bool,
58  /// Allow single-quoted strings (defaults to `true`).
59  pub allow_single_quoted_strings: bool,
60  /// Allow hexadecimal numbers like 0xFF (defaults to `true`).
61  pub allow_hexadecimal_numbers: bool,
62  /// Allow unary plus sign on numbers like +42 (defaults to `true`).
63  pub allow_unary_plus_numbers: bool,
64}
65
66impl Default for ParseOptions {
67  fn default() -> Self {
68    Self {
69      allow_comments: true,
70      allow_loose_object_property_names: true,
71      allow_trailing_commas: true,
72      allow_single_quoted_strings: true,
73      allow_hexadecimal_numbers: true,
74      allow_unary_plus_numbers: true,
75    }
76  }
77}
78
79/// Result of parsing the text.
80pub struct ParseResult<'a> {
81  /// Collection of comments in the text.
82  ///
83  /// Provide `comments: true` to the `ParseOptions` for this to have a value.
84  ///
85  /// Remarks: The key is the start and end position of the tokens.
86  pub comments: Option<CommentMap<'a>>,
87  /// The JSON value the text contained.
88  pub value: Option<Value<'a>>,
89  /// Collection of tokens (excluding any comments).
90  ///
91  /// Provide `tokens: true` to the `ParseOptions` for this to have a value.
92  pub tokens: Option<Vec<TokenAndRange<'a>>>,
93}
94
95struct Context<'a> {
96  scanner: Scanner<'a>,
97  comments: Option<CommentMap<'a>>,
98  current_comments: Option<Vec<Comment<'a>>>,
99  last_token_end: usize,
100  range_stack: Vec<Range>,
101  tokens: Option<Vec<TokenAndRange<'a>>>,
102  collect_comments_as_tokens: bool,
103  allow_comments: bool,
104  allow_trailing_commas: bool,
105  allow_loose_object_property_names: bool,
106}
107
108impl<'a> Context<'a> {
109  pub fn scan(&mut self) -> Result<Option<Token<'a>>, ParseError> {
110    let previous_end = self.last_token_end;
111    let token = self.scan_handling_comments()?;
112    self.last_token_end = self.scanner.token_end();
113
114    // store the comment for the previous token end, and current token start
115    if let Some(comments) = self.comments.as_mut()
116      && let Some(current_comments) = self.current_comments.take()
117    {
118      let current_comments = Rc::new(current_comments);
119      comments.insert(previous_end, current_comments.clone());
120      comments.insert(self.scanner.token_start(), current_comments);
121    }
122
123    if let Some(token) = &token
124      && self.tokens.is_some()
125    {
126      self.capture_token(token.clone());
127    }
128
129    Ok(token)
130  }
131
132  pub fn token(&self) -> Option<Token<'a>> {
133    self.scanner.token()
134  }
135
136  pub fn start_range(&mut self) {
137    self.range_stack.push(Range {
138      start: self.scanner.token_start(),
139      end: 0,
140    });
141  }
142
143  pub fn end_range(&mut self) -> Range {
144    let mut range = self
145      .range_stack
146      .pop()
147      .expect("Range was popped from the stack, but the stack was empty.");
148    range.end = self.scanner.token_end();
149    range
150  }
151
152  pub fn create_range_from_last_token(&self) -> Range {
153    Range {
154      start: self.scanner.token_start(),
155      end: self.scanner.token_end(),
156    }
157  }
158
159  pub fn create_error(&self, kind: ParseErrorKind) -> ParseError {
160    self.scanner.create_error_for_current_token(kind)
161  }
162
163  pub fn create_error_for_current_range(&mut self, kind: ParseErrorKind) -> ParseError {
164    let range = self.end_range();
165    self.create_error_for_range(range, kind)
166  }
167
168  pub fn create_error_for_range(&self, range: Range, kind: ParseErrorKind) -> ParseError {
169    self.scanner.create_error_for_range(range, kind)
170  }
171
172  fn scan_handling_comments(&mut self) -> Result<Option<Token<'a>>, ParseError> {
173    loop {
174      let token = self.scanner.scan()?;
175      match token {
176        Some(token @ Token::CommentLine(_) | token @ Token::CommentBlock(_)) if self.collect_comments_as_tokens => {
177          self.capture_token(token);
178        }
179        Some(Token::CommentLine(text)) => {
180          self.handle_comment(Comment::Line(CommentLine {
181            range: self.create_range_from_last_token(),
182            text,
183          }))?;
184        }
185        Some(Token::CommentBlock(text)) => {
186          self.handle_comment(Comment::Block(CommentBlock {
187            range: self.create_range_from_last_token(),
188            text,
189          }))?;
190        }
191        _ => return Ok(token),
192      }
193    }
194  }
195
196  fn capture_token(&mut self, token: Token<'a>) {
197    let range = self.create_range_from_last_token();
198    if let Some(tokens) = self.tokens.as_mut() {
199      tokens.push(TokenAndRange {
200        token: token.clone(),
201        range,
202      });
203    }
204  }
205
206  fn handle_comment(&mut self, comment: Comment<'a>) -> Result<(), ParseError> {
207    if !self.allow_comments {
208      return Err(self.create_error(ParseErrorKind::CommentsNotAllowed));
209    }
210
211    if self.comments.is_some() {
212      if let Some(comments) = self.current_comments.as_mut() {
213        comments.push(comment);
214      } else {
215        self.current_comments = Some(vec![comment]);
216      }
217    }
218
219    Ok(())
220  }
221}
222
223/// Parses a string containing JSONC to an AST with comments and tokens.
224///
225/// # Example
226///
227/// ```
228/// use jsonc_parser::CollectOptions;
229/// use jsonc_parser::CommentCollectionStrategy;
230/// use jsonc_parser::parse_to_ast;
231/// use jsonc_parser::ParseOptions;
232///
233/// let parse_result = parse_to_ast(r#"{ "test": 5 } // test"#, &CollectOptions {
234///     comments: CommentCollectionStrategy::Separate, // include comments in result
235///     tokens: true, // include tokens in result
236/// }, &Default::default()).expect("Should parse.");
237/// // ...inspect parse_result for value, tokens, and comments here...
238/// ```
239pub fn parse_to_ast<'a>(
240  text: &'a str,
241  collect_options: &CollectOptions,
242  parse_options: &ParseOptions,
243) -> Result<ParseResult<'a>, ParseError> {
244  let mut context = Context {
245    scanner: Scanner::new(
246      text,
247      &ScannerOptions {
248        allow_single_quoted_strings: parse_options.allow_single_quoted_strings,
249        allow_hexadecimal_numbers: parse_options.allow_hexadecimal_numbers,
250        allow_unary_plus_numbers: parse_options.allow_unary_plus_numbers,
251      },
252    ),
253    comments: match collect_options.comments {
254      CommentCollectionStrategy::Separate => Some(Default::default()),
255      CommentCollectionStrategy::Off | CommentCollectionStrategy::AsTokens => None,
256    },
257    current_comments: None,
258    last_token_end: 0,
259    range_stack: Vec::new(),
260    tokens: if collect_options.tokens { Some(Vec::new()) } else { None },
261    collect_comments_as_tokens: collect_options.comments == CommentCollectionStrategy::AsTokens,
262    allow_comments: parse_options.allow_comments,
263    allow_trailing_commas: parse_options.allow_trailing_commas,
264    allow_loose_object_property_names: parse_options.allow_loose_object_property_names,
265  };
266  context.scan()?;
267  let value = parse_value(&mut context)?;
268
269  if context.scan()?.is_some() {
270    return Err(context.create_error(ParseErrorKind::MultipleRootJsonValues));
271  }
272
273  debug_assert!(context.range_stack.is_empty());
274
275  Ok(ParseResult {
276    comments: context.comments,
277    tokens: context.tokens,
278    value,
279  })
280}
281
282fn parse_value<'a>(context: &mut Context<'a>) -> Result<Option<Value<'a>>, ParseError> {
283  match context.token() {
284    None => Ok(None),
285    Some(token) => match token {
286      Token::OpenBrace => Ok(Some(Value::Object(parse_object(context)?))),
287      Token::OpenBracket => Ok(Some(Value::Array(parse_array(context)?))),
288      Token::String(value) => Ok(Some(Value::StringLit(create_string_lit(context, value)))),
289      Token::Boolean(value) => Ok(Some(Value::BooleanLit(create_boolean_lit(context, value)))),
290      Token::Number(value) => Ok(Some(Value::NumberLit(create_number_lit(context, value)))),
291      Token::Null => Ok(Some(Value::NullKeyword(create_null_keyword(context)))),
292      Token::CloseBracket => Err(context.create_error(ParseErrorKind::UnexpectedCloseBracket)),
293      Token::CloseBrace => Err(context.create_error(ParseErrorKind::UnexpectedCloseBrace)),
294      Token::Comma => Err(context.create_error(ParseErrorKind::UnexpectedComma)),
295      Token::Colon => Err(context.create_error(ParseErrorKind::UnexpectedColon)),
296      Token::Word(_) => Err(context.create_error(ParseErrorKind::UnexpectedWord)),
297      Token::CommentLine(_) => unreachable!(),
298      Token::CommentBlock(_) => unreachable!(),
299    },
300  }
301}
302
303fn parse_object<'a>(context: &mut Context<'a>) -> Result<Object<'a>, ParseError> {
304  debug_assert!(context.token() == Some(Token::OpenBrace));
305  let mut properties = Vec::new();
306
307  context.start_range();
308  context.scan()?;
309
310  loop {
311    match context.token() {
312      Some(Token::CloseBrace) => break,
313      Some(Token::String(prop_name)) => {
314        properties.push(parse_object_property(context, PropName::String(prop_name))?);
315      }
316      Some(Token::Word(prop_name)) | Some(Token::Number(prop_name)) => {
317        properties.push(parse_object_property(context, PropName::Word(prop_name))?);
318      }
319      None => return Err(context.create_error_for_current_range(ParseErrorKind::UnterminatedObject)),
320      _ => return Err(context.create_error(ParseErrorKind::UnexpectedTokenInObject)),
321    }
322
323    // skip the comma
324    if let Some(Token::Comma) = context.scan()? {
325      let comma_range = context.create_range_from_last_token();
326      if let Some(Token::CloseBrace) = context.scan()?
327        && !context.allow_trailing_commas
328      {
329        return Err(context.create_error_for_range(comma_range, ParseErrorKind::TrailingCommasNotAllowed));
330      }
331    }
332  }
333
334  Ok(Object {
335    range: context.end_range(),
336    properties,
337  })
338}
339
340enum PropName<'a> {
341  String(Cow<'a, str>),
342  Word(&'a str),
343}
344
345fn parse_object_property<'a>(context: &mut Context<'a>, prop_name: PropName<'a>) -> Result<ObjectProp<'a>, ParseError> {
346  context.start_range();
347
348  let name = match prop_name {
349    PropName::String(prop_name) => ObjectPropName::String(create_string_lit(context, prop_name)),
350    PropName::Word(prop_name) => {
351      if context.allow_loose_object_property_names {
352        ObjectPropName::Word(create_word(context, prop_name))
353      } else {
354        return Err(context.create_error(ParseErrorKind::ExpectedStringObjectProperty));
355      }
356    }
357  };
358
359  match context.scan()? {
360    Some(Token::Colon) => {}
361    _ => return Err(context.create_error(ParseErrorKind::ExpectedColonAfterObjectKey)),
362  }
363
364  context.scan()?;
365  let value = parse_value(context)?;
366
367  match value {
368    Some(value) => Ok(ObjectProp {
369      range: context.end_range(),
370      name,
371      value,
372    }),
373    None => Err(context.create_error(ParseErrorKind::ExpectedObjectValue)),
374  }
375}
376
377fn parse_array<'a>(context: &mut Context<'a>) -> Result<Array<'a>, ParseError> {
378  debug_assert!(context.token() == Some(Token::OpenBracket));
379  let mut elements = Vec::new();
380
381  context.start_range();
382  context.scan()?;
383
384  loop {
385    match context.token() {
386      Some(Token::CloseBracket) => break,
387      None => return Err(context.create_error_for_current_range(ParseErrorKind::UnterminatedArray)),
388      _ => match parse_value(context)? {
389        Some(value) => elements.push(value),
390        None => return Err(context.create_error_for_current_range(ParseErrorKind::UnterminatedArray)),
391      },
392    }
393
394    // skip the comma
395    if let Some(Token::Comma) = context.scan()? {
396      let comma_range = context.create_range_from_last_token();
397      if let Some(Token::CloseBracket) = context.scan()?
398        && !context.allow_trailing_commas
399      {
400        return Err(context.create_error_for_range(comma_range, ParseErrorKind::TrailingCommasNotAllowed));
401      }
402    }
403  }
404
405  Ok(Array {
406    range: context.end_range(),
407    elements,
408  })
409}
410
411// factory functions
412
413fn create_string_lit<'a>(context: &Context<'a>, value: Cow<'a, str>) -> StringLit<'a> {
414  StringLit {
415    range: context.create_range_from_last_token(),
416    value,
417  }
418}
419
420fn create_word<'a>(context: &Context<'a>, value: &'a str) -> WordLit<'a> {
421  WordLit {
422    range: context.create_range_from_last_token(),
423    value,
424  }
425}
426
427fn create_boolean_lit(context: &Context, value: bool) -> BooleanLit {
428  BooleanLit {
429    range: context.create_range_from_last_token(),
430    value,
431  }
432}
433
434fn create_number_lit<'a>(context: &Context<'a>, value: &'a str) -> NumberLit<'a> {
435  NumberLit {
436    range: context.create_range_from_last_token(),
437    value,
438  }
439}
440
441fn create_null_keyword(context: &Context) -> NullKeyword {
442  NullKeyword {
443    range: context.create_range_from_last_token(),
444  }
445}
446
447#[cfg(test)]
448mod tests {
449  use super::*;
450  use pretty_assertions::assert_eq;
451
452  #[test]
453  fn it_should_error_when_has_multiple_values() {
454    assert_has_error(
455      "[][]",
456      "Text cannot contain more than one JSON value on line 1 column 3",
457    );
458  }
459
460  #[test]
461  fn it_should_error_when_object_is_not_terminated() {
462    assert_has_error("{", "Unterminated object on line 1 column 1");
463  }
464
465  #[test]
466  fn it_should_error_when_object_has_unexpected_token() {
467    assert_has_error("{ [] }", "Unexpected token in object on line 1 column 3");
468  }
469
470  #[test]
471  fn it_should_error_when_object_has_two_non_string_tokens() {
472    assert_has_error(
473      "{ asdf asdf: 5 }",
474      "Expected colon after the string or word in object property on line 1 column 8",
475    );
476  }
477
478  #[test]
479  fn it_should_error_when_array_is_not_terminated() {
480    assert_has_error("[", "Unterminated array on line 1 column 1");
481  }
482
483  #[test]
484  fn it_should_error_when_array_has_unexpected_token() {
485    assert_has_error("[:]", "Unexpected colon on line 1 column 2");
486  }
487
488  #[test]
489  fn it_should_error_when_comment_block_not_closed() {
490    assert_has_error("/* test", "Unterminated comment block on line 1 column 1");
491  }
492
493  #[test]
494  fn it_should_error_when_string_lit_not_closed() {
495    assert_has_error("\" test", "Unterminated string literal on line 1 column 1");
496  }
497
498  fn assert_has_error(text: &str, message: &str) {
499    let result = parse_to_ast(text, &Default::default(), &Default::default());
500    match result {
501      Ok(_) => panic!("Expected error, but did not find one."),
502      Err(err) => assert_eq!(err.to_string(), message),
503    }
504  }
505
506  #[test]
507  fn strict_should_error_object_trailing_comma() {
508    assert_has_strict_error(
509      r#"{ "test": 5, }"#,
510      "Trailing commas are not allowed on line 1 column 12",
511    );
512  }
513
514  #[test]
515  fn strict_should_error_array_trailing_comma() {
516    assert_has_strict_error(r#"[ "test", ]"#, "Trailing commas are not allowed on line 1 column 9");
517  }
518
519  #[test]
520  fn strict_should_error_comment_line() {
521    assert_has_strict_error(r#"[ "test" ] // 1"#, "Comments are not allowed on line 1 column 12");
522  }
523
524  #[test]
525  fn strict_should_error_comment_block() {
526    assert_has_strict_error(r#"[ "test" /* 1 */]"#, "Comments are not allowed on line 1 column 10");
527  }
528
529  #[test]
530  fn strict_should_error_word_property() {
531    assert_has_strict_error(
532      r#"{ word: 5 }"#,
533      "Expected string for object property on line 1 column 3",
534    );
535  }
536
537  #[test]
538  fn strict_should_error_single_quoted_string() {
539    assert_has_strict_error(
540      r#"{ "key": 'value' }"#,
541      "Single-quoted strings are not allowed on line 1 column 10",
542    );
543  }
544
545  #[test]
546  fn strict_should_error_hexadecimal_number() {
547    assert_has_strict_error(
548      r#"{ "key": 0xFF }"#,
549      "Hexadecimal numbers are not allowed on line 1 column 10",
550    );
551  }
552
553  #[test]
554  fn strict_should_error_unary_plus_number() {
555    assert_has_strict_error(
556      r#"{ "key": +42 }"#,
557      "Unary plus on numbers is not allowed on line 1 column 10",
558    );
559  }
560
561  #[track_caller]
562  fn assert_has_strict_error(text: &str, message: &str) {
563    let result = parse_to_ast(
564      text,
565      &Default::default(),
566      &ParseOptions {
567        allow_comments: false,
568        allow_loose_object_property_names: false,
569        allow_trailing_commas: false,
570        allow_single_quoted_strings: false,
571        allow_hexadecimal_numbers: false,
572        allow_unary_plus_numbers: false,
573      },
574    );
575    match result {
576      Ok(_) => panic!("Expected error, but did not find one."),
577      Err(err) => assert_eq!(err.to_string(), message),
578    }
579  }
580
581  #[test]
582  fn it_should_not_include_tokens_by_default() {
583    let result = parse_to_ast("{}", &Default::default(), &Default::default()).unwrap();
584    assert!(result.tokens.is_none());
585  }
586
587  #[test]
588  fn it_should_include_tokens_when_specified() {
589    let result = parse_to_ast(
590      "{}",
591      &CollectOptions {
592        tokens: true,
593        ..Default::default()
594      },
595      &Default::default(),
596    )
597    .unwrap();
598    let tokens = result.tokens.unwrap();
599    assert_eq!(tokens.len(), 2);
600  }
601
602  #[test]
603  fn it_should_not_include_comments_by_default() {
604    let result = parse_to_ast("{}", &Default::default(), &Default::default()).unwrap();
605    assert!(result.comments.is_none());
606  }
607
608  #[test]
609  fn it_should_include_comments_when_specified() {
610    let result = parse_to_ast(
611      "{} // 2",
612      &CollectOptions {
613        comments: CommentCollectionStrategy::Separate,
614        ..Default::default()
615      },
616      &Default::default(),
617    )
618    .unwrap();
619    let comments = result.comments.unwrap();
620    assert_eq!(comments.len(), 2); // for both positions, but it's the same comment
621  }
622
623  #[cfg(not(feature = "error_unicode_width"))]
624  #[test]
625  fn error_correct_line_column_unicode_width() {
626    assert_has_strict_error(r#"["🧑‍🦰", ["#, "Unterminated array on line 1 column 9");
627  }
628
629  #[cfg(feature = "error_unicode_width")]
630  #[test]
631  fn error_correct_line_column_unicode_width() {
632    assert_has_strict_error(r#"["🧑‍🦰", ["#, "Unterminated array on line 1 column 10");
633  }
634
635  #[test]
636  fn it_should_parse_unquoted_keys_with_hex_and_trailing_comma() {
637    let text = r#"{
638      CP_CanFuncReqId: 0x7DF,  // 2015
639  }"#;
640    {
641      let parse_result = parse_to_ast(text, &Default::default(), &Default::default()).unwrap();
642
643      let value = parse_result.value.unwrap();
644      let obj = value.as_object().unwrap();
645      assert_eq!(obj.properties.len(), 1);
646      assert_eq!(obj.properties[0].name.as_str(), "CP_CanFuncReqId");
647
648      let number_value = obj.properties[0].value.as_number_lit().unwrap();
649      assert_eq!(number_value.value, "0x7DF");
650    }
651    #[cfg(feature = "serde")]
652    {
653      let value = crate::parse_to_serde_value(text, &Default::default()).unwrap().unwrap();
654      // hexadecimal numbers are converted to decimal in serde output
655      assert_eq!(
656        value,
657        serde_json::json!({
658          "CP_CanFuncReqId": 2015
659        })
660      );
661    }
662  }
663
664  #[test]
665  fn it_should_parse_unary_plus_numbers() {
666    let result = parse_to_ast(r#"{ "test": +42 }"#, &Default::default(), &Default::default()).unwrap();
667
668    let value = result.value.unwrap();
669    let obj = value.as_object().unwrap();
670    assert_eq!(obj.properties.len(), 1);
671    assert_eq!(obj.properties[0].name.as_str(), "test");
672
673    let number_value = obj.properties[0].value.as_number_lit().unwrap();
674    assert_eq!(number_value.value, "+42");
675  }
676}