rill_json/
parser.rs

1//! Contains the `StreamingParser` and its state machine.
2//!
3//! This module defines the core logic of the parser, which is implemented
4//! as a state machine that consumes `Token`s from the `Tokenizer` and
5//! emits `ParserEvent`s.
6
7use crate::error::ParseError;
8use crate::token::{Token, TokenType};
9use crate::tokenizer::Tokenizer;
10use crate::value::JsonNumber;
11use std::borrow::Cow;
12use std::iter::Peekable;
13
14/// A single event emitted by the `StreamingParser`.
15///
16/// The parser is an `Iterator` that yields these events, allowing you
17/// to react to JSON data as it's being parsed without loading the
18/// entire structure into memory.
19#[derive(Debug, PartialEq, Clone)]
20pub enum ParserEvent<'a> {
21    /// The start of a JSON object (`{`).
22    StartObject,
23    /// The end of a JSON object (`}`).
24    EndObject,
25    /// The start of a JSON array (`[`).
26    StartArray,
27    /// The end of a JSON array (`]`).
28    EndArray,
29    /// A JSON object key (e.g., `"name": ...`).
30    Key(Cow<'a, str>),
31    /// A JSON string value (e.g., `... : "value"`).
32    String(Cow<'a, str>),
33    /// A JSON number value (e.g., `123`, `-0.5`, `1e10`).
34    Number(JsonNumber),
35    /// A JSON boolean value (`true` or `false`).
36    Boolean(bool),
37    /// A JSON `null` value.
38    Null,
39}
40
41/// Internal state machine for the parser.
42///
43/// This enum tracks what the parser *expects* to see next,
44/// allowing it to enforce JSON grammar rules (e.g., "a comma or
45/// closing bracket must follow a value in an array").
46#[derive(Debug, PartialEq, Clone)]
47#[allow(clippy::enum_variant_names)]
48enum ParserState {
49    ExpectValue,
50    ExpectArrayFirstValueOrEnd, // After '[' - expect value or ']' (empty array)
51    ExpectArrayValue,           // After ',' in array - expect value (no ']' allowed)
52    ExpectArrayCommaOrEnd,      // After value in array - expect ',' or ']'
53    ExpectObjectFirstKeyOrEnd,  // After '{' - expect key or '}' (empty object)
54    ExpectObjectKey,            // After ',' in object - expect key (no '}' allowed)
55    ExpectObjectColon,          // After key - expect ':'
56    ExpectObjectValue,          // After ':' - expect value
57    ExpectObjectCommaOrEnd,     // After value in object - expect ',' or '}'
58}
59
60/// The main streaming JSON parser.
61///
62/// This struct is an `Iterator` that yields `Result<ParserEvent, ParseError>`.
63/// It is created by the `parse_streaming` function.
64pub struct StreamingParser<'a> {
65    /// The internal tokenizer (lexer) that breaks the input string into `Token`s.
66    tokenizer: Peekable<Tokenizer<'a>>,
67    /// A stack of states, used for tracking nested objects and arrays.
68    state_stack: Vec<ParserState>,
69    /// The maximum allowed nesting depth to prevent DoS attacks.
70    max_depth: usize,
71    /// The *current* nesting depth of the parser.
72    depth: usize,
73}
74
75impl<'a> StreamingParser<'a> {
76    /// Creates a new `StreamingParser` for a given input string.
77    ///
78    /// This is called by the `parse_streaming` function in `lib.rs`.
79    pub fn new(input: &'a str, max_depth: usize) -> Self {
80        StreamingParser {
81            tokenizer: Tokenizer::new(input).peekable(),
82            state_stack: vec![ParserState::ExpectValue],
83            max_depth,
84            depth: 0,
85        }
86    }
87
88    /// A helper function to create a `ParseError` from a token's location.
89    fn error_from_token(&self, message: String, token: &Token<'a>) -> ParseError {
90        ParseError {
91            message,
92            line: token.line,
93            column: token.column,
94        }
95    }
96}
97
98/// The main implementation of the parser's `Iterator` trait.
99/// This is where the state machine logic lives.
100impl<'a> Iterator for StreamingParser<'a> {
101    type Item = Result<ParserEvent<'a>, ParseError>;
102
103    /// Consumes the next token and advances the parser's state.
104    fn next(&mut self) -> Option<Self::Item> {
105        // Get the next token from the tokenizer.
106        let token_result = self.tokenizer.next();
107
108        let mut current_token = match token_result {
109            Some(Ok(token)) => Some(token),
110            Some(Err(e)) => return Some(Err(e)), // Tokenizer error
111            None => None,                        // End of input
112        };
113
114        // Loop handles "non-event" tokens (like `,` or `:`)
115        // that advance the state but don't emit a `ParserEvent`.
116        loop {
117            let state_tuple = (current_token.as_ref(), self.state_stack.last());
118
119            // Determine the current token and state.
120            let (token, state) = match state_tuple {
121                (Some(token), Some(state)) => (token, state.clone()),
122                // End of input, but we're still in a nested state.
123                (None, Some(state)) => {
124                    if *state == ParserState::ExpectValue && self.state_stack.len() == 1 {
125                        // We expected a value, but got clean EOF. Valid for empty input.
126                        // The tokenizer.next() would have returned None, so we'd be in (None, None).
127                        // This path is for cases like `[1,` and then EOF.
128                        // A clean EOF on ExpectValue (e.g. empty string) is handled by (None, None)
129                    }
130                    // Handle clean EOF on empty input
131                    if *state == ParserState::ExpectValue && self.state_stack.len() == 1 {
132                        if let Some(_tok) = &current_token {
133                            // This case should not be hit if current_token is None
134                        } else {
135                            // This means tokenizer.next() returned None *and* state is ExpectValue
136                            return None;
137                        }
138                    }
139
140                    // Return a more specific error message based on the parser's state
141                    let msg = match state {
142                        ParserState::ExpectObjectCommaOrEnd
143                        | ParserState::ExpectObjectFirstKeyOrEnd
144                        | ParserState::ExpectObjectKey
145                        | ParserState::ExpectObjectColon
146                        | ParserState::ExpectObjectValue => "Unclosed object",
147                        ParserState::ExpectArrayCommaOrEnd
148                        | ParserState::ExpectArrayFirstValueOrEnd
149                        | ParserState::ExpectArrayValue => "Unclosed array",
150                        _ => "Unexpected end of input, unclosed structure",
151                    };
152
153                    return Some(Err(ParseError {
154                        message: msg.to_string(),
155                        line: 0, // We don't have a token for location info
156                        column: 0,
157                    }));
158                }
159                (None, None) => return None, // Clean end
160                // We have a token, but the state stack is empty (parser finished).
161                (Some(token), None) => {
162                    return Some(Err(
163                        self.error_from_token("Unexpected trailing token".to_string(), token)
164                    ));
165                }
166            };
167
168            // This is the main state machine logic.
169            let result = match (state, &token.kind) {
170                // --- Root level or nested value expected ---
171                (ParserState::ExpectValue, TokenType::LeftBracket) => {
172                    if self.depth >= self.max_depth {
173                        return Some(Err(self.error_from_token(
174                            "Maximum nesting depth exceeded".to_string(),
175                            token,
176                        )));
177                    }
178                    self.depth += 1;
179                    self.state_stack.pop();
180                    self.state_stack
181                        .push(ParserState::ExpectArrayFirstValueOrEnd);
182                    Ok(Some(ParserEvent::StartArray))
183                }
184                (ParserState::ExpectValue, TokenType::LeftBrace) => {
185                    if self.depth >= self.max_depth {
186                        return Some(Err(self.error_from_token(
187                            "Maximum nesting depth exceeded".to_string(),
188                            token,
189                        )));
190                    }
191                    self.depth += 1;
192                    self.state_stack.pop();
193                    self.state_stack
194                        .push(ParserState::ExpectObjectFirstKeyOrEnd);
195                    Ok(Some(ParserEvent::StartObject))
196                }
197                (ParserState::ExpectValue, TokenType::String(s)) => {
198                    self.state_stack.pop();
199                    Ok(Some(ParserEvent::String(s.clone())))
200                }
201                (ParserState::ExpectValue, TokenType::Number(n)) => {
202                    self.state_stack.pop();
203                    Ok(Some(ParserEvent::Number(*n)))
204                }
205                (ParserState::ExpectValue, TokenType::Boolean(b)) => {
206                    self.state_stack.pop();
207                    Ok(Some(ParserEvent::Boolean(*b)))
208                }
209                (ParserState::ExpectValue, TokenType::Null) => {
210                    self.state_stack.pop();
211                    Ok(Some(ParserEvent::Null))
212                }
213                (ParserState::ExpectValue, _) => {
214                    Err(self.error_from_token("Expected a value".to_string(), token))
215                }
216
217                // --- Inside Array: expecting first value or ']' (empty array) ---
218                (ParserState::ExpectArrayFirstValueOrEnd, TokenType::RightBracket) => {
219                    self.depth -= 1;
220                    self.state_stack.pop();
221                    Ok(Some(ParserEvent::EndArray))
222                }
223                (ParserState::ExpectArrayFirstValueOrEnd, TokenType::LeftBracket) => {
224                    if self.depth >= self.max_depth {
225                        return Some(Err(self.error_from_token(
226                            "Maximum nesting depth exceeded".to_string(),
227                            token,
228                        )));
229                    }
230                    self.depth += 1;
231                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectArrayCommaOrEnd;
232                    self.state_stack
233                        .push(ParserState::ExpectArrayFirstValueOrEnd);
234                    Ok(Some(ParserEvent::StartArray))
235                }
236                (ParserState::ExpectArrayFirstValueOrEnd, TokenType::LeftBrace) => {
237                    if self.depth >= self.max_depth {
238                        return Some(Err(self.error_from_token(
239                            "Maximum nesting depth exceeded".to_string(),
240                            token,
241                        )));
242                    }
243                    self.depth += 1;
244                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectArrayCommaOrEnd;
245                    self.state_stack
246                        .push(ParserState::ExpectObjectFirstKeyOrEnd);
247                    Ok(Some(ParserEvent::StartObject))
248                }
249                (ParserState::ExpectArrayFirstValueOrEnd, TokenType::String(s)) => {
250                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectArrayCommaOrEnd;
251                    Ok(Some(ParserEvent::String(s.clone())))
252                }
253                (ParserState::ExpectArrayFirstValueOrEnd, TokenType::Number(n)) => {
254                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectArrayCommaOrEnd;
255                    Ok(Some(ParserEvent::Number(*n)))
256                }
257                (ParserState::ExpectArrayFirstValueOrEnd, TokenType::Boolean(b)) => {
258                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectArrayCommaOrEnd;
259                    Ok(Some(ParserEvent::Boolean(*b)))
260                }
261                (ParserState::ExpectArrayFirstValueOrEnd, TokenType::Null) => {
262                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectArrayCommaOrEnd;
263                    Ok(Some(ParserEvent::Null))
264                }
265                (ParserState::ExpectArrayFirstValueOrEnd, _) => {
266                    Err(self.error_from_token("Expected value or ']'".to_string(), token))
267                }
268
269                // --- Inside Array: after comma, expecting value (no ']' allowed) ---
270                (ParserState::ExpectArrayValue, TokenType::LeftBracket) => {
271                    if self.depth >= self.max_depth {
272                        return Some(Err(self.error_from_token(
273                            "Maximum nesting depth exceeded".to_string(),
274                            token,
275                        )));
276                    }
277                    self.depth += 1;
278                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectArrayCommaOrEnd;
279                    self.state_stack
280                        .push(ParserState::ExpectArrayFirstValueOrEnd);
281                    Ok(Some(ParserEvent::StartArray))
282                }
283                (ParserState::ExpectArrayValue, TokenType::LeftBrace) => {
284                    if self.depth >= self.max_depth {
285                        return Some(Err(self.error_from_token(
286                            "Maximum nesting depth exceeded".to_string(),
287                            token,
288                        )));
289                    }
290                    self.depth += 1;
291                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectArrayCommaOrEnd;
292                    self.state_stack
293                        .push(ParserState::ExpectObjectFirstKeyOrEnd);
294                    Ok(Some(ParserEvent::StartObject))
295                }
296                (ParserState::ExpectArrayValue, TokenType::String(s)) => {
297                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectArrayCommaOrEnd;
298                    Ok(Some(ParserEvent::String(s.clone())))
299                }
300                (ParserState::ExpectArrayValue, TokenType::Number(n)) => {
301                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectArrayCommaOrEnd;
302                    Ok(Some(ParserEvent::Number(*n)))
303                }
304                (ParserState::ExpectArrayValue, TokenType::Boolean(b)) => {
305                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectArrayCommaOrEnd;
306                    Ok(Some(ParserEvent::Boolean(*b)))
307                }
308                (ParserState::ExpectArrayValue, TokenType::Null) => {
309                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectArrayCommaOrEnd;
310                    Ok(Some(ParserEvent::Null))
311                }
312                // Check for invalid trailing comma `[1,,2]`
313                (ParserState::ExpectArrayValue, TokenType::RightBracket) => {
314                    Err(self
315                        .error_from_token("Unexpected ']', expected a value".to_string(), token))
316                }
317                (ParserState::ExpectArrayValue, _) => {
318                    Err(self.error_from_token("Expected a value".to_string(), token))
319                }
320
321                // --- Inside Array: after a value, expecting ',' or ']' ---
322                (ParserState::ExpectArrayCommaOrEnd, TokenType::Comma) => {
323                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectArrayValue;
324                    Ok(None) // Comma is consumed, state changes, but no event emitted.
325                }
326                (ParserState::ExpectArrayCommaOrEnd, TokenType::RightBracket) => {
327                    self.depth -= 1;
328                    self.state_stack.pop();
329                    Ok(Some(ParserEvent::EndArray))
330                }
331                (ParserState::ExpectArrayCommaOrEnd, _) => {
332                    Err(self.error_from_token("Expected ',' or ']'".to_string(), token))
333                }
334
335                // --- Inside Object: expecting first key or '}' (empty object) ---
336                (ParserState::ExpectObjectFirstKeyOrEnd, TokenType::String(s)) => {
337                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectObjectColon;
338                    Ok(Some(ParserEvent::Key(s.clone())))
339                }
340                (ParserState::ExpectObjectFirstKeyOrEnd, TokenType::RightBrace) => {
341                    self.depth -= 1;
342                    self.state_stack.pop();
343                    Ok(Some(ParserEvent::EndObject))
344                }
345                (ParserState::ExpectObjectFirstKeyOrEnd, _) => {
346                    Err(self.error_from_token("Expected '}' or a string key".to_string(), token))
347                }
348
349                // --- Inside Object: after comma, expecting key (no '}' allowed) ---
350                (ParserState::ExpectObjectKey, TokenType::String(s)) => {
351                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectObjectColon;
352                    Ok(Some(ParserEvent::Key(s.clone())))
353                }
354                // Check for invalid trailing comma `{"key":1,}`
355                (ParserState::ExpectObjectKey, TokenType::RightBrace) => Err(self
356                    .error_from_token("Unexpected '}', expected a string key".to_string(), token)),
357                (ParserState::ExpectObjectKey, _) => {
358                    Err(self.error_from_token("Expected a string key".to_string(), token))
359                }
360
361                // --- Inside Object: after key, expecting ':' ---
362                (ParserState::ExpectObjectColon, TokenType::Colon) => {
363                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectObjectValue;
364                    Ok(None) // Colon is consumed, state changes, no event emitted.
365                }
366                (ParserState::ExpectObjectColon, _) => {
367                    Err(self.error_from_token("Expected ':'".to_string(), token))
368                }
369
370                // --- Inside Object: after ':', expecting value ---
371                (ParserState::ExpectObjectValue, TokenType::LeftBracket) => {
372                    if self.depth >= self.max_depth {
373                        return Some(Err(self.error_from_token(
374                            "Maximum nesting depth exceeded".to_string(),
375                            token,
376                        )));
377                    }
378                    self.depth += 1;
379                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectObjectCommaOrEnd;
380                    self.state_stack
381                        .push(ParserState::ExpectArrayFirstValueOrEnd);
382                    Ok(Some(ParserEvent::StartArray))
383                }
384                (ParserState::ExpectObjectValue, TokenType::LeftBrace) => {
385                    if self.depth >= self.max_depth {
386                        return Some(Err(self.error_from_token(
387                            "Maximum nesting depth exceeded".to_string(),
388                            token,
389                        )));
390                    }
391                    self.depth += 1;
392                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectObjectCommaOrEnd;
393                    self.state_stack
394                        .push(ParserState::ExpectObjectFirstKeyOrEnd);
395                    Ok(Some(ParserEvent::StartObject))
396                }
397                (ParserState::ExpectObjectValue, TokenType::String(s)) => {
398                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectObjectCommaOrEnd;
399                    Ok(Some(ParserEvent::String(s.clone())))
400                }
401                (ParserState::ExpectObjectValue, TokenType::Number(n)) => {
402                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectObjectCommaOrEnd;
403                    Ok(Some(ParserEvent::Number(*n)))
404                }
405                (ParserState::ExpectObjectValue, TokenType::Boolean(b)) => {
406                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectObjectCommaOrEnd;
407                    Ok(Some(ParserEvent::Boolean(*b)))
408                }
409                (ParserState::ExpectObjectValue, TokenType::Null) => {
410                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectObjectCommaOrEnd;
411                    Ok(Some(ParserEvent::Null))
412                }
413                (ParserState::ExpectObjectValue, _) => {
414                    Err(self.error_from_token("Expected a value".to_string(), token))
415                }
416
417                // --- Inside Object: after value, expecting ',' or '}' ---
418                (ParserState::ExpectObjectCommaOrEnd, TokenType::Comma) => {
419                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectObjectKey;
420                    Ok(None) // Comma consumed, state changes, no event.
421                }
422                (ParserState::ExpectObjectCommaOrEnd, TokenType::RightBrace) => {
423                    self.depth -= 1;
424                    self.state_stack.pop();
425                    Ok(Some(ParserEvent::EndObject))
426                }
427                (ParserState::ExpectObjectCommaOrEnd, _) => {
428                    Err(self.error_from_token("Expected ',' or '}'".to_string(), token))
429                }
430            };
431
432            // Handle the result of the `match` expression
433            match result {
434                Ok(Some(event)) => {
435                    // We have an event to emit. Return it.
436                    return Some(Ok(event));
437                }
438                Ok(None) => {
439                    // This was a non-event token (like `,` or `:`).
440                    // We loop again to get the *next* token.
441                    current_token = match self.tokenizer.next() {
442                        Some(Ok(token)) => Some(token),
443                        Some(Err(e)) => return Some(Err(e)),
444                        None => None,
445                    };
446                    continue;
447                }
448                Err(e) => {
449                    // A parsing error occurred.
450                    return Some(Err(e));
451                }
452            }
453        }
454    }
455}