rill_json/
parser.rs

1//! Contains the `StreamingParser` and its state machine.
2//!
3//! This module defines the core logic of the parser, which is implemented
4//! as a state machine that consumes `Token`s from the `Tokenizer` and
5//! emits `ParserEvent`s.
6
7use crate::error::ParseError;
8use crate::token::{Token, TokenType};
9use crate::tokenizer::Tokenizer;
10use std::iter::Peekable;
11
12// --- 6. "True" Streaming Parser (Stage 15) ---
13
14/// A single event emitted by the `StreamingParser`.
15///
16/// The parser is an `Iterator` that yields these events, allowing you
17/// to react to JSON data as it's being parsed without loading the
18/// entire structure into memory.
19#[derive(Debug, PartialEq, Clone)]
20pub enum ParserEvent {
21    /// The start of a JSON object (`{`).
22    StartObject,
23    /// The end of a JSON object (`}`).
24    EndObject,
25    /// The start of a JSON array (`[`).
26    StartArray,
27    /// The end of a JSON array (`]`).
28    EndArray,
29    /// A JSON object key (e.g., `"name": ...`).
30    Key(String),
31    /// A JSON string value (e.g., `... : "value"`).
32    String(String),
33    /// A JSON number value (e.g., `123`, `-0.5`, `1e10`).
34    Number(f64),
35    /// A JSON boolean value (`true` or `false`).
36    Boolean(bool),
37    /// A JSON `null` value.
38    Null,
39}
40
41/// Internal state machine for the parser.
42///
43/// This enum tracks what the parser *expects* to see next,
44/// allowing it to enforce JSON grammar rules (e.g., "a comma or
45/// closing bracket must follow a value in an array").
46#[derive(Debug, PartialEq, Clone)]
47#[allow(clippy::enum_variant_names)]
48enum ParserState {
49    ExpectValue,
50    ExpectArrayFirstValueOrEnd, // After '[' - expect value or ']' (empty array)
51    ExpectArrayValue,           // After ',' in array - expect value (no ']' allowed)
52    ExpectArrayCommaOrEnd,      // After value in array - expect ',' or ']'
53    ExpectObjectFirstKeyOrEnd,  // After '{' - expect key or '}' (empty object)
54    ExpectObjectKey,            // After ',' in object - expect key (no '}' allowed)
55    ExpectObjectColon,          // After key - expect ':'
56    ExpectObjectValue,          // After ':' - expect value
57    ExpectObjectCommaOrEnd,     // After value in object - expect ',' or '}'
58}
59
60/// The main streaming JSON parser.
61///
62/// This struct is an `Iterator` that yields `Result<ParserEvent, ParseError>`.
63/// It is created by the `parse_streaming` function.
64pub struct StreamingParser<'a> {
65    /// The internal tokenizer (lexer) that breaks the input string into `Token`s.
66    tokenizer: Peekable<Tokenizer<'a>>,
67    /// A stack of states, used for tracking nested objects and arrays.
68    state_stack: Vec<ParserState>,
69    /// The maximum allowed nesting depth to prevent DoS attacks.
70    max_depth: usize,
71    /// The *current* nesting depth of the parser.
72    depth: usize,
73}
74
75impl<'a> StreamingParser<'a> {
76    /// Creates a new `StreamingParser` for a given input string.
77    ///
78    /// This is called by the `parse_streaming` function in `lib.rs`.
79    pub fn new(input: &'a str, max_depth: usize) -> Self {
80        StreamingParser {
81            tokenizer: Tokenizer::new(input).peekable(),
82            state_stack: vec![ParserState::ExpectValue],
83            max_depth,
84            depth: 0,
85        }
86    }
87
88    /// A helper function to create a `ParseError` from a token's location.
89    fn error_from_token(&self, message: String, token: &Token) -> ParseError {
90        ParseError {
91            message,
92            line: token.line,
93            column: token.column,
94        }
95    }
96}
97
98/// The main implementation of the parser's `Iterator` trait.
99/// This is where the state machine logic lives.
100impl<'a> Iterator for StreamingParser<'a> {
101    type Item = Result<ParserEvent, ParseError>;
102
103    /// Consumes the next token and advances the parser's state.
104    fn next(&mut self) -> Option<Self::Item> {
105        // Get the next token from the tokenizer.
106        let token_result = self.tokenizer.next();
107
108        let mut current_token = match token_result {
109            Some(Ok(token)) => Some(token),
110            Some(Err(e)) => return Some(Err(e)), // Tokenizer error
111            None => None,                        // End of input
112        };
113
114        // Loop handles "non-event" tokens (like `,` or `:`)
115        // that advance the state but don't emit a `ParserEvent`.
116        loop {
117            let state_tuple = (current_token.as_ref(), self.state_stack.last());
118
119            // Determine the current token and state.
120            let (token, state) = match state_tuple {
121                (Some(token), Some(state)) => (token, state.clone()),
122                // End of input, but we're still in a nested state.
123                (None, Some(state)) => {
124                    if *state == ParserState::ExpectValue && self.state_stack.len() == 1 {
125                        return None; // We expected a value, but got clean EOF. Valid.
126                    }
127                    return Some(Err(ParseError {
128                        message: "Unexpected end of input, unclosed structure".to_string(),
129                        line: 0, // We don't have a token for location info
130                        column: 0,
131                    }));
132                }
133                (None, None) => return None, // Clean end
134                // We have a token, but the state stack is empty (parser finished).
135                (Some(token), None) => {
136                    return Some(Err(
137                        self.error_from_token("Unexpected trailing token".to_string(), token)
138                    ));
139                }
140            };
141
142            // This is the main state machine logic.
143            let result = match (state, &token.kind) {
144                // --- Root level or nested value expected ---
145                (ParserState::ExpectValue, TokenType::LeftBracket) => {
146                    if self.depth >= self.max_depth {
147                        return Some(Err(self.error_from_token(
148                            "Maximum nesting depth exceeded".to_string(),
149                            token,
150                        )));
151                    }
152                    self.depth += 1;
153                    self.state_stack.pop();
154                    self.state_stack
155                        .push(ParserState::ExpectArrayFirstValueOrEnd);
156                    Ok(Some(ParserEvent::StartArray))
157                }
158                (ParserState::ExpectValue, TokenType::LeftBrace) => {
159                    if self.depth >= self.max_depth {
160                        return Some(Err(self.error_from_token(
161                            "Maximum nesting depth exceeded".to_string(),
162                            token,
163                        )));
164                    }
165                    self.depth += 1;
166                    self.state_stack.pop();
167                    self.state_stack
168                        .push(ParserState::ExpectObjectFirstKeyOrEnd);
169                    Ok(Some(ParserEvent::StartObject))
170                }
171                (ParserState::ExpectValue, TokenType::String(s)) => {
172                    self.state_stack.pop();
173                    Ok(Some(ParserEvent::String(s.clone())))
174                }
175                (ParserState::ExpectValue, TokenType::Number(n)) => {
176                    self.state_stack.pop();
177                    Ok(Some(ParserEvent::Number(*n)))
178                }
179                (ParserState::ExpectValue, TokenType::Boolean(b)) => {
180                    self.state_stack.pop();
181                    Ok(Some(ParserEvent::Boolean(*b)))
182                }
183                (ParserState::ExpectValue, TokenType::Null) => {
184                    self.state_stack.pop();
185                    Ok(Some(ParserEvent::Null))
186                }
187                (ParserState::ExpectValue, _) => {
188                    Err(self.error_from_token("Expected a value".to_string(), token))
189                }
190
191                // --- Inside Array: expecting first value or ']' (empty array) ---
192                (ParserState::ExpectArrayFirstValueOrEnd, TokenType::RightBracket) => {
193                    self.depth -= 1;
194                    self.state_stack.pop();
195                    Ok(Some(ParserEvent::EndArray))
196                }
197                (ParserState::ExpectArrayFirstValueOrEnd, TokenType::LeftBracket) => {
198                    if self.depth >= self.max_depth {
199                        return Some(Err(self.error_from_token(
200                            "Maximum nesting depth exceeded".to_string(),
201                            token,
202                        )));
203                    }
204                    self.depth += 1;
205                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectArrayCommaOrEnd;
206                    self.state_stack
207                        .push(ParserState::ExpectArrayFirstValueOrEnd);
208                    Ok(Some(ParserEvent::StartArray))
209                }
210                (ParserState::ExpectArrayFirstValueOrEnd, TokenType::LeftBrace) => {
211                    if self.depth >= self.max_depth {
212                        return Some(Err(self.error_from_token(
213                            "Maximum nesting depth exceeded".to_string(),
214                            token,
215                        )));
216                    }
217                    self.depth += 1;
218                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectArrayCommaOrEnd;
219                    self.state_stack
220                        .push(ParserState::ExpectObjectFirstKeyOrEnd);
221                    Ok(Some(ParserEvent::StartObject))
222                }
223                (ParserState::ExpectArrayFirstValueOrEnd, TokenType::String(s)) => {
224                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectArrayCommaOrEnd;
225                    Ok(Some(ParserEvent::String(s.clone())))
226                }
227                (ParserState::ExpectArrayFirstValueOrEnd, TokenType::Number(n)) => {
228                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectArrayCommaOrEnd;
229                    Ok(Some(ParserEvent::Number(*n)))
230                }
231                (ParserState::ExpectArrayFirstValueOrEnd, TokenType::Boolean(b)) => {
232                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectArrayCommaOrEnd;
233                    Ok(Some(ParserEvent::Boolean(*b)))
234                }
235                (ParserState::ExpectArrayFirstValueOrEnd, TokenType::Null) => {
236                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectArrayCommaOrEnd;
237                    Ok(Some(ParserEvent::Null))
238                }
239                (ParserState::ExpectArrayFirstValueOrEnd, _) => {
240                    Err(self.error_from_token("Expected value or ']'".to_string(), token))
241                }
242
243                // --- Inside Array: after comma, expecting value (no ']' allowed) ---
244                (ParserState::ExpectArrayValue, TokenType::LeftBracket) => {
245                    if self.depth >= self.max_depth {
246                        return Some(Err(self.error_from_token(
247                            "Maximum nesting depth exceeded".to_string(),
248                            token,
249                        )));
250                    }
251                    self.depth += 1;
252                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectArrayCommaOrEnd;
253                    self.state_stack
254                        .push(ParserState::ExpectArrayFirstValueOrEnd);
255                    Ok(Some(ParserEvent::StartArray))
256                }
257                (ParserState::ExpectArrayValue, TokenType::LeftBrace) => {
258                    if self.depth >= self.max_depth {
259                        return Some(Err(self.error_from_token(
260                            "Maximum nesting depth exceeded".to_string(),
261                            token,
262                        )));
263                    }
264                    self.depth += 1;
265                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectArrayCommaOrEnd;
266                    self.state_stack
267                        .push(ParserState::ExpectObjectFirstKeyOrEnd);
268                    Ok(Some(ParserEvent::StartObject))
269                }
270                (ParserState::ExpectArrayValue, TokenType::String(s)) => {
271                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectArrayCommaOrEnd;
272                    Ok(Some(ParserEvent::String(s.clone())))
273                }
274                (ParserState::ExpectArrayValue, TokenType::Number(n)) => {
275                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectArrayCommaOrEnd;
276                    Ok(Some(ParserEvent::Number(*n)))
277                }
278                (ParserState::ExpectArrayValue, TokenType::Boolean(b)) => {
279                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectArrayCommaOrEnd;
280                    Ok(Some(ParserEvent::Boolean(*b)))
281                }
282                (ParserState::ExpectArrayValue, TokenType::Null) => {
283                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectArrayCommaOrEnd;
284                    Ok(Some(ParserEvent::Null))
285                }
286                // Check for invalid trailing comma `[1,,2]`
287                (ParserState::ExpectArrayValue, TokenType::RightBracket) => {
288                    Err(self
289                        .error_from_token("Unexpected ']', expected a value".to_string(), token))
290                }
291                (ParserState::ExpectArrayValue, _) => {
292                    Err(self.error_from_token("Expected a value".to_string(), token))
293                }
294
295                // --- Inside Array: after a value, expecting ',' or ']' ---
296                (ParserState::ExpectArrayCommaOrEnd, TokenType::Comma) => {
297                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectArrayValue;
298                    Ok(None) // Comma is consumed, state changes, but no event emitted.
299                }
300                (ParserState::ExpectArrayCommaOrEnd, TokenType::RightBracket) => {
301                    self.depth -= 1;
302                    self.state_stack.pop();
303                    Ok(Some(ParserEvent::EndArray))
304                }
305                (ParserState::ExpectArrayCommaOrEnd, _) => {
306                    Err(self.error_from_token("Expected ',' or ']'".to_string(), token))
307                }
308
309                // --- Inside Object: expecting first key or '}' (empty object) ---
310                (ParserState::ExpectObjectFirstKeyOrEnd, TokenType::String(s)) => {
311                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectObjectColon;
312                    Ok(Some(ParserEvent::Key(s.clone())))
313                }
314                (ParserState::ExpectObjectFirstKeyOrEnd, TokenType::RightBrace) => {
315                    self.depth -= 1;
316                    self.state_stack.pop();
317                    Ok(Some(ParserEvent::EndObject))
318                }
319                (ParserState::ExpectObjectFirstKeyOrEnd, _) => {
320                    Err(self.error_from_token("Expected '}' or a string key".to_string(), token))
321                }
322
323                // --- Inside Object: after comma, expecting key (no '}' allowed) ---
324                (ParserState::ExpectObjectKey, TokenType::String(s)) => {
325                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectObjectColon;
326                    Ok(Some(ParserEvent::Key(s.clone())))
327                }
328                // Check for invalid trailing comma `{"key":1,}`
329                (ParserState::ExpectObjectKey, TokenType::RightBrace) => Err(self
330                    .error_from_token("Unexpected '}', expected a string key".to_string(), token)),
331                (ParserState::ExpectObjectKey, _) => {
332                    Err(self.error_from_token("Expected a string key".to_string(), token))
333                }
334
335                // --- Inside Object: after key, expecting ':' ---
336                (ParserState::ExpectObjectColon, TokenType::Colon) => {
337                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectObjectValue;
338                    Ok(None) // Colon is consumed, state changes, no event emitted.
339                }
340                (ParserState::ExpectObjectColon, _) => {
341                    Err(self.error_from_token("Expected ':'".to_string(), token))
342                }
343
344                // --- Inside Object: after ':', expecting value ---
345                (ParserState::ExpectObjectValue, TokenType::LeftBracket) => {
346                    if self.depth >= self.max_depth {
347                        return Some(Err(self.error_from_token(
348                            "Maximum nesting depth exceeded".to_string(),
349                            token,
350                        )));
351                    }
352                    self.depth += 1;
353                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectObjectCommaOrEnd;
354                    self.state_stack
355                        .push(ParserState::ExpectArrayFirstValueOrEnd);
356                    Ok(Some(ParserEvent::StartArray))
357                }
358                (ParserState::ExpectObjectValue, TokenType::LeftBrace) => {
359                    if self.depth >= self.max_depth {
360                        return Some(Err(self.error_from_token(
361                            "Maximum nesting depth exceeded".to_string(),
362                            token,
363                        )));
364                    }
365                    self.depth += 1;
366                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectObjectCommaOrEnd;
367                    self.state_stack
368                        .push(ParserState::ExpectObjectFirstKeyOrEnd);
369                    Ok(Some(ParserEvent::StartObject))
370                }
371                (ParserState::ExpectObjectValue, TokenType::String(s)) => {
372                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectObjectCommaOrEnd;
373                    Ok(Some(ParserEvent::String(s.clone())))
374                }
375                (ParserState::ExpectObjectValue, TokenType::Number(n)) => {
376                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectObjectCommaOrEnd;
377                    Ok(Some(ParserEvent::Number(*n)))
378                }
379                (ParserState::ExpectObjectValue, TokenType::Boolean(b)) => {
380                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectObjectCommaOrEnd;
381                    Ok(Some(ParserEvent::Boolean(*b)))
382                }
383                (ParserState::ExpectObjectValue, TokenType::Null) => {
384                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectObjectCommaOrEnd;
385                    Ok(Some(ParserEvent::Null))
386                }
387                (ParserState::ExpectObjectValue, _) => {
388                    Err(self.error_from_token("Expected a value".to_string(), token))
389                }
390
391                // --- Inside Object: after value, expecting ',' or '}' ---
392                (ParserState::ExpectObjectCommaOrEnd, TokenType::Comma) => {
393                    *self.state_stack.last_mut().unwrap() = ParserState::ExpectObjectKey;
394                    Ok(None) // Comma consumed, state changes, no event.
395                }
396                (ParserState::ExpectObjectCommaOrEnd, TokenType::RightBrace) => {
397                    self.depth -= 1;
398                    self.state_stack.pop();
399                    Ok(Some(ParserEvent::EndObject))
400                }
401                (ParserState::ExpectObjectCommaOrEnd, _) => {
402                    Err(self.error_from_token("Expected ',' or '}'".to_string(), token))
403                }
404            };
405
406            // Handle the result of the `match` expression
407            match result {
408                Ok(Some(event)) => {
409                    // We have an event to emit. Return it.
410                    return Some(Ok(event));
411                }
412                Ok(None) => {
413                    // This was a non-event token (like `,` or `:`).
414                    // We loop again to get the *next* token.
415                    current_token = match self.tokenizer.next() {
416                        Some(Ok(token)) => Some(token),
417                        Some(Err(e)) => return Some(Err(e)),
418                        None => None,
419                    };
420                    continue;
421                }
422                Err(e) => {
423                    // A parsing error occurred.
424                    return Some(Err(e));
425                }
426            }
427        }
428    }
429}