Skip to main content

styx_parse/parser/
mod.rs

1//! Pull-based event parser for Styx.
2
3use std::borrow::Cow;
4use std::collections::{HashMap, VecDeque};
5
6use styx_tokenizer::Span;
7use tracing::trace;
8
9use crate::events::{EventKind, ParseErrorKind, ScalarKind};
10use crate::{Event, Lexeme, Lexer};
11
12/// Wraps lexer with a single pending slot for stashing boundary lexemes.
13#[derive(Clone)]
14struct LexemeSource<'src> {
15    lexer: Lexer<'src>,
16    /// Single pending lexeme slot. When collect_entry_atoms hits a boundary
17    /// (comma, closing brace, etc.), it stashes the lexeme here instead of
18    /// discarding it. Limited to exactly one slot - if we ever need more,
19    /// that's a bug in our logic.
20    pending: Option<Lexeme<'src>>,
21}
22
23impl<'src> LexemeSource<'src> {
24    fn new(source: &'src str) -> Self {
25        Self {
26            lexer: Lexer::new(source),
27            pending: None,
28        }
29    }
30
31    fn next(&mut self) -> Lexeme<'src> {
32        self.pending
33            .take()
34            .unwrap_or_else(|| self.lexer.next_lexeme())
35    }
36
37    fn stash(&mut self, lexeme: Lexeme<'src>) {
38        assert!(self.pending.is_none(), "double stash - this is a bug");
39        self.pending = Some(lexeme);
40    }
41}
42
43/// Pull-based event parser for Styx.
44#[derive(Clone)]
45pub struct Parser<'src> {
46    input: &'src str,
47    source: LexemeSource<'src>,
48    state: ParserState,
49    event_queue: VecDeque<Event<'src>>,
50}
51
52/// Parser state machine states.
53#[derive(Clone)]
54enum ParserState {
55    /// Haven't emitted DocumentStart yet.
56    BeforeDocument,
57
58    /// Expression mode: parse a single value without document wrapper.
59    BeforeExpression,
60
61    /// At implicit document root.
62    DocumentRoot {
63        seen_keys: HashMap<KeyValue, Span>,
64        pending_doc_comment: Option<Span>,
65        path_state: PathState,
66        /// Whether we've emitted ObjectStart for the implicit root object.
67        emitted_object_start: bool,
68    },
69
70    /// Inside explicit object { ... }.
71    InObject {
72        start_span: Span,
73        seen_keys: HashMap<KeyValue, Span>,
74        pending_doc_comment: Option<Span>,
75        /// Parent state to restore when we pop.
76        parent: Box<ParserState>,
77    },
78
79    /// Document ended.
80    AfterDocument,
81
82    /// Expression mode ended.
83    AfterExpression,
84}
85
86impl<'src> Parser<'src> {
87    /// Create a new parser for the given source.
88    pub fn new(source: &'src str) -> Self {
89        Self {
90            input: source,
91            source: LexemeSource::new(source),
92            state: ParserState::BeforeDocument,
93            event_queue: VecDeque::new(),
94        }
95    }
96
97    /// Create a new parser in expression mode.
98    ///
99    /// Expression mode parses a single value rather than a document with implicit root object.
100    /// Use this for parsing embedded values like default values in schemas.
101    pub fn new_expr(source: &'src str) -> Self {
102        Self {
103            input: source,
104            source: LexemeSource::new(source),
105            state: ParserState::BeforeExpression,
106            event_queue: VecDeque::new(),
107        }
108    }
109
110    /// Get the next event from the parser.
111    pub fn next_event(&mut self) -> Option<Event<'src>> {
112        trace!(
113            queue_len = self.event_queue.len(),
114            "styx-parse next_event called"
115        );
116        // Drain queue first
117        if let Some(event) = self.event_queue.pop_front() {
118            trace!(?event, "styx-parse returning queued event");
119            return Some(event);
120        }
121
122        // Advance state machine
123        let event = self.advance();
124        trace!(?event, "styx-parse returning from advance");
125        event
126    }
127
128    /// Parse all events into a vector.
129    pub fn parse_to_vec(mut self) -> Vec<Event<'src>> {
130        let mut events = Vec::new();
131        while let Some(event) = self.next_event() {
132            events.push(event);
133        }
134        events
135    }
136
137    /// Advance the state machine.
138    fn advance(&mut self) -> Option<Event<'src>> {
139        match &self.state {
140            ParserState::BeforeDocument => {
141                self.state = ParserState::DocumentRoot {
142                    seen_keys: HashMap::new(),
143                    pending_doc_comment: None,
144                    path_state: PathState::default(),
145                    emitted_object_start: false,
146                };
147                Some(Event {
148                    span: Span::empty(0),
149                    kind: EventKind::DocumentStart,
150                })
151            }
152            ParserState::BeforeExpression => self.advance_expression(),
153            ParserState::AfterExpression => None,
154            ParserState::AfterDocument => self.check_trailing_content(),
155            ParserState::DocumentRoot { .. } => self.advance_document_root(),
156            ParserState::InObject { .. } => self.advance_in_object(),
157        }
158    }
159
160    /// Advance when in expression mode - parse a single value.
161    fn advance_expression(&mut self) -> Option<Event<'src>> {
162        loop {
163            let lexeme = self.source.next();
164            match lexeme {
165                // Skip whitespace/newlines/comments
166                Lexeme::Newline { .. } | Lexeme::Comment { .. } => continue,
167                Lexeme::Eof => {
168                    self.state = ParserState::AfterExpression;
169                    return None;
170                }
171                _ => {
172                    // Parse a single atom as the value
173                    let atom = self.parse_atom(lexeme);
174                    self.emit_atom_as_value(&atom);
175                    self.state = ParserState::AfterExpression;
176                    return self.event_queue.pop_front();
177                }
178            }
179        }
180    }
181
182    /// Check for trailing content after explicit root object.
183    /// Returns an error event if there's non-whitespace content, otherwise None.
184    fn check_trailing_content(&mut self) -> Option<Event<'src>> {
185        loop {
186            let lexeme = self.source.next();
187            match lexeme {
188                // Skip whitespace, newlines, and comments - these are allowed after document
189                Lexeme::Newline { .. } | Lexeme::Comment { .. } => continue,
190                Lexeme::Eof => return None,
191                // Any other content is an error
192                _ => {
193                    let span = lexeme.span();
194                    // Consume remaining tokens to find the full extent of trailing content
195                    let mut end = span.end;
196                    loop {
197                        match self.source.next() {
198                            Lexeme::Eof => break,
199                            lex => end = lex.span().end,
200                        }
201                    }
202                    return Some(Event {
203                        span: Span::new(span.start, end),
204                        kind: EventKind::Error {
205                            kind: ParseErrorKind::TrailingContent,
206                        },
207                    });
208                }
209            }
210        }
211    }
212
213    /// Advance when in DocumentRoot state.
214    fn advance_document_root(&mut self) -> Option<Event<'src>> {
215        loop {
216            let lexeme = self.source.next();
217            match lexeme {
218                Lexeme::Eof => {
219                    if let ParserState::DocumentRoot {
220                        pending_doc_comment,
221                        emitted_object_start,
222                        ..
223                    } = &mut self.state
224                    {
225                        if let Some(span) = pending_doc_comment.take() {
226                            self.event_queue.push_back(Event {
227                                span,
228                                kind: EventKind::Error {
229                                    kind: ParseErrorKind::DanglingDocComment,
230                                },
231                            });
232                        }
233                        // Close implicit root object if we opened it
234                        if *emitted_object_start {
235                            self.event_queue.push_back(Event {
236                                span: Span::empty(0),
237                                kind: EventKind::ObjectEnd,
238                            });
239                        }
240                    }
241                    self.event_queue.push_back(Event {
242                        span: Span::empty(self.input.len() as u32),
243                        kind: EventKind::DocumentEnd,
244                    });
245                    self.state = ParserState::AfterDocument;
246                    return self.event_queue.pop_front();
247                }
248                Lexeme::Newline { .. } | Lexeme::Comma { .. } => continue,
249                Lexeme::Comment { span, text } => {
250                    return Some(Event {
251                        span,
252                        kind: EventKind::Comment { text },
253                    });
254                }
255                Lexeme::DocComment { span, text } => {
256                    if let ParserState::DocumentRoot {
257                        pending_doc_comment,
258                        emitted_object_start,
259                        ..
260                    } = &mut self.state
261                    {
262                        *pending_doc_comment = Some(span);
263                        // Doc comments are content, so emit implicit ObjectStart first
264                        if !*emitted_object_start {
265                            *emitted_object_start = true;
266                            self.event_queue.push_back(Event {
267                                span: Span::empty(0),
268                                kind: EventKind::ObjectStart,
269                            });
270                        }
271                    }
272                    // Strip `/// ` or `///` prefix
273                    let line = text
274                        .strip_prefix("/// ")
275                        .or_else(|| text.strip_prefix("///"))
276                        .unwrap_or(text);
277                    self.event_queue.push_back(Event {
278                        span,
279                        kind: EventKind::DocComment { lines: vec![line] },
280                    });
281                    return self.event_queue.pop_front();
282                }
283                Lexeme::ObjectStart { span } => {
284                    // Explicit root object - after it closes, document is done
285                    self.state = ParserState::InObject {
286                        start_span: span,
287                        seen_keys: HashMap::new(),
288                        pending_doc_comment: None,
289                        parent: Box::new(ParserState::AfterDocument),
290                    };
291                    return Some(Event {
292                        span,
293                        kind: EventKind::ObjectStart,
294                    });
295                }
296                _ => {
297                    // Emit implicit ObjectStart on first content
298                    if let ParserState::DocumentRoot {
299                        pending_doc_comment,
300                        emitted_object_start,
301                        ..
302                    } = &mut self.state
303                    {
304                        *pending_doc_comment = None;
305                        if !*emitted_object_start {
306                            *emitted_object_start = true;
307                            self.event_queue.push_back(Event {
308                                span: Span::empty(0),
309                                kind: EventKind::ObjectStart,
310                            });
311                        }
312                    }
313                    let atoms = self.collect_entry_atoms(lexeme);
314                    if !atoms.is_empty() {
315                        self.emit_entry_at_root(&atoms);
316                    }
317                    return self.event_queue.pop_front();
318                }
319            }
320        }
321    }
322
323    /// Advance when in InObject state.
324    fn advance_in_object(&mut self) -> Option<Event<'src>> {
325        let start = if let ParserState::InObject { start_span, .. } = &self.state {
326            *start_span
327        } else {
328            return None;
329        };
330
331        loop {
332            let lexeme = self.source.next();
333            match lexeme {
334                Lexeme::Eof => {
335                    if let ParserState::InObject {
336                        pending_doc_comment,
337                        parent,
338                        ..
339                    } = &mut self.state
340                    {
341                        if let Some(span) = pending_doc_comment.take() {
342                            self.event_queue.push_back(Event {
343                                span,
344                                kind: EventKind::Error {
345                                    kind: ParseErrorKind::DanglingDocComment,
346                                },
347                            });
348                        }
349                        self.event_queue.push_back(Event {
350                            span: start,
351                            kind: EventKind::Error {
352                                kind: ParseErrorKind::UnclosedObject,
353                            },
354                        });
355                        self.event_queue.push_back(Event {
356                            span: start,
357                            kind: EventKind::ObjectEnd,
358                        });
359                        // If parent is AfterDocument, this was a top-level explicit object.
360                        // We need to emit DocumentEnd before transitioning to AfterDocument.
361                        if matches!(parent.as_ref(), ParserState::AfterDocument) {
362                            self.event_queue.push_back(Event {
363                                span: Span::empty(self.input.len() as u32),
364                                kind: EventKind::DocumentEnd,
365                            });
366                        }
367                    }
368                    self.pop_state();
369                    return self.event_queue.pop_front();
370                }
371                Lexeme::ObjectEnd { span } => {
372                    if let ParserState::InObject {
373                        pending_doc_comment,
374                        parent,
375                        ..
376                    } = &mut self.state
377                    {
378                        if let Some(doc_span) = pending_doc_comment.take() {
379                            self.event_queue.push_back(Event {
380                                span: doc_span,
381                                kind: EventKind::Error {
382                                    kind: ParseErrorKind::DanglingDocComment,
383                                },
384                            });
385                        }
386                        // If parent is AfterDocument, this was a top-level explicit object.
387                        // We need to emit DocumentEnd after ObjectEnd.
388                        if matches!(parent.as_ref(), ParserState::AfterDocument) {
389                            self.event_queue.push_back(Event {
390                                span: Span::empty(self.input.len() as u32),
391                                kind: EventKind::DocumentEnd,
392                            });
393                        }
394                    }
395                    self.pop_state();
396                    return Some(Event {
397                        span,
398                        kind: EventKind::ObjectEnd,
399                    });
400                }
401                Lexeme::Newline { .. } | Lexeme::Comma { .. } => continue,
402                Lexeme::Comment { span, text } => {
403                    return Some(Event {
404                        span,
405                        kind: EventKind::Comment { text },
406                    });
407                }
408                Lexeme::DocComment { span, text } => {
409                    if let ParserState::InObject {
410                        pending_doc_comment,
411                        ..
412                    } = &mut self.state
413                    {
414                        *pending_doc_comment = Some(span);
415                    }
416                    // Strip `/// ` or `///` prefix
417                    let line = text
418                        .strip_prefix("/// ")
419                        .or_else(|| text.strip_prefix("///"))
420                        .unwrap_or(text);
421                    return Some(Event {
422                        span,
423                        kind: EventKind::DocComment { lines: vec![line] },
424                    });
425                }
426                _ => {
427                    if let ParserState::InObject {
428                        pending_doc_comment,
429                        ..
430                    } = &mut self.state
431                    {
432                        *pending_doc_comment = None;
433                    }
434                    let atoms = self.collect_entry_atoms(lexeme);
435                    if !atoms.is_empty() {
436                        self.emit_entry_in_object(&atoms);
437                    }
438                    return self.event_queue.pop_front();
439                }
440            }
441        }
442    }
443
444    /// Pop the current state and restore parent.
445    fn pop_state(&mut self) {
446        let parent = match &mut self.state {
447            ParserState::InObject { parent, .. } => {
448                std::mem::replace(parent.as_mut(), ParserState::AfterDocument)
449            }
450            _ => ParserState::AfterDocument,
451        };
452        self.state = parent;
453    }
454
455    /// Emit entry at document root (with path state).
456    fn emit_entry_at_root(&mut self, atoms: &[Atom<'src>]) {
457        if atoms.is_empty() {
458            return;
459        }
460
461        let key_atom = &atoms[0];
462
463        // Check for invalid key types
464        if let AtomContent::Scalar {
465            kind: ScalarKind::Heredoc,
466            ..
467        } = &key_atom.content
468        {
469            // For heredocs, point at just the opening marker (<<TAG), not the whole content
470            let error_span = self.heredoc_start_span(key_atom.span);
471            self.event_queue.push_back(Event {
472                span: error_span,
473                kind: EventKind::Error {
474                    kind: ParseErrorKind::InvalidKey,
475                },
476            });
477        }
478
479        // Check for dotted path
480        if let AtomContent::Scalar {
481            value,
482            kind: ScalarKind::Bare,
483        } = &key_atom.content
484            && value.contains('.')
485        {
486            self.emit_dotted_path_entry(value.clone(), key_atom.span, atoms, true);
487            return;
488        }
489
490        // Simple key - use path state for duplicate detection at root level
491        // (path_state handles both simple and dotted paths uniformly)
492        let key_value = KeyValue::from_atom(key_atom);
493
494        if let ParserState::DocumentRoot { path_state, .. } = &mut self.state {
495            // Check path state - this handles duplicates for us
496            let key_text = key_value.to_key_string();
497            let path = vec![key_text];
498            let value_kind = if atoms.len() >= 2 {
499                match &atoms[1].content {
500                    AtomContent::Object { .. } | AtomContent::Attributes(_) => {
501                        PathValueKind::Object
502                    }
503                    _ => PathValueKind::Terminal,
504                }
505            } else {
506                PathValueKind::Terminal
507            };
508
509            if let Err(err) = path_state.check_and_update(&path, key_atom.span, value_kind) {
510                self.emit_path_error(err, key_atom.span);
511            }
512        }
513
514        self.emit_simple_entry(atoms);
515    }
516
517    /// Emit entry inside an object (no path state).
518    fn emit_entry_in_object(&mut self, atoms: &[Atom<'src>]) {
519        if atoms.is_empty() {
520            return;
521        }
522
523        let key_atom = &atoms[0];
524
525        // Check for invalid key types
526        if let AtomContent::Scalar {
527            kind: ScalarKind::Heredoc,
528            ..
529        } = &key_atom.content
530        {
531            self.event_queue.push_back(Event {
532                span: key_atom.span,
533                kind: EventKind::Error {
534                    kind: ParseErrorKind::InvalidKey,
535                },
536            });
537        }
538
539        // Check for dotted path (still allowed in nested objects)
540        if let AtomContent::Scalar {
541            value,
542            kind: ScalarKind::Bare,
543        } = &key_atom.content
544            && value.contains('.')
545        {
546            self.emit_dotted_path_entry(value.clone(), key_atom.span, atoms, false);
547            return;
548        }
549
550        // Simple key - check for duplicates
551        let key_value = KeyValue::from_atom(key_atom);
552
553        if let ParserState::InObject { seen_keys, .. } = &mut self.state {
554            if let Some(&original_span) = seen_keys.get(&key_value) {
555                self.event_queue.push_back(Event {
556                    span: key_atom.span,
557                    kind: EventKind::Error {
558                        kind: ParseErrorKind::DuplicateKey {
559                            original: original_span,
560                        },
561                    },
562                });
563            } else {
564                seen_keys.insert(key_value, key_atom.span);
565            }
566        }
567
568        self.emit_simple_entry(atoms);
569    }
570
571    /// Emit a simple (non-dotted) entry.
572    fn emit_simple_entry(&mut self, atoms: &[Atom<'src>]) {
573        let key_atom = &atoms[0];
574
575        self.event_queue.push_back(Event {
576            span: key_atom.span,
577            kind: EventKind::EntryStart,
578        });
579        self.emit_atom_as_key(key_atom);
580
581        if atoms.len() == 1 {
582            self.event_queue.push_back(Event {
583                span: key_atom.span,
584                kind: EventKind::Unit,
585            });
586        } else if atoms.len() >= 2 {
587            self.emit_atom_as_value(&atoms[1]);
588        }
589
590        if atoms.len() > 2 {
591            self.event_queue.push_back(Event {
592                span: atoms[2].span,
593                kind: EventKind::Error {
594                    kind: ParseErrorKind::TooManyAtoms,
595                },
596            });
597        }
598
599        self.event_queue.push_back(Event {
600            span: atoms.last().map(|a| a.span).unwrap_or(key_atom.span),
601            kind: EventKind::EntryEnd,
602        });
603    }
604
605    /// Collect atoms for an entry.
606    fn collect_entry_atoms(&mut self, first: Lexeme<'src>) -> Vec<Atom<'src>> {
607        let mut atoms = Vec::new();
608        let first_atom = self.parse_atom(first);
609        let first_atom_end = first_atom.span.end;
610        let first_is_bare = matches!(
611            &first_atom.content,
612            AtomContent::Scalar {
613                kind: ScalarKind::Bare,
614                ..
615            }
616        );
617        atoms.push(first_atom);
618
619        loop {
620            let lexeme = self.source.next();
621            match lexeme {
622                Lexeme::Eof
623                | Lexeme::Newline { .. }
624                | Lexeme::Comma { .. }
625                | Lexeme::ObjectEnd { .. }
626                | Lexeme::SeqEnd { .. } => {
627                    self.source.stash(lexeme);
628                    break;
629                }
630                Lexeme::Comment { span, text } => {
631                    self.event_queue.push_back(Event {
632                        span,
633                        kind: EventKind::Comment { text },
634                    });
635                    break;
636                }
637                Lexeme::DocComment { span, text } => {
638                    // Strip `/// ` or `///` prefix
639                    let line = text
640                        .strip_prefix("/// ")
641                        .or_else(|| text.strip_prefix("///"))
642                        .unwrap_or(text);
643                    self.event_queue.push_back(Event {
644                        span,
645                        kind: EventKind::DocComment { lines: vec![line] },
646                    });
647                    break;
648                }
649                Lexeme::ObjectStart { span } | Lexeme::SeqStart { span } => {
650                    // Check for MissingWhitespaceBeforeBlock: bare scalar immediately
651                    // followed by { or ( with no whitespace
652                    if atoms.len() == 1 && first_is_bare && first_atom_end == span.start {
653                        self.event_queue.push_back(Event {
654                            span,
655                            kind: EventKind::Error {
656                                kind: ParseErrorKind::MissingWhitespaceBeforeBlock,
657                            },
658                        });
659                    }
660                    let atom = self.parse_atom(lexeme);
661                    atoms.push(atom);
662                }
663                _ => {
664                    let atom = self.parse_atom(lexeme);
665                    atoms.push(atom);
666                }
667            }
668        }
669
670        atoms
671    }
672
673    /// Parse a single atom.
674    fn parse_atom(&mut self, lexeme: Lexeme<'src>) -> Atom<'src> {
675        match lexeme {
676            Lexeme::Scalar { span, value, kind } => Atom {
677                span,
678                content: AtomContent::Scalar { value, kind },
679            },
680            Lexeme::Unit { span } => {
681                // Check if this is an invalid tag like @.foo or @1digit
682                // The lexer produces Unit + Scalar when the tag name is invalid
683                let next = self.source.next();
684                if let Lexeme::Scalar {
685                    span: scalar_span,
686                    value,
687                    kind: ScalarKind::Bare,
688                } = &next
689                {
690                    // Adjacent spans = invalid tag (e.g., @.foo where @ is at 2 and .foo starts at 3)
691                    if scalar_span.start == span.end {
692                        return Atom {
693                            span: Span::new(span.start, scalar_span.end),
694                            content: AtomContent::Tag {
695                                name: "", // empty name signals invalid
696                                payload: Some(Box::new(Atom {
697                                    span: *scalar_span,
698                                    content: AtomContent::Scalar {
699                                        value: value.clone(),
700                                        kind: ScalarKind::Bare,
701                                    },
702                                })),
703                                invalid_name: true,
704                                error_span: Some(*scalar_span), // Error points at the name, not @
705                            },
706                        };
707                    }
708                }
709                // Not an invalid tag, stash and return unit
710                self.source.stash(next);
711                Atom {
712                    span,
713                    content: AtomContent::Unit,
714                }
715            }
716            Lexeme::Tag {
717                span,
718                name,
719                has_payload,
720            } => {
721                // Check if this tag is followed by an adjacent scalar starting with '.'
722                // This happens with @Some.Type where lexer produces Tag("Some") + Scalar(".Type")
723                if !has_payload {
724                    let next = self.source.next();
725                    if let Lexeme::Scalar {
726                        span: scalar_span,
727                        value,
728                        kind: ScalarKind::Bare,
729                    } = &next
730                        && scalar_span.start == span.end
731                        && value.starts_with('.')
732                    {
733                        // Combined invalid tag name like @Some.Type
734                        let combined_name_span = Span::new(span.start + 1, scalar_span.end);
735                        return Atom {
736                            span: Span::new(span.start, scalar_span.end),
737                            content: AtomContent::Tag {
738                                name,
739                                payload: None,
740                                invalid_name: true,
741                                error_span: Some(combined_name_span),
742                            },
743                        };
744                    }
745                    self.source.stash(next);
746                }
747
748                let invalid_name = !is_valid_tag_name(name);
749                let payload = if has_payload {
750                    let next = self.source.next();
751                    Some(Box::new(self.parse_atom(next)))
752                } else {
753                    None
754                };
755                let end = payload.as_ref().map(|p| p.span.end).unwrap_or(span.end);
756                // For invalid tags, error span includes the @ (it's part of the tag)
757                let error_span = if invalid_name { Some(span) } else { None };
758                Atom {
759                    span: Span::new(span.start, end),
760                    content: AtomContent::Tag {
761                        name,
762                        payload,
763                        invalid_name,
764                        error_span,
765                    },
766                }
767            }
768            Lexeme::ObjectStart { span } => self.parse_object_atom(span),
769            Lexeme::SeqStart { span } => self.parse_sequence_atom(span),
770            Lexeme::AttrKey { key_span, key, .. } => self.parse_attributes(key_span, key),
771            Lexeme::Error { span, message } => {
772                // Check if this is an invalid escape error from a quoted string
773                if message.contains("escape") {
774                    // Extract the raw text to find escape positions
775                    let raw_text = &self.input[span.start as usize..span.end as usize];
776                    // Strip quotes if present
777                    let inner = if raw_text.starts_with('"') && raw_text.ends_with('"') {
778                        &raw_text[1..raw_text.len() - 1]
779                    } else {
780                        raw_text
781                    };
782                    Atom {
783                        span,
784                        content: AtomContent::InvalidEscapeScalar {
785                            raw_inner: Cow::Borrowed(inner),
786                        },
787                    }
788                } else {
789                    Atom {
790                        span,
791                        content: AtomContent::Error { message },
792                    }
793                }
794            }
795            Lexeme::ObjectEnd { span }
796            | Lexeme::SeqEnd { span }
797            | Lexeme::Comma { span }
798            | Lexeme::Newline { span } => Atom {
799                span,
800                content: AtomContent::Error {
801                    message: "unexpected token",
802                },
803            },
804            Lexeme::Comment { span, .. } | Lexeme::DocComment { span, .. } => Atom {
805                span,
806                content: AtomContent::Error {
807                    message: "unexpected token",
808                },
809            },
810            Lexeme::Eof => Atom {
811                span: Span::new(self.input.len() as u32, self.input.len() as u32),
812                content: AtomContent::Error {
813                    message: "unexpected end of input",
814                },
815            },
816        }
817    }
818
819    /// Parse an object atom.
820    fn parse_object_atom(&mut self, start_span: Span) -> Atom<'src> {
821        let mut entries: Vec<ObjectEntry<'src>> = Vec::new();
822        let mut seen_keys: HashMap<KeyValue, Span> = HashMap::new();
823        let mut duplicate_key_spans: Vec<(Span, Span)> = Vec::new();
824        let mut dangling_doc_comment_spans: Vec<Span> = Vec::new();
825        let mut pending_doc_comments: Vec<(Span, &'src str)> = Vec::new();
826        let mut unclosed = false;
827        let mut end_span = start_span;
828
829        loop {
830            let lexeme = self.source.next();
831            match lexeme {
832                Lexeme::Eof => {
833                    unclosed = true;
834                    for (span, _) in &pending_doc_comments {
835                        dangling_doc_comment_spans.push(*span);
836                    }
837                    break;
838                }
839                Lexeme::ObjectEnd { span } => {
840                    for (s, _) in &pending_doc_comments {
841                        dangling_doc_comment_spans.push(*s);
842                    }
843                    end_span = span;
844                    break;
845                }
846                Lexeme::Newline { .. } | Lexeme::Comma { .. } => continue,
847                Lexeme::Comment { .. } => continue,
848                Lexeme::DocComment { span, text } => {
849                    pending_doc_comments.push((span, text));
850                }
851                _ => {
852                    let doc_comment = if pending_doc_comments.is_empty() {
853                        None
854                    } else {
855                        // Collect all doc comments, stripping the `/// ` prefix from each
856                        let first_span = pending_doc_comments.first().unwrap().0;
857                        let last_span = pending_doc_comments.last().unwrap().0;
858                        let combined_span = Span::new(first_span.start, last_span.end);
859                        let lines: Vec<&'src str> = pending_doc_comments
860                            .iter()
861                            .map(|(_, text)| {
862                                // Strip `/// ` or `///` prefix
863                                text.strip_prefix("/// ")
864                                    .or_else(|| text.strip_prefix("///"))
865                                    .unwrap_or(*text)
866                            })
867                            .collect();
868                        pending_doc_comments.clear();
869                        Some((combined_span, lines))
870                    };
871                    let entry_atoms = self.collect_entry_atoms(lexeme);
872
873                    if !entry_atoms.is_empty() {
874                        let key = entry_atoms[0].clone();
875                        let key_value = KeyValue::from_atom(&key);
876
877                        if let Some(&original_span) = seen_keys.get(&key_value) {
878                            duplicate_key_spans.push((original_span, key.span));
879                        } else {
880                            seen_keys.insert(key_value, key.span);
881                        }
882
883                        let (value, too_many_atoms_span) = if entry_atoms.len() == 1 {
884                            (
885                                Atom {
886                                    span: key.span,
887                                    content: AtomContent::Unit,
888                                },
889                                None,
890                            )
891                        } else if entry_atoms.len() == 2 {
892                            (entry_atoms[1].clone(), None)
893                        } else {
894                            (entry_atoms[1].clone(), Some(entry_atoms[2].span))
895                        };
896
897                        entries.push(ObjectEntry {
898                            key,
899                            value,
900                            doc_comment,
901                            too_many_atoms_span,
902                        });
903                    }
904                }
905            }
906        }
907
908        Atom {
909            span: Span::new(start_span.start, end_span.end),
910            content: AtomContent::Object {
911                entries,
912                duplicate_key_spans,
913                dangling_doc_comment_spans,
914                unclosed,
915            },
916        }
917    }
918
919    /// Parse a sequence atom.
920    fn parse_sequence_atom(&mut self, start_span: Span) -> Atom<'src> {
921        let mut elements: Vec<Atom<'src>> = Vec::new();
922        let mut unclosed = false;
923        let mut comma_spans: Vec<Span> = Vec::new();
924        let mut end_span = start_span;
925
926        loop {
927            let lexeme = self.source.next();
928            match lexeme {
929                Lexeme::Eof => {
930                    unclosed = true;
931                    break;
932                }
933                Lexeme::SeqEnd { span } => {
934                    end_span = span;
935                    break;
936                }
937                Lexeme::Newline { .. } => continue,
938                Lexeme::Comma { span } => {
939                    comma_spans.push(span);
940                    continue;
941                }
942                Lexeme::Comment { .. } | Lexeme::DocComment { .. } => continue,
943                _ => {
944                    let elem = self.parse_atom(lexeme);
945                    elements.push(elem);
946                }
947            }
948        }
949
950        Atom {
951            span: Span::new(start_span.start, end_span.end),
952            content: AtomContent::Sequence {
953                elements,
954                unclosed,
955                comma_spans,
956            },
957        }
958    }
959
960    /// Parse attributes.
961    fn parse_attributes(&mut self, first_span: Span, first_key: &'src str) -> Atom<'src> {
962        let mut attrs = Vec::new();
963        let first_value = self.parse_attribute_value();
964        attrs.push(AttributeEntry {
965            key: first_key,
966            key_span: first_span,
967            value: first_value,
968        });
969
970        loop {
971            let lexeme = self.source.next();
972            match lexeme {
973                Lexeme::AttrKey { key_span, key, .. } => {
974                    let value = self.parse_attribute_value();
975                    attrs.push(AttributeEntry {
976                        key,
977                        key_span,
978                        value,
979                    });
980                }
981                other => {
982                    self.source.stash(other);
983                    break;
984                }
985            }
986        }
987
988        let end = attrs
989            .last()
990            .map(|a| a.value.span.end)
991            .unwrap_or(first_span.end);
992        Atom {
993            span: Span::new(first_span.start, end),
994            content: AtomContent::Attributes(attrs),
995        }
996    }
997
998    /// Parse an attribute value.
999    fn parse_attribute_value(&mut self) -> Atom<'src> {
1000        let lexeme = self.source.next();
1001        self.parse_atom(lexeme)
1002    }
1003
1004    /// Emit dotted path entry.
1005    fn emit_dotted_path_entry(
1006        &mut self,
1007        path_text: Cow<'src, str>,
1008        path_span: Span,
1009        atoms: &[Atom<'src>],
1010        check_path_state: bool,
1011    ) {
1012        let segments: Vec<&str> = path_text.split('.').collect();
1013
1014        if segments.is_empty() || segments.iter().any(|s| s.is_empty()) {
1015            self.event_queue.push_back(Event {
1016                span: path_span,
1017                kind: EventKind::Error {
1018                    kind: ParseErrorKind::InvalidKey,
1019                },
1020            });
1021            self.event_queue.push_back(Event {
1022                span: path_span,
1023                kind: EventKind::EntryStart,
1024            });
1025            self.event_queue.push_back(Event {
1026                span: path_span,
1027                kind: EventKind::EntryEnd,
1028            });
1029            return;
1030        }
1031
1032        // Check path state at root
1033        if check_path_state
1034            && let ParserState::DocumentRoot {
1035                seen_keys,
1036                path_state,
1037                ..
1038            } = &mut self.state
1039        {
1040            let first_key_value = KeyValue::Scalar(segments[0].to_string());
1041            seen_keys.entry(first_key_value).or_insert(path_span);
1042
1043            let path: Vec<String> = segments.iter().map(|s| s.to_string()).collect();
1044            let value_kind = if atoms.len() >= 2 {
1045                match &atoms[1].content {
1046                    AtomContent::Object { .. } | AtomContent::Attributes(_) => {
1047                        PathValueKind::Object
1048                    }
1049                    _ => PathValueKind::Terminal,
1050                }
1051            } else {
1052                PathValueKind::Terminal
1053            };
1054
1055            if let Err(err) = path_state.check_and_update(&path, path_span, value_kind) {
1056                self.emit_path_error(err, path_span);
1057            }
1058        }
1059
1060        // Emit nested structure
1061        let depth = segments.len();
1062        let mut current_offset = path_span.start;
1063
1064        for (i, segment) in segments.iter().enumerate() {
1065            let segment_len = segment.len() as u32;
1066            let segment_span = Span::new(current_offset, current_offset + segment_len);
1067
1068            self.event_queue.push_back(Event {
1069                span: segment_span,
1070                kind: EventKind::EntryStart,
1071            });
1072            self.event_queue.push_back(Event {
1073                span: segment_span,
1074                kind: EventKind::Key {
1075                    tag: None,
1076                    payload: Some(Cow::Owned(segment.to_string())),
1077                    kind: ScalarKind::Bare,
1078                },
1079            });
1080
1081            if i < depth - 1 {
1082                self.event_queue.push_back(Event {
1083                    span: segment_span,
1084                    kind: EventKind::ObjectStart,
1085                });
1086            }
1087
1088            current_offset += segment_len + 1;
1089        }
1090
1091        // Emit value
1092        if atoms.len() == 1 {
1093            self.event_queue.push_back(Event {
1094                span: path_span,
1095                kind: EventKind::Unit,
1096            });
1097        } else if atoms.len() >= 2 {
1098            self.emit_atom_as_value(&atoms[1]);
1099        }
1100
1101        if atoms.len() > 2 {
1102            self.event_queue.push_back(Event {
1103                span: atoms[2].span,
1104                kind: EventKind::Error {
1105                    kind: ParseErrorKind::TooManyAtoms,
1106                },
1107            });
1108        }
1109
1110        // Close nested structures
1111        for i in (0..depth).rev() {
1112            if i < depth - 1 {
1113                self.event_queue.push_back(Event {
1114                    span: path_span,
1115                    kind: EventKind::ObjectEnd,
1116                });
1117            }
1118            self.event_queue.push_back(Event {
1119                span: path_span,
1120                kind: EventKind::EntryEnd,
1121            });
1122        }
1123    }
1124
1125    /// Emit path error.
1126    fn emit_path_error(&mut self, err: PathError, span: Span) {
1127        let kind = match err {
1128            PathError::Duplicate { original } => ParseErrorKind::DuplicateKey { original },
1129            PathError::Reopened { closed_path } => ParseErrorKind::ReopenedPath { closed_path },
1130            PathError::NestIntoTerminal { terminal_path } => {
1131                ParseErrorKind::NestIntoTerminal { terminal_path }
1132            }
1133        };
1134        self.event_queue.push_back(Event {
1135            span,
1136            kind: EventKind::Error { kind },
1137        });
1138    }
1139
1140    /// Get the span of just the heredoc opening marker (<<TAG\n).
1141    fn heredoc_start_span(&self, heredoc_span: Span) -> Span {
1142        let text = &self.input[heredoc_span.start as usize..heredoc_span.end as usize];
1143        // Find the first newline - that's the end of the opening marker
1144        let end_offset = text.find('\n').map(|i| i + 1).unwrap_or(text.len());
1145        Span::new(heredoc_span.start, heredoc_span.start + end_offset as u32)
1146    }
1147
1148    /// Emit atom as key.
1149    fn emit_atom_as_key(&mut self, atom: &Atom<'src>) {
1150        match &atom.content {
1151            AtomContent::Scalar { value, kind } => {
1152                // The lexer already processed escape sequences.
1153                self.event_queue.push_back(Event {
1154                    span: atom.span,
1155                    kind: EventKind::Key {
1156                        tag: None,
1157                        payload: Some(value.clone()),
1158                        kind: *kind,
1159                    },
1160                });
1161            }
1162            AtomContent::Unit => {
1163                self.event_queue.push_back(Event {
1164                    span: atom.span,
1165                    kind: EventKind::Key {
1166                        tag: None,
1167                        payload: None,
1168                        kind: ScalarKind::Bare,
1169                    },
1170                });
1171            }
1172            AtomContent::Tag {
1173                name,
1174                payload,
1175                invalid_name,
1176                error_span,
1177            } => {
1178                if *invalid_name {
1179                    self.event_queue.push_back(Event {
1180                        span: error_span.unwrap_or(atom.span),
1181                        kind: EventKind::Error {
1182                            kind: ParseErrorKind::InvalidTagName,
1183                        },
1184                    });
1185                }
1186                match payload {
1187                    None => {
1188                        self.event_queue.push_back(Event {
1189                            span: atom.span,
1190                            kind: EventKind::Key {
1191                                tag: Some(name),
1192                                payload: None,
1193                                kind: ScalarKind::Bare,
1194                            },
1195                        });
1196                    }
1197                    Some(inner) => match &inner.content {
1198                        AtomContent::Scalar { value, kind } => {
1199                            if *kind == ScalarKind::Quoted {
1200                                self.emit_escape_errors(value, inner.span);
1201                            }
1202                            self.event_queue.push_back(Event {
1203                                span: atom.span,
1204                                kind: EventKind::Key {
1205                                    tag: Some(name),
1206                                    payload: Some(value.clone()),
1207                                    kind: *kind,
1208                                },
1209                            });
1210                        }
1211                        AtomContent::Unit => {
1212                            self.event_queue.push_back(Event {
1213                                span: atom.span,
1214                                kind: EventKind::Key {
1215                                    tag: Some(name),
1216                                    payload: None,
1217                                    kind: ScalarKind::Bare,
1218                                },
1219                            });
1220                        }
1221                        _ => {
1222                            self.event_queue.push_back(Event {
1223                                span: inner.span,
1224                                kind: EventKind::Error {
1225                                    kind: ParseErrorKind::InvalidKey,
1226                                },
1227                            });
1228                        }
1229                    },
1230                }
1231            }
1232            AtomContent::InvalidEscapeScalar { raw_inner } => {
1233                // Emit the escape errors at their specific positions
1234                let inner_start = atom.span.start + 1;
1235                for (offset, seq) in validate_escapes(raw_inner) {
1236                    let error_start = inner_start + offset as u32;
1237                    let error_span = Span::new(error_start, error_start + seq.len() as u32);
1238                    self.event_queue.push_back(Event {
1239                        span: error_span,
1240                        kind: EventKind::Error {
1241                            kind: ParseErrorKind::InvalidEscape(seq),
1242                        },
1243                    });
1244                }
1245                // Still emit a key event (with the partially-processed value)
1246                self.event_queue.push_back(Event {
1247                    span: atom.span,
1248                    kind: EventKind::Key {
1249                        tag: None,
1250                        payload: Some(Cow::Owned(unescape_quoted(raw_inner).into_owned())),
1251                        kind: ScalarKind::Quoted,
1252                    },
1253                });
1254            }
1255            AtomContent::Error { message } => {
1256                let kind = if message.contains("invalid tag name") {
1257                    ParseErrorKind::InvalidTagName
1258                } else {
1259                    ParseErrorKind::InvalidKey
1260                };
1261                self.event_queue.push_back(Event {
1262                    span: atom.span,
1263                    kind: EventKind::Error { kind },
1264                });
1265            }
1266            _ => {
1267                self.event_queue.push_back(Event {
1268                    span: atom.span,
1269                    kind: EventKind::Error {
1270                        kind: ParseErrorKind::InvalidKey,
1271                    },
1272                });
1273            }
1274        }
1275    }
1276
1277    /// Emit atom as value.
1278    fn emit_atom_as_value(&mut self, atom: &Atom<'src>) {
1279        match &atom.content {
1280            AtomContent::Scalar { value, kind } => {
1281                // The lexer already processed escape sequences.
1282                self.event_queue.push_back(Event {
1283                    span: atom.span,
1284                    kind: EventKind::Scalar {
1285                        value: value.clone(),
1286                        kind: *kind,
1287                    },
1288                });
1289            }
1290            AtomContent::Unit => {
1291                self.event_queue.push_back(Event {
1292                    span: atom.span,
1293                    kind: EventKind::Unit,
1294                });
1295            }
1296            AtomContent::Tag {
1297                name,
1298                payload,
1299                invalid_name,
1300                error_span,
1301            } => {
1302                if *invalid_name {
1303                    self.event_queue.push_back(Event {
1304                        span: error_span.unwrap_or(atom.span),
1305                        kind: EventKind::Error {
1306                            kind: ParseErrorKind::InvalidTagName,
1307                        },
1308                    });
1309                }
1310                self.event_queue.push_back(Event {
1311                    span: atom.span,
1312                    kind: EventKind::TagStart { name },
1313                });
1314                if let Some(inner) = payload {
1315                    self.emit_atom_as_value(inner);
1316                }
1317                self.event_queue.push_back(Event {
1318                    span: atom.span,
1319                    kind: EventKind::TagEnd,
1320                });
1321            }
1322            AtomContent::Object {
1323                entries,
1324                duplicate_key_spans,
1325                dangling_doc_comment_spans,
1326                unclosed,
1327            } => {
1328                self.event_queue.push_back(Event {
1329                    span: atom.span,
1330                    kind: EventKind::ObjectStart,
1331                });
1332
1333                if *unclosed {
1334                    self.event_queue.push_back(Event {
1335                        span: atom.span,
1336                        kind: EventKind::Error {
1337                            kind: ParseErrorKind::UnclosedObject,
1338                        },
1339                    });
1340                }
1341
1342                for (original, dup) in duplicate_key_spans {
1343                    self.event_queue.push_back(Event {
1344                        span: *dup,
1345                        kind: EventKind::Error {
1346                            kind: ParseErrorKind::DuplicateKey {
1347                                original: *original,
1348                            },
1349                        },
1350                    });
1351                }
1352
1353                for span in dangling_doc_comment_spans {
1354                    self.event_queue.push_back(Event {
1355                        span: *span,
1356                        kind: EventKind::Error {
1357                            kind: ParseErrorKind::DanglingDocComment,
1358                        },
1359                    });
1360                }
1361
1362                for entry in entries {
1363                    if let Some((span, lines)) = &entry.doc_comment {
1364                        self.event_queue.push_back(Event {
1365                            span: *span,
1366                            kind: EventKind::DocComment {
1367                                lines: lines.clone(),
1368                            },
1369                        });
1370                    }
1371                    self.event_queue.push_back(Event {
1372                        span: entry.key.span,
1373                        kind: EventKind::EntryStart,
1374                    });
1375                    self.emit_atom_as_key(&entry.key);
1376                    self.emit_atom_as_value(&entry.value);
1377
1378                    let mut end_span = entry.value.span;
1379                    if let Some(span) = entry.too_many_atoms_span {
1380                        self.event_queue.push_back(Event {
1381                            span,
1382                            kind: EventKind::Error {
1383                                kind: ParseErrorKind::TooManyAtoms,
1384                            },
1385                        });
1386                        end_span = span;
1387                    }
1388                    self.event_queue.push_back(Event {
1389                        span: end_span,
1390                        kind: EventKind::EntryEnd,
1391                    });
1392                }
1393
1394                self.event_queue.push_back(Event {
1395                    span: atom.span,
1396                    kind: EventKind::ObjectEnd,
1397                });
1398            }
1399            AtomContent::Sequence {
1400                elements,
1401                unclosed,
1402                comma_spans,
1403            } => {
1404                self.event_queue.push_back(Event {
1405                    span: atom.span,
1406                    kind: EventKind::SequenceStart,
1407                });
1408
1409                if *unclosed {
1410                    self.event_queue.push_back(Event {
1411                        span: atom.span,
1412                        kind: EventKind::Error {
1413                            kind: ParseErrorKind::UnclosedSequence,
1414                        },
1415                    });
1416                }
1417
1418                for span in comma_spans {
1419                    self.event_queue.push_back(Event {
1420                        span: *span,
1421                        kind: EventKind::Error {
1422                            kind: ParseErrorKind::CommaInSequence,
1423                        },
1424                    });
1425                }
1426
1427                for elem in elements {
1428                    self.emit_atom_as_value(elem);
1429                }
1430
1431                self.event_queue.push_back(Event {
1432                    span: atom.span,
1433                    kind: EventKind::SequenceEnd,
1434                });
1435            }
1436            AtomContent::Attributes(attrs) => {
1437                self.event_queue.push_back(Event {
1438                    span: atom.span,
1439                    kind: EventKind::ObjectStart,
1440                });
1441
1442                for attr in attrs {
1443                    self.event_queue.push_back(Event {
1444                        span: attr.key_span,
1445                        kind: EventKind::EntryStart,
1446                    });
1447                    self.event_queue.push_back(Event {
1448                        span: attr.key_span,
1449                        kind: EventKind::Key {
1450                            tag: None,
1451                            payload: Some(Cow::Borrowed(attr.key)),
1452                            kind: ScalarKind::Bare,
1453                        },
1454                    });
1455                    self.emit_atom_as_value(&attr.value);
1456                    self.event_queue.push_back(Event {
1457                        span: attr.value.span,
1458                        kind: EventKind::EntryEnd,
1459                    });
1460                }
1461
1462                self.event_queue.push_back(Event {
1463                    span: atom.span,
1464                    kind: EventKind::ObjectEnd,
1465                });
1466            }
1467            AtomContent::InvalidEscapeScalar { raw_inner } => {
1468                // Emit the escape errors at their specific positions
1469                // The span includes quotes, so offset by 1 for the opening quote
1470                let inner_start = atom.span.start + 1;
1471                for (offset, seq) in validate_escapes(raw_inner) {
1472                    let error_start = inner_start + offset as u32;
1473                    let error_span = Span::new(error_start, error_start + seq.len() as u32);
1474                    self.event_queue.push_back(Event {
1475                        span: error_span,
1476                        kind: EventKind::Error {
1477                            kind: ParseErrorKind::InvalidEscape(seq),
1478                        },
1479                    });
1480                }
1481                // Also emit the scalar value (with invalid escapes replaced/kept)
1482                self.event_queue.push_back(Event {
1483                    span: atom.span,
1484                    kind: EventKind::Scalar {
1485                        value: Cow::Owned(unescape_quoted(raw_inner).into_owned()),
1486                        kind: ScalarKind::Quoted,
1487                    },
1488                });
1489            }
1490            AtomContent::Error { message } => {
1491                let kind = if message.contains("invalid tag name") {
1492                    ParseErrorKind::InvalidTagName
1493                } else if message.contains("expected a value") {
1494                    ParseErrorKind::ExpectedValue
1495                } else {
1496                    ParseErrorKind::UnexpectedToken
1497                };
1498                self.event_queue.push_back(Event {
1499                    span: atom.span,
1500                    kind: EventKind::Error { kind },
1501                });
1502            }
1503        }
1504    }
1505
1506    /// Emit escape errors.
1507    fn emit_escape_errors(&mut self, text: &str, span: Span) {
1508        for (offset, seq) in validate_escapes(text) {
1509            let error_start = span.start + offset as u32;
1510            let error_span = Span::new(error_start, error_start + seq.len() as u32);
1511            self.event_queue.push_back(Event {
1512                span: error_span,
1513                kind: EventKind::Error {
1514                    kind: ParseErrorKind::InvalidEscape(seq),
1515                },
1516            });
1517        }
1518    }
1519}
1520
1521// ============================================================================
1522// Atom types
1523// ============================================================================
1524
1525#[derive(Debug, Clone)]
1526struct Atom<'src> {
1527    span: Span,
1528    content: AtomContent<'src>,
1529}
1530
1531#[derive(Debug, Clone)]
1532enum AtomContent<'src> {
1533    Scalar {
1534        value: Cow<'src, str>,
1535        kind: ScalarKind,
1536    },
1537    Unit,
1538    Tag {
1539        name: &'src str,
1540        payload: Option<Box<Atom<'src>>>,
1541        invalid_name: bool,
1542        /// For invalid tags, the span to use for the error (excludes @).
1543        /// If None, uses atom.span.
1544        error_span: Option<Span>,
1545    },
1546    Object {
1547        entries: Vec<ObjectEntry<'src>>,
1548        duplicate_key_spans: Vec<(Span, Span)>,
1549        dangling_doc_comment_spans: Vec<Span>,
1550        unclosed: bool,
1551    },
1552    Sequence {
1553        elements: Vec<Atom<'src>>,
1554        unclosed: bool,
1555        comma_spans: Vec<Span>,
1556    },
1557    Attributes(Vec<AttributeEntry<'src>>),
1558    /// A quoted scalar with invalid escape sequences.
1559    /// We store the raw inner text (without quotes) to scan for escape errors.
1560    InvalidEscapeScalar {
1561        raw_inner: Cow<'src, str>,
1562    },
1563    /// An error from the lexer.
1564    Error {
1565        message: &'src str,
1566    },
1567}
1568
1569#[derive(Debug, Clone)]
1570struct ObjectEntry<'src> {
1571    key: Atom<'src>,
1572    value: Atom<'src>,
1573    doc_comment: Option<(Span, Vec<&'src str>)>,
1574    too_many_atoms_span: Option<Span>,
1575}
1576
1577#[derive(Debug, Clone)]
1578struct AttributeEntry<'src> {
1579    key: &'src str,
1580    key_span: Span,
1581    value: Atom<'src>,
1582}
1583
1584// ============================================================================
1585// Key comparison
1586// ============================================================================
1587
1588#[derive(Debug, Clone, PartialEq, Eq, Hash)]
1589enum KeyValue {
1590    Scalar(String),
1591    Unit,
1592    Tagged {
1593        name: String,
1594        payload: Option<Box<KeyValue>>,
1595    },
1596}
1597
1598impl KeyValue {
1599    fn from_atom(atom: &Atom<'_>) -> Self {
1600        match &atom.content {
1601            AtomContent::Scalar { value, .. } => KeyValue::Scalar(value.to_string()),
1602            AtomContent::Unit => KeyValue::Unit,
1603            AtomContent::Tag { name, payload, .. } => KeyValue::Tagged {
1604                name: (*name).to_string(),
1605                payload: payload.as_ref().map(|p| Box::new(KeyValue::from_atom(p))),
1606            },
1607            AtomContent::Object { .. } => KeyValue::Scalar("{}".into()),
1608            AtomContent::Sequence { .. } => KeyValue::Scalar("()".into()),
1609            AtomContent::Attributes(_) => KeyValue::Scalar("{}".into()),
1610            AtomContent::InvalidEscapeScalar { raw_inner } => {
1611                // This is raw text that failed escape processing - just use it as-is
1612                KeyValue::Scalar(raw_inner.to_string())
1613            }
1614            AtomContent::Error { .. } => KeyValue::Scalar("<error>".into()),
1615        }
1616    }
1617
1618    fn to_key_string(&self) -> String {
1619        match self {
1620            KeyValue::Scalar(s) => s.clone(),
1621            KeyValue::Unit => "@".to_string(),
1622            KeyValue::Tagged { name, .. } => format!("@{}", name),
1623        }
1624    }
1625}
1626
1627// ============================================================================
1628// Path tracking (O(depth) implementation)
1629// ============================================================================
1630
1631#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1632enum PathValueKind {
1633    Object,
1634    Terminal,
1635}
1636
1637#[derive(Debug, Clone)]
1638enum PathError {
1639    Duplicate { original: Span },
1640    Reopened { closed_path: Vec<String> },
1641    NestIntoTerminal { terminal_path: Vec<String> },
1642}
1643
1644/// A single segment in the current path.
1645///
1646/// Each segment tracks:
1647/// - The key name and where it was defined
1648/// - Whether it has a terminal value (can't nest into it)
1649/// - Which child keys have been "closed" (can't be reopened)
1650#[derive(Debug, Clone)]
1651struct PathSegment {
1652    key: String,
1653    span: Span,
1654    value_kind: PathValueKind,
1655    /// Keys that have been closed at this level. When we move from a.b.c to a.b.d,
1656    /// we add "c" to the closed_children of the "b" segment. This is O(siblings at this level)
1657    /// rather than O(all paths ever seen).
1658    closed_children: HashMap<String, Span>,
1659}
1660
1661/// Path state tracker with O(depth) memory usage.
1662///
1663/// Instead of tracking all paths ever seen (O(total paths)), we only track:
1664/// - The current path as a stack of segments
1665/// - At each segment, which sibling keys have been closed
1666///
1667/// This works because we can never go back to a previous sibling in the file order.
1668#[derive(Default, Clone)]
1669struct PathState {
1670    /// The current path, as a stack of segments. Length is O(max depth).
1671    segments: Vec<PathSegment>,
1672}
1673
1674impl PathState {
1675    fn check_and_update(
1676        &mut self,
1677        path: &[String],
1678        span: Span,
1679        value_kind: PathValueKind,
1680    ) -> Result<(), PathError> {
1681        if path.is_empty() {
1682            return Ok(());
1683        }
1684
1685        // Find common prefix length with current path
1686        let common_len = self
1687            .segments
1688            .iter()
1689            .zip(path.iter())
1690            .take_while(|(seg, key)| seg.key == **key)
1691            .count();
1692
1693        // Special case: if the entire path matches, check for duplicate
1694        // This happens when we see `a 1` then `a 2` - the path ["a"] fully matches
1695        if common_len == path.len()
1696            && common_len == self.segments.len()
1697            && !self.segments.is_empty()
1698        {
1699            // Exact same path - this is a duplicate
1700            return Err(PathError::Duplicate {
1701                original: self.segments.last().unwrap().span,
1702            });
1703        }
1704
1705        // Close segments beyond common prefix and check for reopening
1706        // We iterate from deepest to shallowest
1707        while self.segments.len() > common_len {
1708            let closed_segment = self.segments.pop().unwrap();
1709
1710            // Add this key to parent's closed_children (if there is a parent)
1711            if let Some(parent) = self.segments.last_mut() {
1712                parent
1713                    .closed_children
1714                    .insert(closed_segment.key, closed_segment.span);
1715            }
1716        }
1717
1718        // Now process each new segment of the path
1719        for (i, key) in path.iter().enumerate().skip(common_len) {
1720            let is_last = i == path.len() - 1;
1721            let segment_value_kind = if is_last {
1722                value_kind
1723            } else {
1724                PathValueKind::Object
1725            };
1726
1727            if i == common_len && common_len < self.segments.len() {
1728                // This case shouldn't happen after the while loop above, but handle defensively
1729                unreachable!("segments should have been truncated");
1730            }
1731
1732            if i < self.segments.len() {
1733                // We're on the same path segment - check for exact duplicate
1734                let existing = &self.segments[i];
1735                if existing.key == *key && is_last {
1736                    return Err(PathError::Duplicate {
1737                        original: existing.span,
1738                    });
1739                }
1740            } else if i == 0 {
1741                // Root level - no parent to check
1742                // Check if we already have a root segment with this key
1743                if !self.segments.is_empty() && self.segments[0].key == *key {
1744                    if is_last {
1745                        return Err(PathError::Duplicate {
1746                            original: self.segments[0].span,
1747                        });
1748                    }
1749                    // Continue using existing segment
1750                    continue;
1751                }
1752                // New root segment
1753                self.segments.push(PathSegment {
1754                    key: key.clone(),
1755                    span,
1756                    value_kind: segment_value_kind,
1757                    closed_children: HashMap::new(),
1758                });
1759            } else {
1760                // Check parent's closed_children for reopening
1761                let parent = &self.segments[i - 1];
1762
1763                // Check if parent is terminal (can't nest into it)
1764                if parent.value_kind == PathValueKind::Terminal {
1765                    return Err(PathError::NestIntoTerminal {
1766                        terminal_path: self.segments.iter().map(|s| s.key.clone()).collect(),
1767                    });
1768                }
1769
1770                // Check if this key was already closed at this level
1771                if parent.closed_children.contains_key(key) {
1772                    return Err(PathError::Reopened {
1773                        closed_path: self.segments[..i]
1774                            .iter()
1775                            .map(|s| s.key.clone())
1776                            .chain(std::iter::once(key.clone()))
1777                            .collect(),
1778                    });
1779                }
1780
1781                // Add new segment
1782                self.segments.push(PathSegment {
1783                    key: key.clone(),
1784                    span,
1785                    value_kind: segment_value_kind,
1786                    closed_children: HashMap::new(),
1787                });
1788            }
1789        }
1790
1791        // Update the value_kind of the last segment to match what was passed in
1792        if let Some(last) = self.segments.last_mut() {
1793            last.value_kind = value_kind;
1794        }
1795
1796        Ok(())
1797    }
1798}
1799
1800// ============================================================================
1801// Helpers
1802// ============================================================================
1803
1804fn is_valid_tag_name(name: &str) -> bool {
1805    let mut chars = name.chars();
1806    match chars.next() {
1807        Some(c) if c.is_ascii_alphabetic() || c == '_' => {}
1808        _ => return false,
1809    }
1810    chars.all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-')
1811}
1812
1813fn unescape_quoted(text: &str) -> Cow<'_, str> {
1814    if !text.contains('\\') {
1815        return Cow::Borrowed(text);
1816    }
1817
1818    let mut result = String::with_capacity(text.len());
1819    let mut chars = text.chars().peekable();
1820
1821    while let Some(c) = chars.next() {
1822        if c == '\\' {
1823            match chars.next() {
1824                Some('n') => result.push('\n'),
1825                Some('r') => result.push('\r'),
1826                Some('t') => result.push('\t'),
1827                Some('\\') => result.push('\\'),
1828                Some('"') => result.push('"'),
1829                Some('u') => match chars.peek() {
1830                    Some('{') => {
1831                        chars.next();
1832                        let mut hex = String::new();
1833                        while let Some(&c) = chars.peek() {
1834                            if c == '}' {
1835                                chars.next();
1836                                break;
1837                            }
1838                            hex.push(chars.next().unwrap());
1839                        }
1840                        if let Ok(code) = u32::from_str_radix(&hex, 16)
1841                            && let Some(ch) = char::from_u32(code)
1842                        {
1843                            result.push(ch);
1844                        }
1845                    }
1846                    Some(&c) if c.is_ascii_hexdigit() => {
1847                        let mut hex = String::with_capacity(4);
1848                        for _ in 0..4 {
1849                            if let Some(&c) = chars.peek() {
1850                                if c.is_ascii_hexdigit() {
1851                                    hex.push(chars.next().unwrap());
1852                                } else {
1853                                    break;
1854                                }
1855                            }
1856                        }
1857                        if hex.len() == 4
1858                            && let Ok(code) = u32::from_str_radix(&hex, 16)
1859                            && let Some(ch) = char::from_u32(code)
1860                        {
1861                            result.push(ch);
1862                        }
1863                    }
1864                    _ => {}
1865                },
1866                Some(c) => {
1867                    result.push('\\');
1868                    result.push(c);
1869                }
1870                None => {
1871                    result.push('\\');
1872                }
1873            }
1874        } else {
1875            result.push(c);
1876        }
1877    }
1878
1879    Cow::Owned(result)
1880}
1881
1882fn validate_escapes(text: &str) -> Vec<(usize, String)> {
1883    let mut errors = Vec::new();
1884    let mut chars = text.char_indices().peekable();
1885
1886    while let Some((i, c)) = chars.next() {
1887        if c == '\\' {
1888            let escape_start = i;
1889            match chars.next() {
1890                Some((_, 'n' | 'r' | 't' | '\\' | '"')) => {}
1891                Some((_, 'u')) => match chars.peek() {
1892                    Some((_, '{')) => {
1893                        chars.next();
1894                        let mut valid = true;
1895                        let mut found_close = false;
1896                        for (_, c) in chars.by_ref() {
1897                            if c == '}' {
1898                                found_close = true;
1899                                break;
1900                            }
1901                            if !c.is_ascii_hexdigit() {
1902                                valid = false;
1903                            }
1904                        }
1905                        if !found_close || !valid {
1906                            let end = chars.peek().map(|(i, _)| *i).unwrap_or(text.len());
1907                            let seq = &text[escape_start..end.min(escape_start + 12)];
1908                            errors.push((escape_start, seq.to_string()));
1909                        }
1910                    }
1911                    Some((_, c)) if c.is_ascii_hexdigit() => {
1912                        let mut count = 1;
1913                        while count < 4 {
1914                            match chars.peek() {
1915                                Some((_, c)) if c.is_ascii_hexdigit() => {
1916                                    chars.next();
1917                                    count += 1;
1918                                }
1919                                _ => break,
1920                            }
1921                        }
1922                        if count != 4 {
1923                            let end = chars.peek().map(|(i, _)| *i).unwrap_or(text.len());
1924                            let seq = &text[escape_start..end];
1925                            errors.push((escape_start, seq.to_string()));
1926                        }
1927                    }
1928                    _ => {
1929                        errors.push((escape_start, "\\u".to_string()));
1930                    }
1931                },
1932                Some((_, c)) => {
1933                    errors.push((escape_start, format!("\\{}", c)));
1934                }
1935                None => {
1936                    errors.push((escape_start, "\\".to_string()));
1937                }
1938            }
1939        }
1940    }
1941
1942    errors
1943}
1944
1945#[cfg(test)]
1946mod tests;