Skip to main content

rlsp_yaml_parser/
loader.rs

1// SPDX-License-Identifier: MIT
2
3//! Event-to-AST loader.
4//!
5//! Consumes the event stream from [`crate::parse_events`] and builds a
6//! `Vec<Document<Span>>`.
7//!
8//! Two modes are available:
9//! - **Lossless** (default): alias references are kept as [`Node::Alias`]
10//!   nodes — no expansion, safe for untrusted input without any expansion
11//!   limit.
12//! - **Resolved**: aliases are expanded inline.  An expansion-node counter
13//!   guards against alias bombs (Billion Laughs attack).
14//!
15//! Security controls (all active in both modes unless noted):
16//! - `max_nesting_depth` — caps sequence/mapping nesting to prevent stack
17//!   exhaustion (default 512).
18//! - `max_anchors` — caps distinct anchor registrations to bound anchor-map
19//!   memory (default 10 000).
20//! - `max_expanded_nodes` — caps total nodes produced by alias expansion in
21//!   resolved mode only (default 1 000 000).
22//!
23//! # Accepted risks
24//!
25//! `expand_node` does not detect the case where an anchor-within-expansion
26//! references a previously defined anchor, forming an indirect cycle not
27//! caught by the `in_progress` set until the second traversal.  This
28//! limitation exists in the old loader and is acceptable in the LSP context
29//! where Lossless mode is the default.  The `expanded_nodes` volume limit
30//! provides the backstop.
31
32use std::borrow::Cow;
33use std::collections::{HashMap, HashSet};
34use std::iter::Peekable;
35
36use std::sync::Arc;
37
38use crate::error::Error;
39use crate::event::{Event, EventMeta, ScalarStyle};
40use crate::node::{Document, Node, NodeMeta};
41use crate::pos::{LineIndex, Pos, Span};
42use crate::schema::{CollectionKind, Schema, resolve_collection, resolve_scalar};
43
44use comments::{attach_leading_comments, attach_trailing_comment};
45use reloc::reloc;
46use stream::{
47    consume_leading_comments, consume_leading_doc_comments, next_from, peek_trailing_comment,
48    with_hash_prefix,
49};
50
51mod comments;
52mod reloc;
53mod stream;
54
55// ---------------------------------------------------------------------------
56// Public error type
57// ---------------------------------------------------------------------------
58
59/// Errors produced by the loader.
60#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)]
61pub enum LoadError {
62    /// The event stream contained a parse error.
63    #[error("parse error at {pos:?}: {message}")]
64    Parse {
65        /// Source position where the parse error was detected.
66        pos: Pos,
67        /// Human-readable description of the error.
68        message: String,
69    },
70
71    /// The event stream ended unexpectedly mid-document.
72    #[error("unexpected end of event stream")]
73    UnexpectedEndOfStream,
74
75    /// Nesting depth exceeded the configured limit.
76    #[error("nesting depth limit exceeded (max: {limit})")]
77    NestingDepthLimitExceeded {
78        /// The configured nesting depth limit that was exceeded.
79        limit: usize,
80    },
81
82    /// Too many distinct anchor names were defined.
83    #[error("anchor count limit exceeded (max: {limit})")]
84    AnchorCountLimitExceeded {
85        /// The configured anchor count limit that was exceeded.
86        limit: usize,
87    },
88
89    /// Alias expansion produced more nodes than the configured limit.
90    #[error("alias expansion node limit exceeded (max: {limit})")]
91    AliasExpansionLimitExceeded {
92        /// The configured expansion node limit that was exceeded.
93        limit: usize,
94    },
95
96    /// A circular alias reference was detected.
97    #[error("circular alias reference: '{name}'")]
98    CircularAlias {
99        /// The anchor name involved in the cycle.
100        name: String,
101    },
102
103    /// An alias referred to an anchor that was never defined.
104    #[error("undefined alias: '{name}'")]
105    UndefinedAlias {
106        /// The alias name that had no corresponding anchor definition.
107        name: String,
108    },
109
110    /// A plain scalar could not be resolved under the JSON schema.
111    ///
112    /// The JSON schema has no fallback: every untagged plain scalar must match
113    /// one of its patterns (null, bool, int, float).  If none match, the scalar
114    /// is an error per YAML 1.2.2 §10.2.
115    ///
116    /// `value` is truncated to 128 Unicode scalar values and ASCII control
117    /// characters (U+0000–U+001F, U+007F) are replaced with `\uXXXX` escapes
118    /// to prevent log injection via the `Display` impl.
119    #[error("JSON schema: plain scalar does not match any type pattern")]
120    UnresolvedScalar {
121        /// The sanitized, truncated scalar value that failed resolution.
122        value: String,
123        /// Source position of the scalar.
124        pos: Pos,
125    },
126}
127
128// Convenience alias used inside the module.
129type Result<T> = std::result::Result<T, LoadError>;
130
131// Type alias for the peekable event stream used throughout the loader.
132type EventStream<'a> =
133    Peekable<Box<dyn Iterator<Item = std::result::Result<(Event<'a>, Span), Error>> + 'a>>;
134
135/// Unpack an `Option<Box<EventMeta>>` into its four constituent fields.
136#[expect(
137    clippy::type_complexity,
138    reason = "four-tuple mirrors EventMeta fields; extracting a type alias here would obscure the one-to-one correspondence"
139)]
140#[inline]
141fn unpack_meta(
142    meta: Option<Box<EventMeta<'_>>>,
143) -> (
144    Option<&'_ str>,
145    Option<Span>,
146    Option<std::borrow::Cow<'_, str>>,
147    Option<Span>,
148) {
149    meta.map_or((None, None, None, None), |m| {
150        (m.anchor, m.anchor_loc, m.tag, m.tag_loc)
151    })
152}
153
154// ---------------------------------------------------------------------------
155// Configuration
156// ---------------------------------------------------------------------------
157
158/// Loader mode — controls how alias references are handled.
159#[derive(Debug, Clone, Copy, PartialEq, Eq)]
160pub enum LoadMode {
161    /// Preserve aliases as [`Node::Alias`] nodes (default, safe for LSP).
162    Lossless,
163    /// Expand aliases inline; subject to `max_expanded_nodes` limit.
164    Resolved,
165}
166
167/// Security and behaviour options for the loader.
168#[derive(Debug, Clone)]
169pub struct LoaderOptions {
170    /// Maximum mapping/sequence nesting depth before returning
171    /// [`LoadError::NestingDepthLimitExceeded`] (default: 512).
172    pub max_nesting_depth: usize,
173    /// Maximum number of distinct anchor names per document before returning
174    /// [`LoadError::AnchorCountLimitExceeded`] (default: 10 000).
175    pub max_anchors: usize,
176    /// Maximum total nodes produced by alias expansion in resolved mode before
177    /// returning [`LoadError::AliasExpansionLimitExceeded`] (default: 1 000 000).
178    pub max_expanded_nodes: usize,
179    /// Controls how alias references are handled during loading.
180    pub mode: LoadMode,
181    /// YAML 1.2.2 §10 schema to apply during loading (default: [`Schema::Core`]).
182    ///
183    /// Each node's tag is resolved according to this schema after the node is
184    /// constructed.  Nodes with explicit source tags are left unchanged.
185    pub schema: Schema,
186}
187
188impl Default for LoaderOptions {
189    fn default() -> Self {
190        Self {
191            max_nesting_depth: 512,
192            max_anchors: 10_000,
193            max_expanded_nodes: 1_000_000,
194            mode: LoadMode::Lossless,
195            schema: Schema::Core,
196        }
197    }
198}
199
200// ---------------------------------------------------------------------------
201// Builder
202// ---------------------------------------------------------------------------
203
204/// Builder for configuring and creating a [`Loader`].
205///
206/// ```
207/// use rlsp_yaml_parser::loader::LoaderBuilder;
208///
209/// let docs = LoaderBuilder::new().lossless().build().load("hello\n").unwrap();
210/// assert_eq!(docs.len(), 1);
211/// ```
212pub struct LoaderBuilder {
213    options: LoaderOptions,
214}
215
216impl LoaderBuilder {
217    /// Create a builder with default options (lossless mode, safe limits).
218    #[must_use]
219    pub fn new() -> Self {
220        Self {
221            options: LoaderOptions::default(),
222        }
223    }
224
225    /// Use lossless mode — aliases become [`Node::Alias`] nodes.
226    #[must_use]
227    pub const fn lossless(mut self) -> Self {
228        self.options.mode = LoadMode::Lossless;
229        self
230    }
231
232    /// Use resolved mode — aliases are expanded inline.
233    #[must_use]
234    pub const fn resolved(mut self) -> Self {
235        self.options.mode = LoadMode::Resolved;
236        self
237    }
238
239    /// Override the maximum nesting depth.
240    #[must_use]
241    pub const fn max_nesting_depth(mut self, limit: usize) -> Self {
242        self.options.max_nesting_depth = limit;
243        self
244    }
245
246    /// Override the maximum anchor count.
247    #[must_use]
248    pub const fn max_anchors(mut self, limit: usize) -> Self {
249        self.options.max_anchors = limit;
250        self
251    }
252
253    /// Override the maximum expanded-node count (resolved mode only).
254    #[must_use]
255    pub const fn max_expanded_nodes(mut self, limit: usize) -> Self {
256        self.options.max_expanded_nodes = limit;
257        self
258    }
259
260    /// Override the YAML 1.2.2 §10 schema used for tag resolution during loading.
261    ///
262    /// The default is [`Schema::Core`].  Untagged nodes receive resolved tag URIs
263    /// in the AST; nodes with explicit source tags are not modified.
264    #[must_use]
265    pub const fn schema(mut self, s: Schema) -> Self {
266        self.options.schema = s;
267        self
268    }
269
270    /// Consume the builder and produce a [`Loader`].
271    #[must_use]
272    pub const fn build(self) -> Loader {
273        Loader {
274            options: self.options,
275        }
276    }
277}
278
279impl Default for LoaderBuilder {
280    fn default() -> Self {
281        Self::new()
282    }
283}
284
285// ---------------------------------------------------------------------------
286// Loader
287// ---------------------------------------------------------------------------
288
289/// A configured YAML loader.
290pub struct Loader {
291    options: LoaderOptions,
292}
293
294impl Loader {
295    /// Load YAML text into a sequence of documents.
296    ///
297    /// # Errors
298    ///
299    /// Returns `Err` if the input contains a parse error, exceeds a configured
300    /// security limit, or (in resolved mode) references an undefined anchor.
301    pub fn load(&self, input: &str) -> std::result::Result<Vec<Document<Span>>, LoadError> {
302        let mut state = LoadState::new(&self.options, input);
303        let iter: Box<dyn Iterator<Item = std::result::Result<(Event<'_>, Span), Error>> + '_> =
304            Box::new(crate::parse_events(input));
305        state.run(iter.peekable())
306    }
307}
308
309// ---------------------------------------------------------------------------
310// Convenience entry point
311// ---------------------------------------------------------------------------
312
313/// Load YAML text using lossless mode, default security limits, and Core schema tag
314/// resolution (YAML 1.2.2 §10.3).
315///
316/// Returns one `Document<Span>` per YAML document in the stream.  Untagged nodes
317/// receive resolved tag URIs according to the Core schema; nodes with explicit source
318/// tags are left unchanged.
319///
320/// # Errors
321///
322/// Returns `Err` if the input contains a parse error or exceeds a security
323/// limit (nesting depth or anchor count).
324///
325/// ```
326/// use rlsp_yaml_parser::loader::load;
327/// use rlsp_yaml_parser::Node;
328///
329/// let docs = load("hello\n").unwrap();
330/// assert_eq!(docs.len(), 1);
331/// let Node::Scalar { tag, .. } = &docs[0].root else { panic!() };
332/// assert_eq!(tag.as_deref(), Some("tag:yaml.org,2002:str"));
333/// ```
334pub fn load(input: &str) -> std::result::Result<Vec<Document<Span>>, LoadError> {
335    LoaderBuilder::new().lossless().build().load(input)
336}
337
338// ---------------------------------------------------------------------------
339// Internal loader state
340// ---------------------------------------------------------------------------
341
342struct LoadState<'opt> {
343    options: &'opt LoaderOptions,
344    /// Anchors registered so far in the current document: name → node.
345    anchor_map: HashMap<String, Node<Span>>,
346    /// Count of distinct anchors registered (resets per document).
347    anchor_count: usize,
348    /// Current nesting depth (incremented on Begin, decremented on End).
349    depth: usize,
350    /// Total nodes produced via alias expansion (resolved mode only).
351    expanded_nodes: usize,
352    /// Leading comments accumulated by `parse_node` when it encounters a
353    /// `Comment` event between a mapping key and its value's collection start,
354    /// or by a sequence/mapping loop when it hits End with leftover leading
355    /// comments.  The next mapping/sequence loop iteration picks these up and
356    /// prepends them to the next entry's leading comments.
357    pending_leading: Vec<String>,
358    /// Line index for the current document source; shared across all documents
359    /// produced from the same input via `Arc` to avoid N full copies.
360    line_index: Arc<LineIndex>,
361}
362
363impl<'opt> LoadState<'opt> {
364    fn new(options: &'opt LoaderOptions, input: &str) -> Self {
365        Self {
366            options,
367            anchor_map: HashMap::new(),
368            anchor_count: 0,
369            depth: 0,
370            expanded_nodes: 0,
371            pending_leading: Vec::new(),
372            line_index: Arc::new(LineIndex::new(input)),
373        }
374    }
375
376    fn reset_for_document(&mut self) {
377        self.anchor_map.clear();
378        self.anchor_count = 0;
379        self.expanded_nodes = 0;
380        self.pending_leading.clear();
381    }
382
383    fn run(&mut self, mut stream: EventStream<'_>) -> Result<Vec<Document<Span>>> {
384        let mut docs: Vec<Document<Span>> = Vec::new();
385
386        // Skip StreamStart.
387        match stream.next() {
388            Some(Ok(_)) | None => {}
389            Some(Err(e)) => {
390                return Err(LoadError::Parse {
391                    pos: e.pos,
392                    message: e.message,
393                });
394            }
395        }
396
397        loop {
398            // Skip any leading comments or unknown events before a document.
399            match next_from(&mut stream)? {
400                None | Some((Event::StreamEnd, _)) => break,
401                Some((
402                    Event::DocumentStart {
403                        explicit,
404                        version,
405                        tag_directives,
406                    },
407                    _,
408                )) => {
409                    let doc_explicit_start = explicit;
410                    let doc_version = version;
411                    let doc_tags = tag_directives;
412                    self.reset_for_document();
413
414                    let mut doc_comments: Vec<String> = Vec::new();
415
416                    // Consume leading comments at document level.
417                    consume_leading_doc_comments(&mut stream, &mut doc_comments, &self.line_index)?;
418
419                    // Parse root node (may be absent for empty documents).
420                    let root = if is_document_end(stream.peek()) {
421                        // Empty document — emit an empty scalar as root.
422                        let mut node = empty_scalar();
423                        apply_schema_to_node(&mut node, self.options.schema, &self.line_index)?;
424                        node
425                    } else {
426                        self.parse_node(&mut stream)?
427                    };
428
429                    // Consume DocumentEnd if present and capture its explicit flag.
430                    let doc_explicit_end =
431                        if let Some(Ok((Event::DocumentEnd { explicit }, _))) = stream.peek() {
432                            let end_explicit = *explicit;
433                            let _ = stream.next();
434                            end_explicit
435                        } else {
436                            false
437                        };
438
439                    docs.push(Document {
440                        root,
441                        version: doc_version,
442                        tags: doc_tags,
443                        comments: doc_comments,
444                        explicit_start: doc_explicit_start,
445                        explicit_end: doc_explicit_end,
446                        line_index: Some(self.line_index.clone()),
447                    });
448                }
449                Some(_) => {
450                    // Comment or any other stray event outside a document — skip.
451                }
452            }
453        }
454
455        Ok(docs)
456    }
457
458    /// Parse a single node from the stream.
459    ///
460    /// Advances the stream past the node (including end-of-container events).
461    #[expect(
462        clippy::too_many_lines,
463        reason = "match-on-event-type; splitting would obscure flow"
464    )]
465    fn parse_node(&mut self, stream: &mut EventStream<'_>) -> Result<Node<Span>> {
466        // Structural end events close the caller's collection loop — do NOT
467        // consume them here.  Return an empty scalar and leave the event in
468        // the stream so the outer mapping/sequence loop can see and consume it.
469        if matches!(
470            stream.peek(),
471            Some(Ok((
472                Event::MappingEnd | Event::SequenceEnd | Event::DocumentEnd { .. },
473                _
474            )))
475        ) {
476            return Ok(empty_scalar());
477        }
478
479        let Some((event, span)) = next_from(stream)? else {
480            return Ok(empty_scalar());
481        };
482
483        match event {
484            Event::Scalar { value, style, meta } => {
485                let (anchor, anchor_loc, tag, tag_loc) = unpack_meta(meta);
486                let anchor = anchor.map(str::to_owned);
487                let mut node = Node::Scalar {
488                    value: value.into_owned(),
489                    style,
490                    tag: tag.map(|t| Cow::Owned(t.into_owned())),
491                    loc: span,
492                    meta: NodeMeta {
493                        anchor,
494                        anchor_loc,
495                        tag_loc,
496                        leading_comments: None,
497                        trailing_comment: None,
498                    }
499                    .into_option(),
500                };
501                apply_schema_to_node(&mut node, self.options.schema, &self.line_index)?;
502                if let Some(name) = node.anchor() {
503                    self.register_anchor(name.to_owned(), &node)?;
504                }
505                Ok(node)
506            }
507
508            Event::MappingStart { style, meta } => {
509                let (event_anchor, anchor_loc, event_tag, tag_loc) = unpack_meta(meta);
510                let anchor = event_anchor.map(str::to_owned);
511                let tag = event_tag.map(|t| Cow::Owned(t.into_owned()));
512                let anchor_for_registration = anchor.clone();
513
514                self.depth += 1;
515                if self.depth > self.options.max_nesting_depth {
516                    return Err(LoadError::NestingDepthLimitExceeded {
517                        limit: self.options.max_nesting_depth,
518                    });
519                }
520
521                let mut entries: Vec<(Node<Span>, Node<Span>)> = Vec::new();
522                let mut end_span = span;
523
524                loop {
525                    // Consume leading comments before the next key.  Also
526                    // collect any comments that spilled over from a sibling
527                    // value's collection end (stored in `pending_leading`).
528                    let raw_leading = consume_leading_comments(stream)?;
529                    let leading = if self.pending_leading.is_empty() {
530                        raw_leading
531                    } else {
532                        let mut combined = std::mem::take(&mut self.pending_leading);
533                        combined.extend(raw_leading);
534                        combined
535                    };
536
537                    match stream.peek() {
538                        None | Some(Ok((Event::MappingEnd | Event::StreamEnd, _))) => {
539                            // Save any collected leading comments so the next
540                            // sibling entry in the parent collection can inherit
541                            // them (e.g. a comment just before MappingEnd that
542                            // belongs to the following mapping entry).
543                            if !leading.is_empty() {
544                                self.pending_leading = leading;
545                            }
546                            break;
547                        }
548                        Some(Err(_)) => {
549                            // Consume the error.
550                            return Err(match stream.next() {
551                                Some(Err(e)) => LoadError::Parse {
552                                    pos: e.pos,
553                                    message: e.message,
554                                },
555                                _ => LoadError::UnexpectedEndOfStream,
556                            });
557                        }
558                        Some(Ok(_)) => {}
559                    }
560
561                    let mut key = self.parse_node(stream)?;
562                    attach_leading_comments(&mut key, leading);
563
564                    let mut value = self.parse_node(stream)?;
565
566                    // Trailing comment on the value — peek for inline comment.
567                    // Block scalars (literal `|` and folded `>`) consume trailing
568                    // blank lines as part of chomping; their span.end falls on the
569                    // first line after the scalar, which can coincide with the
570                    // next comment's line number.  That would falsely attach a
571                    // leading inter-node comment as a trailing inline comment.
572                    // Block scalars never have an inline comment on their content
573                    // lines, so skip trailing-comment detection for them.
574                    if !is_block_scalar(&value)
575                        && matches!(stream.peek(), Some(Ok((Event::Comment { .. }, _))))
576                    {
577                        let value_end_line = node_end_line(&value, &self.line_index);
578                        if let Some(trail) =
579                            peek_trailing_comment(stream, value_end_line, &self.line_index)?
580                        {
581                            attach_trailing_comment(&mut value, trail);
582                        }
583                    }
584
585                    entries.push((key, value));
586                }
587
588                // Consume MappingEnd and capture its span.
589                if let Some(Ok((Event::MappingEnd, end))) = stream.peek() {
590                    end_span = *end;
591                    let _ = stream.next();
592                }
593                self.depth -= 1;
594
595                let mut node = Node::Mapping {
596                    entries,
597                    style,
598                    tag,
599                    loc: Span {
600                        start: span.start,
601                        end: end_span.end,
602                    },
603                    meta: NodeMeta {
604                        anchor,
605                        anchor_loc,
606                        tag_loc,
607                        leading_comments: None,
608                        trailing_comment: None,
609                    }
610                    .into_option(),
611                };
612                apply_schema_to_node(&mut node, self.options.schema, &self.line_index)?;
613                if let Some(name) = anchor_for_registration {
614                    self.register_anchor(name, &node)?;
615                }
616                Ok(node)
617            }
618
619            Event::SequenceStart { style, meta } => {
620                let (event_anchor, anchor_loc, event_tag, tag_loc) = unpack_meta(meta);
621                let anchor = event_anchor.map(str::to_owned);
622                let tag = event_tag.map(|t| Cow::Owned(t.into_owned()));
623                let anchor_for_registration = anchor.clone();
624
625                self.depth += 1;
626                if self.depth > self.options.max_nesting_depth {
627                    return Err(LoadError::NestingDepthLimitExceeded {
628                        limit: self.options.max_nesting_depth,
629                    });
630                }
631
632                let mut items: Vec<Node<Span>> = Vec::new();
633                let mut end_span = span;
634
635                loop {
636                    // Collect leading comments before the next item.  Also
637                    // collect any comments that spilled over from a sibling
638                    // value's collection end (stored in `pending_leading`).
639                    let raw_leading = consume_leading_comments(stream)?;
640                    let leading = if self.pending_leading.is_empty() {
641                        raw_leading
642                    } else {
643                        let mut combined = std::mem::take(&mut self.pending_leading);
644                        combined.extend(raw_leading);
645                        combined
646                    };
647
648                    match stream.peek() {
649                        None | Some(Ok((Event::SequenceEnd | Event::StreamEnd, _))) => {
650                            // Save any collected leading comments so the next
651                            // sibling entry in the parent collection can inherit
652                            // them (e.g. a comment just before SequenceEnd that
653                            // belongs to the following sequence item or mapping
654                            // entry in the parent).
655                            if !leading.is_empty() {
656                                self.pending_leading = leading;
657                            }
658                            break;
659                        }
660                        Some(Err(_)) => {
661                            // Consume the error.
662                            return Err(match stream.next() {
663                                Some(Err(e)) => LoadError::Parse {
664                                    pos: e.pos,
665                                    message: e.message,
666                                },
667                                _ => LoadError::UnexpectedEndOfStream,
668                            });
669                        }
670                        Some(Ok(_)) => {}
671                    }
672
673                    let mut item = self.parse_node(stream)?;
674                    attach_leading_comments(&mut item, leading);
675
676                    // Trailing comment on the item — peek for inline comment.
677                    // Block scalars are excluded for the same reason as in the
678                    // mapping path: their span.end can coincide with the next
679                    // comment's line, falsely turning a leading comment into a
680                    // trailing one.
681                    if !is_block_scalar(&item)
682                        && matches!(stream.peek(), Some(Ok((Event::Comment { .. }, _))))
683                    {
684                        let item_end_line = node_end_line(&item, &self.line_index);
685                        if let Some(trail) =
686                            peek_trailing_comment(stream, item_end_line, &self.line_index)?
687                        {
688                            attach_trailing_comment(&mut item, trail);
689                        }
690                    }
691
692                    items.push(item);
693                }
694
695                // Consume SequenceEnd and capture its span.
696                if let Some(Ok((Event::SequenceEnd, end))) = stream.peek() {
697                    end_span = *end;
698                    let _ = stream.next();
699                }
700                self.depth -= 1;
701
702                let mut node = Node::Sequence {
703                    items,
704                    style,
705                    tag,
706                    loc: Span {
707                        start: span.start,
708                        end: end_span.end,
709                    },
710                    meta: NodeMeta {
711                        anchor,
712                        anchor_loc,
713                        tag_loc,
714                        leading_comments: None,
715                        trailing_comment: None,
716                    }
717                    .into_option(),
718                };
719                apply_schema_to_node(&mut node, self.options.schema, &self.line_index)?;
720                if let Some(name) = anchor_for_registration {
721                    self.register_anchor(name, &node)?;
722                }
723                Ok(node)
724            }
725
726            Event::Alias { name } => {
727                let name = name.to_owned();
728                self.resolve_alias(&name, span)
729            }
730
731            Event::Comment { text } => {
732                // Comment between a mapping key and its collection value (e.g.
733                // `key:\n  # comment\n  subkey: val`).  The comment appears
734                // after the key Scalar and before the MappingStart/SequenceStart
735                // that begins the value.  Save it in `pending_leading` so the
736                // first entry of the upcoming collection can inherit it.
737                self.pending_leading.push(with_hash_prefix(text));
738                self.parse_node(stream)
739            }
740
741            Event::StreamStart
742            | Event::StreamEnd
743            | Event::DocumentStart { .. }
744            | Event::DocumentEnd { .. }
745            | Event::MappingEnd
746            | Event::SequenceEnd => {
747                // Structural event where a node is expected — return empty scalar.
748                Ok(empty_scalar())
749            }
750        }
751    }
752
753    fn register_anchor(&mut self, name: String, node: &Node<Span>) -> Result<()> {
754        if !self.anchor_map.contains_key(&name) {
755            self.anchor_count += 1;
756            if self.anchor_count > self.options.max_anchors {
757                return Err(LoadError::AnchorCountLimitExceeded {
758                    limit: self.options.max_anchors,
759                });
760            }
761        }
762        // Count the anchor node itself toward the expansion budget in resolved
763        // mode so that the total reflects every node present in the expanded
764        // document (anchor definition + each alias expansion).
765        if self.options.mode == LoadMode::Resolved {
766            self.expanded_nodes += 1;
767            if self.expanded_nodes > self.options.max_expanded_nodes {
768                return Err(LoadError::AliasExpansionLimitExceeded {
769                    limit: self.options.max_expanded_nodes,
770                });
771            }
772            self.anchor_map.insert(name, node.clone());
773        } else {
774            // Lossless mode never reads anchor_map for expansion; store a
775            // zero-cost placeholder so contains_key still detects re-definitions.
776            self.anchor_map.insert(name, empty_scalar());
777        }
778        Ok(())
779    }
780
781    fn resolve_alias(&mut self, name: &str, loc: Span) -> Result<Node<Span>> {
782        match self.options.mode {
783            LoadMode::Lossless => Ok(Node::Alias {
784                name: name.to_owned(),
785                loc,
786                leading_comments: None,
787                trailing_comment: None,
788            }),
789            LoadMode::Resolved => {
790                let anchored = self.anchor_map.get(name).cloned().ok_or_else(|| {
791                    LoadError::UndefinedAlias {
792                        name: name.to_owned(),
793                    }
794                })?;
795                let mut in_progress: HashSet<String> = HashSet::new();
796                self.expand_node(anchored, &mut in_progress)
797            }
798        }
799    }
800
801    /// Recursively expand a node, counting every node produced against the
802    /// expansion limit and checking for cycles via `in_progress`.
803    fn expand_node(
804        &mut self,
805        node: Node<Span>,
806        in_progress: &mut HashSet<String>,
807    ) -> Result<Node<Span>> {
808        // Increment at the top — before child recursion — so every node
809        // (including non-alias nodes inside expanded trees) counts against the
810        // budget.
811        self.expanded_nodes += 1;
812        if self.expanded_nodes > self.options.max_expanded_nodes {
813            return Err(LoadError::AliasExpansionLimitExceeded {
814                limit: self.options.max_expanded_nodes,
815            });
816        }
817
818        match node {
819            Node::Alias { ref name, loc, .. } => {
820                if in_progress.contains(name) {
821                    return Err(LoadError::CircularAlias { name: name.clone() });
822                }
823                let target = self
824                    .anchor_map
825                    .get(name)
826                    .cloned()
827                    .ok_or_else(|| LoadError::UndefinedAlias { name: name.clone() })?;
828                in_progress.insert(name.clone());
829                let expanded = self.expand_node(target, in_progress)?;
830                in_progress.remove(name);
831                // Re-stamp with the alias site's location.
832                Ok(reloc(expanded, loc))
833            }
834            Node::Mapping {
835                entries,
836                style,
837                tag,
838                loc,
839                meta,
840            } => {
841                let mut expanded_entries = Vec::with_capacity(entries.len());
842                for (k, v) in entries {
843                    let ek = self.expand_node(k, in_progress)?;
844                    let ev = self.expand_node(v, in_progress)?;
845                    expanded_entries.push((ek, ev));
846                }
847                Ok(Node::Mapping {
848                    entries: expanded_entries,
849                    style,
850                    tag,
851                    loc,
852                    meta,
853                })
854            }
855            Node::Sequence {
856                items,
857                style,
858                tag,
859                loc,
860                meta,
861            } => {
862                let mut expanded_items = Vec::with_capacity(items.len());
863                for item in items {
864                    expanded_items.push(self.expand_node(item, in_progress)?);
865                }
866                Ok(Node::Sequence {
867                    items: expanded_items,
868                    style,
869                    tag,
870                    loc,
871                    meta,
872                })
873            }
874            // Scalars and already-resolved nodes — pass through.
875            scalar @ Node::Scalar { .. } => Ok(scalar),
876        }
877    }
878}
879
880/// Return `true` if the peeked item signals end of document (or stream).
881const fn is_document_end(peeked: Option<&std::result::Result<(Event<'_>, Span), Error>>) -> bool {
882    matches!(
883        peeked,
884        None | Some(Ok((Event::DocumentEnd { .. } | Event::StreamEnd, _)))
885    )
886}
887
888/// Convert a `Span.start` byte offset to a `Pos` with accurate line/column.
889#[inline]
890fn span_start_to_pos(offset: u32, line_index: &LineIndex) -> Pos {
891    let (line, column) = line_index.line_column(offset);
892    Pos {
893        byte_offset: offset as usize,
894        line: line as usize,
895        column: column as usize,
896    }
897}
898
899/// Return the line number of a node's span end position.
900///
901/// Used to determine whether the next `Comment` event is trailing (same line)
902/// or leading (different line).
903#[inline]
904fn node_end_line(node: &Node<Span>, line_index: &LineIndex) -> u32 {
905    let end_offset = match node {
906        Node::Scalar { loc, .. }
907        | Node::Mapping { loc, .. }
908        | Node::Sequence { loc, .. }
909        | Node::Alias { loc, .. } => loc.end,
910    };
911    line_index.line_column(end_offset).0
912}
913
914/// Return `true` if the node is a block scalar (literal `|` or folded `>`).
915///
916/// Block scalars consume trailing blank lines as part of chomping, so their
917/// `span.end` falls on the line *after* the last consumed line.  This means a
918/// comment on the immediately following line has the same line number as
919/// `span.end.line`, which would cause `peek_trailing_comment` to falsely
920/// classify it as an inline trailing comment.  The caller uses this predicate
921/// to skip trailing-comment detection for block scalars.
922#[inline]
923const fn is_block_scalar(node: &Node<Span>) -> bool {
924    matches!(
925        node,
926        Node::Scalar {
927            style: ScalarStyle::Literal(_) | ScalarStyle::Folded(_),
928            ..
929        }
930    )
931}
932
933// ---------------------------------------------------------------------------
934// Schema resolution helpers
935// ---------------------------------------------------------------------------
936
937/// Maximum number of Unicode scalar values kept in [`LoadError::UnresolvedScalar`]
938/// value field.  Prevents unbounded allocation when storing user-supplied input
939/// in error messages.
940const UNRESOLVED_VALUE_MAX_CHARS: usize = 128;
941
942/// Sanitize a raw scalar value for inclusion in an error message.
943///
944/// - Truncates to [`UNRESOLVED_VALUE_MAX_CHARS`] Unicode scalar values,
945///   appending `"..."` when truncated.
946/// - Replaces ASCII control characters (U+0000–U+001F and U+007F) with
947///   `\uXXXX` hex escapes to prevent log injection via the `Display` impl.
948fn sanitize_scalar_for_error(raw: &str) -> String {
949    let mut out = String::with_capacity(raw.len().min(UNRESOLVED_VALUE_MAX_CHARS * 2));
950    let mut truncated = false;
951
952    for (i, ch) in raw.chars().enumerate() {
953        if i >= UNRESOLVED_VALUE_MAX_CHARS {
954            truncated = true;
955            break;
956        }
957        if ch.is_ascii_control() {
958            // Replace control chars with \uXXXX escape to prevent log injection.
959            let escaped = format!("\\u{:04X}", ch as u32);
960            out.push_str(&escaped);
961        } else {
962            out.push(ch);
963        }
964    }
965
966    if truncated {
967        out.push_str("...");
968    }
969    out
970}
971
972/// Apply schema tag resolution to a freshly-constructed node.
973///
974/// - For scalars: translates bare `!` to `None` (non-specific), then calls
975///   `resolve_scalar`.
976/// - For mappings/sequences: translates bare `!` to `None`, then calls
977///   `resolve_collection`.
978/// - On `Ok(Some(tag))`: overwrites `node.tag`; `tag_loc` is left `None`
979///   (no source position for a resolved tag).
980/// - On `Ok(None)` (explicit tag present): leaves `node.tag` unchanged.
981///
982/// # Errors
983///
984/// Returns [`LoadError::UnresolvedScalar`] when `schema` is [`Schema::Json`]
985/// and a plain scalar does not match any JSON type pattern.
986#[inline]
987fn apply_schema_to_node(
988    node: &mut Node<Span>,
989    schema: Schema,
990    line_index: &LineIndex,
991) -> Result<()> {
992    match node {
993        Node::Scalar {
994            value,
995            style,
996            tag,
997            loc,
998            meta,
999        } => {
1000            // Bare `!` on a scalar is the non-specific scalar tag — it resolves
1001            // unconditionally to !!str regardless of content (YAML 1.2.2 §10.2.1,
1002            // §10.3.2: "non-specific" tag for scalars = Failsafe str).  We handle
1003            // it before calling the schema resolver so Core doesn't pattern-match
1004            // the value.
1005            //
1006            // `tag_loc` is preserved here (NOT cleared) because `!` is explicitly
1007            // written in the source.  Preserving `tag_loc` lets downstream consumers
1008            // (e.g. the formatter) distinguish user-authored tags from resolver-injected
1009            // ones, which is critical for correct idempotent output.
1010            if tag.as_deref() == Some("!") {
1011                *tag = Some(Cow::Borrowed(crate::schema::ResolvedTag::Str.as_str()));
1012                return Ok(());
1013            }
1014            // All other tags: pass through as-is (Some(non-!) = explicit tag → Ok(None)).
1015            match resolve_scalar(schema, *style, value, tag.as_deref()) {
1016                Ok(Some(resolved)) => {
1017                    *tag = Some(Cow::Borrowed(resolved.as_str()));
1018                    // Clear tag_loc: resolver-injected tags have no source position.
1019                    if let Some(m) = meta.as_mut() {
1020                        m.tag_loc = None;
1021                        if m.is_all_none() {
1022                            *meta = None;
1023                        }
1024                    }
1025                }
1026                Ok(None) => {}
1027                Err(_) => {
1028                    return Err(LoadError::UnresolvedScalar {
1029                        value: sanitize_scalar_for_error(value),
1030                        pos: span_start_to_pos(loc.start, line_index),
1031                    });
1032                }
1033            }
1034        }
1035        Node::Mapping { tag, meta, .. } => {
1036            // Bare `!` on a collection means non-specific collection tag — translate
1037            // to None so the resolver returns the kind-based tag (!!map / !!seq).
1038            let effective_tag = tag.as_deref().filter(|t| *t != "!");
1039            if let Some(resolved) =
1040                resolve_collection(schema, CollectionKind::Mapping, effective_tag)
1041            {
1042                *tag = Some(Cow::Borrowed(resolved.as_str()));
1043                if let Some(m) = meta.as_mut() {
1044                    m.tag_loc = None;
1045                    if m.is_all_none() {
1046                        *meta = None;
1047                    }
1048                }
1049            }
1050        }
1051        Node::Sequence { tag, meta, .. } => {
1052            let effective_tag = tag.as_deref().filter(|t| *t != "!");
1053            if let Some(resolved) =
1054                resolve_collection(schema, CollectionKind::Sequence, effective_tag)
1055            {
1056                *tag = Some(Cow::Borrowed(resolved.as_str()));
1057                if let Some(m) = meta.as_mut() {
1058                    m.tag_loc = None;
1059                    if m.is_all_none() {
1060                        *meta = None;
1061                    }
1062                }
1063            }
1064        }
1065        Node::Alias { .. } => {}
1066    }
1067    Ok(())
1068}
1069
1070// ---------------------------------------------------------------------------
1071// Node helpers
1072// ---------------------------------------------------------------------------
1073
1074const fn empty_scalar() -> Node<Span> {
1075    Node::Scalar {
1076        value: String::new(),
1077        style: ScalarStyle::Plain,
1078        tag: None,
1079        loc: Span { start: 0, end: 0 },
1080        meta: None,
1081    }
1082}
1083
1084// ---------------------------------------------------------------------------
1085// Tests
1086// ---------------------------------------------------------------------------
1087
1088#[cfg(test)]
1089#[expect(
1090    clippy::expect_used,
1091    clippy::unwrap_used,
1092    clippy::indexing_slicing,
1093    clippy::panic,
1094    reason = "test code"
1095)]
1096mod tests {
1097    use super::*;
1098
1099    // UT-1: loader_state_resets_anchor_map_between_documents
1100    #[test]
1101    fn loader_state_resets_anchor_map_between_documents() {
1102        // In resolved mode: anchor defined in doc 1 must not be visible in doc 2.
1103        let result = LoaderBuilder::new()
1104            .resolved()
1105            .build()
1106            .load("---\n- &foo hello\n...\n---\n- *foo\n...\n");
1107        assert!(
1108            result.is_err(),
1109            "expected Err: *foo in doc 2 should be undefined"
1110        );
1111        assert!(matches!(
1112            result.unwrap_err(),
1113            LoadError::UndefinedAlias { .. }
1114        ));
1115    }
1116
1117    // UT-2: register_anchor_increments_count
1118    #[test]
1119    fn register_anchor_increments_count() {
1120        let options = LoaderOptions {
1121            max_anchors: 2,
1122            ..LoaderOptions::default()
1123        };
1124        let mut state = LoadState::new(&options, "");
1125        let node = Node::Scalar {
1126            value: "x".to_owned(),
1127            style: ScalarStyle::Plain,
1128            tag: None,
1129            loc: Span { start: 0, end: 0 },
1130            meta: None,
1131        };
1132        assert!(state.register_anchor("a".to_owned(), &node).is_ok());
1133        assert!(state.register_anchor("b".to_owned(), &node).is_ok());
1134        let err = state
1135            .register_anchor("c".to_owned(), &node)
1136            .expect_err("expected AnchorCountLimitExceeded");
1137        assert!(matches!(
1138            err,
1139            LoadError::AnchorCountLimitExceeded { limit: 2 }
1140        ));
1141    }
1142
1143    // UT-3: expand_node_detects_circular_alias
1144    #[test]
1145    fn expand_node_detects_circular_alias() {
1146        let options = LoaderOptions {
1147            mode: LoadMode::Resolved,
1148            ..LoaderOptions::default()
1149        };
1150        let mut state = LoadState::new(&options, "");
1151        // Insert a self-referential alias node.
1152        let alias_node = Node::Alias {
1153            name: "a".to_owned(),
1154            loc: Span { start: 0, end: 0 },
1155            leading_comments: None,
1156            trailing_comment: None,
1157        };
1158        state.anchor_map.insert("a".to_owned(), alias_node.clone());
1159        let mut in_progress = HashSet::new();
1160        let result = state.expand_node(alias_node, &mut in_progress);
1161        assert!(
1162            matches!(result, Err(LoadError::CircularAlias { .. })),
1163            "expected CircularAlias, got: {result:?}"
1164        );
1165    }
1166
1167    // -----------------------------------------------------------------------
1168    // Bug A: comment between mapping key and its collection value
1169    // -----------------------------------------------------------------------
1170
1171    // UT-A1: comment between key and nested mapping is attached to first entry.
1172    #[test]
1173    fn comment_between_key_and_nested_mapping_is_attached_to_first_key() {
1174        let docs = load("outer:\n  # Style 1\n  inner: val\n").unwrap();
1175        let root = &docs[0].root;
1176        // root is a mapping: outer -> { inner: val }
1177        // The comment "# Style 1" appears between "outer" key and the nested
1178        // MappingStart.  After the fix it must be attached to the "inner" key.
1179        let Node::Mapping { entries, .. } = root else {
1180            panic!("expected root mapping");
1181        };
1182        assert_eq!(entries.len(), 1);
1183        let (_outer_key, outer_value) = &entries[0];
1184        let Node::Mapping {
1185            entries: nested, ..
1186        } = outer_value
1187        else {
1188            panic!("expected nested mapping");
1189        };
1190        assert_eq!(nested.len(), 1);
1191        let (inner_key, _) = &nested[0];
1192        assert_eq!(
1193            inner_key.leading_comments(),
1194            &["# Style 1"],
1195            "comment should be attached to the first nested key"
1196        );
1197    }
1198
1199    // UT-A2: comment between key and nested sequence is attached to first item.
1200    #[test]
1201    fn comment_between_key_and_nested_sequence_is_attached_to_first_item() {
1202        let docs = load("key:\n  # leading\n  - item1\n  - item2\n").unwrap();
1203        let root = &docs[0].root;
1204        let Node::Mapping { entries, .. } = root else {
1205            panic!("expected root mapping");
1206        };
1207        let (_key, seq_value) = &entries[0];
1208        let Node::Sequence { items, .. } = seq_value else {
1209            panic!("expected sequence value");
1210        };
1211        // The comment "# leading" appears before the sequence items; after
1212        // the fix it is attached to the first item.
1213        assert_eq!(
1214            items[0].leading_comments(),
1215            &["# leading"],
1216            "comment should be attached to first sequence item"
1217        );
1218    }
1219
1220    // UT-A3: multiple consecutive comments before a collection are all preserved.
1221    #[test]
1222    fn multiple_comments_between_key_and_collection_all_preserved() {
1223        let docs = load("key:\n  # first\n  # second\n  - item\n").unwrap();
1224        let root = &docs[0].root;
1225        let Node::Mapping { entries, .. } = root else {
1226            panic!("expected root mapping");
1227        };
1228        let (_key, seq_value) = &entries[0];
1229        let Node::Sequence { items, .. } = seq_value else {
1230            panic!("expected sequence value");
1231        };
1232        assert_eq!(
1233            items[0].leading_comments(),
1234            &["# first", "# second"],
1235            "both comments should be on first item"
1236        );
1237    }
1238
1239    // UT-A4: the KEY node itself has no leading comments from Bug-A fix.
1240    #[test]
1241    fn comment_between_key_and_collection_does_not_corrupt_key_node() {
1242        let docs = load("outer:\n  # Style 1\n  inner: val\n").unwrap();
1243        let root = &docs[0].root;
1244        let Node::Mapping { entries, .. } = root else {
1245            panic!("expected root mapping");
1246        };
1247        let (outer_key, _) = &entries[0];
1248        assert!(
1249            outer_key.leading_comments().is_empty(),
1250            "outer key should have no leading comments"
1251        );
1252        assert!(
1253            outer_key.trailing_comment().is_none(),
1254            "outer key should have no trailing comment"
1255        );
1256    }
1257
1258    // UT-A5: no comment between key and value leaves leading_comments empty.
1259    #[test]
1260    fn no_comment_between_key_and_value_leaves_leading_comments_empty() {
1261        let docs = load("key:\n  inner: val\n").unwrap();
1262        let root = &docs[0].root;
1263        let Node::Mapping { entries, .. } = root else {
1264            panic!("expected root mapping");
1265        };
1266        let (_key, nested) = &entries[0];
1267        let Node::Mapping {
1268            entries: nested_entries,
1269            ..
1270        } = nested
1271        else {
1272            panic!("expected nested mapping");
1273        };
1274        let (inner_key, _) = &nested_entries[0];
1275        assert!(
1276            inner_key.leading_comments().is_empty(),
1277            "inner key should have no leading comments when there is no comment"
1278        );
1279    }
1280
1281    // -----------------------------------------------------------------------
1282    // Bug B: comment at end of collection preserved as leading on next sibling
1283    // -----------------------------------------------------------------------
1284
1285    // UT-B1: comment before SequenceEnd becomes leading on next mapping entry.
1286    #[test]
1287    fn trailing_comment_of_sequence_preserved_as_leading_on_next_sibling() {
1288        let input =
1289            "Lists:\n  list-a:\n    - item1\n    - item2\n\n  # Style 2\n  list-b:\n    - item1\n";
1290        let docs = load(input).unwrap();
1291        let root = &docs[0].root;
1292        let Node::Mapping { entries, .. } = root else {
1293            panic!("expected root mapping");
1294        };
1295        let (_lists_key, nested) = &entries[0];
1296        let Node::Mapping {
1297            entries: nested_entries,
1298            ..
1299        } = nested
1300        else {
1301            panic!("expected nested mapping");
1302        };
1303        assert_eq!(nested_entries.len(), 2);
1304        let (list_b_key, _) = &nested_entries[1];
1305        assert_eq!(
1306            list_b_key.leading_comments(),
1307            &["# Style 2"],
1308            "# Style 2 should be leading comment on list-b key"
1309        );
1310    }
1311
1312    // UT-B2: comment at end of nested sequence propagates to next mapping entry.
1313    #[test]
1314    fn overflow_comments_from_nested_sequence_end_reach_next_mapping_entry() {
1315        let input = "outer:\n  a:\n    - x\n    # between\n  b: y\n";
1316        let docs = load(input).unwrap();
1317        let root = &docs[0].root;
1318        let Node::Mapping { entries, .. } = root else {
1319            panic!("expected root mapping");
1320        };
1321        let (_outer_key, outer_val) = &entries[0];
1322        let Node::Mapping {
1323            entries: nested, ..
1324        } = outer_val
1325        else {
1326            panic!("expected nested mapping");
1327        };
1328        assert_eq!(nested.len(), 2);
1329        let (b_key, _) = &nested[1];
1330        assert_eq!(
1331            b_key.leading_comments(),
1332            &["# between"],
1333            "# between should be leading comment on b key"
1334        );
1335    }
1336
1337    // UT-B3: comment at end of nested mapping propagates to next sibling.
1338    #[test]
1339    fn overflow_comments_from_nested_mapping_end_reach_next_sibling() {
1340        let input = "parent:\n  child1:\n    k: v\n    # end-of-child1\n  child2: val\n";
1341        let docs = load(input).unwrap();
1342        let root = &docs[0].root;
1343        let Node::Mapping { entries, .. } = root else {
1344            panic!("expected root mapping");
1345        };
1346        let (_parent_key, parent_val) = &entries[0];
1347        let Node::Mapping {
1348            entries: siblings, ..
1349        } = parent_val
1350        else {
1351            panic!("expected parent mapping value");
1352        };
1353        assert_eq!(siblings.len(), 2);
1354        let (child2_key, _) = &siblings[1];
1355        assert_eq!(
1356            child2_key.leading_comments(),
1357            &["# end-of-child1"],
1358            "# end-of-child1 should be leading comment on child2 key"
1359        );
1360    }
1361
1362    // UT-B4: overflow comment at top-level sequence end is not silently dropped.
1363    #[test]
1364    fn overflow_comments_at_top_level_sequence_end_are_not_lost() {
1365        // The comment "# tail" appears before SequenceEnd of the top-level items
1366        // sequence.  The fix saves it to pending_leading; since there is no next
1367        // sibling, it ends up in the document's root mapping's pending state and
1368        // is not lost.  We assert it appears somewhere reachable in the AST rather
1369        // than disappearing entirely.
1370        let input = "items:\n  - a\n  - b\n  # tail\n";
1371        let docs = load(input).unwrap();
1372        // The document must parse successfully (no panic, no error).
1373        assert!(!docs.is_empty(), "document should parse without error");
1374        // The # tail comment must not cause data loss — the sequence items are intact.
1375        let root = &docs[0].root;
1376        let Node::Mapping { entries, .. } = root else {
1377            panic!("expected root mapping");
1378        };
1379        let (_items_key, seq_val) = &entries[0];
1380        let Node::Sequence { items, .. } = seq_val else {
1381            panic!("expected sequence value");
1382        };
1383        assert_eq!(items.len(), 2, "sequence items must not be lost");
1384    }
1385
1386    // UT-B5: no overflow comments when collection ends cleanly.
1387    #[test]
1388    fn no_overflow_comments_when_collection_ends_cleanly() {
1389        let docs = load("key:\n  - item1\n  - item2\n").unwrap();
1390        let root = &docs[0].root;
1391        let Node::Mapping { entries, .. } = root else {
1392            panic!("expected root mapping");
1393        };
1394        let (_key, seq_val) = &entries[0];
1395        let Node::Sequence { items, .. } = seq_val else {
1396            panic!("expected sequence value");
1397        };
1398        for item in items {
1399            assert!(
1400                item.leading_comments().is_empty(),
1401                "items should have no leading comments"
1402            );
1403        }
1404    }
1405
1406    // -----------------------------------------------------------------------
1407    // Combined scenarios
1408    // -----------------------------------------------------------------------
1409
1410    // UT-C1: exact bug-report input — both comments survive.
1411    #[test]
1412    fn original_bug_report_input_preserves_both_comments() {
1413        let input = "Lists:\n  # Style 1\n  list-a:\n    - item1\n    - item2\n\n  # Style 2\n  list-b:\n  - item1\n  - item2\n";
1414        let docs = load(input).unwrap();
1415        let root = &docs[0].root;
1416        let Node::Mapping { entries, .. } = root else {
1417            panic!("expected root mapping");
1418        };
1419        let (_lists_key, nested) = &entries[0];
1420        let Node::Mapping {
1421            entries: nested_entries,
1422            ..
1423        } = nested
1424        else {
1425            panic!("expected nested mapping");
1426        };
1427        assert_eq!(nested_entries.len(), 2);
1428        let (first_key, _) = &nested_entries[0];
1429        let (second_key, _) = &nested_entries[1];
1430        assert_eq!(
1431            first_key.leading_comments(),
1432            &["# Style 1"],
1433            "list-a should have # Style 1 as leading comment"
1434        );
1435        assert_eq!(
1436            second_key.leading_comments(),
1437            &["# Style 2"],
1438            "list-b should have # Style 2 as leading comment"
1439        );
1440    }
1441
1442    // UT-C2: leading and trailing comments on sibling entries both preserved.
1443    #[test]
1444    fn leading_and_trailing_comments_both_preserved_on_sibling_entries() {
1445        let input = "map:\n  # leading\n  key: value  # trailing\n  # next-leading\n  key2: v2\n";
1446        let docs = load(input).unwrap();
1447        let root = &docs[0].root;
1448        let Node::Mapping { entries, .. } = root else {
1449            panic!("expected root mapping");
1450        };
1451        let (_map_key, map_val) = &entries[0];
1452        let Node::Mapping {
1453            entries: siblings, ..
1454        } = map_val
1455        else {
1456            panic!("expected mapping value");
1457        };
1458        assert_eq!(siblings.len(), 2);
1459        let (key1, val1) = &siblings[0];
1460        let (key2, _) = &siblings[1];
1461        assert_eq!(key1.leading_comments(), &["# leading"]);
1462        assert_eq!(val1.trailing_comment(), Some("# trailing"));
1463        assert_eq!(key2.leading_comments(), &["# next-leading"]);
1464    }
1465
1466    // UT-C3: deeply nested overflow comments propagate to correct sibling.
1467    #[test]
1468    fn deeply_nested_overflow_comments_reach_correct_sibling() {
1469        let input = "top:\n  mid:\n    - x\n    # deep-overflow\n  next: y\n";
1470        let docs = load(input).unwrap();
1471        let root = &docs[0].root;
1472        let Node::Mapping { entries, .. } = root else {
1473            panic!("expected root mapping");
1474        };
1475        let (_top_key, top_val) = &entries[0];
1476        let Node::Mapping {
1477            entries: top_entries,
1478            ..
1479        } = top_val
1480        else {
1481            panic!("expected top-level mapping");
1482        };
1483        assert_eq!(top_entries.len(), 2);
1484        let (next_key, _) = &top_entries[1];
1485        assert_eq!(
1486            next_key.leading_comments(),
1487            &["# deep-overflow"],
1488            "# deep-overflow should propagate from nested sequence to next sibling"
1489        );
1490    }
1491
1492    // -----------------------------------------------------------------------
1493    // UT-D: Document marker flags (explicit_start / explicit_end)
1494    // -----------------------------------------------------------------------
1495
1496    // UT-D1: Bare document (no markers) → both flags false
1497    #[test]
1498    fn bare_document_has_both_flags_false() {
1499        let docs = load("key: value\n").expect("load failed");
1500        assert_eq!(docs.len(), 1);
1501        assert!(!docs[0].explicit_start, "expected explicit_start=false");
1502        assert!(!docs[0].explicit_end, "expected explicit_end=false");
1503    }
1504
1505    // UT-D2: Document with `---` start marker → explicit_start true, explicit_end false
1506    #[test]
1507    fn document_with_start_marker_has_explicit_start_true() {
1508        let docs = load("---\nkey: value\n").expect("load failed");
1509        assert_eq!(docs.len(), 1);
1510        assert!(docs[0].explicit_start, "expected explicit_start=true");
1511        assert!(!docs[0].explicit_end, "expected explicit_end=false");
1512    }
1513
1514    // UT-D3: Document with `...` end marker → explicit_start false, explicit_end true
1515    #[test]
1516    fn document_with_end_marker_has_explicit_end_true() {
1517        let docs = load("key: value\n...\n").expect("load failed");
1518        assert_eq!(docs.len(), 1);
1519        assert!(!docs[0].explicit_start, "expected explicit_start=false");
1520        assert!(docs[0].explicit_end, "expected explicit_end=true");
1521    }
1522
1523    // UT-D4: Document with both `---` and `...` → both flags true
1524    #[test]
1525    fn document_with_both_markers_has_both_flags_true() {
1526        let docs = load("---\nkey: value\n...\n").expect("load failed");
1527        assert_eq!(docs.len(), 1);
1528        assert!(docs[0].explicit_start, "expected explicit_start=true");
1529        assert!(docs[0].explicit_end, "expected explicit_end=true");
1530    }
1531
1532    // UT-D5: Multi-document — each document's flags are independent
1533    #[test]
1534    fn multi_document_flags_are_independent() {
1535        // doc1: no explicit start/end (bare)
1536        // doc2: explicit start (---), explicit end (...)
1537        // doc3: explicit start (---), no explicit end
1538        let docs = load("doc1: a\n---\ndoc2: b\n...\n---\ndoc3: c\n").expect("load failed");
1539        assert_eq!(docs.len(), 3);
1540        assert!(!docs[0].explicit_start, "doc1 explicit_start");
1541        assert!(!docs[0].explicit_end, "doc1 explicit_end");
1542        assert!(docs[1].explicit_start, "doc2 explicit_start");
1543        assert!(docs[1].explicit_end, "doc2 explicit_end");
1544        assert!(docs[2].explicit_start, "doc3 explicit_start");
1545        assert!(!docs[2].explicit_end, "doc3 explicit_end");
1546    }
1547
1548    // UT-D6: Empty document with explicit markers → flags are set
1549    #[test]
1550    fn empty_document_with_explicit_markers_has_both_flags_true() {
1551        let docs = load("---\n...\n").expect("load failed");
1552        assert_eq!(docs.len(), 1);
1553        assert!(docs[0].explicit_start, "expected explicit_start=true");
1554        assert!(docs[0].explicit_end, "expected explicit_end=true");
1555    }
1556
1557    // -----------------------------------------------------------------------
1558    // UT-S: sanitize_scalar_for_error unit tests
1559    // -----------------------------------------------------------------------
1560
1561    // UT-S1: newline replaced with \u000A escape (no raw newline in output)
1562    #[test]
1563    fn sanitize_newline_replaced_with_escape() {
1564        let result = sanitize_scalar_for_error("foo\nbar");
1565        assert!(
1566            !result.contains('\n'),
1567            "output must not contain a raw newline"
1568        );
1569        assert!(
1570            result.contains("\\u000A"),
1571            "output must contain \\u000A escape, got: {result:?}"
1572        );
1573        assert_eq!(result, "foo\\u000Abar");
1574    }
1575
1576    // UT-S2: carriage return replaced with \u000D escape
1577    #[test]
1578    fn sanitize_carriage_return_replaced_with_escape() {
1579        let result = sanitize_scalar_for_error("foo\rbar");
1580        assert!(
1581            !result.contains('\r'),
1582            "output must not contain a raw carriage return"
1583        );
1584        assert!(
1585            result.contains("\\u000D"),
1586            "output must contain \\u000D escape, got: {result:?}"
1587        );
1588        assert_eq!(result, "foo\\u000Dbar");
1589    }
1590
1591    // UT-S3: null byte replaced with \u0000 escape
1592    #[test]
1593    fn sanitize_null_byte_replaced_with_escape() {
1594        let result = sanitize_scalar_for_error("foo\0bar");
1595        assert!(
1596            !result.contains('\0'),
1597            "output must not contain a raw null byte"
1598        );
1599        assert!(
1600            result.contains("\\u0000"),
1601            "output must contain \\u0000 escape, got: {result:?}"
1602        );
1603        assert_eq!(result, "foo\\u0000bar");
1604    }
1605
1606    // UT-S4: short value (≤128 chars) stored verbatim without ellipsis
1607    #[test]
1608    fn sanitize_short_value_stored_verbatim() {
1609        let input = "hello";
1610        let result = sanitize_scalar_for_error(input);
1611        assert_eq!(result, "hello");
1612        assert!(
1613            !result.ends_with("..."),
1614            "short value must not be truncated"
1615        );
1616    }
1617
1618    // UT-S5: value at exactly 128 chars stored verbatim, no ellipsis
1619    #[test]
1620    fn sanitize_value_at_exact_limit_not_truncated() {
1621        let input = "a".repeat(128);
1622        let result = sanitize_scalar_for_error(&input);
1623        assert_eq!(
1624            result.len(),
1625            128,
1626            "128-char input must produce 128-char output"
1627        );
1628        assert!(
1629            !result.ends_with("..."),
1630            "value at exact limit must not be truncated"
1631        );
1632    }
1633
1634    // UT-S6: value of 129 chars truncated to 128 chars + "..."
1635    #[test]
1636    fn sanitize_value_over_limit_truncated() {
1637        let input = "a".repeat(129);
1638        let result = sanitize_scalar_for_error(&input);
1639        assert!(
1640            result.ends_with("..."),
1641            "value over limit must end with '...'"
1642        );
1643        assert_eq!(
1644            result.len(),
1645            128 + 3,
1646            "truncated output must be 128 chars + 3 ellipsis chars"
1647        );
1648    }
1649
1650    // UT-S7: multibyte chars are counted by Unicode scalar value, not bytes;
1651    // truncation at 128 chars does not split a multibyte sequence or produce invalid UTF-8.
1652    #[test]
1653    fn sanitize_multibyte_char_boundary_not_split() {
1654        // Each '中' is 3 bytes. 127 of them = 127 Unicode scalar values, under limit.
1655        // Adding one more ASCII char pushes to 128 (at limit, no truncation).
1656        // Adding yet another pushes to 129 → truncation after 128 chars.
1657        let input: String = "中".repeat(127) + "ab"; // 129 chars total
1658        let result = sanitize_scalar_for_error(&input);
1659        // Must be valid UTF-8 (String guarantees this if we don't split bytes).
1660        assert!(
1661            result.ends_with("..."),
1662            "129-char multibyte input should be truncated"
1663        );
1664        // The result up to the ellipsis must be valid UTF-8 — verified by the
1665        // fact that it's a String. Also check char count = 128.
1666        let char_count = result.trim_end_matches("...").chars().count();
1667        assert_eq!(
1668            char_count, 128,
1669            "truncated portion must be exactly 128 chars"
1670        );
1671    }
1672
1673    // -----------------------------------------------------------------------
1674    // COW-UT: Cow variant identity for resolver-injected vs user-authored tags
1675    // -----------------------------------------------------------------------
1676
1677    fn load_root(input: &str) -> Node<Span> {
1678        load(input).expect("load failed").remove(0).root
1679    }
1680
1681    // COW-UT-1: resolver-injected !!str tag is Cow::Borrowed
1682    #[test]
1683    fn resolver_injected_str_tag_is_borrowed() {
1684        let Node::Scalar { tag, .. } = load_root("hello\n") else {
1685            panic!("expected scalar");
1686        };
1687        assert!(
1688            matches!(tag, Some(Cow::Borrowed(_))),
1689            "resolver-injected !!str must be Borrowed, got: {tag:?}"
1690        );
1691    }
1692
1693    // COW-UT-2: resolver-injected !!int tag is Cow::Borrowed
1694    #[test]
1695    fn resolver_injected_int_tag_is_borrowed() {
1696        let Node::Scalar { tag, .. } = load_root("42\n") else {
1697            panic!("expected scalar");
1698        };
1699        assert!(
1700            matches!(tag, Some(Cow::Borrowed(_))),
1701            "resolver-injected !!int must be Borrowed, got: {tag:?}"
1702        );
1703    }
1704
1705    // COW-UT-3: resolver-injected !!null tag is Cow::Borrowed
1706    #[test]
1707    fn resolver_injected_null_tag_is_borrowed() {
1708        let Node::Scalar { tag, .. } = load_root("null\n") else {
1709            panic!("expected scalar");
1710        };
1711        assert!(
1712            matches!(tag, Some(Cow::Borrowed(_))),
1713            "resolver-injected !!null must be Borrowed, got: {tag:?}"
1714        );
1715    }
1716
1717    // COW-UT-4: resolver-injected !!map tag is Cow::Borrowed
1718    #[test]
1719    fn resolver_injected_map_tag_is_borrowed() {
1720        let Node::Mapping { tag, .. } = load_root("a: 1\n") else {
1721            panic!("expected mapping");
1722        };
1723        assert!(
1724            matches!(tag, Some(Cow::Borrowed(_))),
1725            "resolver-injected !!map must be Borrowed, got: {tag:?}"
1726        );
1727    }
1728
1729    // COW-UT-5: resolver-injected !!seq tag is Cow::Borrowed
1730    #[test]
1731    fn resolver_injected_seq_tag_is_borrowed() {
1732        let Node::Sequence { tag, .. } = load_root("- a\n") else {
1733            panic!("expected sequence");
1734        };
1735        assert!(
1736            matches!(tag, Some(Cow::Borrowed(_))),
1737            "resolver-injected !!seq must be Borrowed, got: {tag:?}"
1738        );
1739    }
1740
1741    // COW-UT-6: user-authored tag on scalar is Cow::Owned
1742    #[test]
1743    fn user_authored_tag_on_scalar_is_owned() {
1744        let Node::Scalar { tag, .. } = load_root("!!str hello\n") else {
1745            panic!("expected scalar");
1746        };
1747        assert!(
1748            matches!(tag, Some(Cow::Owned(_))),
1749            "user-authored !!str must be Owned, got: {tag:?}"
1750        );
1751    }
1752
1753    // COW-UT-7: user-authored tag on mapping is Cow::Owned
1754    #[test]
1755    fn user_authored_tag_on_mapping_is_owned() {
1756        let Node::Mapping { tag, .. } = load_root("!!map\na: 1\n") else {
1757            panic!("expected mapping");
1758        };
1759        assert!(
1760            matches!(tag, Some(Cow::Owned(_))),
1761            "user-authored !!map must be Owned, got: {tag:?}"
1762        );
1763    }
1764
1765    // COW-UT-8: user-authored tag on sequence is Cow::Owned
1766    #[test]
1767    fn user_authored_tag_on_sequence_is_owned() {
1768        let Node::Sequence { tag, .. } = load_root("!!seq\n- a\n") else {
1769            panic!("expected sequence");
1770        };
1771        assert!(
1772            matches!(tag, Some(Cow::Owned(_))),
1773            "user-authored !!seq must be Owned, got: {tag:?}"
1774        );
1775    }
1776
1777    // COW-UT-9: bare `!` on a scalar is handled inside apply_schema_to_node
1778    // and must produce Cow::Borrowed (not Cow::Owned)
1779    #[test]
1780    fn bare_excl_tag_resolver_path_is_borrowed() {
1781        let Node::Scalar { tag, .. } = load_root("! hello\n") else {
1782            panic!("expected scalar");
1783        };
1784        assert!(
1785            matches!(tag, Some(Cow::Borrowed(_))),
1786            "bare-! path in apply_schema_to_node must be Borrowed, got: {tag:?}"
1787        );
1788    }
1789
1790    // COW-UT-10: alias node in lossless mode has no tag field (Alias variant)
1791    #[test]
1792    fn alias_node_has_no_tag_field() {
1793        let docs = LoaderBuilder::new()
1794            .build()
1795            .load("- &a x\n- *a\n")
1796            .expect("load failed");
1797        let Node::Sequence { items, .. } = &docs[0].root else {
1798            panic!("expected root sequence");
1799        };
1800        // items[1] is the alias in lossless mode
1801        assert!(
1802            matches!(items[1], Node::Alias { .. }),
1803            "second item must be Alias in lossless mode"
1804        );
1805    }
1806
1807    // COW-UT-11: Deref coercion still works for string content comparison
1808    #[test]
1809    fn tag_value_content_preserved_across_cow_variants() {
1810        // Resolver-injected: Borrowed — value preserved via as_deref()
1811        let Node::Scalar {
1812            tag: tag_resolver, ..
1813        } = load_root("hello\n")
1814        else {
1815            panic!("expected scalar");
1816        };
1817        assert_eq!(tag_resolver.as_deref(), Some("tag:yaml.org,2002:str"));
1818
1819        // User-authored with a single-! local tag: Owned — value preserved via as_deref().
1820        // `!custom` is a local tag (single !), not a handle-expanded URI.
1821        let Node::Scalar { tag: tag_user, .. } = load_root("!custom hello\n") else {
1822            panic!("expected scalar");
1823        };
1824        assert_eq!(tag_user.as_deref(), Some("!custom"));
1825    }
1826
1827    // -----------------------------------------------------------------------
1828    // LOAD-META: loader correctly gates NodeMeta construction
1829    // -----------------------------------------------------------------------
1830
1831    // LOAD-META-1: loaded_plain_scalar_has_no_meta
1832    #[test]
1833    fn loaded_plain_scalar_has_no_meta() {
1834        let docs = load("hello\n").unwrap();
1835        let root = &docs[0].root;
1836        // Plain scalar with no anchor, no user-authored tag_loc, no comments → meta: None
1837        assert!(
1838            matches!(root, Node::Scalar { meta: None, .. }),
1839            "plain scalar must have meta: None, got: {root:?}"
1840        );
1841    }
1842
1843    // LOAD-META-2: loaded_anchored_scalar_has_meta_some
1844    #[test]
1845    fn loaded_anchored_scalar_has_meta_some() {
1846        let docs = load("- &foo bar\n").unwrap();
1847        let Node::Sequence { items, .. } = &docs[0].root else {
1848            panic!("expected root Sequence");
1849        };
1850        let item = &items[0];
1851        assert!(
1852            matches!(item, Node::Scalar { meta: Some(_), .. }),
1853            "anchored scalar must have meta: Some, got: {item:?}"
1854        );
1855        assert_eq!(item.anchor(), Some("foo"));
1856    }
1857
1858    // LOAD-META-3: loaded_mapping_with_no_meta_fields_has_meta_none
1859    #[test]
1860    fn loaded_mapping_with_no_meta_fields_has_meta_none() {
1861        let docs = load("a: 1\n").unwrap();
1862        let root = &docs[0].root;
1863        assert!(
1864            matches!(root, Node::Mapping { meta: None, .. }),
1865            "plain mapping must have meta: None, got: {root:?}"
1866        );
1867    }
1868
1869    // LOAD-META-4: loaded_sequence_with_no_meta_fields_has_meta_none
1870    #[test]
1871    fn loaded_sequence_with_no_meta_fields_has_meta_none() {
1872        let docs = load("- a\n").unwrap();
1873        let root = &docs[0].root;
1874        assert!(
1875            matches!(root, Node::Sequence { meta: None, .. }),
1876            "plain sequence must have meta: None, got: {root:?}"
1877        );
1878    }
1879
1880    // LOAD-META-5: loaded_scalar_with_anchor_has_meta_some_with_anchor_loc
1881    #[test]
1882    fn loaded_scalar_with_anchor_has_meta_some_with_anchor_loc() {
1883        let docs = load("&tag hello\n").unwrap();
1884        let root = &docs[0].root;
1885        assert!(
1886            matches!(root, Node::Scalar { meta: Some(_), .. }),
1887            "anchored scalar must have meta: Some"
1888        );
1889        assert!(
1890            root.anchor_loc().is_some(),
1891            "anchor_loc() must be Some for anchored scalar"
1892        );
1893    }
1894}