Skip to main content

rlsp_yaml_parser/
loader.rs

1// SPDX-License-Identifier: MIT
2
3//! Event-to-AST loader.
4//!
5//! Consumes the event stream from [`crate::parse_events`] and builds a
6//! `Vec<Document<Span>>`.
7//!
8//! Two modes are available:
9//! - **Lossless** (default): alias references are kept as [`Node::Alias`]
10//!   nodes — no expansion, safe for untrusted input without any expansion
11//!   limit.
12//! - **Resolved**: aliases are expanded inline.  An expansion-node counter
13//!   guards against alias bombs (Billion Laughs attack).
14//!
15//! Security controls (all active in both modes unless noted):
16//! - `max_nesting_depth` — caps sequence/mapping nesting to prevent stack
17//!   exhaustion (default 512).
18//! - `max_anchors` — caps distinct anchor registrations to bound anchor-map
19//!   memory (default 10 000).
20//! - `max_expanded_nodes` — caps total nodes produced by alias expansion in
21//!   resolved mode only (default 1 000 000).
22//!
23//! # Accepted risks
24//!
25//! `expand_node` does not detect the case where an anchor-within-expansion
26//! references a previously defined anchor, forming an indirect cycle not
27//! caught by the `in_progress` set until the second traversal.  This
28//! limitation exists in the old loader and is acceptable in the LSP context
29//! where Lossless mode is the default.  The `expanded_nodes` volume limit
30//! provides the backstop.
31
32use std::collections::{HashMap, HashSet};
33use std::iter::Peekable;
34
35use crate::error::Error;
36use crate::event::{Event, ScalarStyle};
37use crate::node::{Document, Node};
38use crate::pos::{Pos, Span};
39use crate::schema::{CollectionKind, Schema, resolve_collection, resolve_scalar};
40
41use comments::{attach_leading_comments, attach_trailing_comment};
42use reloc::reloc;
43use stream::{
44    consume_leading_comments, consume_leading_doc_comments, next_from, peek_trailing_comment,
45    with_hash_prefix,
46};
47
48mod comments;
49mod reloc;
50mod stream;
51
52// ---------------------------------------------------------------------------
53// Public error type
54// ---------------------------------------------------------------------------
55
56/// Errors produced by the loader.
57#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)]
58pub enum LoadError {
59    /// The event stream contained a parse error.
60    #[error("parse error at {pos:?}: {message}")]
61    Parse {
62        /// Source position where the parse error was detected.
63        pos: Pos,
64        /// Human-readable description of the error.
65        message: String,
66    },
67
68    /// The event stream ended unexpectedly mid-document.
69    #[error("unexpected end of event stream")]
70    UnexpectedEndOfStream,
71
72    /// Nesting depth exceeded the configured limit.
73    #[error("nesting depth limit exceeded (max: {limit})")]
74    NestingDepthLimitExceeded {
75        /// The configured nesting depth limit that was exceeded.
76        limit: usize,
77    },
78
79    /// Too many distinct anchor names were defined.
80    #[error("anchor count limit exceeded (max: {limit})")]
81    AnchorCountLimitExceeded {
82        /// The configured anchor count limit that was exceeded.
83        limit: usize,
84    },
85
86    /// Alias expansion produced more nodes than the configured limit.
87    #[error("alias expansion node limit exceeded (max: {limit})")]
88    AliasExpansionLimitExceeded {
89        /// The configured expansion node limit that was exceeded.
90        limit: usize,
91    },
92
93    /// A circular alias reference was detected.
94    #[error("circular alias reference: '{name}'")]
95    CircularAlias {
96        /// The anchor name involved in the cycle.
97        name: String,
98    },
99
100    /// An alias referred to an anchor that was never defined.
101    #[error("undefined alias: '{name}'")]
102    UndefinedAlias {
103        /// The alias name that had no corresponding anchor definition.
104        name: String,
105    },
106
107    /// A plain scalar could not be resolved under the JSON schema.
108    ///
109    /// The JSON schema has no fallback: every untagged plain scalar must match
110    /// one of its patterns (null, bool, int, float).  If none match, the scalar
111    /// is an error per YAML 1.2.2 §10.2.
112    ///
113    /// `value` is truncated to 128 Unicode scalar values and ASCII control
114    /// characters (U+0000–U+001F, U+007F) are replaced with `\uXXXX` escapes
115    /// to prevent log injection via the `Display` impl.
116    #[error("JSON schema: plain scalar does not match any type pattern")]
117    UnresolvedScalar {
118        /// The sanitized, truncated scalar value that failed resolution.
119        value: String,
120        /// Source position of the scalar.
121        pos: Pos,
122    },
123}
124
125// Convenience alias used inside the module.
126type Result<T> = std::result::Result<T, LoadError>;
127
128// Type alias for the peekable event stream used throughout the loader.
129type EventStream<'a> =
130    Peekable<Box<dyn Iterator<Item = std::result::Result<(Event<'a>, Span), Error>> + 'a>>;
131
132// ---------------------------------------------------------------------------
133// Configuration
134// ---------------------------------------------------------------------------
135
136/// Loader mode — controls how alias references are handled.
137#[derive(Debug, Clone, Copy, PartialEq, Eq)]
138pub enum LoadMode {
139    /// Preserve aliases as [`Node::Alias`] nodes (default, safe for LSP).
140    Lossless,
141    /// Expand aliases inline; subject to `max_expanded_nodes` limit.
142    Resolved,
143}
144
145/// Security and behaviour options for the loader.
146#[derive(Debug, Clone)]
147pub struct LoaderOptions {
148    /// Maximum mapping/sequence nesting depth before returning
149    /// [`LoadError::NestingDepthLimitExceeded`] (default: 512).
150    pub max_nesting_depth: usize,
151    /// Maximum number of distinct anchor names per document before returning
152    /// [`LoadError::AnchorCountLimitExceeded`] (default: 10 000).
153    pub max_anchors: usize,
154    /// Maximum total nodes produced by alias expansion in resolved mode before
155    /// returning [`LoadError::AliasExpansionLimitExceeded`] (default: 1 000 000).
156    pub max_expanded_nodes: usize,
157    /// Controls how alias references are handled during loading.
158    pub mode: LoadMode,
159    /// YAML 1.2.2 §10 schema to apply during loading (default: [`Schema::Core`]).
160    ///
161    /// Each node's tag is resolved according to this schema after the node is
162    /// constructed.  Nodes with explicit source tags are left unchanged.
163    pub schema: Schema,
164}
165
166impl Default for LoaderOptions {
167    fn default() -> Self {
168        Self {
169            max_nesting_depth: 512,
170            max_anchors: 10_000,
171            max_expanded_nodes: 1_000_000,
172            mode: LoadMode::Lossless,
173            schema: Schema::Core,
174        }
175    }
176}
177
178// ---------------------------------------------------------------------------
179// Builder
180// ---------------------------------------------------------------------------
181
182/// Builder for configuring and creating a [`Loader`].
183///
184/// ```
185/// use rlsp_yaml_parser::loader::LoaderBuilder;
186///
187/// let docs = LoaderBuilder::new().lossless().build().load("hello\n").unwrap();
188/// assert_eq!(docs.len(), 1);
189/// ```
190pub struct LoaderBuilder {
191    options: LoaderOptions,
192}
193
194impl LoaderBuilder {
195    /// Create a builder with default options (lossless mode, safe limits).
196    #[must_use]
197    pub fn new() -> Self {
198        Self {
199            options: LoaderOptions::default(),
200        }
201    }
202
203    /// Use lossless mode — aliases become [`Node::Alias`] nodes.
204    #[must_use]
205    pub const fn lossless(mut self) -> Self {
206        self.options.mode = LoadMode::Lossless;
207        self
208    }
209
210    /// Use resolved mode — aliases are expanded inline.
211    #[must_use]
212    pub const fn resolved(mut self) -> Self {
213        self.options.mode = LoadMode::Resolved;
214        self
215    }
216
217    /// Override the maximum nesting depth.
218    #[must_use]
219    pub const fn max_nesting_depth(mut self, limit: usize) -> Self {
220        self.options.max_nesting_depth = limit;
221        self
222    }
223
224    /// Override the maximum anchor count.
225    #[must_use]
226    pub const fn max_anchors(mut self, limit: usize) -> Self {
227        self.options.max_anchors = limit;
228        self
229    }
230
231    /// Override the maximum expanded-node count (resolved mode only).
232    #[must_use]
233    pub const fn max_expanded_nodes(mut self, limit: usize) -> Self {
234        self.options.max_expanded_nodes = limit;
235        self
236    }
237
238    /// Override the YAML 1.2.2 §10 schema used for tag resolution during loading.
239    ///
240    /// The default is [`Schema::Core`].  Untagged nodes receive resolved tag URIs
241    /// in the AST; nodes with explicit source tags are not modified.
242    #[must_use]
243    pub const fn schema(mut self, s: Schema) -> Self {
244        self.options.schema = s;
245        self
246    }
247
248    /// Consume the builder and produce a [`Loader`].
249    #[must_use]
250    pub const fn build(self) -> Loader {
251        Loader {
252            options: self.options,
253        }
254    }
255}
256
257impl Default for LoaderBuilder {
258    fn default() -> Self {
259        Self::new()
260    }
261}
262
263// ---------------------------------------------------------------------------
264// Loader
265// ---------------------------------------------------------------------------
266
267/// A configured YAML loader.
268pub struct Loader {
269    options: LoaderOptions,
270}
271
272impl Loader {
273    /// Load YAML text into a sequence of documents.
274    ///
275    /// # Errors
276    ///
277    /// Returns `Err` if the input contains a parse error, exceeds a configured
278    /// security limit, or (in resolved mode) references an undefined anchor.
279    pub fn load(&self, input: &str) -> std::result::Result<Vec<Document<Span>>, LoadError> {
280        let mut state = LoadState::new(&self.options);
281        let iter: Box<dyn Iterator<Item = std::result::Result<(Event<'_>, Span), Error>> + '_> =
282            Box::new(crate::parse_events(input));
283        state.run(iter.peekable())
284    }
285}
286
287// ---------------------------------------------------------------------------
288// Convenience entry point
289// ---------------------------------------------------------------------------
290
291/// Load YAML text using lossless mode, default security limits, and Core schema tag
292/// resolution (YAML 1.2.2 §10.3).
293///
294/// Returns one `Document<Span>` per YAML document in the stream.  Untagged nodes
295/// receive resolved tag URIs according to the Core schema; nodes with explicit source
296/// tags are left unchanged.
297///
298/// # Errors
299///
300/// Returns `Err` if the input contains a parse error or exceeds a security
301/// limit (nesting depth or anchor count).
302///
303/// ```
304/// use rlsp_yaml_parser::loader::load;
305/// use rlsp_yaml_parser::Node;
306///
307/// let docs = load("hello\n").unwrap();
308/// assert_eq!(docs.len(), 1);
309/// let Node::Scalar { tag, .. } = &docs[0].root else { panic!() };
310/// assert_eq!(tag.as_deref(), Some("tag:yaml.org,2002:str"));
311/// ```
312pub fn load(input: &str) -> std::result::Result<Vec<Document<Span>>, LoadError> {
313    LoaderBuilder::new().lossless().build().load(input)
314}
315
316// ---------------------------------------------------------------------------
317// Internal loader state
318// ---------------------------------------------------------------------------
319
320struct LoadState<'opt> {
321    options: &'opt LoaderOptions,
322    /// Anchors registered so far in the current document: name → node.
323    anchor_map: HashMap<String, Node<Span>>,
324    /// Count of distinct anchors registered (resets per document).
325    anchor_count: usize,
326    /// Current nesting depth (incremented on Begin, decremented on End).
327    depth: usize,
328    /// Total nodes produced via alias expansion (resolved mode only).
329    expanded_nodes: usize,
330    /// Leading comments accumulated by `parse_node` when it encounters a
331    /// `Comment` event between a mapping key and its value's collection start,
332    /// or by a sequence/mapping loop when it hits End with leftover leading
333    /// comments.  The next mapping/sequence loop iteration picks these up and
334    /// prepends them to the next entry's leading comments.
335    pending_leading: Vec<String>,
336}
337
338impl<'opt> LoadState<'opt> {
339    fn new(options: &'opt LoaderOptions) -> Self {
340        Self {
341            options,
342            anchor_map: HashMap::new(),
343            anchor_count: 0,
344            depth: 0,
345            expanded_nodes: 0,
346            pending_leading: Vec::new(),
347        }
348    }
349
350    fn reset_for_document(&mut self) {
351        self.anchor_map.clear();
352        self.anchor_count = 0;
353        self.expanded_nodes = 0;
354        self.pending_leading.clear();
355    }
356
357    fn run(&mut self, mut stream: EventStream<'_>) -> Result<Vec<Document<Span>>> {
358        let mut docs: Vec<Document<Span>> = Vec::new();
359
360        // Skip StreamStart.
361        match stream.next() {
362            Some(Ok(_)) | None => {}
363            Some(Err(e)) => {
364                return Err(LoadError::Parse {
365                    pos: e.pos,
366                    message: e.message,
367                });
368            }
369        }
370
371        loop {
372            // Skip any leading comments or unknown events before a document.
373            match next_from(&mut stream)? {
374                None | Some((Event::StreamEnd, _)) => break,
375                Some((
376                    Event::DocumentStart {
377                        explicit,
378                        version,
379                        tag_directives,
380                    },
381                    _,
382                )) => {
383                    let doc_explicit_start = explicit;
384                    let doc_version = version;
385                    let doc_tags = tag_directives;
386                    self.reset_for_document();
387
388                    let mut doc_comments: Vec<String> = Vec::new();
389
390                    // Consume leading comments at document level.
391                    consume_leading_doc_comments(&mut stream, &mut doc_comments)?;
392
393                    // Parse root node (may be absent for empty documents).
394                    let root = if is_document_end(stream.peek()) {
395                        // Empty document — emit an empty scalar as root.
396                        let mut node = empty_scalar();
397                        apply_schema_to_node(&mut node, self.options.schema)?;
398                        node
399                    } else {
400                        self.parse_node(&mut stream)?
401                    };
402
403                    // Consume DocumentEnd if present and capture its explicit flag.
404                    let doc_explicit_end =
405                        if let Some(Ok((Event::DocumentEnd { explicit }, _))) = stream.peek() {
406                            let end_explicit = *explicit;
407                            let _ = stream.next();
408                            end_explicit
409                        } else {
410                            false
411                        };
412
413                    docs.push(Document {
414                        root,
415                        version: doc_version,
416                        tags: doc_tags,
417                        comments: doc_comments,
418                        explicit_start: doc_explicit_start,
419                        explicit_end: doc_explicit_end,
420                    });
421                }
422                Some(_) => {
423                    // Comment or any other stray event outside a document — skip.
424                }
425            }
426        }
427
428        Ok(docs)
429    }
430
431    /// Parse a single node from the stream.
432    ///
433    /// Advances the stream past the node (including end-of-container events).
434    #[expect(
435        clippy::too_many_lines,
436        reason = "match-on-event-type; splitting would obscure flow"
437    )]
438    fn parse_node(&mut self, stream: &mut EventStream<'_>) -> Result<Node<Span>> {
439        // Structural end events close the caller's collection loop — do NOT
440        // consume them here.  Return an empty scalar and leave the event in
441        // the stream so the outer mapping/sequence loop can see and consume it.
442        if matches!(
443            stream.peek(),
444            Some(Ok((
445                Event::MappingEnd | Event::SequenceEnd | Event::DocumentEnd { .. },
446                _
447            )))
448        ) {
449            return Ok(empty_scalar());
450        }
451
452        let Some((event, span)) = next_from(stream)? else {
453            return Ok(empty_scalar());
454        };
455
456        match event {
457            Event::Scalar {
458                value,
459                style,
460                anchor,
461                anchor_loc,
462                tag,
463                tag_loc,
464                ..
465            } => {
466                let mut node = Node::Scalar {
467                    value: value.into_owned(),
468                    style,
469                    anchor: anchor.map(str::to_owned),
470                    anchor_loc,
471                    tag: tag.map(std::borrow::Cow::into_owned),
472                    tag_loc,
473                    loc: span,
474                    leading_comments: None,
475                    trailing_comment: None,
476                };
477                apply_schema_to_node(&mut node, self.options.schema)?;
478                if let Some(name) = node.anchor() {
479                    self.register_anchor(name.to_owned(), &node)?;
480                }
481                Ok(node)
482            }
483
484            Event::MappingStart {
485                anchor,
486                anchor_loc: mapping_anchor_loc,
487                tag,
488                tag_loc: mapping_tag_loc,
489                style,
490                ..
491            } => {
492                let anchor = anchor.map(str::to_owned);
493                let anchor_loc = mapping_anchor_loc;
494                let tag_loc = mapping_tag_loc;
495                let tag = tag.map(std::borrow::Cow::into_owned);
496
497                self.depth += 1;
498                if self.depth > self.options.max_nesting_depth {
499                    return Err(LoadError::NestingDepthLimitExceeded {
500                        limit: self.options.max_nesting_depth,
501                    });
502                }
503
504                let mut entries: Vec<(Node<Span>, Node<Span>)> = Vec::new();
505                let mut end_span = span;
506
507                loop {
508                    // Consume leading comments before the next key.  Also
509                    // collect any comments that spilled over from a sibling
510                    // value's collection end (stored in `pending_leading`).
511                    let raw_leading = consume_leading_comments(stream)?;
512                    let leading = if self.pending_leading.is_empty() {
513                        raw_leading
514                    } else {
515                        let mut combined = std::mem::take(&mut self.pending_leading);
516                        combined.extend(raw_leading);
517                        combined
518                    };
519
520                    match stream.peek() {
521                        None | Some(Ok((Event::MappingEnd | Event::StreamEnd, _))) => {
522                            // Save any collected leading comments so the next
523                            // sibling entry in the parent collection can inherit
524                            // them (e.g. a comment just before MappingEnd that
525                            // belongs to the following mapping entry).
526                            if !leading.is_empty() {
527                                self.pending_leading = leading;
528                            }
529                            break;
530                        }
531                        Some(Err(_)) => {
532                            // Consume the error.
533                            return Err(match stream.next() {
534                                Some(Err(e)) => LoadError::Parse {
535                                    pos: e.pos,
536                                    message: e.message,
537                                },
538                                _ => LoadError::UnexpectedEndOfStream,
539                            });
540                        }
541                        Some(Ok(_)) => {}
542                    }
543
544                    let mut key = self.parse_node(stream)?;
545                    attach_leading_comments(&mut key, leading);
546
547                    let mut value = self.parse_node(stream)?;
548
549                    // Trailing comment on the value — peek for inline comment.
550                    // Block scalars (literal `|` and folded `>`) consume trailing
551                    // blank lines as part of chomping; their span.end falls on the
552                    // first line after the scalar, which can coincide with the
553                    // next comment's line number.  That would falsely attach a
554                    // leading inter-node comment as a trailing inline comment.
555                    // Block scalars never have an inline comment on their content
556                    // lines, so skip trailing-comment detection for them.
557                    if !is_block_scalar(&value)
558                        && matches!(stream.peek(), Some(Ok((Event::Comment { .. }, _))))
559                    {
560                        let value_end_line = node_end_line(&value);
561                        if let Some(trail) = peek_trailing_comment(stream, value_end_line)? {
562                            attach_trailing_comment(&mut value, trail);
563                        }
564                    }
565
566                    entries.push((key, value));
567                }
568
569                // Consume MappingEnd and capture its span.
570                if let Some(Ok((Event::MappingEnd, end))) = stream.peek() {
571                    end_span = *end;
572                    let _ = stream.next();
573                }
574                self.depth -= 1;
575
576                let mut node = Node::Mapping {
577                    entries,
578                    style,
579                    anchor: anchor.clone(),
580                    anchor_loc,
581                    tag,
582                    tag_loc,
583                    loc: Span {
584                        start: span.start,
585                        end: end_span.end,
586                    },
587                    leading_comments: None,
588                    trailing_comment: None,
589                };
590                apply_schema_to_node(&mut node, self.options.schema)?;
591                if let Some(name) = anchor {
592                    self.register_anchor(name, &node)?;
593                }
594                Ok(node)
595            }
596
597            Event::SequenceStart {
598                anchor,
599                anchor_loc: sequence_anchor_loc,
600                tag,
601                tag_loc: sequence_tag_loc,
602                style,
603                ..
604            } => {
605                let anchor = anchor.map(str::to_owned);
606                let anchor_loc = sequence_anchor_loc;
607                let tag_loc = sequence_tag_loc;
608                let tag = tag.map(std::borrow::Cow::into_owned);
609
610                self.depth += 1;
611                if self.depth > self.options.max_nesting_depth {
612                    return Err(LoadError::NestingDepthLimitExceeded {
613                        limit: self.options.max_nesting_depth,
614                    });
615                }
616
617                let mut items: Vec<Node<Span>> = Vec::new();
618                let mut end_span = span;
619
620                loop {
621                    // Collect leading comments before the next item.  Also
622                    // collect any comments that spilled over from a sibling
623                    // value's collection end (stored in `pending_leading`).
624                    let raw_leading = consume_leading_comments(stream)?;
625                    let leading = if self.pending_leading.is_empty() {
626                        raw_leading
627                    } else {
628                        let mut combined = std::mem::take(&mut self.pending_leading);
629                        combined.extend(raw_leading);
630                        combined
631                    };
632
633                    match stream.peek() {
634                        None | Some(Ok((Event::SequenceEnd | Event::StreamEnd, _))) => {
635                            // Save any collected leading comments so the next
636                            // sibling entry in the parent collection can inherit
637                            // them (e.g. a comment just before SequenceEnd that
638                            // belongs to the following sequence item or mapping
639                            // entry in the parent).
640                            if !leading.is_empty() {
641                                self.pending_leading = leading;
642                            }
643                            break;
644                        }
645                        Some(Err(_)) => {
646                            // Consume the error.
647                            return Err(match stream.next() {
648                                Some(Err(e)) => LoadError::Parse {
649                                    pos: e.pos,
650                                    message: e.message,
651                                },
652                                _ => LoadError::UnexpectedEndOfStream,
653                            });
654                        }
655                        Some(Ok(_)) => {}
656                    }
657
658                    let mut item = self.parse_node(stream)?;
659                    attach_leading_comments(&mut item, leading);
660
661                    // Trailing comment on the item — peek for inline comment.
662                    // Block scalars are excluded for the same reason as in the
663                    // mapping path: their span.end can coincide with the next
664                    // comment's line, falsely turning a leading comment into a
665                    // trailing one.
666                    if !is_block_scalar(&item)
667                        && matches!(stream.peek(), Some(Ok((Event::Comment { .. }, _))))
668                    {
669                        let item_end_line = node_end_line(&item);
670                        if let Some(trail) = peek_trailing_comment(stream, item_end_line)? {
671                            attach_trailing_comment(&mut item, trail);
672                        }
673                    }
674
675                    items.push(item);
676                }
677
678                // Consume SequenceEnd and capture its span.
679                if let Some(Ok((Event::SequenceEnd, end))) = stream.peek() {
680                    end_span = *end;
681                    let _ = stream.next();
682                }
683                self.depth -= 1;
684
685                let mut node = Node::Sequence {
686                    items,
687                    style,
688                    anchor: anchor.clone(),
689                    anchor_loc,
690                    tag,
691                    tag_loc,
692                    loc: Span {
693                        start: span.start,
694                        end: end_span.end,
695                    },
696                    leading_comments: None,
697                    trailing_comment: None,
698                };
699                apply_schema_to_node(&mut node, self.options.schema)?;
700                if let Some(name) = anchor {
701                    self.register_anchor(name, &node)?;
702                }
703                Ok(node)
704            }
705
706            Event::Alias { name } => {
707                let name = name.to_owned();
708                self.resolve_alias(&name, span)
709            }
710
711            Event::Comment { text } => {
712                // Comment between a mapping key and its collection value (e.g.
713                // `key:\n  # comment\n  subkey: val`).  The comment appears
714                // after the key Scalar and before the MappingStart/SequenceStart
715                // that begins the value.  Save it in `pending_leading` so the
716                // first entry of the upcoming collection can inherit it.
717                self.pending_leading.push(with_hash_prefix(text));
718                self.parse_node(stream)
719            }
720
721            Event::StreamStart
722            | Event::StreamEnd
723            | Event::DocumentStart { .. }
724            | Event::DocumentEnd { .. }
725            | Event::MappingEnd
726            | Event::SequenceEnd => {
727                // Structural event where a node is expected — return empty scalar.
728                Ok(empty_scalar())
729            }
730        }
731    }
732
733    fn register_anchor(&mut self, name: String, node: &Node<Span>) -> Result<()> {
734        if !self.anchor_map.contains_key(&name) {
735            self.anchor_count += 1;
736            if self.anchor_count > self.options.max_anchors {
737                return Err(LoadError::AnchorCountLimitExceeded {
738                    limit: self.options.max_anchors,
739                });
740            }
741        }
742        // Count the anchor node itself toward the expansion budget in resolved
743        // mode so that the total reflects every node present in the expanded
744        // document (anchor definition + each alias expansion).
745        if self.options.mode == LoadMode::Resolved {
746            self.expanded_nodes += 1;
747            if self.expanded_nodes > self.options.max_expanded_nodes {
748                return Err(LoadError::AliasExpansionLimitExceeded {
749                    limit: self.options.max_expanded_nodes,
750                });
751            }
752            self.anchor_map.insert(name, node.clone());
753        } else {
754            // Lossless mode never reads anchor_map for expansion; store a
755            // zero-cost placeholder so contains_key still detects re-definitions.
756            self.anchor_map.insert(name, empty_scalar());
757        }
758        Ok(())
759    }
760
761    fn resolve_alias(&mut self, name: &str, loc: Span) -> Result<Node<Span>> {
762        match self.options.mode {
763            LoadMode::Lossless => Ok(Node::Alias {
764                name: name.to_owned(),
765                loc,
766                leading_comments: None,
767                trailing_comment: None,
768            }),
769            LoadMode::Resolved => {
770                let anchored = self.anchor_map.get(name).cloned().ok_or_else(|| {
771                    LoadError::UndefinedAlias {
772                        name: name.to_owned(),
773                    }
774                })?;
775                let mut in_progress: HashSet<String> = HashSet::new();
776                self.expand_node(anchored, &mut in_progress)
777            }
778        }
779    }
780
781    /// Recursively expand a node, counting every node produced against the
782    /// expansion limit and checking for cycles via `in_progress`.
783    fn expand_node(
784        &mut self,
785        node: Node<Span>,
786        in_progress: &mut HashSet<String>,
787    ) -> Result<Node<Span>> {
788        // Increment at the top — before child recursion — so every node
789        // (including non-alias nodes inside expanded trees) counts against the
790        // budget.
791        self.expanded_nodes += 1;
792        if self.expanded_nodes > self.options.max_expanded_nodes {
793            return Err(LoadError::AliasExpansionLimitExceeded {
794                limit: self.options.max_expanded_nodes,
795            });
796        }
797
798        match node {
799            Node::Alias { ref name, loc, .. } => {
800                if in_progress.contains(name) {
801                    return Err(LoadError::CircularAlias { name: name.clone() });
802                }
803                let target = self
804                    .anchor_map
805                    .get(name)
806                    .cloned()
807                    .ok_or_else(|| LoadError::UndefinedAlias { name: name.clone() })?;
808                in_progress.insert(name.clone());
809                let expanded = self.expand_node(target, in_progress)?;
810                in_progress.remove(name);
811                // Re-stamp with the alias site's location.
812                Ok(reloc(expanded, loc))
813            }
814            Node::Mapping {
815                entries,
816                style,
817                anchor,
818                anchor_loc,
819                tag,
820                tag_loc,
821                loc,
822                leading_comments,
823                trailing_comment,
824            } => {
825                let mut expanded_entries = Vec::with_capacity(entries.len());
826                for (k, v) in entries {
827                    let ek = self.expand_node(k, in_progress)?;
828                    let ev = self.expand_node(v, in_progress)?;
829                    expanded_entries.push((ek, ev));
830                }
831                Ok(Node::Mapping {
832                    entries: expanded_entries,
833                    style,
834                    anchor,
835                    anchor_loc,
836                    tag,
837                    tag_loc,
838                    loc,
839                    leading_comments,
840                    trailing_comment,
841                })
842            }
843            Node::Sequence {
844                items,
845                style,
846                anchor,
847                anchor_loc,
848                tag,
849                tag_loc,
850                loc,
851                leading_comments,
852                trailing_comment,
853            } => {
854                let mut expanded_items = Vec::with_capacity(items.len());
855                for item in items {
856                    expanded_items.push(self.expand_node(item, in_progress)?);
857                }
858                Ok(Node::Sequence {
859                    items: expanded_items,
860                    style,
861                    anchor,
862                    anchor_loc,
863                    tag,
864                    tag_loc,
865                    loc,
866                    leading_comments,
867                    trailing_comment,
868                })
869            }
870            // Scalars and already-resolved nodes — pass through.
871            scalar @ Node::Scalar { .. } => Ok(scalar),
872        }
873    }
874}
875
876/// Return `true` if the peeked item signals end of document (or stream).
877const fn is_document_end(peeked: Option<&std::result::Result<(Event<'_>, Span), Error>>) -> bool {
878    matches!(
879        peeked,
880        None | Some(Ok((Event::DocumentEnd { .. } | Event::StreamEnd, _)))
881    )
882}
883
884/// Return the line number of a node's span end position.
885///
886/// Used to determine whether the next `Comment` event is trailing (same line)
887/// or leading (different line).
888#[inline]
889const fn node_end_line(node: &Node<Span>) -> usize {
890    match node {
891        Node::Scalar { loc, .. }
892        | Node::Mapping { loc, .. }
893        | Node::Sequence { loc, .. }
894        | Node::Alias { loc, .. } => loc.end.line,
895    }
896}
897
898/// Return `true` if the node is a block scalar (literal `|` or folded `>`).
899///
900/// Block scalars consume trailing blank lines as part of chomping, so their
901/// `span.end` falls on the line *after* the last consumed line.  This means a
902/// comment on the immediately following line has the same line number as
903/// `span.end.line`, which would cause `peek_trailing_comment` to falsely
904/// classify it as an inline trailing comment.  The caller uses this predicate
905/// to skip trailing-comment detection for block scalars.
906#[inline]
907const fn is_block_scalar(node: &Node<Span>) -> bool {
908    matches!(
909        node,
910        Node::Scalar {
911            style: ScalarStyle::Literal(_) | ScalarStyle::Folded(_),
912            ..
913        }
914    )
915}
916
917// ---------------------------------------------------------------------------
918// Schema resolution helpers
919// ---------------------------------------------------------------------------
920
921/// Maximum number of Unicode scalar values kept in [`LoadError::UnresolvedScalar`]
922/// value field.  Prevents unbounded allocation when storing user-supplied input
923/// in error messages.
924const UNRESOLVED_VALUE_MAX_CHARS: usize = 128;
925
926/// Sanitize a raw scalar value for inclusion in an error message.
927///
928/// - Truncates to [`UNRESOLVED_VALUE_MAX_CHARS`] Unicode scalar values,
929///   appending `"..."` when truncated.
930/// - Replaces ASCII control characters (U+0000–U+001F and U+007F) with
931///   `\uXXXX` hex escapes to prevent log injection via the `Display` impl.
932fn sanitize_scalar_for_error(raw: &str) -> String {
933    let mut out = String::with_capacity(raw.len().min(UNRESOLVED_VALUE_MAX_CHARS * 2));
934    let mut truncated = false;
935
936    for (i, ch) in raw.chars().enumerate() {
937        if i >= UNRESOLVED_VALUE_MAX_CHARS {
938            truncated = true;
939            break;
940        }
941        if ch.is_ascii_control() {
942            // Replace control chars with \uXXXX escape to prevent log injection.
943            let escaped = format!("\\u{:04X}", ch as u32);
944            out.push_str(&escaped);
945        } else {
946            out.push(ch);
947        }
948    }
949
950    if truncated {
951        out.push_str("...");
952    }
953    out
954}
955
956/// Apply schema tag resolution to a freshly-constructed node.
957///
958/// - For scalars: translates bare `!` to `None` (non-specific), then calls
959///   `resolve_scalar`.
960/// - For mappings/sequences: translates bare `!` to `None`, then calls
961///   `resolve_collection`.
962/// - On `Ok(Some(tag))`: overwrites `node.tag`; `tag_loc` is left `None`
963///   (no source position for a resolved tag).
964/// - On `Ok(None)` (explicit tag present): leaves `node.tag` unchanged.
965///
966/// # Errors
967///
968/// Returns [`LoadError::UnresolvedScalar`] when `schema` is [`Schema::Json`]
969/// and a plain scalar does not match any JSON type pattern.
970fn apply_schema_to_node(node: &mut Node<Span>, schema: Schema) -> Result<()> {
971    match node {
972        Node::Scalar {
973            value,
974            style,
975            tag,
976            tag_loc,
977            loc,
978            ..
979        } => {
980            // Bare `!` on a scalar is the non-specific scalar tag — it resolves
981            // unconditionally to !!str regardless of content (YAML 1.2.2 §10.2.1,
982            // §10.3.2: "non-specific" tag for scalars = Failsafe str).  We handle
983            // it before calling the schema resolver so Core doesn't pattern-match
984            // the value.
985            //
986            // `tag_loc` is preserved here (NOT cleared) because `!` is explicitly
987            // written in the source.  Preserving `tag_loc` lets downstream consumers
988            // (e.g. the formatter) distinguish user-authored tags from resolver-injected
989            // ones, which is critical for correct idempotent output.
990            if tag.as_deref() == Some("!") {
991                *tag = Some(crate::schema::ResolvedTag::Str.as_str().to_owned());
992                return Ok(());
993            }
994            // All other tags: pass through as-is (Some(non-!) = explicit tag → Ok(None)).
995            match resolve_scalar(schema, *style, value, tag.as_deref()) {
996                Ok(Some(resolved)) => {
997                    *tag = Some(resolved.as_str().to_owned());
998                    *tag_loc = None;
999                }
1000                Ok(None) => {}
1001                Err(_) => {
1002                    return Err(LoadError::UnresolvedScalar {
1003                        value: sanitize_scalar_for_error(value),
1004                        pos: loc.start,
1005                    });
1006                }
1007            }
1008        }
1009        Node::Mapping { tag, tag_loc, .. } => {
1010            // Bare `!` on a collection means non-specific collection tag — translate
1011            // to None so the resolver returns the kind-based tag (!!map / !!seq).
1012            let effective_tag = tag.as_deref().filter(|t| *t != "!");
1013            if let Some(resolved) =
1014                resolve_collection(schema, CollectionKind::Mapping, effective_tag)
1015            {
1016                *tag = Some(resolved.as_str().to_owned());
1017                *tag_loc = None;
1018            }
1019        }
1020        Node::Sequence { tag, tag_loc, .. } => {
1021            let effective_tag = tag.as_deref().filter(|t| *t != "!");
1022            if let Some(resolved) =
1023                resolve_collection(schema, CollectionKind::Sequence, effective_tag)
1024            {
1025                *tag = Some(resolved.as_str().to_owned());
1026                *tag_loc = None;
1027            }
1028        }
1029        Node::Alias { .. } => {}
1030    }
1031    Ok(())
1032}
1033
1034// ---------------------------------------------------------------------------
1035// Node helpers
1036// ---------------------------------------------------------------------------
1037
1038const fn empty_scalar() -> Node<Span> {
1039    Node::Scalar {
1040        value: String::new(),
1041        style: ScalarStyle::Plain,
1042        anchor: None,
1043        anchor_loc: None,
1044        tag: None,
1045        tag_loc: None,
1046        loc: Span {
1047            start: Pos::ORIGIN,
1048            end: Pos::ORIGIN,
1049        },
1050        leading_comments: None,
1051        trailing_comment: None,
1052    }
1053}
1054
1055// ---------------------------------------------------------------------------
1056// Tests
1057// ---------------------------------------------------------------------------
1058
1059#[cfg(test)]
1060#[expect(
1061    clippy::expect_used,
1062    clippy::unwrap_used,
1063    clippy::indexing_slicing,
1064    clippy::panic,
1065    reason = "test code"
1066)]
1067mod tests {
1068    use super::*;
1069
1070    // UT-1: loader_state_resets_anchor_map_between_documents
1071    #[test]
1072    fn loader_state_resets_anchor_map_between_documents() {
1073        // In resolved mode: anchor defined in doc 1 must not be visible in doc 2.
1074        let result = LoaderBuilder::new()
1075            .resolved()
1076            .build()
1077            .load("---\n- &foo hello\n...\n---\n- *foo\n...\n");
1078        assert!(
1079            result.is_err(),
1080            "expected Err: *foo in doc 2 should be undefined"
1081        );
1082        assert!(matches!(
1083            result.unwrap_err(),
1084            LoadError::UndefinedAlias { .. }
1085        ));
1086    }
1087
1088    // UT-2: register_anchor_increments_count
1089    #[test]
1090    fn register_anchor_increments_count() {
1091        let options = LoaderOptions {
1092            max_anchors: 2,
1093            ..LoaderOptions::default()
1094        };
1095        let mut state = LoadState::new(&options);
1096        let node = Node::Scalar {
1097            value: "x".to_owned(),
1098            style: ScalarStyle::Plain,
1099            anchor: None,
1100            anchor_loc: None,
1101            tag: None,
1102            tag_loc: None,
1103            loc: Span {
1104                start: Pos::ORIGIN,
1105                end: Pos::ORIGIN,
1106            },
1107            leading_comments: None,
1108            trailing_comment: None,
1109        };
1110        assert!(state.register_anchor("a".to_owned(), &node).is_ok());
1111        assert!(state.register_anchor("b".to_owned(), &node).is_ok());
1112        let err = state
1113            .register_anchor("c".to_owned(), &node)
1114            .expect_err("expected AnchorCountLimitExceeded");
1115        assert!(matches!(
1116            err,
1117            LoadError::AnchorCountLimitExceeded { limit: 2 }
1118        ));
1119    }
1120
1121    // UT-3: expand_node_detects_circular_alias
1122    #[test]
1123    fn expand_node_detects_circular_alias() {
1124        let options = LoaderOptions {
1125            mode: LoadMode::Resolved,
1126            ..LoaderOptions::default()
1127        };
1128        let mut state = LoadState::new(&options);
1129        // Insert a self-referential alias node.
1130        let alias_node = Node::Alias {
1131            name: "a".to_owned(),
1132            loc: Span {
1133                start: Pos::ORIGIN,
1134                end: Pos::ORIGIN,
1135            },
1136            leading_comments: None,
1137            trailing_comment: None,
1138        };
1139        state.anchor_map.insert("a".to_owned(), alias_node.clone());
1140        let mut in_progress = HashSet::new();
1141        let result = state.expand_node(alias_node, &mut in_progress);
1142        assert!(
1143            matches!(result, Err(LoadError::CircularAlias { .. })),
1144            "expected CircularAlias, got: {result:?}"
1145        );
1146    }
1147
1148    // -----------------------------------------------------------------------
1149    // Bug A: comment between mapping key and its collection value
1150    // -----------------------------------------------------------------------
1151
1152    // UT-A1: comment between key and nested mapping is attached to first entry.
1153    #[test]
1154    fn comment_between_key_and_nested_mapping_is_attached_to_first_key() {
1155        let docs = load("outer:\n  # Style 1\n  inner: val\n").unwrap();
1156        let root = &docs[0].root;
1157        // root is a mapping: outer -> { inner: val }
1158        // The comment "# Style 1" appears between "outer" key and the nested
1159        // MappingStart.  After the fix it must be attached to the "inner" key.
1160        let Node::Mapping { entries, .. } = root else {
1161            panic!("expected root mapping");
1162        };
1163        assert_eq!(entries.len(), 1);
1164        let (_outer_key, outer_value) = &entries[0];
1165        let Node::Mapping {
1166            entries: nested, ..
1167        } = outer_value
1168        else {
1169            panic!("expected nested mapping");
1170        };
1171        assert_eq!(nested.len(), 1);
1172        let (inner_key, _) = &nested[0];
1173        assert_eq!(
1174            inner_key.leading_comments(),
1175            &["# Style 1"],
1176            "comment should be attached to the first nested key"
1177        );
1178    }
1179
1180    // UT-A2: comment between key and nested sequence is attached to first item.
1181    #[test]
1182    fn comment_between_key_and_nested_sequence_is_attached_to_first_item() {
1183        let docs = load("key:\n  # leading\n  - item1\n  - item2\n").unwrap();
1184        let root = &docs[0].root;
1185        let Node::Mapping { entries, .. } = root else {
1186            panic!("expected root mapping");
1187        };
1188        let (_key, seq_value) = &entries[0];
1189        let Node::Sequence { items, .. } = seq_value else {
1190            panic!("expected sequence value");
1191        };
1192        // The comment "# leading" appears before the sequence items; after
1193        // the fix it is attached to the first item.
1194        assert_eq!(
1195            items[0].leading_comments(),
1196            &["# leading"],
1197            "comment should be attached to first sequence item"
1198        );
1199    }
1200
1201    // UT-A3: multiple consecutive comments before a collection are all preserved.
1202    #[test]
1203    fn multiple_comments_between_key_and_collection_all_preserved() {
1204        let docs = load("key:\n  # first\n  # second\n  - item\n").unwrap();
1205        let root = &docs[0].root;
1206        let Node::Mapping { entries, .. } = root else {
1207            panic!("expected root mapping");
1208        };
1209        let (_key, seq_value) = &entries[0];
1210        let Node::Sequence { items, .. } = seq_value else {
1211            panic!("expected sequence value");
1212        };
1213        assert_eq!(
1214            items[0].leading_comments(),
1215            &["# first", "# second"],
1216            "both comments should be on first item"
1217        );
1218    }
1219
1220    // UT-A4: the KEY node itself has no leading comments from Bug-A fix.
1221    #[test]
1222    fn comment_between_key_and_collection_does_not_corrupt_key_node() {
1223        let docs = load("outer:\n  # Style 1\n  inner: val\n").unwrap();
1224        let root = &docs[0].root;
1225        let Node::Mapping { entries, .. } = root else {
1226            panic!("expected root mapping");
1227        };
1228        let (outer_key, _) = &entries[0];
1229        assert!(
1230            outer_key.leading_comments().is_empty(),
1231            "outer key should have no leading comments"
1232        );
1233        assert!(
1234            outer_key.trailing_comment().is_none(),
1235            "outer key should have no trailing comment"
1236        );
1237    }
1238
1239    // UT-A5: no comment between key and value leaves leading_comments empty.
1240    #[test]
1241    fn no_comment_between_key_and_value_leaves_leading_comments_empty() {
1242        let docs = load("key:\n  inner: val\n").unwrap();
1243        let root = &docs[0].root;
1244        let Node::Mapping { entries, .. } = root else {
1245            panic!("expected root mapping");
1246        };
1247        let (_key, nested) = &entries[0];
1248        let Node::Mapping {
1249            entries: nested_entries,
1250            ..
1251        } = nested
1252        else {
1253            panic!("expected nested mapping");
1254        };
1255        let (inner_key, _) = &nested_entries[0];
1256        assert!(
1257            inner_key.leading_comments().is_empty(),
1258            "inner key should have no leading comments when there is no comment"
1259        );
1260    }
1261
1262    // -----------------------------------------------------------------------
1263    // Bug B: comment at end of collection preserved as leading on next sibling
1264    // -----------------------------------------------------------------------
1265
1266    // UT-B1: comment before SequenceEnd becomes leading on next mapping entry.
1267    #[test]
1268    fn trailing_comment_of_sequence_preserved_as_leading_on_next_sibling() {
1269        let input =
1270            "Lists:\n  list-a:\n    - item1\n    - item2\n\n  # Style 2\n  list-b:\n    - item1\n";
1271        let docs = load(input).unwrap();
1272        let root = &docs[0].root;
1273        let Node::Mapping { entries, .. } = root else {
1274            panic!("expected root mapping");
1275        };
1276        let (_lists_key, nested) = &entries[0];
1277        let Node::Mapping {
1278            entries: nested_entries,
1279            ..
1280        } = nested
1281        else {
1282            panic!("expected nested mapping");
1283        };
1284        assert_eq!(nested_entries.len(), 2);
1285        let (list_b_key, _) = &nested_entries[1];
1286        assert_eq!(
1287            list_b_key.leading_comments(),
1288            &["# Style 2"],
1289            "# Style 2 should be leading comment on list-b key"
1290        );
1291    }
1292
1293    // UT-B2: comment at end of nested sequence propagates to next mapping entry.
1294    #[test]
1295    fn overflow_comments_from_nested_sequence_end_reach_next_mapping_entry() {
1296        let input = "outer:\n  a:\n    - x\n    # between\n  b: y\n";
1297        let docs = load(input).unwrap();
1298        let root = &docs[0].root;
1299        let Node::Mapping { entries, .. } = root else {
1300            panic!("expected root mapping");
1301        };
1302        let (_outer_key, outer_val) = &entries[0];
1303        let Node::Mapping {
1304            entries: nested, ..
1305        } = outer_val
1306        else {
1307            panic!("expected nested mapping");
1308        };
1309        assert_eq!(nested.len(), 2);
1310        let (b_key, _) = &nested[1];
1311        assert_eq!(
1312            b_key.leading_comments(),
1313            &["# between"],
1314            "# between should be leading comment on b key"
1315        );
1316    }
1317
1318    // UT-B3: comment at end of nested mapping propagates to next sibling.
1319    #[test]
1320    fn overflow_comments_from_nested_mapping_end_reach_next_sibling() {
1321        let input = "parent:\n  child1:\n    k: v\n    # end-of-child1\n  child2: val\n";
1322        let docs = load(input).unwrap();
1323        let root = &docs[0].root;
1324        let Node::Mapping { entries, .. } = root else {
1325            panic!("expected root mapping");
1326        };
1327        let (_parent_key, parent_val) = &entries[0];
1328        let Node::Mapping {
1329            entries: siblings, ..
1330        } = parent_val
1331        else {
1332            panic!("expected parent mapping value");
1333        };
1334        assert_eq!(siblings.len(), 2);
1335        let (child2_key, _) = &siblings[1];
1336        assert_eq!(
1337            child2_key.leading_comments(),
1338            &["# end-of-child1"],
1339            "# end-of-child1 should be leading comment on child2 key"
1340        );
1341    }
1342
1343    // UT-B4: overflow comment at top-level sequence end is not silently dropped.
1344    #[test]
1345    fn overflow_comments_at_top_level_sequence_end_are_not_lost() {
1346        // The comment "# tail" appears before SequenceEnd of the top-level items
1347        // sequence.  The fix saves it to pending_leading; since there is no next
1348        // sibling, it ends up in the document's root mapping's pending state and
1349        // is not lost.  We assert it appears somewhere reachable in the AST rather
1350        // than disappearing entirely.
1351        let input = "items:\n  - a\n  - b\n  # tail\n";
1352        let docs = load(input).unwrap();
1353        // The document must parse successfully (no panic, no error).
1354        assert!(!docs.is_empty(), "document should parse without error");
1355        // The # tail comment must not cause data loss — the sequence items are intact.
1356        let root = &docs[0].root;
1357        let Node::Mapping { entries, .. } = root else {
1358            panic!("expected root mapping");
1359        };
1360        let (_items_key, seq_val) = &entries[0];
1361        let Node::Sequence { items, .. } = seq_val else {
1362            panic!("expected sequence value");
1363        };
1364        assert_eq!(items.len(), 2, "sequence items must not be lost");
1365    }
1366
1367    // UT-B5: no overflow comments when collection ends cleanly.
1368    #[test]
1369    fn no_overflow_comments_when_collection_ends_cleanly() {
1370        let docs = load("key:\n  - item1\n  - item2\n").unwrap();
1371        let root = &docs[0].root;
1372        let Node::Mapping { entries, .. } = root else {
1373            panic!("expected root mapping");
1374        };
1375        let (_key, seq_val) = &entries[0];
1376        let Node::Sequence { items, .. } = seq_val else {
1377            panic!("expected sequence value");
1378        };
1379        for item in items {
1380            assert!(
1381                item.leading_comments().is_empty(),
1382                "items should have no leading comments"
1383            );
1384        }
1385    }
1386
1387    // -----------------------------------------------------------------------
1388    // Combined scenarios
1389    // -----------------------------------------------------------------------
1390
1391    // UT-C1: exact bug-report input — both comments survive.
1392    #[test]
1393    fn original_bug_report_input_preserves_both_comments() {
1394        let input = "Lists:\n  # Style 1\n  list-a:\n    - item1\n    - item2\n\n  # Style 2\n  list-b:\n  - item1\n  - item2\n";
1395        let docs = load(input).unwrap();
1396        let root = &docs[0].root;
1397        let Node::Mapping { entries, .. } = root else {
1398            panic!("expected root mapping");
1399        };
1400        let (_lists_key, nested) = &entries[0];
1401        let Node::Mapping {
1402            entries: nested_entries,
1403            ..
1404        } = nested
1405        else {
1406            panic!("expected nested mapping");
1407        };
1408        assert_eq!(nested_entries.len(), 2);
1409        let (first_key, _) = &nested_entries[0];
1410        let (second_key, _) = &nested_entries[1];
1411        assert_eq!(
1412            first_key.leading_comments(),
1413            &["# Style 1"],
1414            "list-a should have # Style 1 as leading comment"
1415        );
1416        assert_eq!(
1417            second_key.leading_comments(),
1418            &["# Style 2"],
1419            "list-b should have # Style 2 as leading comment"
1420        );
1421    }
1422
1423    // UT-C2: leading and trailing comments on sibling entries both preserved.
1424    #[test]
1425    fn leading_and_trailing_comments_both_preserved_on_sibling_entries() {
1426        let input = "map:\n  # leading\n  key: value  # trailing\n  # next-leading\n  key2: v2\n";
1427        let docs = load(input).unwrap();
1428        let root = &docs[0].root;
1429        let Node::Mapping { entries, .. } = root else {
1430            panic!("expected root mapping");
1431        };
1432        let (_map_key, map_val) = &entries[0];
1433        let Node::Mapping {
1434            entries: siblings, ..
1435        } = map_val
1436        else {
1437            panic!("expected mapping value");
1438        };
1439        assert_eq!(siblings.len(), 2);
1440        let (key1, val1) = &siblings[0];
1441        let (key2, _) = &siblings[1];
1442        assert_eq!(key1.leading_comments(), &["# leading"]);
1443        assert_eq!(val1.trailing_comment(), Some("# trailing"));
1444        assert_eq!(key2.leading_comments(), &["# next-leading"]);
1445    }
1446
1447    // UT-C3: deeply nested overflow comments propagate to correct sibling.
1448    #[test]
1449    fn deeply_nested_overflow_comments_reach_correct_sibling() {
1450        let input = "top:\n  mid:\n    - x\n    # deep-overflow\n  next: y\n";
1451        let docs = load(input).unwrap();
1452        let root = &docs[0].root;
1453        let Node::Mapping { entries, .. } = root else {
1454            panic!("expected root mapping");
1455        };
1456        let (_top_key, top_val) = &entries[0];
1457        let Node::Mapping {
1458            entries: top_entries,
1459            ..
1460        } = top_val
1461        else {
1462            panic!("expected top-level mapping");
1463        };
1464        assert_eq!(top_entries.len(), 2);
1465        let (next_key, _) = &top_entries[1];
1466        assert_eq!(
1467            next_key.leading_comments(),
1468            &["# deep-overflow"],
1469            "# deep-overflow should propagate from nested sequence to next sibling"
1470        );
1471    }
1472
1473    // -----------------------------------------------------------------------
1474    // UT-D: Document marker flags (explicit_start / explicit_end)
1475    // -----------------------------------------------------------------------
1476
1477    // UT-D1: Bare document (no markers) → both flags false
1478    #[test]
1479    fn bare_document_has_both_flags_false() {
1480        let docs = load("key: value\n").expect("load failed");
1481        assert_eq!(docs.len(), 1);
1482        assert!(!docs[0].explicit_start, "expected explicit_start=false");
1483        assert!(!docs[0].explicit_end, "expected explicit_end=false");
1484    }
1485
1486    // UT-D2: Document with `---` start marker → explicit_start true, explicit_end false
1487    #[test]
1488    fn document_with_start_marker_has_explicit_start_true() {
1489        let docs = load("---\nkey: value\n").expect("load failed");
1490        assert_eq!(docs.len(), 1);
1491        assert!(docs[0].explicit_start, "expected explicit_start=true");
1492        assert!(!docs[0].explicit_end, "expected explicit_end=false");
1493    }
1494
1495    // UT-D3: Document with `...` end marker → explicit_start false, explicit_end true
1496    #[test]
1497    fn document_with_end_marker_has_explicit_end_true() {
1498        let docs = load("key: value\n...\n").expect("load failed");
1499        assert_eq!(docs.len(), 1);
1500        assert!(!docs[0].explicit_start, "expected explicit_start=false");
1501        assert!(docs[0].explicit_end, "expected explicit_end=true");
1502    }
1503
1504    // UT-D4: Document with both `---` and `...` → both flags true
1505    #[test]
1506    fn document_with_both_markers_has_both_flags_true() {
1507        let docs = load("---\nkey: value\n...\n").expect("load failed");
1508        assert_eq!(docs.len(), 1);
1509        assert!(docs[0].explicit_start, "expected explicit_start=true");
1510        assert!(docs[0].explicit_end, "expected explicit_end=true");
1511    }
1512
1513    // UT-D5: Multi-document — each document's flags are independent
1514    #[test]
1515    fn multi_document_flags_are_independent() {
1516        // doc1: no explicit start/end (bare)
1517        // doc2: explicit start (---), explicit end (...)
1518        // doc3: explicit start (---), no explicit end
1519        let docs = load("doc1: a\n---\ndoc2: b\n...\n---\ndoc3: c\n").expect("load failed");
1520        assert_eq!(docs.len(), 3);
1521        assert!(!docs[0].explicit_start, "doc1 explicit_start");
1522        assert!(!docs[0].explicit_end, "doc1 explicit_end");
1523        assert!(docs[1].explicit_start, "doc2 explicit_start");
1524        assert!(docs[1].explicit_end, "doc2 explicit_end");
1525        assert!(docs[2].explicit_start, "doc3 explicit_start");
1526        assert!(!docs[2].explicit_end, "doc3 explicit_end");
1527    }
1528
1529    // UT-D6: Empty document with explicit markers → flags are set
1530    #[test]
1531    fn empty_document_with_explicit_markers_has_both_flags_true() {
1532        let docs = load("---\n...\n").expect("load failed");
1533        assert_eq!(docs.len(), 1);
1534        assert!(docs[0].explicit_start, "expected explicit_start=true");
1535        assert!(docs[0].explicit_end, "expected explicit_end=true");
1536    }
1537
1538    // -----------------------------------------------------------------------
1539    // UT-S: sanitize_scalar_for_error unit tests
1540    // -----------------------------------------------------------------------
1541
1542    // UT-S1: newline replaced with \u000A escape (no raw newline in output)
1543    #[test]
1544    fn sanitize_newline_replaced_with_escape() {
1545        let result = sanitize_scalar_for_error("foo\nbar");
1546        assert!(
1547            !result.contains('\n'),
1548            "output must not contain a raw newline"
1549        );
1550        assert!(
1551            result.contains("\\u000A"),
1552            "output must contain \\u000A escape, got: {result:?}"
1553        );
1554        assert_eq!(result, "foo\\u000Abar");
1555    }
1556
1557    // UT-S2: carriage return replaced with \u000D escape
1558    #[test]
1559    fn sanitize_carriage_return_replaced_with_escape() {
1560        let result = sanitize_scalar_for_error("foo\rbar");
1561        assert!(
1562            !result.contains('\r'),
1563            "output must not contain a raw carriage return"
1564        );
1565        assert!(
1566            result.contains("\\u000D"),
1567            "output must contain \\u000D escape, got: {result:?}"
1568        );
1569        assert_eq!(result, "foo\\u000Dbar");
1570    }
1571
1572    // UT-S3: null byte replaced with \u0000 escape
1573    #[test]
1574    fn sanitize_null_byte_replaced_with_escape() {
1575        let result = sanitize_scalar_for_error("foo\0bar");
1576        assert!(
1577            !result.contains('\0'),
1578            "output must not contain a raw null byte"
1579        );
1580        assert!(
1581            result.contains("\\u0000"),
1582            "output must contain \\u0000 escape, got: {result:?}"
1583        );
1584        assert_eq!(result, "foo\\u0000bar");
1585    }
1586
1587    // UT-S4: short value (≤128 chars) stored verbatim without ellipsis
1588    #[test]
1589    fn sanitize_short_value_stored_verbatim() {
1590        let input = "hello";
1591        let result = sanitize_scalar_for_error(input);
1592        assert_eq!(result, "hello");
1593        assert!(
1594            !result.ends_with("..."),
1595            "short value must not be truncated"
1596        );
1597    }
1598
1599    // UT-S5: value at exactly 128 chars stored verbatim, no ellipsis
1600    #[test]
1601    fn sanitize_value_at_exact_limit_not_truncated() {
1602        let input = "a".repeat(128);
1603        let result = sanitize_scalar_for_error(&input);
1604        assert_eq!(
1605            result.len(),
1606            128,
1607            "128-char input must produce 128-char output"
1608        );
1609        assert!(
1610            !result.ends_with("..."),
1611            "value at exact limit must not be truncated"
1612        );
1613    }
1614
1615    // UT-S6: value of 129 chars truncated to 128 chars + "..."
1616    #[test]
1617    fn sanitize_value_over_limit_truncated() {
1618        let input = "a".repeat(129);
1619        let result = sanitize_scalar_for_error(&input);
1620        assert!(
1621            result.ends_with("..."),
1622            "value over limit must end with '...'"
1623        );
1624        assert_eq!(
1625            result.len(),
1626            128 + 3,
1627            "truncated output must be 128 chars + 3 ellipsis chars"
1628        );
1629    }
1630
1631    // UT-S7: multibyte chars are counted by Unicode scalar value, not bytes;
1632    // truncation at 128 chars does not split a multibyte sequence or produce invalid UTF-8.
1633    #[test]
1634    fn sanitize_multibyte_char_boundary_not_split() {
1635        // Each '中' is 3 bytes. 127 of them = 127 Unicode scalar values, under limit.
1636        // Adding one more ASCII char pushes to 128 (at limit, no truncation).
1637        // Adding yet another pushes to 129 → truncation after 128 chars.
1638        let input: String = "中".repeat(127) + "ab"; // 129 chars total
1639        let result = sanitize_scalar_for_error(&input);
1640        // Must be valid UTF-8 (String guarantees this if we don't split bytes).
1641        assert!(
1642            result.ends_with("..."),
1643            "129-char multibyte input should be truncated"
1644        );
1645        // The result up to the ellipsis must be valid UTF-8 — verified by the
1646        // fact that it's a String. Also check char count = 128.
1647        let char_count = result.trim_end_matches("...").chars().count();
1648        assert_eq!(
1649            char_count, 128,
1650            "truncated portion must be exactly 128 chars"
1651        );
1652    }
1653}