rlsp_yaml_parser/lib.rs
1// SPDX-License-Identifier: MIT
2
3//! A spec-faithful streaming YAML 1.2 parser.
4//!
5//! Use [`parse_events`] for a lazy event stream, or the [`loader`] module to
6//! build a full AST.
7
8mod chars;
9/// Encoding detection and UTF-8 decoding for YAML byte streams.
10pub mod encoding;
11mod error;
12mod event;
13mod event_iter;
14pub(crate) mod lexer;
15/// Security limit constants for the parser and loader.
16pub mod limits;
17mod lines;
18/// Event-to-AST loader that builds a `Vec<Document<Span>>`.
19pub mod loader;
20pub mod node;
21mod pos;
22pub use error::Error;
23pub use event::{Chomp, CollectionStyle, Event, ScalarStyle};
24pub use lines::{BreakType, Line, LineBuffer};
25pub use loader::{LoadError, LoadMode, Loader, LoaderBuilder, LoaderOptions, load};
26pub use node::{Document, Node};
27pub use pos::{Pos, Span};
28
29pub use limits::{
30 MAX_ANCHOR_NAME_BYTES, MAX_COLLECTION_DEPTH, MAX_COMMENT_LEN, MAX_DIRECTIVES_PER_DOC,
31 MAX_RESOLVED_TAG_LEN, MAX_TAG_HANDLE_BYTES, MAX_TAG_LEN,
32};
33use std::collections::VecDeque;
34
35use event_iter::{CollectionEntry, DirectiveScope, IterState, PendingAnchor, PendingTag};
36
37use lexer::Lexer;
38
39/// Parse a YAML string into a lazy event stream.
40///
41/// The iterator yields <code>Result<([Event], [Span]), [Error]></code> items.
42/// The first event is always [`Event::StreamStart`] and the last is always
43/// [`Event::StreamEnd`].
44///
45/// # Example
46///
47/// ```
48/// use rlsp_yaml_parser::{parse_events, Event};
49///
50/// let events: Vec<_> = parse_events("").collect();
51/// assert!(matches!(events.first(), Some(Ok((Event::StreamStart, _)))));
52/// assert!(matches!(events.last(), Some(Ok((Event::StreamEnd, _)))));
53/// ```
54pub fn parse_events(input: &str) -> impl Iterator<Item = Result<(Event<'_>, Span), Error>> + '_ {
55 EventIter::new(input)
56}
57
58// ---------------------------------------------------------------------------
59// Iterator implementation
60// ---------------------------------------------------------------------------
61
62/// Lazy iterator that yields events by walking a [`Lexer`].
63struct EventIter<'input> {
64 lexer: Lexer<'input>,
65 state: IterState,
66 /// Queued events to emit before resuming normal state dispatch.
67 ///
68 /// Used when a single parse step must produce multiple consecutive events —
69 /// e.g. `SequenceStart` before the first item, or multiple close events
70 /// when a dedent closes several nested collections at once.
71 queue: VecDeque<(Event<'input>, Span)>,
72 /// Stack of open block collections (sequences and mappings).
73 ///
74 /// Each entry records whether the open collection is a sequence or a
75 /// mapping, its indentation column, and (for mappings) whether the next
76 /// expected node is a key or a value. The combined length of this stack
77 /// is bounded by [`MAX_COLLECTION_DEPTH`].
78 coll_stack: Vec<CollectionEntry>,
79 /// A pending anchor that has been scanned but not yet attached to a node
80 /// event. The [`PendingAnchor`] variant encodes both the anchor name and
81 /// whether it was standalone (applies to the next node of any type) or
82 /// inline (applies to the key scalar, not the enclosing mapping).
83 pending_anchor: Option<PendingAnchor<'input>>,
84 /// A pending tag that has been scanned but not yet attached to a node event.
85 ///
86 /// Tags in YAML precede the node they annotate (YAML 1.2 §6.8.1). After
87 /// scanning `!tag`, `!!tag`, `!<uri>`, or `!`, the parser stores the tag
88 /// here and attaches it to the next `Scalar`, `SequenceStart`, or
89 /// `MappingStart` event.
90 ///
91 /// Tags are resolved against the current directive scope at scan time:
92 /// - `!<URI>` → stored as `Cow::Borrowed("URI")` (verbatim, no change)
93 /// - `!!suffix` → resolved via `!!` handle (default: `tag:yaml.org,2002:suffix`)
94 /// - `!suffix` → stored as `Cow::Borrowed("!suffix")` (local tag, no expansion)
95 /// - `!` → stored as `Cow::Borrowed("!")`
96 /// - `!handle!suffix` → resolved via `%TAG !handle! prefix` directive
97 ///
98 /// The [`PendingTag`] variant encodes both the resolved tag string and
99 /// whether it was standalone (applies to the next node of any type) or
100 /// inline (applies to the key scalar, not the enclosing mapping).
101 pending_tag: Option<PendingTag<'input>>,
102 /// When a `Standalone` anchor is displaced by an `Inline` anchor for a
103 /// mapping key on the same physical line (e.g. `&node1\n&k1 key: v`), the
104 /// standalone anchor is saved here and consumed by the next
105 /// `MappingStart`/`SequenceStart` event.
106 ///
107 /// Normal anchor delivery uses `pending_anchor`; this field only holds the
108 /// collection-level anchor when both a collection property and a key
109 /// property must be delivered simultaneously.
110 pending_collection_anchor: Option<&'input str>,
111 /// Span of the anchor token stored in `pending_collection_anchor`.
112 pending_collection_anchor_loc: Option<crate::pos::Span>,
113 /// Parallel to `pending_collection_anchor` but for tags.
114 ///
115 /// When a `Standalone` tag is displaced by an `Inline` tag for a mapping
116 /// key (e.g. `!!map\n!!str key: v`), the standalone tag is saved here and
117 /// consumed by the next `MappingStart`/`SequenceStart` event.
118 pending_collection_tag: Option<std::borrow::Cow<'input, str>>,
119 /// Span of the tag token stored in `pending_collection_tag`.
120 pending_collection_tag_loc: Option<crate::pos::Span>,
121 /// Directive scope for the current document.
122 ///
123 /// Accumulated from `%YAML` and `%TAG` directives seen in `BetweenDocs`
124 /// state. Reset at document boundaries.
125 directive_scope: DirectiveScope,
126 /// Set to `true` once the root node of the current document has been
127 /// fully emitted (a scalar at the top level, or a collection after its
128 /// closing event empties `coll_stack`).
129 ///
130 /// Used to detect invalid extra content after the document root, such as
131 /// `foo:\n bar\ninvalid` where `invalid` appears after the root mapping
132 /// closes. Reset to `false` at each document boundary.
133 root_node_emitted: bool,
134 /// Set to `true` after consuming a `? ` explicit key indicator whose key
135 /// content will appear on the NEXT line (i.e., `had_key_inline = false`).
136 /// Cleared when the key content is processed.
137 ///
138 /// Used to allow a block sequence indicator on a line following `? ` to be
139 /// treated as the explicit key's content rather than triggering the
140 /// "invalid block sequence entry" guard.
141 explicit_key_pending: bool,
142 /// When `Some(indent)`, a `? inline-content` explicit key was consumed for the
143 /// mapping at column `indent`. The inline content opens (or is) a complex node
144 /// (sub-mapping or sub-sequence), so the outer mapping stays in Key phase after
145 /// the complex node closes. The stored indent distinguishes the outer mapping
146 /// from any inner mappings that advance to Value phase while the flag is active.
147 /// Used to allow the subsequent `:` value-indicator line to be recognised as
148 /// the explicit value indicator (rather than as an implicit empty-key entry).
149 /// Cleared when the outer mapping at `indent` advances to Value phase.
150 complex_key_inline: Option<usize>,
151 /// When a tag or anchor appears inline on a physical line (e.g. `!!str &a key:`),
152 /// the key content is prepended as a synthetic line with the key's column as its
153 /// indent. This field records the indent of the ORIGINAL physical line so that
154 /// `handle_mapping_entry` can open the mapping at the correct (original) indent
155 /// rather than the synthetic line's offset.
156 property_origin_indent: Option<usize>,
157}
158
159impl EventIter<'_> {
160 /// Current combined collection depth (sequences + mappings).
161 const fn collection_depth(&self) -> usize {
162 self.coll_stack.len()
163 }
164}
165
166/// Build an empty plain scalar event.
167pub(crate) const fn empty_scalar_event<'input>() -> Event<'input> {
168 Event::Scalar {
169 value: std::borrow::Cow::Borrowed(""),
170 style: ScalarStyle::Plain,
171 anchor: None,
172 anchor_loc: None,
173 tag: None,
174 tag_loc: None,
175 }
176}
177
178/// Build a span that covers exactly the 3-byte document marker at `marker_pos`.
179pub(crate) const fn marker_span(marker_pos: Pos) -> Span {
180 Span {
181 start: marker_pos,
182 end: Pos {
183 byte_offset: marker_pos.byte_offset + 3,
184 line: marker_pos.line,
185 column: marker_pos.column + 3,
186 },
187 }
188}
189
190/// Build a zero-width span at `pos`.
191pub(crate) const fn zero_span(pos: Pos) -> Span {
192 Span {
193 start: pos,
194 end: pos,
195 }
196}