Skip to main content

rlsp_yaml_parser/
lib.rs

1// SPDX-License-Identifier: MIT
2
3//! A spec-faithful streaming YAML 1.2 parser.
4//!
5//! Use [`parse_events`] for a lazy event stream, or the [`loader`] module to
6//! build a full AST.
7
8mod chars;
9/// Encoding detection and UTF-8 decoding for YAML byte streams.
10pub mod encoding;
11mod error;
12mod event;
13mod event_iter;
14pub(crate) mod lexer;
15/// Security limit constants for the parser and loader.
16pub mod limits;
17mod lines;
18/// Event-to-AST loader that builds a `Vec<Document<Span>>`.
19pub mod loader;
20pub mod node;
21mod pos;
22/// YAML 1.2.2 §10 schema tag resolution.
23pub mod schema;
24pub use error::Error;
25pub use event::{Chomp, CollectionStyle, Event, EventMeta, ScalarStyle};
26pub use lines::{BreakType, Line, LineBuffer};
27pub use loader::{LoadError, LoadMode, Loader, LoaderBuilder, LoaderOptions, load};
28pub use node::{Document, Node};
29pub use pos::{LineIndex, Pos, Span};
30pub use schema::{ResolvedTag, Schema};
31
32pub use limits::{
33    MAX_ANCHOR_NAME_BYTES, MAX_COLLECTION_DEPTH, MAX_COMMENT_LEN, MAX_DIRECTIVES_PER_DOC,
34    MAX_RESOLVED_TAG_LEN, MAX_TAG_HANDLE_BYTES, MAX_TAG_LEN,
35};
36use std::collections::VecDeque;
37
38use event_iter::{CollectionEntry, DirectiveScope, IterState, PendingAnchor, PendingTag};
39
40use lexer::Lexer;
41
42/// Parse a YAML string into a lazy event stream.
43///
44/// The iterator yields <code>Result<([Event], [Span]), [Error]></code> items.
45/// The first event is always [`Event::StreamStart`] and the last is always
46/// [`Event::StreamEnd`].
47///
48/// # Example
49///
50/// ```
51/// use rlsp_yaml_parser::{parse_events, Event};
52///
53/// let events: Vec<_> = parse_events("").collect();
54/// assert!(matches!(events.first(), Some(Ok((Event::StreamStart, _)))));
55/// assert!(matches!(events.last(), Some(Ok((Event::StreamEnd, _)))));
56/// ```
57pub fn parse_events(input: &str) -> impl Iterator<Item = Result<(Event<'_>, Span), Error>> + '_ {
58    EventIter::new(input)
59}
60
61// ---------------------------------------------------------------------------
62// Iterator implementation
63// ---------------------------------------------------------------------------
64
65/// Lazy iterator that yields events by walking a [`Lexer`].
66struct EventIter<'input> {
67    lexer: Lexer<'input>,
68    /// Full input slice, used for byte-range-to-char-count conversion in the
69    /// flow implicit-key length check (YAML 1.2 §7.4.3).
70    input: &'input str,
71    state: IterState,
72    /// Queued events to emit before resuming normal state dispatch.
73    ///
74    /// Used when a single parse step must produce multiple consecutive events —
75    /// e.g. `SequenceStart` before the first item, or multiple close events
76    /// when a dedent closes several nested collections at once.
77    queue: VecDeque<(Event<'input>, Span)>,
78    /// Stack of open block collections (sequences and mappings).
79    ///
80    /// Each entry records whether the open collection is a sequence or a
81    /// mapping, its indentation column, and (for mappings) whether the next
82    /// expected node is a key or a value.  The combined length of this stack
83    /// is bounded by [`MAX_COLLECTION_DEPTH`].
84    coll_stack: Vec<CollectionEntry>,
85    /// A pending anchor that has been scanned but not yet attached to a node
86    /// event.  The [`PendingAnchor`] variant encodes both the anchor name and
87    /// whether it was standalone (applies to the next node of any type) or
88    /// inline (applies to the key scalar, not the enclosing mapping).
89    pending_anchor: Option<PendingAnchor<'input>>,
90    /// A pending tag that has been scanned but not yet attached to a node event.
91    ///
92    /// Tags in YAML precede the node they annotate (YAML 1.2 §6.8.1).  After
93    /// scanning `!tag`, `!!tag`, `!<uri>`, or `!`, the parser stores the tag
94    /// here and attaches it to the next `Scalar`, `SequenceStart`, or
95    /// `MappingStart` event.
96    ///
97    /// Tags are resolved against the current directive scope at scan time:
98    /// - `!<URI>`  → stored as `Cow::Borrowed("URI")` (verbatim, no change)
99    /// - `!!suffix` → resolved via `!!` handle (default: `tag:yaml.org,2002:suffix`)
100    /// - `!suffix` → stored as `Cow::Borrowed("!suffix")` (local tag, no expansion)
101    /// - `!`       → stored as `Cow::Borrowed("!")`
102    /// - `!handle!suffix` → resolved via `%TAG !handle! prefix` directive
103    ///
104    /// The [`PendingTag`] variant encodes both the resolved tag string and
105    /// whether it was standalone (applies to the next node of any type) or
106    /// inline (applies to the key scalar, not the enclosing mapping).
107    pending_tag: Option<PendingTag<'input>>,
108    /// When a `Standalone` anchor is displaced by an `Inline` anchor for a
109    /// mapping key on the same physical line (e.g. `&node1\n&k1 key: v`), the
110    /// standalone anchor is saved here and consumed by the next
111    /// `MappingStart`/`SequenceStart` event.
112    ///
113    /// Normal anchor delivery uses `pending_anchor`; this field only holds the
114    /// collection-level anchor when both a collection property and a key
115    /// property must be delivered simultaneously.
116    pending_collection_anchor: Option<&'input str>,
117    /// Span of the anchor token stored in `pending_collection_anchor`.
118    pending_collection_anchor_loc: Option<crate::pos::Span>,
119    /// Parallel to `pending_collection_anchor` but for tags.
120    ///
121    /// When a `Standalone` tag is displaced by an `Inline` tag for a mapping
122    /// key (e.g. `!!map\n!!str key: v`), the standalone tag is saved here and
123    /// consumed by the next `MappingStart`/`SequenceStart` event.
124    pending_collection_tag: Option<std::borrow::Cow<'input, str>>,
125    /// Span of the tag token stored in `pending_collection_tag`.
126    pending_collection_tag_loc: Option<crate::pos::Span>,
127    /// Directive scope for the current document.
128    ///
129    /// Accumulated from `%YAML` and `%TAG` directives seen in `BetweenDocs`
130    /// state.  Reset at document boundaries.
131    directive_scope: DirectiveScope,
132    /// Set to `true` once the root node of the current document has been
133    /// fully emitted (a scalar at the top level, or a collection after its
134    /// closing event empties `coll_stack`).
135    ///
136    /// Used to detect invalid extra content after the document root, such as
137    /// `foo:\n  bar\ninvalid` where `invalid` appears after the root mapping
138    /// closes.  Reset to `false` at each document boundary.
139    root_node_emitted: bool,
140    /// Set to `true` after consuming a `? ` explicit key indicator whose key
141    /// content will appear on the NEXT line (i.e., `had_key_inline = false`).
142    /// Cleared when the key content is processed.
143    ///
144    /// Used to allow a block sequence indicator on a line following `? ` to be
145    /// treated as the explicit key's content rather than triggering the
146    /// "invalid block sequence entry" guard.
147    explicit_key_pending: bool,
148    /// When `Some(indent)`, a `? inline-content` explicit key was consumed for the
149    /// mapping at column `indent`.  The inline content opens (or is) a complex node
150    /// (sub-mapping or sub-sequence), so the outer mapping stays in Key phase after
151    /// the complex node closes.  The stored indent distinguishes the outer mapping
152    /// from any inner mappings that advance to Value phase while the flag is active.
153    /// Used to allow the subsequent `:` value-indicator line to be recognised as
154    /// the explicit value indicator (rather than as an implicit empty-key entry).
155    /// Cleared when the outer mapping at `indent` advances to Value phase.
156    complex_key_inline: Option<usize>,
157    /// When a tag or anchor appears inline on a physical line (e.g. `!!str &a key:`),
158    /// the key content is prepended as a synthetic line with the key's column as its
159    /// indent.  This field records the indent of the ORIGINAL physical line so that
160    /// `handle_mapping_entry` can open the mapping at the correct (original) indent
161    /// rather than the synthetic line's offset.
162    property_origin_indent: Option<usize>,
163}
164
165impl EventIter<'_> {
166    /// Current combined collection depth (sequences + mappings).
167    const fn collection_depth(&self) -> usize {
168        self.coll_stack.len()
169    }
170}
171
172/// Build an empty plain scalar event.
173pub(crate) const fn empty_scalar_event<'input>() -> Event<'input> {
174    Event::Scalar {
175        value: std::borrow::Cow::Borrowed(""),
176        style: ScalarStyle::Plain,
177        meta: None,
178    }
179}
180
181/// Build a span that covers exactly the 3-byte document marker at `marker_pos`.
182#[expect(
183    clippy::cast_possible_truncation,
184    reason = "YAML files <= 4 GB; u32 offset is sufficient"
185)]
186pub(crate) const fn marker_span(marker_pos: Pos) -> Span {
187    Span {
188        start: marker_pos.byte_offset as u32,
189        end: (marker_pos.byte_offset + 3) as u32,
190    }
191}
192
193/// Build a zero-width span at `pos`.
194#[expect(
195    clippy::cast_possible_truncation,
196    reason = "YAML files <= 4 GB; u32 offset is sufficient"
197)]
198pub(crate) const fn zero_span(pos: Pos) -> Span {
199    let offset = pos.byte_offset as u32;
200    Span {
201        start: offset,
202        end: offset,
203    }
204}
205
206#[cfg(test)]
207mod tests {
208    use super::*;
209
210    // ES-1: empty_scalar_event must return meta: None.
211    #[test]
212    fn empty_scalar_event_has_no_meta() {
213        let ev = empty_scalar_event();
214        assert!(
215            matches!(ev, Event::Scalar { meta: None, .. }),
216            "empty_scalar_event must produce meta: None, not an allocated EventMeta box"
217        );
218    }
219}