Skip to main content

panache_parser/parser/yaml/
scanner.rs

1//! Streaming, char-by-char YAML scanner (libyaml/PyYAML-style).
2//!
3//! Replaces the line-based `lexer.rs` once parity is reached. The plan
4//! and resolved design decisions live in
5//! `.claude/skills/yaml-shadow-expand/scanner-rewrite.md`.
6//!
7//! Currently implements: trivia, document markers, directives, flow
8//! indicators, block indicators (`-`/`?`/`:`) with the simple-key
9//! table, plain scalars (with internal whitespace and multi-line
10//! continuation), quoted scalars (`'…'`, `"…"`) with escape
11//! diagnostics, and block scalars (`|` literal, `>` folded). Anchors,
12//! tags, and aliases land alongside the parser cutover (step 12).
13
14// No production callers yet — the line-based lexer remains the live
15// path until step 12. Remove once the scanner is wired into parsing.
16#![allow(dead_code)]
17
18use std::collections::VecDeque;
19
20use super::model::{YamlDiagnostic, diagnostic_codes};
21
22/// Position in the input stream. Lines and columns are 0-indexed,
23/// matching PyYAML / libyaml convention.
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
25pub(crate) struct Mark {
26    pub index: usize,
27    pub line: usize,
28    pub column: usize,
29}
30
31/// A simple-key candidate awaiting confirmation by a downstream `:`.
32///
33/// `token_number` records the non-trivia token count at the moment the
34/// candidate was registered, so the parser can splice
35/// `BlockMappingStart` / `FlowMappingStart` before the candidate when
36/// the `:` arrives.
37#[derive(Debug, Clone, Copy, PartialEq, Eq)]
38pub(crate) struct SimpleKey {
39    pub token_number: usize,
40    pub required: bool,
41    pub mark: Mark,
42}
43
44/// Scalar source style — folding/escape decoding lives in projection,
45/// not here. Scanner emits the raw source span and tags the style.
46#[derive(Debug, Clone, Copy, PartialEq, Eq)]
47pub(crate) enum ScalarStyle {
48    Plain,
49    SingleQuoted,
50    DoubleQuoted,
51    Literal,
52    Folded,
53}
54
55/// Trivia preserved in the queue so the parser walks a single stream
56/// rather than re-scanning the input for inter-token bytes.
57#[derive(Debug, Clone, Copy, PartialEq, Eq)]
58pub(crate) enum TriviaKind {
59    Whitespace,
60    Newline,
61    Comment,
62}
63
64#[derive(Debug, Clone, Copy, PartialEq, Eq)]
65pub(crate) enum TokenKind {
66    StreamStart,
67    StreamEnd,
68    DocumentStart,
69    DocumentEnd,
70    Directive,
71    BlockSequenceStart,
72    BlockMappingStart,
73    BlockEnd,
74    FlowSequenceStart,
75    FlowSequenceEnd,
76    FlowMappingStart,
77    FlowMappingEnd,
78    BlockEntry,
79    FlowEntry,
80    Key,
81    Value,
82    Alias,
83    Anchor,
84    Tag,
85    Scalar(ScalarStyle),
86    Trivia(TriviaKind),
87}
88
89#[derive(Debug, Clone, Copy, PartialEq, Eq)]
90pub(crate) struct Token {
91    pub kind: TokenKind,
92    pub start: Mark,
93    pub end: Mark,
94}
95
96#[derive(Debug)]
97pub(crate) struct Scanner<'a> {
98    input: &'a str,
99    cursor: Mark,
100    tokens: VecDeque<Token>,
101    /// Count of tokens that have been popped via `next_token`. Together
102    /// with `tokens.len()` it gives the global index of the next token
103    /// that will be added to the queue — the value `save_simple_key`
104    /// records so `fetch_value` can splice `Key`/`BlockMappingStart`
105    /// before the candidate even after intervening trivia is popped.
106    tokens_taken: usize,
107    /// Current block-context indent column. `-1` represents "before the
108    /// first block container" and matches PyYAML's sentinel.
109    indent: i32,
110    /// Stack of prior `indent` values; popped during `unwind_indent`.
111    indent_stack: Vec<i32>,
112    /// Per-flow-level simple-key candidate slot. Index 0 is block
113    /// context; each `[`/`{` pushes a new slot.
114    simple_keys: Vec<Option<SimpleKey>>,
115    flow_level: usize,
116    /// Whether the next non-trivia token may register a simple-key
117    /// candidate. Reset by indicators that close key candidacy
118    /// (`fetch_value`, plain/quoted scalar emission) and reopened by
119    /// indicators that re-enable it (`fetch_key`, `fetch_block_entry`,
120    /// `fetch_flow_entry`, line breaks in block context).
121    allow_simple_key: bool,
122    diagnostics: Vec<YamlDiagnostic>,
123    stream_end_emitted: bool,
124}
125
126impl<'a> Scanner<'a> {
127    pub(crate) fn new(input: &'a str) -> Self {
128        let mut scanner = Self {
129            input,
130            cursor: Mark::default(),
131            tokens: VecDeque::new(),
132            tokens_taken: 0,
133            indent: -1,
134            indent_stack: Vec::new(),
135            // Slot for the implicit block-context level (flow_level 0).
136            // Each flow open pushes another slot; flow close pops.
137            simple_keys: vec![None],
138            flow_level: 0,
139            allow_simple_key: true,
140            diagnostics: Vec::new(),
141            stream_end_emitted: false,
142        };
143        let mark = scanner.cursor;
144        scanner.tokens.push_back(Token {
145            kind: TokenKind::StreamStart,
146            start: mark,
147            end: mark,
148        });
149        scanner
150    }
151
152    pub(crate) fn next_token(&mut self) -> Option<Token> {
153        while self.need_more_tokens() {
154            self.fetch_more_tokens();
155        }
156        let tok = self.tokens.pop_front();
157        if tok.is_some() {
158            self.tokens_taken += 1;
159        }
160        tok
161    }
162
163    /// Should the caller fetch more tokens before popping the queue
164    /// head? True when the queue is empty (and the stream is still
165    /// open), or when the queue head is itself a registered simple-key
166    /// candidate that may still be spliced before. The latter is what
167    /// makes `Key` / `BlockMappingStart` splicing work — we keep
168    /// fetching past the candidate until either a `:` confirms it
169    /// (cancelling the slot) or a stale check expires it.
170    fn need_more_tokens(&mut self) -> bool {
171        if self.stream_end_emitted {
172            return false;
173        }
174        if self.tokens.is_empty() {
175            return true;
176        }
177        self.stale_simple_keys();
178        matches!(
179            self.next_possible_simple_key_index(),
180            Some(min) if min == self.tokens_taken
181        )
182    }
183
184    fn next_possible_simple_key_index(&self) -> Option<usize> {
185        self.simple_keys
186            .iter()
187            .filter_map(|slot| slot.as_ref().map(|k| k.token_number))
188            .min()
189    }
190
191    /// Drain trivia and one meaningful token into the queue. Called
192    /// repeatedly from `next_token` while `need_more_tokens` is true.
193    fn fetch_more_tokens(&mut self) {
194        self.scan_trivia();
195        self.stale_simple_keys();
196        self.unwind_indent(self.cursor.column as i32);
197        if self.at_eof() {
198            self.fetch_stream_end();
199            return;
200        }
201        // Document markers and directives only apply at column 0 in
202        // block context. Flow context (inside `[]` / `{}`) ignores them.
203        if self.flow_level == 0 && self.cursor.column == 0 {
204            if self.check_document_indicator(b"---") {
205                self.fetch_document_marker(TokenKind::DocumentStart);
206                return;
207            }
208            if self.check_document_indicator(b"...") {
209                self.fetch_document_marker(TokenKind::DocumentEnd);
210                return;
211            }
212            if self.peek_char() == Some('%') {
213                self.fetch_directive();
214                return;
215            }
216        }
217        match self.peek_char() {
218            Some('[') => {
219                self.fetch_flow_collection_start(TokenKind::FlowSequenceStart);
220                return;
221            }
222            Some('{') => {
223                self.fetch_flow_collection_start(TokenKind::FlowMappingStart);
224                return;
225            }
226            Some(']') => {
227                self.fetch_flow_collection_end(TokenKind::FlowSequenceEnd);
228                return;
229            }
230            Some('}') => {
231                self.fetch_flow_collection_end(TokenKind::FlowMappingEnd);
232                return;
233            }
234            Some(',') if self.flow_level > 0 => {
235                self.fetch_flow_entry();
236                return;
237            }
238            Some('-') if self.check_block_entry() => {
239                self.fetch_block_entry();
240                return;
241            }
242            Some('?') if self.check_key() => {
243                self.fetch_key();
244                return;
245            }
246            Some(':') if self.check_value() => {
247                self.fetch_value();
248                return;
249            }
250            Some('\'') => {
251                self.fetch_flow_scalar(ScalarStyle::SingleQuoted);
252                return;
253            }
254            Some('"') => {
255                self.fetch_flow_scalar(ScalarStyle::DoubleQuoted);
256                return;
257            }
258            Some('|') if self.flow_level == 0 => {
259                self.fetch_block_scalar(ScalarStyle::Literal);
260                return;
261            }
262            Some('>') if self.flow_level == 0 => {
263                self.fetch_block_scalar(ScalarStyle::Folded);
264                return;
265            }
266            Some('&') => {
267                self.fetch_anchor();
268                return;
269            }
270            Some('*') => {
271                self.fetch_alias();
272                return;
273            }
274            Some('!') => {
275                self.fetch_tag();
276                return;
277            }
278            _ => {}
279        }
280        // Default: anything else opens a plain scalar.
281        self.fetch_plain_scalar();
282    }
283
284    fn fetch_flow_collection_start(&mut self, kind: TokenKind) {
285        // Register the flow collection as a simple-key candidate at the
286        // current (outer) flow level. A subsequent `:` on the same line
287        // (e.g. `{x: y}: value`, `[a, b]: value`, `{[a,b]: c}`) splices
288        // a Key marker before the flow start — and a BlockMappingStart
289        // earlier when entering block context from level 0.
290        self.save_simple_key();
291        let start = self.cursor;
292        self.advance();
293        let end = self.cursor;
294        self.flow_level += 1;
295        // New nest: a flow scalar can immediately register as the
296        // inner level's simple-key candidate (e.g. `a` in `{a: b}`).
297        self.allow_simple_key = true;
298        self.simple_keys.push(None);
299        self.tokens.push_back(Token { kind, start, end });
300    }
301
302    fn fetch_flow_collection_end(&mut self, kind: TokenKind) {
303        let start = self.cursor;
304        self.advance();
305        let end = self.cursor;
306        if self.flow_level > 0 {
307            self.flow_level -= 1;
308            self.simple_keys.pop();
309        }
310        self.tokens.push_back(Token { kind, start, end });
311    }
312
313    fn fetch_flow_entry(&mut self) {
314        // `,` separates flow items. Subsequent entries can be implicit
315        // keys, so re-open candidacy and clear the current slot.
316        self.allow_simple_key = true;
317        self.remove_simple_key();
318        let start = self.cursor;
319        self.advance();
320        let end = self.cursor;
321        self.tokens.push_back(Token {
322            kind: TokenKind::FlowEntry,
323            start,
324            end,
325        });
326    }
327
328    fn fetch_block_entry(&mut self) {
329        if self.flow_level == 0 {
330            if !self.allow_simple_key {
331                self.push_diagnostic(
332                    diagnostic_codes::LEX_BLOCK_ENTRY_NOT_ALLOWED,
333                    "block sequence entry not allowed here",
334                );
335            }
336            if self.add_indent(self.cursor.column as i32) {
337                let mark = self.cursor;
338                self.tokens.push_back(Token {
339                    kind: TokenKind::BlockSequenceStart,
340                    start: mark,
341                    end: mark,
342                });
343            }
344        }
345        self.allow_simple_key = true;
346        self.remove_simple_key();
347        let start = self.cursor;
348        self.advance();
349        let end = self.cursor;
350        self.tokens.push_back(Token {
351            kind: TokenKind::BlockEntry,
352            start,
353            end,
354        });
355    }
356
357    fn fetch_key(&mut self) {
358        if self.flow_level == 0 {
359            if !self.allow_simple_key {
360                self.push_diagnostic(
361                    diagnostic_codes::LEX_KEY_INDICATOR_NOT_ALLOWED,
362                    "explicit key indicator not allowed here",
363                );
364            }
365            if self.add_indent(self.cursor.column as i32) {
366                let mark = self.cursor;
367                self.tokens.push_back(Token {
368                    kind: TokenKind::BlockMappingStart,
369                    start: mark,
370                    end: mark,
371                });
372            }
373        }
374        // After `?`, the next thing in block context can itself be an
375        // implicit key (the explicit-key path opens a fresh entry).
376        self.allow_simple_key = self.flow_level == 0;
377        self.remove_simple_key();
378        let start = self.cursor;
379        self.advance();
380        let end = self.cursor;
381        self.tokens.push_back(Token {
382            kind: TokenKind::Key,
383            start,
384            end,
385        });
386    }
387
388    fn fetch_value(&mut self) {
389        if let Some(key) = self.simple_keys[self.flow_level].take() {
390            // Implicit key confirmed: splice `Key` (and possibly
391            // `BlockMappingStart`) before the candidate token in the
392            // queue. Both go at the same queue index, with
393            // `BlockMappingStart` inserted last so it ends up first.
394            let queue_pos = key.token_number.saturating_sub(self.tokens_taken);
395            self.tokens.insert(
396                queue_pos,
397                Token {
398                    kind: TokenKind::Key,
399                    start: key.mark,
400                    end: key.mark,
401                },
402            );
403            if self.flow_level == 0 && self.add_indent(key.mark.column as i32) {
404                self.tokens.insert(
405                    queue_pos,
406                    Token {
407                        kind: TokenKind::BlockMappingStart,
408                        start: key.mark,
409                        end: key.mark,
410                    },
411                );
412            }
413            self.allow_simple_key = false;
414        } else {
415            // No candidate: explicit `:` (e.g. `? key\n: value`) or
416            // an empty-key shorthand. In block context this needs to
417            // be at a position where a fresh key could appear.
418            if self.flow_level == 0 {
419                if !self.allow_simple_key {
420                    self.push_diagnostic(
421                        diagnostic_codes::LEX_VALUE_INDICATOR_NOT_ALLOWED,
422                        "value indicator not allowed here",
423                    );
424                }
425                if self.add_indent(self.cursor.column as i32) {
426                    let mark = self.cursor;
427                    self.tokens.push_back(Token {
428                        kind: TokenKind::BlockMappingStart,
429                        start: mark,
430                        end: mark,
431                    });
432                }
433            }
434            self.allow_simple_key = self.flow_level == 0;
435            self.remove_simple_key();
436        }
437        let start = self.cursor;
438        self.advance();
439        let end = self.cursor;
440        self.tokens.push_back(Token {
441            kind: TokenKind::Value,
442            start,
443            end,
444        });
445    }
446
447    /// Plain scalar with internal whitespace and multi-line
448    /// continuation (YAML 1.2 §7.3.3). Each iteration reads a
449    /// non-whitespace "chunk", then peeks past trailing whitespace
450    /// and line breaks to decide whether the scalar continues. A
451    /// scalar terminates on:
452    /// - EOF or a `#` after whitespace (comment),
453    /// - dedent below `parent_indent + 1` after a line break,
454    /// - a column-0 document marker (`---` / `...`) on a continuation
455    ///   line, or a block indicator (`-`/`?`/`:` followed by EOL/space)
456    ///   at the head of a continuation line in block context,
457    /// - in flow context, a flow indicator (`,`/`[`/`]`/`{`/`}`/`?`).
458    ///
459    /// Trailing whitespace that does NOT lead to continuation is left
460    /// unconsumed so the next fetch can emit it as trivia.
461    fn fetch_plain_scalar(&mut self) {
462        self.save_simple_key();
463        self.allow_simple_key = false;
464        let start = self.cursor;
465        let min_indent = self.indent + 1;
466        // Bridge for absent tag tokenization: a plain scalar that begins
467        // with `!` is an emulation placeholder for a tag. Keep a following
468        // block-indicator line (`-`/`?`) separate so the projection can
469        // attach the placeholder to the collection that follows (e.g.
470        // J7PZ `--- !!omap\n- ...`). Genuine plain scalars instead fold
471        // such lines per libyaml (AB8U). `&`/`*` are dispatched separately
472        // by `fetch_anchor`/`fetch_alias` and never reach this path at
473        // start-of-token. Remove this guard once the scanner emits real
474        // tag tokens.
475        let placeholder = matches!(self.input[start.index..].chars().next(), Some('!'));
476        loop {
477            let chunk_start = self.cursor.index;
478            self.consume_plain_chunk();
479            if self.cursor.index == chunk_start {
480                break;
481            }
482            // Peek past inter-chunk whitespace and any line break to
483            // determine if the scalar continues. If not, rewind so
484            // the trailing whitespace becomes trivia.
485            let saved = self.cursor;
486            while matches!(self.peek_char(), Some(' ' | '\t')) {
487                self.advance();
488            }
489            match self.peek_char() {
490                None | Some('#') => {
491                    self.cursor = saved;
492                    break;
493                }
494                Some('\n' | '\r') => {
495                    if !self.try_consume_plain_line_break(min_indent, placeholder) {
496                        self.cursor = saved;
497                        break;
498                    }
499                }
500                Some(_) => {
501                    // Same-line continuation: the consumed spaces are
502                    // internal whitespace; keep going.
503                }
504            }
505        }
506        let end = self.cursor;
507        if start.index == end.index {
508            // Pathological: dispatch landed here on a char we can't
509            // consume (a stray `?`/`-`/`:` not followed by whitespace
510            // at EOF, etc.). Advance one codepoint so the loop makes
511            // progress.
512            self.advance();
513            let end = self.cursor;
514            self.tokens.push_back(Token {
515                kind: TokenKind::Scalar(ScalarStyle::Plain),
516                start,
517                end,
518            });
519            return;
520        }
521        self.tokens.push_back(Token {
522            kind: TokenKind::Scalar(ScalarStyle::Plain),
523            start,
524            end,
525        });
526    }
527
528    /// Consume one run of non-whitespace, non-special chars belonging
529    /// to a plain scalar. Stops at whitespace/break, at `: ` (value
530    /// indicator), and — in flow context — at `,`/`[`/`]`/`{`/`}`/`?`.
531    fn consume_plain_chunk(&mut self) {
532        loop {
533            match self.peek_char() {
534                None | Some('\n' | '\r' | ' ' | '\t') => break,
535                Some(':') => {
536                    let next = self.peek_at(1);
537                    if matches!(next, None | Some(' ' | '\t' | '\n' | '\r')) {
538                        break;
539                    }
540                    if self.flow_level > 0 && matches!(next, Some(',' | ']' | '}')) {
541                        break;
542                    }
543                    self.advance();
544                }
545                Some(',' | '[' | ']' | '{' | '}') if self.flow_level > 0 => break,
546                _ => {
547                    self.advance();
548                }
549            }
550        }
551    }
552
553    /// Try to consume a line break plus any blank lines and the
554    /// leading whitespace of the next non-empty line, leaving the
555    /// cursor at the next chunk if continuation is allowed. Returns
556    /// false (without modifying the cursor) if the scalar must
557    /// terminate at the line break. The caller is responsible for
558    /// rewinding to a saved cursor in that case.
559    fn try_consume_plain_line_break(&mut self, min_indent: i32, placeholder: bool) -> bool {
560        let saved = self.cursor;
561        self.consume_one_line_break();
562        loop {
563            while matches!(self.peek_char(), Some(' ' | '\t')) {
564                self.advance();
565            }
566            match self.peek_char() {
567                None => {
568                    self.cursor = saved;
569                    return false;
570                }
571                Some('\n' | '\r') => {
572                    self.consume_one_line_break();
573                    continue;
574                }
575                Some('#') => {
576                    self.cursor = saved;
577                    return false;
578                }
579                Some(_) => {
580                    let col = self.cursor.column as i32;
581                    if col < min_indent {
582                        self.cursor = saved;
583                        return false;
584                    }
585                    if self.flow_level == 0 {
586                        // Document marker at column 0 ends the scalar.
587                        if col == 0
588                            && (self.check_document_indicator(b"---")
589                                || self.check_document_indicator(b"..."))
590                        {
591                            self.cursor = saved;
592                            return false;
593                        }
594                        // A value indicator (`:` followed by EOL or
595                        // whitespace) at the head of the next line always
596                        // aborts the plain scalar: `consume_plain_chunk`
597                        // refuses to consume it, which would otherwise
598                        // leave the cursor stranded past the line break
599                        // with an empty chunk. `-`/`?` only abort for
600                        // anchor/tag/alias placeholders (see `placeholder`
601                        // above); for genuine plain scalars they fold in
602                        // as content per libyaml (yaml-test-suite AB8U).
603                        let aborts = if placeholder {
604                            matches!(self.peek_char(), Some('-' | '?' | ':'))
605                        } else {
606                            self.peek_char() == Some(':')
607                        };
608                        if aborts
609                            && matches!(self.peek_at(1), None | Some(' ' | '\t' | '\n' | '\r'))
610                        {
611                            self.cursor = saved;
612                            return false;
613                        }
614                    } else if matches!(self.peek_char(), Some(',' | ']' | '}')) {
615                        // In flow context, a flow terminator/separator
616                        // at the head of the next line closes the
617                        // surrounding container — it doesn't continue
618                        // the scalar.
619                        self.cursor = saved;
620                        return false;
621                    }
622                    return true;
623                }
624            }
625        }
626    }
627
628    /// Quoted scalar (`'...'` or `"..."`). Both styles can span
629    /// multiple lines and can be implicit keys; the scanner emits the
630    /// raw source span and surfaces escape/termination diagnostics.
631    /// Cooking (escape decoding, line folding) is the projection
632    /// layer's job.
633    fn fetch_flow_scalar(&mut self, style: ScalarStyle) {
634        self.save_simple_key();
635        self.allow_simple_key = false;
636        let start = self.cursor;
637        let quote = match style {
638            ScalarStyle::SingleQuoted => '\'',
639            ScalarStyle::DoubleQuoted => '"',
640            _ => unreachable!("fetch_flow_scalar called with non-quoted style"),
641        };
642        // Opening quote.
643        self.advance();
644        let mut closed = false;
645        while let Some(c) = self.peek_char() {
646            if c == quote {
647                if style == ScalarStyle::SingleQuoted && self.peek_at(1) == Some('\'') {
648                    // `''` is a literal single quote inside a
649                    // single-quoted scalar — not a terminator.
650                    self.advance();
651                    self.advance();
652                    continue;
653                }
654                self.advance();
655                closed = true;
656                break;
657            }
658            if style == ScalarStyle::DoubleQuoted && c == '\\' {
659                self.advance();
660                self.consume_double_quoted_escape();
661                continue;
662            }
663            // Document markers at column 0 inside an unterminated
664            // quoted scalar abort the scalar (libyaml convention) so
665            // we don't swallow the next document. Bail out before
666            // consuming the marker.
667            if self.flow_level == 0
668                && self.cursor.column == 0
669                && (self.check_document_indicator(b"---") || self.check_document_indicator(b"..."))
670            {
671                break;
672            }
673            self.advance();
674        }
675        if !closed {
676            self.diagnostics.push(YamlDiagnostic {
677                code: diagnostic_codes::LEX_UNTERMINATED_QUOTED_SCALAR,
678                message: "unterminated quoted scalar",
679                byte_start: start.index,
680                byte_end: self.cursor.index,
681            });
682        }
683        let end = self.cursor;
684        self.tokens.push_back(Token {
685            kind: TokenKind::Scalar(style),
686            start,
687            end,
688        });
689    }
690
691    /// Consume one escape sequence inside a double-quoted scalar,
692    /// starting AFTER the introducing `\`. Recognised escapes follow
693    /// YAML 1.2 §5.7 (`\0`, `\a`, …, `\xHH`, `\uHHHH`, `\UHHHHHHHH`,
694    /// and `\<line-break>` for continuation). Unrecognised escapes
695    /// emit a diagnostic; the cursor still advances by one codepoint
696    /// to make progress.
697    fn consume_double_quoted_escape(&mut self) {
698        // The backslash is already past the cursor; record its index
699        // for diagnostic spans (one byte before).
700        let backslash_index = self.cursor.index.saturating_sub(1);
701        match self.peek_char() {
702            None => {
703                // EOF after backslash; the unterminated-scalar branch
704                // will fire.
705            }
706            Some('\n') => {
707                self.advance();
708            }
709            Some('\r') => {
710                self.advance();
711                if self.peek_char() == Some('\n') {
712                    self.advance();
713                }
714            }
715            Some('x') => {
716                self.advance();
717                self.consume_hex_digits(2, backslash_index);
718            }
719            Some('u') => {
720                self.advance();
721                self.consume_hex_digits(4, backslash_index);
722            }
723            Some('U') => {
724                self.advance();
725                self.consume_hex_digits(8, backslash_index);
726            }
727            Some(c) if Self::is_double_quoted_single_byte_escape(c) => {
728                self.advance();
729            }
730            Some(_) => {
731                let invalid_end = self.cursor.index + self.peek_char().unwrap().len_utf8();
732                self.diagnostics.push(YamlDiagnostic {
733                    code: diagnostic_codes::LEX_INVALID_DOUBLE_QUOTED_ESCAPE,
734                    message: "invalid double-quoted escape",
735                    byte_start: backslash_index,
736                    byte_end: invalid_end,
737                });
738                self.advance();
739            }
740        }
741    }
742
743    fn consume_hex_digits(&mut self, count: usize, backslash_index: usize) {
744        let mut consumed = 0;
745        while consumed < count {
746            match self.peek_char() {
747                Some(c) if c.is_ascii_hexdigit() => {
748                    self.advance();
749                    consumed += 1;
750                }
751                _ => break,
752            }
753        }
754        if consumed < count {
755            self.diagnostics.push(YamlDiagnostic {
756                code: diagnostic_codes::LEX_INVALID_DOUBLE_QUOTED_ESCAPE,
757                message: "incomplete hex escape in double-quoted scalar",
758                byte_start: backslash_index,
759                byte_end: self.cursor.index,
760            });
761        }
762    }
763
764    fn is_double_quoted_single_byte_escape(c: char) -> bool {
765        // YAML 1.2 §5.7 escape characters that take no payload.
766        matches!(
767            c,
768            '0' | 'a'
769                | 'b'
770                | 't'
771                | '\t'
772                | 'n'
773                | 'v'
774                | 'f'
775                | 'r'
776                | 'e'
777                | ' '
778                | '"'
779                | '/'
780                | '\\'
781                | 'N'
782                | '_'
783                | 'L'
784                | 'P'
785        )
786    }
787
788    /// Block scalar (`|` literal, `>` folded). The header is `|`/`>`
789    /// optionally followed by an indent indicator (`1`–`9`) and/or a
790    /// chomping indicator (`+`/`-`), then trailing spaces/comment, then
791    /// a line break. Content lines whose indentation falls below the
792    /// resolved minimum terminate the scalar — at which point the
793    /// cursor is left at the start of the dedented line so the main
794    /// loop can pick up the next token.
795    ///
796    /// As with quoted scalars, the source span is emitted raw; folding
797    /// and chomping live in projection.
798    fn fetch_block_scalar(&mut self, style: ScalarStyle) {
799        // Block scalars are values, not keys, so they don't register
800        // a simple-key candidate; but they DO close any pending
801        // candidate at the current level (e.g. `key: |` confirms `key`
802        // as the candidate before we get here).
803        self.allow_simple_key = true;
804        self.remove_simple_key();
805        let start = self.cursor;
806        let parent_indent = self.indent;
807        // Header indicator (`|` or `>`).
808        self.advance();
809        // Optional indent + chomping indicators (in either order).
810        let mut explicit_increment: Option<u32> = None;
811        for _ in 0..2 {
812            match self.peek_char() {
813                Some('+' | '-') => {
814                    self.advance();
815                }
816                Some(d @ '1'..='9') if explicit_increment.is_none() => {
817                    explicit_increment = Some(d.to_digit(10).expect("hex digit"));
818                    self.advance();
819                }
820                _ => break,
821            }
822        }
823        // Header trailing whitespace.
824        while matches!(self.peek_char(), Some(' ' | '\t')) {
825            self.advance();
826        }
827        // Optional trailing comment on the header line.
828        if self.peek_char() == Some('#') {
829            while !matches!(self.peek_char(), None | Some('\n' | '\r')) {
830                self.advance();
831            }
832        }
833        // The header must end at a line break (or EOF, for an empty
834        // body). Non-blank trailing content is malformed; libyaml
835        // diagnoses but we just consume to end-of-line for resilience.
836        match self.peek_char() {
837            Some('\n') => {
838                self.advance();
839            }
840            Some('\r') => {
841                self.advance();
842                if self.peek_char() == Some('\n') {
843                    self.advance();
844                }
845            }
846            None => {
847                // Empty body at EOF.
848                let end = self.cursor;
849                self.tokens.push_back(Token {
850                    kind: TokenKind::Scalar(style),
851                    start,
852                    end,
853                });
854                return;
855            }
856            Some(_) => {
857                // Trailing junk on header — skip to end of line.
858                while !matches!(self.peek_char(), None | Some('\n' | '\r')) {
859                    self.advance();
860                }
861                match self.peek_char() {
862                    Some('\n') => {
863                        self.advance();
864                    }
865                    Some('\r') => {
866                        self.advance();
867                        if self.peek_char() == Some('\n') {
868                            self.advance();
869                        }
870                    }
871                    _ => {}
872                }
873            }
874        }
875        // Determine the minimum content indent. Per YAML 1.2 §8.1.1.1
876        // content indent must be strictly greater than the parent's
877        // indent. At doc root parent_indent = -1, so column-0 content
878        // is permitted (floor = 0). Otherwise the floor is parent+1.
879        // An explicit indicator m gives content indent max(parent,0)+m.
880        let base = parent_indent.max(0);
881        let auto_floor = (parent_indent + 1).max(0);
882        let min_indent = match explicit_increment {
883            Some(m) => base + m as i32,
884            None => self
885                .auto_detect_block_scalar_indent()
886                .unwrap_or(auto_floor)
887                .max(auto_floor),
888        };
889        // Walk content lines via lookahead so a dedented line stays
890        // unconsumed and the main fetch loop sees it.
891        loop {
892            let line_start = self.cursor.index;
893            let bytes = self.input.as_bytes();
894            let mut probe = line_start;
895            while bytes.get(probe) == Some(&b' ') {
896                probe += 1;
897            }
898            let leading_spaces = probe - line_start;
899            match bytes.get(probe) {
900                None => break,
901                Some(b'\n' | b'\r') => {
902                    // Blank line — entirely whitespace. Consume the
903                    // spaces and the line break as content.
904                    while self.cursor.index < probe {
905                        self.advance();
906                    }
907                    self.consume_one_line_break();
908                    continue;
909                }
910                _ => {}
911            }
912            if (leading_spaces as i32) < min_indent {
913                // Dedent below content — terminate without consuming.
914                break;
915            }
916            if leading_spaces == 0
917                && (bytes.get(probe..probe + 3) == Some(b"---")
918                    || bytes.get(probe..probe + 3) == Some(b"..."))
919                && matches!(
920                    bytes.get(probe + 3),
921                    None | Some(b' ' | b'\t' | b'\n' | b'\r')
922                )
923            {
924                // Document marker terminates the scalar.
925                break;
926            }
927            // Consume the rest of the line as content.
928            while !matches!(self.peek_char(), None | Some('\n' | '\r')) {
929                self.advance();
930            }
931            self.consume_one_line_break();
932            if self.at_eof() {
933                break;
934            }
935        }
936        let end = self.cursor;
937        self.tokens.push_back(Token {
938            kind: TokenKind::Scalar(style),
939            start,
940            end,
941        });
942    }
943
944    /// Look ahead through blank lines to find the first non-blank
945    /// content line, returning its leading-space count. Pure peek;
946    /// the cursor does not move.
947    fn auto_detect_block_scalar_indent(&self) -> Option<i32> {
948        let bytes = self.input.as_bytes();
949        let mut i = self.cursor.index;
950        while i < bytes.len() {
951            let line_start = i;
952            while bytes.get(i) == Some(&b' ') {
953                i += 1;
954            }
955            match bytes.get(i) {
956                None => return None,
957                Some(b'\n') => {
958                    i += 1;
959                    continue;
960                }
961                Some(b'\r') => {
962                    i += 1;
963                    if bytes.get(i) == Some(&b'\n') {
964                        i += 1;
965                    }
966                    continue;
967                }
968                _ => {
969                    return Some((i - line_start) as i32);
970                }
971            }
972        }
973        None
974    }
975
976    fn consume_one_line_break(&mut self) {
977        match self.peek_char() {
978            Some('\n') => {
979                self.advance();
980            }
981            Some('\r') => {
982                self.advance();
983                if self.peek_char() == Some('\n') {
984                    self.advance();
985                }
986            }
987            _ => {}
988        }
989    }
990
991    fn fetch_stream_end(&mut self) {
992        if self.stream_end_emitted {
993            return;
994        }
995        self.unwind_indent(-1);
996        // Drain any pending simple-key candidates. Required candidates
997        // that never met a `:` are diagnosed; non-required ones are
998        // dropped silently.
999        for slot in self.simple_keys.iter_mut() {
1000            if let Some(key) = slot.take()
1001                && key.required
1002            {
1003                self.diagnostics.push(YamlDiagnostic {
1004                    code: diagnostic_codes::LEX_REQUIRED_SIMPLE_KEY_NOT_FOUND,
1005                    message: "could not find expected ':' for required simple key",
1006                    byte_start: key.mark.index,
1007                    byte_end: key.mark.index,
1008                });
1009            }
1010        }
1011        self.allow_simple_key = false;
1012        self.stream_end_emitted = true;
1013        let mark = self.cursor;
1014        self.tokens.push_back(Token {
1015            kind: TokenKind::StreamEnd,
1016            start: mark,
1017            end: mark,
1018        });
1019    }
1020
1021    fn check_block_entry(&self) -> bool {
1022        matches!(self.peek_at(1), None | Some(' ' | '\t' | '\n' | '\r'))
1023    }
1024
1025    /// `?` opens an explicit key only when followed by whitespace,
1026    /// end-of-input, or end-of-line — in both block and flow context.
1027    /// A `?` that's followed by any other character is plain-scalar
1028    /// text (e.g. `value?`, `another ? string`, `?key`). yaml-test-suite
1029    /// JR7V pins this for flow context; libyaml `check_key` agrees.
1030    fn check_key(&self) -> bool {
1031        matches!(self.peek_at(1), None | Some(' ' | '\t' | '\n' | '\r'))
1032    }
1033
1034    /// `:` is a value indicator in the same conditions as `?`. In flow
1035    /// context it's always structural; in block context only when
1036    /// followed by whitespace/EOL (otherwise it's part of a plain
1037    /// scalar like `https://example.com`).
1038    fn check_value(&self) -> bool {
1039        if self.flow_level > 0 {
1040            return true;
1041        }
1042        matches!(self.peek_at(1), None | Some(' ' | '\t' | '\n' | '\r'))
1043    }
1044
1045    /// Push a new indent level if `column` exceeds the current one.
1046    /// Returns true if the level was newly opened, signalling the
1047    /// caller should emit a `BlockSequenceStart` / `BlockMappingStart`.
1048    fn add_indent(&mut self, column: i32) -> bool {
1049        if self.indent < column {
1050            self.indent_stack.push(self.indent);
1051            self.indent = column;
1052            true
1053        } else {
1054            false
1055        }
1056    }
1057
1058    /// Pop indent levels above `column`, emitting `BlockEnd` for each.
1059    /// Flow context never owns indent levels, so this is a no-op there.
1060    fn unwind_indent(&mut self, column: i32) {
1061        if self.flow_level > 0 {
1062            return;
1063        }
1064        while self.indent > column {
1065            let mark = self.cursor;
1066            self.indent = self.indent_stack.pop().unwrap_or(-1);
1067            self.tokens.push_back(Token {
1068                kind: TokenKind::BlockEnd,
1069                start: mark,
1070                end: mark,
1071            });
1072        }
1073    }
1074
1075    /// Tentatively register a simple-key candidate at the current flow
1076    /// level. The candidate's `token_number` is the global index where
1077    /// the next token will be appended — i.e. the scalar/anchor that
1078    /// triggered registration. A subsequent `:` confirms the candidate
1079    /// (splicing `Key` before that token); a line break or required
1080    /// expiration cancels it.
1081    fn save_simple_key(&mut self) {
1082        if !self.allow_simple_key {
1083            return;
1084        }
1085        let required = self.flow_level == 0 && self.indent == self.cursor.column as i32;
1086        self.remove_simple_key();
1087        let token_number = self.tokens_taken + self.tokens.len();
1088        self.simple_keys[self.flow_level] = Some(SimpleKey {
1089            token_number,
1090            required,
1091            mark: self.cursor,
1092        });
1093    }
1094
1095    /// Cancel the simple-key candidate at the current flow level. If it
1096    /// was required, surface a diagnostic — required candidates that
1097    /// fail to confirm indicate malformed YAML (e.g. an indent change
1098    /// before the expected `:`).
1099    fn remove_simple_key(&mut self) {
1100        if let Some(key) = self.simple_keys[self.flow_level].take()
1101            && key.required
1102        {
1103            self.diagnostics.push(YamlDiagnostic {
1104                code: diagnostic_codes::LEX_REQUIRED_SIMPLE_KEY_NOT_FOUND,
1105                message: "could not find expected ':' for required simple key",
1106                byte_start: key.mark.index,
1107                byte_end: key.mark.index,
1108            });
1109        }
1110    }
1111
1112    /// Expire candidates whose registration line lies behind the
1113    /// cursor — a simple key cannot span a line break. Required
1114    /// candidates that age out get a diagnostic; others are dropped
1115    /// silently.
1116    fn stale_simple_keys(&mut self) {
1117        let line = self.cursor.line;
1118        for slot in self.simple_keys.iter_mut() {
1119            let stale = match slot {
1120                Some(key) => key.mark.line != line,
1121                None => false,
1122            };
1123            if stale
1124                && let Some(key) = slot.take()
1125                && key.required
1126            {
1127                self.diagnostics.push(YamlDiagnostic {
1128                    code: diagnostic_codes::LEX_REQUIRED_SIMPLE_KEY_NOT_FOUND,
1129                    message: "could not find expected ':' for required simple key",
1130                    byte_start: key.mark.index,
1131                    byte_end: key.mark.index,
1132                });
1133            }
1134        }
1135    }
1136
1137    fn push_diagnostic(&mut self, code: &'static str, message: &'static str) {
1138        self.diagnostics.push(YamlDiagnostic {
1139            code,
1140            message,
1141            byte_start: self.cursor.index,
1142            byte_end: self.cursor.index,
1143        });
1144    }
1145
1146    /// `---` / `...` are document markers only at column 0 followed by
1147    /// whitespace, newline, or end-of-input. `---abc` is a plain
1148    /// scalar, not a marker.
1149    fn check_document_indicator(&self, marker: &[u8; 3]) -> bool {
1150        let bytes = self.input.as_bytes();
1151        let i = self.cursor.index;
1152        if bytes.get(i..i + 3) != Some(marker.as_slice()) {
1153            return false;
1154        }
1155        matches!(bytes.get(i + 3), None | Some(b' ' | b'\t' | b'\n' | b'\r'))
1156    }
1157
1158    fn fetch_document_marker(&mut self, kind: TokenKind) {
1159        // A document marker terminates the previous document's block
1160        // structure: any indent levels held by an open block map or
1161        // sequence must close before the marker so the next document
1162        // starts from a clean indent stack. Without this, a
1163        // multi-document stream where doc N closed at column 0 leaves
1164        // `self.indent == 0`, which prevents `add_indent(0)` from
1165        // emitting a fresh `BlockMappingStart` / `BlockSequenceStart`
1166        // for doc N+1's body — its content lands at document level
1167        // instead of inside a container. Mirrors libyaml/PyYAML's
1168        // `fetch_document_indicator`.
1169        self.unwind_indent(-1);
1170        self.remove_simple_key();
1171        self.allow_simple_key = false;
1172        let start = self.cursor;
1173        self.advance();
1174        self.advance();
1175        self.advance();
1176        let end = self.cursor;
1177        self.tokens.push_back(Token { kind, start, end });
1178    }
1179
1180    /// A directive is `%name args` running to end-of-line. Trailing
1181    /// whitespace/comment/newline emit as separate trivia on the next
1182    /// fetch.
1183    fn fetch_directive(&mut self) {
1184        let start = self.cursor;
1185        debug_assert_eq!(self.peek_char(), Some('%'));
1186        self.advance();
1187        while let Some(c) = self.peek_char() {
1188            if c == '\n' || c == '\r' {
1189                break;
1190            }
1191            self.advance();
1192        }
1193        let end = self.cursor;
1194        self.tokens.push_back(Token {
1195            kind: TokenKind::Directive,
1196            start,
1197            end,
1198        });
1199    }
1200
1201    /// Anchor (`&name`) at start-of-token. Anchors can occupy the
1202    /// implicit-key slot (e.g. `&b *alias : value` in SU74), so we
1203    /// `save_simple_key` first, then close key candidacy for this
1204    /// token.
1205    fn fetch_anchor(&mut self) {
1206        self.save_simple_key();
1207        self.allow_simple_key = false;
1208        let start = self.cursor;
1209        debug_assert_eq!(self.peek_char(), Some('&'));
1210        self.advance();
1211        self.scan_anchor_name();
1212        let end = self.cursor;
1213        self.tokens.push_back(Token {
1214            kind: TokenKind::Anchor,
1215            start,
1216            end,
1217        });
1218    }
1219
1220    /// Alias (`*name`) at start-of-token. Like an anchor, an alias can
1221    /// be the implicit-key slot's candidate.
1222    fn fetch_alias(&mut self) {
1223        self.save_simple_key();
1224        self.allow_simple_key = false;
1225        let start = self.cursor;
1226        debug_assert_eq!(self.peek_char(), Some('*'));
1227        self.advance();
1228        self.scan_anchor_name();
1229        let end = self.cursor;
1230        self.tokens.push_back(Token {
1231            kind: TokenKind::Alias,
1232            start,
1233            end,
1234        });
1235    }
1236
1237    /// Tag (`!handle suffix`, `!!type`, or `!<verbatim>`) at start-of-token.
1238    /// Tags annotate the *next* node, so they're emitted as a separate
1239    /// token (decoration) and `parser_v2` carries them through without
1240    /// closing the implicit-key candidate slot. Like an anchor, a tag
1241    /// can occupy the implicit-key position (e.g. `!!str key: value`).
1242    fn fetch_tag(&mut self) {
1243        self.save_simple_key();
1244        self.allow_simple_key = false;
1245        let start = self.cursor;
1246        debug_assert_eq!(self.peek_char(), Some('!'));
1247        self.advance();
1248        if self.peek_char() == Some('<') {
1249            // Verbatim form `!<uri>`: consume up to and including the
1250            // closing `>`. We don't validate the URI body — projection
1251            // and downstream tools surface tag-shape diagnostics.
1252            self.advance();
1253            while let Some(c) = self.peek_char() {
1254                self.advance();
1255                if c == '>' {
1256                    break;
1257                }
1258            }
1259        } else {
1260            // Handle + suffix: any non-whitespace, non-flow-indicator
1261            // char. Relaxed (libyaml/PyYAML) name class so suffix chars
1262            // like `:`, `/`, `!`, `%` all land inside the tag token.
1263            while let Some(c) = self.peek_char() {
1264                match c {
1265                    ' ' | '\t' | '\n' | '\r' => break,
1266                    ',' | '[' | ']' | '{' | '}' if self.flow_level > 0 => break,
1267                    _ => {
1268                        self.advance();
1269                    }
1270                }
1271            }
1272        }
1273        let end = self.cursor;
1274        self.tokens.push_back(Token {
1275            kind: TokenKind::Tag,
1276            start,
1277            end,
1278        });
1279    }
1280
1281    /// Consume an anchor/alias name. Relaxed (libyaml/PyYAML) name
1282    /// class: any non-whitespace, non-flow-indicator codepoint. This
1283    /// lets `&a:` in 2SXE land as an anchor with name `a:` rather than
1284    /// splitting on the colon.
1285    fn scan_anchor_name(&mut self) {
1286        while let Some(c) = self.peek_char() {
1287            match c {
1288                ' ' | '\t' | '\n' | '\r' => break,
1289                ',' | '[' | ']' | '{' | '}' => break,
1290                _ => {
1291                    self.advance();
1292                }
1293            }
1294        }
1295    }
1296
1297    /// Consume runs of whitespace, newlines, and comments, emitting
1298    /// one `Trivia` token per run. Stops at the first meaningful char
1299    /// or EOF.
1300    fn scan_trivia(&mut self) {
1301        while !self.at_eof() {
1302            match self.peek_char() {
1303                Some(' ' | '\t') => self.scan_whitespace_run(),
1304                Some('\n' | '\r') => self.scan_newline(),
1305                Some('#') => self.scan_comment(),
1306                _ => break,
1307            }
1308        }
1309    }
1310
1311    fn scan_whitespace_run(&mut self) {
1312        let start = self.cursor;
1313        while matches!(self.peek_char(), Some(' ' | '\t')) {
1314            self.advance();
1315        }
1316        let end = self.cursor;
1317        self.tokens.push_back(Token {
1318            kind: TokenKind::Trivia(TriviaKind::Whitespace),
1319            start,
1320            end,
1321        });
1322    }
1323
1324    fn scan_newline(&mut self) {
1325        let start = self.cursor;
1326        match self.peek_char() {
1327            Some('\n') => {
1328                self.advance();
1329            }
1330            Some('\r') => {
1331                self.advance();
1332                if self.peek_char() == Some('\n') {
1333                    self.advance();
1334                }
1335            }
1336            _ => unreachable!("scan_newline called on non-newline char"),
1337        }
1338        let end = self.cursor;
1339        // Line breaks in block context re-open simple-key candidacy:
1340        // the next non-trivia token starts a fresh line and may be a
1341        // key. Flow context ignores indentation, so candidacy is
1342        // governed by `,`/`[`/`{` instead.
1343        if self.flow_level == 0 {
1344            self.allow_simple_key = true;
1345        }
1346        self.tokens.push_back(Token {
1347            kind: TokenKind::Trivia(TriviaKind::Newline),
1348            start,
1349            end,
1350        });
1351    }
1352
1353    fn scan_comment(&mut self) {
1354        let start = self.cursor;
1355        debug_assert_eq!(self.peek_char(), Some('#'));
1356        self.advance();
1357        while let Some(c) = self.peek_char() {
1358            if c == '\n' || c == '\r' {
1359                break;
1360            }
1361            self.advance();
1362        }
1363        let end = self.cursor;
1364        self.tokens.push_back(Token {
1365            kind: TokenKind::Trivia(TriviaKind::Comment),
1366            start,
1367            end,
1368        });
1369    }
1370
1371    pub(crate) fn diagnostics(&self) -> &[YamlDiagnostic] {
1372        &self.diagnostics
1373    }
1374
1375    pub(crate) fn cursor(&self) -> Mark {
1376        self.cursor
1377    }
1378
1379    pub(crate) fn at_eof(&self) -> bool {
1380        self.cursor.index >= self.input.len()
1381    }
1382
1383    fn remaining(&self) -> &str {
1384        &self.input[self.cursor.index..]
1385    }
1386
1387    pub(crate) fn peek_char(&self) -> Option<char> {
1388        self.remaining().chars().next()
1389    }
1390
1391    /// Look ahead `offset` codepoints from the cursor. `offset == 0`
1392    /// returns the same as `peek_char`.
1393    pub(crate) fn peek_at(&self, offset: usize) -> Option<char> {
1394        self.remaining().chars().nth(offset)
1395    }
1396
1397    /// Consume one codepoint and advance the cursor. Line/column
1398    /// tracking treats `\n`, `\r\n`, and lone `\r` each as one logical
1399    /// line break (YAML 1.2 §5.4).
1400    pub(crate) fn advance(&mut self) -> Option<char> {
1401        let c = self.peek_char()?;
1402        self.cursor.index += c.len_utf8();
1403        match c {
1404            '\n' => {
1405                self.cursor.line += 1;
1406                self.cursor.column = 0;
1407            }
1408            '\r' => {
1409                // CRLF: defer the line break to the following '\n' so
1410                // each byte updates the cursor exactly once. Lone '\r'
1411                // takes the line break itself.
1412                if self.peek_char() != Some('\n') {
1413                    self.cursor.line += 1;
1414                    self.cursor.column = 0;
1415                }
1416            }
1417            _ => {
1418                self.cursor.column += 1;
1419            }
1420        }
1421        Some(c)
1422    }
1423}
1424
1425/// Byte-completeness report from running the streaming scanner over an
1426/// input. Used by the integration harness to gate the cutover (step 12)
1427/// — until every allowlisted fixture is covered byte-completely with no
1428/// overlaps or gaps, the new scanner cannot replace the line-based
1429/// lexer.
1430#[derive(Debug, Clone)]
1431pub struct ShadowScannerReport {
1432    /// True when token spans cover the entire input contiguously and
1433    /// no two non-synthetic tokens overlap.
1434    pub byte_complete: bool,
1435    /// Total tokens emitted (including trivia and stream markers).
1436    pub token_count: usize,
1437    /// Diagnostic codes emitted during scanning, in order.
1438    pub diagnostic_codes: Vec<&'static str>,
1439    /// Highest end-index reached across non-synthetic tokens.
1440    pub last_token_end: usize,
1441    pub input_len: usize,
1442    /// First byte index where coverage is missing, if any.
1443    pub gap_at: Option<usize>,
1444    /// True if any non-synthetic token's start index is below the
1445    /// preceding token's end (a regression in the splice/queue logic).
1446    pub overlapping: bool,
1447}
1448
1449/// Drive the streaming scanner to completion over `input` and return a
1450/// byte-completeness report. This is exposed so the integration harness
1451/// in `tests/yaml.rs` can run the scanner over every allowlisted
1452/// fixture without depending on internal `Token`/`Scanner` types.
1453pub fn shadow_scanner_check(input: &str) -> ShadowScannerReport {
1454    let mut scanner = Scanner::new(input);
1455    let mut tokens = Vec::new();
1456    while let Some(tok) = scanner.next_token() {
1457        tokens.push(tok);
1458    }
1459    let mut cursor = 0usize;
1460    let mut overlapping = false;
1461    let mut gap_at: Option<usize> = None;
1462    for tok in &tokens {
1463        match tok.kind {
1464            TokenKind::StreamStart | TokenKind::StreamEnd => {}
1465            _ => {
1466                if tok.start.index < cursor {
1467                    overlapping = true;
1468                } else if tok.start.index > cursor && gap_at.is_none() {
1469                    gap_at = Some(cursor);
1470                }
1471                if tok.end.index > cursor {
1472                    cursor = tok.end.index;
1473                }
1474            }
1475        }
1476    }
1477    let byte_complete = !overlapping && gap_at.is_none() && cursor == input.len();
1478    ShadowScannerReport {
1479        byte_complete,
1480        token_count: tokens.len(),
1481        diagnostic_codes: scanner.diagnostics.iter().map(|d| d.code).collect(),
1482        last_token_end: cursor,
1483        input_len: input.len(),
1484        gap_at,
1485        overlapping,
1486    }
1487}
1488
1489#[cfg(test)]
1490mod tests {
1491    use super::*;
1492
1493    #[test]
1494    fn empty_input_emits_stream_start_then_stream_end() {
1495        let mut scanner = Scanner::new("");
1496        assert_eq!(
1497            scanner.next_token().map(|t| t.kind),
1498            Some(TokenKind::StreamStart)
1499        );
1500        assert_eq!(
1501            scanner.next_token().map(|t| t.kind),
1502            Some(TokenKind::StreamEnd)
1503        );
1504        assert_eq!(scanner.next_token(), None);
1505    }
1506
1507    #[test]
1508    fn first_and_last_tokens_are_always_stream_markers() {
1509        let mut scanner = Scanner::new("foo: bar\n");
1510        assert_eq!(
1511            scanner.next_token().map(|t| t.kind),
1512            Some(TokenKind::StreamStart)
1513        );
1514        let mut last = None;
1515        while let Some(tok) = scanner.next_token() {
1516            last = Some(tok);
1517        }
1518        assert_eq!(last.map(|t| t.kind), Some(TokenKind::StreamEnd));
1519    }
1520
1521    #[test]
1522    fn stream_end_marks_cursor_position_after_trivia_only_input() {
1523        let input = "   \n";
1524        let mut scanner = Scanner::new(input);
1525        // StreamStart, Whitespace, Newline, StreamEnd
1526        let mut last = None;
1527        while let Some(tok) = scanner.next_token() {
1528            last = Some(tok);
1529        }
1530        let end = last.expect("stream end");
1531        assert_eq!(end.kind, TokenKind::StreamEnd);
1532        assert_eq!(end.start.index, input.len());
1533        assert_eq!(end.end.index, input.len());
1534    }
1535
1536    #[test]
1537    fn diagnostics_start_empty() {
1538        let scanner = Scanner::new("");
1539        assert!(scanner.diagnostics().is_empty());
1540    }
1541
1542    #[test]
1543    fn cursor_starts_at_origin() {
1544        let scanner = Scanner::new("anything");
1545        assert_eq!(
1546            scanner.cursor(),
1547            Mark {
1548                index: 0,
1549                line: 0,
1550                column: 0
1551            }
1552        );
1553    }
1554
1555    #[test]
1556    fn at_eof_is_true_for_empty_input() {
1557        let scanner = Scanner::new("");
1558        assert!(scanner.at_eof());
1559        assert_eq!(scanner.peek_char(), None);
1560    }
1561
1562    #[test]
1563    fn peek_does_not_advance_cursor() {
1564        let scanner = Scanner::new("abc");
1565        assert_eq!(scanner.peek_char(), Some('a'));
1566        assert_eq!(scanner.peek_at(1), Some('b'));
1567        assert_eq!(scanner.peek_at(2), Some('c'));
1568        assert_eq!(scanner.peek_at(3), None);
1569        assert_eq!(scanner.cursor().index, 0);
1570    }
1571
1572    #[test]
1573    fn advance_moves_through_ascii_one_column_per_char() {
1574        let mut scanner = Scanner::new("abc");
1575        assert_eq!(scanner.advance(), Some('a'));
1576        assert_eq!(
1577            scanner.cursor(),
1578            Mark {
1579                index: 1,
1580                line: 0,
1581                column: 1
1582            }
1583        );
1584        assert_eq!(scanner.advance(), Some('b'));
1585        assert_eq!(
1586            scanner.cursor(),
1587            Mark {
1588                index: 2,
1589                line: 0,
1590                column: 2
1591            }
1592        );
1593        assert_eq!(scanner.advance(), Some('c'));
1594        assert_eq!(
1595            scanner.cursor(),
1596            Mark {
1597                index: 3,
1598                line: 0,
1599                column: 3
1600            }
1601        );
1602        assert_eq!(scanner.advance(), None);
1603        assert!(scanner.at_eof());
1604    }
1605
1606    #[test]
1607    fn lf_increments_line_and_resets_column() {
1608        let mut scanner = Scanner::new("a\nb");
1609        scanner.advance(); // 'a'
1610        scanner.advance(); // '\n'
1611        assert_eq!(
1612            scanner.cursor(),
1613            Mark {
1614                index: 2,
1615                line: 1,
1616                column: 0
1617            }
1618        );
1619        scanner.advance(); // 'b'
1620        assert_eq!(
1621            scanner.cursor(),
1622            Mark {
1623                index: 3,
1624                line: 1,
1625                column: 1
1626            }
1627        );
1628    }
1629
1630    #[test]
1631    fn crlf_counts_as_one_line_break() {
1632        let mut scanner = Scanner::new("a\r\nb");
1633        scanner.advance(); // 'a' → line 0, col 1
1634        scanner.advance(); // '\r' → line 0 (deferred), col 1, index 2
1635        assert_eq!(scanner.cursor().line, 0);
1636        assert_eq!(scanner.cursor().index, 2);
1637        scanner.advance(); // '\n' → line 1, col 0
1638        assert_eq!(
1639            scanner.cursor(),
1640            Mark {
1641                index: 3,
1642                line: 1,
1643                column: 0
1644            }
1645        );
1646        scanner.advance(); // 'b'
1647        assert_eq!(
1648            scanner.cursor(),
1649            Mark {
1650                index: 4,
1651                line: 1,
1652                column: 1
1653            }
1654        );
1655    }
1656
1657    #[test]
1658    fn lone_cr_takes_its_own_line_break() {
1659        let mut scanner = Scanner::new("a\rb");
1660        scanner.advance(); // 'a'
1661        scanner.advance(); // '\r' (no following '\n')
1662        assert_eq!(
1663            scanner.cursor(),
1664            Mark {
1665                index: 2,
1666                line: 1,
1667                column: 0
1668            }
1669        );
1670        scanner.advance(); // 'b'
1671        assert_eq!(
1672            scanner.cursor(),
1673            Mark {
1674                index: 3,
1675                line: 1,
1676                column: 1
1677            }
1678        );
1679    }
1680
1681    #[test]
1682    fn multibyte_utf8_advances_index_by_byte_length_and_column_by_one() {
1683        // 'é' is 2 bytes in UTF-8 (0xC3 0xA9), one codepoint.
1684        let mut scanner = Scanner::new("é!");
1685        scanner.advance();
1686        assert_eq!(
1687            scanner.cursor(),
1688            Mark {
1689                index: 2,
1690                line: 0,
1691                column: 1
1692            }
1693        );
1694        scanner.advance();
1695        assert_eq!(
1696            scanner.cursor(),
1697            Mark {
1698                index: 3,
1699                line: 0,
1700                column: 2
1701            }
1702        );
1703    }
1704
1705    #[test]
1706    fn mixed_line_endings_track_correctly() {
1707        // LF, CRLF, lone CR — three logical breaks.
1708        let mut scanner = Scanner::new("a\nb\r\nc\rd");
1709        while scanner.advance().is_some() {}
1710        assert_eq!(scanner.cursor().line, 3);
1711        assert_eq!(scanner.cursor().column, 1);
1712        assert_eq!(scanner.cursor().index, 8);
1713    }
1714
1715    fn collect_tokens(input: &str) -> Vec<Token> {
1716        let mut scanner = Scanner::new(input);
1717        let mut out = Vec::new();
1718        while let Some(tok) = scanner.next_token() {
1719            out.push(tok);
1720        }
1721        out
1722    }
1723
1724    fn trivia_kinds(tokens: &[Token]) -> Vec<TriviaKind> {
1725        tokens
1726            .iter()
1727            .filter_map(|t| match t.kind {
1728                TokenKind::Trivia(k) => Some(k),
1729                _ => None,
1730            })
1731            .collect()
1732    }
1733
1734    fn assert_byte_complete(input: &str, tokens: &[Token]) {
1735        // Synthetic StreamStart/StreamEnd carry zero-width spans; trivia
1736        // tokens between them must cover the full input contiguously.
1737        let mut cursor = 0usize;
1738        for tok in tokens {
1739            match tok.kind {
1740                TokenKind::StreamStart | TokenKind::StreamEnd => {
1741                    assert_eq!(tok.start.index, tok.end.index, "synthetic token has extent");
1742                }
1743                _ => {
1744                    assert_eq!(tok.start.index, cursor, "token starts at expected position");
1745                    assert!(tok.end.index >= tok.start.index);
1746                    cursor = tok.end.index;
1747                }
1748            }
1749        }
1750        assert_eq!(cursor, input.len(), "all bytes covered");
1751    }
1752
1753    #[test]
1754    fn pure_whitespace_yields_one_whitespace_trivia_token() {
1755        let tokens = collect_tokens("   \t  ");
1756        assert_eq!(
1757            trivia_kinds(&tokens),
1758            vec![TriviaKind::Whitespace],
1759            "whitespace coalesces into a single run"
1760        );
1761        assert_byte_complete("   \t  ", &tokens);
1762    }
1763
1764    #[test]
1765    fn newline_emits_one_newline_per_logical_break() {
1766        let input = "\n\r\n\r";
1767        let tokens = collect_tokens(input);
1768        assert_eq!(
1769            trivia_kinds(&tokens),
1770            vec![
1771                TriviaKind::Newline,
1772                TriviaKind::Newline,
1773                TriviaKind::Newline
1774            ],
1775        );
1776        assert_byte_complete(input, &tokens);
1777    }
1778
1779    #[test]
1780    fn comment_runs_to_end_of_line_excluding_break() {
1781        let input = "# hello\n# next\n";
1782        let tokens = collect_tokens(input);
1783        assert_eq!(
1784            trivia_kinds(&tokens),
1785            vec![
1786                TriviaKind::Comment,
1787                TriviaKind::Newline,
1788                TriviaKind::Comment,
1789                TriviaKind::Newline,
1790            ],
1791        );
1792        // First comment span equals "# hello".
1793        let comment_tok = tokens
1794            .iter()
1795            .find(|t| matches!(t.kind, TokenKind::Trivia(TriviaKind::Comment)))
1796            .unwrap();
1797        assert_eq!(
1798            &input[comment_tok.start.index..comment_tok.end.index],
1799            "# hello"
1800        );
1801        assert_byte_complete(input, &tokens);
1802    }
1803
1804    #[test]
1805    fn whitespace_then_comment_then_newline_separates_into_three_tokens() {
1806        let input = "   # comment\n";
1807        let tokens = collect_tokens(input);
1808        assert_eq!(
1809            trivia_kinds(&tokens),
1810            vec![
1811                TriviaKind::Whitespace,
1812                TriviaKind::Comment,
1813                TriviaKind::Newline
1814            ],
1815        );
1816        assert_byte_complete(input, &tokens);
1817    }
1818
1819    #[test]
1820    fn pure_trivia_input_round_trips_byte_complete() {
1821        // Mixed whitespace/newlines/comments with CRLF — the kind of
1822        // input we'll hit between meaningful tokens once the scanner
1823        // is wired up.
1824        let input = " \t# c1\r\n\n  # c2\n\r";
1825        let tokens = collect_tokens(input);
1826        assert_byte_complete(input, &tokens);
1827        assert!(matches!(
1828            tokens.last().map(|t| t.kind),
1829            Some(TokenKind::StreamEnd),
1830        ));
1831    }
1832
1833    #[test]
1834    fn empty_input_emits_only_stream_markers() {
1835        let tokens = collect_tokens("");
1836        assert_eq!(tokens.len(), 2);
1837        assert_eq!(tokens[0].kind, TokenKind::StreamStart);
1838        assert_eq!(tokens[1].kind, TokenKind::StreamEnd);
1839    }
1840
1841    fn meaningful_kinds(tokens: &[Token]) -> Vec<TokenKind> {
1842        tokens
1843            .iter()
1844            .map(|t| t.kind)
1845            .filter(|k| !matches!(k, TokenKind::Trivia(_)))
1846            .collect()
1847    }
1848
1849    #[test]
1850    fn document_start_marker_at_column_zero_emits_token() {
1851        let input = "---\n";
1852        let tokens = collect_tokens(input);
1853        assert_eq!(
1854            meaningful_kinds(&tokens),
1855            vec![
1856                TokenKind::StreamStart,
1857                TokenKind::DocumentStart,
1858                TokenKind::StreamEnd
1859            ],
1860        );
1861        assert_byte_complete(input, &tokens);
1862    }
1863
1864    #[test]
1865    fn document_end_marker_at_column_zero_emits_token() {
1866        let input = "...\n";
1867        let tokens = collect_tokens(input);
1868        assert_eq!(
1869            meaningful_kinds(&tokens),
1870            vec![
1871                TokenKind::StreamStart,
1872                TokenKind::DocumentEnd,
1873                TokenKind::StreamEnd
1874            ],
1875        );
1876        assert_byte_complete(input, &tokens);
1877    }
1878
1879    #[test]
1880    fn document_marker_at_eof_without_trailing_break_still_emits() {
1881        let input = "---";
1882        let tokens = collect_tokens(input);
1883        assert_eq!(
1884            meaningful_kinds(&tokens),
1885            vec![
1886                TokenKind::StreamStart,
1887                TokenKind::DocumentStart,
1888                TokenKind::StreamEnd
1889            ],
1890        );
1891    }
1892
1893    #[test]
1894    fn three_dashes_followed_by_non_break_is_not_a_marker() {
1895        // `---abc` at col 0 is a plain scalar starter, not a marker.
1896        let tokens = collect_tokens("---abc\n");
1897        let kinds = meaningful_kinds(&tokens);
1898        assert!(!kinds.contains(&TokenKind::DocumentStart), "got {kinds:?}",);
1899        assert!(
1900            kinds.contains(&TokenKind::Scalar(ScalarStyle::Plain)),
1901            "got {kinds:?}",
1902        );
1903    }
1904
1905    #[test]
1906    fn three_dashes_indented_is_not_a_marker() {
1907        // ` ---` at col 1 is not a doc marker.
1908        let tokens = collect_tokens(" ---\n");
1909        let kinds = meaningful_kinds(&tokens);
1910        assert!(!kinds.contains(&TokenKind::DocumentStart), "got {kinds:?}",);
1911    }
1912
1913    #[test]
1914    fn directive_at_column_zero_emits_directive_token() {
1915        let input = "%YAML 1.2\n";
1916        let tokens = collect_tokens(input);
1917        let directive = tokens
1918            .iter()
1919            .find(|t| matches!(t.kind, TokenKind::Directive))
1920            .expect("directive token");
1921        assert_eq!(
1922            &input[directive.start.index..directive.end.index],
1923            "%YAML 1.2",
1924        );
1925        assert_byte_complete(input, &tokens);
1926    }
1927
1928    #[test]
1929    fn directive_indented_is_not_recognized() {
1930        // Directives MUST be at column 0; ` %YAML 1.2` is not a directive.
1931        let tokens = collect_tokens(" %YAML 1.2\n");
1932        let kinds = meaningful_kinds(&tokens);
1933        assert!(!kinds.contains(&TokenKind::Directive), "got {kinds:?}",);
1934    }
1935
1936    #[test]
1937    fn document_start_then_marker_on_new_line() {
1938        // Two markers separated by a newline: both detected.
1939        let input = "---\n...\n";
1940        let tokens = collect_tokens(input);
1941        assert_eq!(
1942            meaningful_kinds(&tokens),
1943            vec![
1944                TokenKind::StreamStart,
1945                TokenKind::DocumentStart,
1946                TokenKind::DocumentEnd,
1947                TokenKind::StreamEnd,
1948            ],
1949        );
1950        assert_byte_complete(input, &tokens);
1951    }
1952
1953    #[test]
1954    fn directive_followed_by_doc_start_emits_both_in_order() {
1955        let input = "%YAML 1.2\n---\n";
1956        let tokens = collect_tokens(input);
1957        assert_eq!(
1958            meaningful_kinds(&tokens),
1959            vec![
1960                TokenKind::StreamStart,
1961                TokenKind::Directive,
1962                TokenKind::DocumentStart,
1963                TokenKind::StreamEnd,
1964            ],
1965        );
1966        assert_byte_complete(input, &tokens);
1967    }
1968
1969    #[test]
1970    fn document_marker_followed_by_space_emits_marker_then_content_scalar() {
1971        let input = "--- foo\n";
1972        let tokens = collect_tokens(input);
1973        let kinds = meaningful_kinds(&tokens);
1974        assert_eq!(kinds[0], TokenKind::StreamStart);
1975        assert_eq!(kinds[1], TokenKind::DocumentStart);
1976        // " " is whitespace trivia; "foo" is now a plain scalar.
1977        assert_eq!(kinds[2], TokenKind::Scalar(ScalarStyle::Plain));
1978        assert_eq!(*kinds.last().unwrap(), TokenKind::StreamEnd);
1979        assert_byte_complete(input, &tokens);
1980    }
1981
1982    #[test]
1983    fn empty_flow_sequence_emits_start_then_end() {
1984        let input = "[]";
1985        let tokens = collect_tokens(input);
1986        assert_eq!(
1987            meaningful_kinds(&tokens),
1988            vec![
1989                TokenKind::StreamStart,
1990                TokenKind::FlowSequenceStart,
1991                TokenKind::FlowSequenceEnd,
1992                TokenKind::StreamEnd,
1993            ],
1994        );
1995        assert_byte_complete(input, &tokens);
1996    }
1997
1998    #[test]
1999    fn empty_flow_mapping_emits_start_then_end() {
2000        let input = "{}";
2001        let tokens = collect_tokens(input);
2002        assert_eq!(
2003            meaningful_kinds(&tokens),
2004            vec![
2005                TokenKind::StreamStart,
2006                TokenKind::FlowMappingStart,
2007                TokenKind::FlowMappingEnd,
2008                TokenKind::StreamEnd,
2009            ],
2010        );
2011        assert_byte_complete(input, &tokens);
2012    }
2013
2014    #[test]
2015    fn nested_flow_sequence_brackets_emit_in_order() {
2016        let input = "[[]]";
2017        let tokens = collect_tokens(input);
2018        assert_eq!(
2019            meaningful_kinds(&tokens),
2020            vec![
2021                TokenKind::StreamStart,
2022                TokenKind::FlowSequenceStart,
2023                TokenKind::FlowSequenceStart,
2024                TokenKind::FlowSequenceEnd,
2025                TokenKind::FlowSequenceEnd,
2026                TokenKind::StreamEnd,
2027            ],
2028        );
2029        assert_byte_complete(input, &tokens);
2030    }
2031
2032    #[test]
2033    fn nested_flow_mixed_brackets_emit_in_order() {
2034        let input = "[{}]";
2035        let tokens = collect_tokens(input);
2036        assert_eq!(
2037            meaningful_kinds(&tokens),
2038            vec![
2039                TokenKind::StreamStart,
2040                TokenKind::FlowSequenceStart,
2041                TokenKind::FlowMappingStart,
2042                TokenKind::FlowMappingEnd,
2043                TokenKind::FlowSequenceEnd,
2044                TokenKind::StreamEnd,
2045            ],
2046        );
2047        assert_byte_complete(input, &tokens);
2048    }
2049
2050    #[test]
2051    fn comma_inside_flow_emits_flow_entry() {
2052        let input = "[,,]";
2053        let tokens = collect_tokens(input);
2054        assert_eq!(
2055            meaningful_kinds(&tokens),
2056            vec![
2057                TokenKind::StreamStart,
2058                TokenKind::FlowSequenceStart,
2059                TokenKind::FlowEntry,
2060                TokenKind::FlowEntry,
2061                TokenKind::FlowSequenceEnd,
2062                TokenKind::StreamEnd,
2063            ],
2064        );
2065        assert_byte_complete(input, &tokens);
2066    }
2067
2068    #[test]
2069    fn comma_outside_flow_is_not_a_flow_entry() {
2070        // Outside flow context, `,` is plain text, not an indicator.
2071        let tokens = collect_tokens(",");
2072        let kinds = meaningful_kinds(&tokens);
2073        assert!(!kinds.contains(&TokenKind::FlowEntry), "got {kinds:?}");
2074    }
2075
2076    #[test]
2077    fn doc_markers_inside_flow_context_are_not_recognized() {
2078        // `[---]` — the `---` inside flow context is plain text, not a
2079        // doc marker.
2080        let tokens = collect_tokens("[---]");
2081        let kinds = meaningful_kinds(&tokens);
2082        assert!(!kinds.contains(&TokenKind::DocumentStart), "got {kinds:?}");
2083        assert_eq!(kinds[1], TokenKind::FlowSequenceStart);
2084    }
2085
2086    #[test]
2087    fn flow_brackets_with_whitespace_emit_trivia_between() {
2088        let input = "[ , ]";
2089        let tokens = collect_tokens(input);
2090        // FlowSequenceStart, Whitespace, FlowEntry, Whitespace, FlowSequenceEnd.
2091        assert_eq!(
2092            tokens
2093                .iter()
2094                .map(|t| t.kind)
2095                .filter(|k| !matches!(k, TokenKind::StreamStart | TokenKind::StreamEnd))
2096                .collect::<Vec<_>>(),
2097            vec![
2098                TokenKind::FlowSequenceStart,
2099                TokenKind::Trivia(TriviaKind::Whitespace),
2100                TokenKind::FlowEntry,
2101                TokenKind::Trivia(TriviaKind::Whitespace),
2102                TokenKind::FlowSequenceEnd,
2103            ],
2104        );
2105        assert_byte_complete(input, &tokens);
2106    }
2107
2108    #[test]
2109    fn block_mapping_implicit_key_splices_block_mapping_start_and_key() {
2110        // The classic case: `key: value` registers `key` as a simple-key
2111        // candidate; the `:` confirms it, splicing BlockMappingStart and
2112        // Key before the scalar.
2113        let input = "key: value";
2114        let tokens = collect_tokens(input);
2115        assert_eq!(
2116            meaningful_kinds(&tokens),
2117            vec![
2118                TokenKind::StreamStart,
2119                TokenKind::BlockMappingStart,
2120                TokenKind::Key,
2121                TokenKind::Scalar(ScalarStyle::Plain),
2122                TokenKind::Value,
2123                TokenKind::Scalar(ScalarStyle::Plain),
2124                TokenKind::BlockEnd,
2125                TokenKind::StreamEnd,
2126            ],
2127        );
2128        assert_byte_complete(input, &tokens);
2129    }
2130
2131    #[test]
2132    fn block_sequence_emits_block_sequence_start_then_entries() {
2133        let input = "- a\n- b\n";
2134        let tokens = collect_tokens(input);
2135        assert_eq!(
2136            meaningful_kinds(&tokens),
2137            vec![
2138                TokenKind::StreamStart,
2139                TokenKind::BlockSequenceStart,
2140                TokenKind::BlockEntry,
2141                TokenKind::Scalar(ScalarStyle::Plain),
2142                TokenKind::BlockEntry,
2143                TokenKind::Scalar(ScalarStyle::Plain),
2144                TokenKind::BlockEnd,
2145                TokenKind::StreamEnd,
2146            ],
2147        );
2148        assert_byte_complete(input, &tokens);
2149    }
2150
2151    #[test]
2152    fn explicit_key_indicator_emits_key_and_value_without_splice() {
2153        // `? a\n: b` — the `?` opens an explicit-key entry, so when `:`
2154        // arrives there's no implicit-key candidate to confirm (the
2155        // candidate registered for `a` aged out at the line break).
2156        let input = "? a\n: b\n";
2157        let tokens = collect_tokens(input);
2158        let kinds = meaningful_kinds(&tokens);
2159        assert_eq!(
2160            kinds,
2161            vec![
2162                TokenKind::StreamStart,
2163                TokenKind::BlockMappingStart,
2164                TokenKind::Key,
2165                TokenKind::Scalar(ScalarStyle::Plain),
2166                TokenKind::Value,
2167                TokenKind::Scalar(ScalarStyle::Plain),
2168                TokenKind::BlockEnd,
2169                TokenKind::StreamEnd,
2170            ],
2171        );
2172        assert_byte_complete(input, &tokens);
2173    }
2174
2175    #[test]
2176    fn multi_line_plain_scalar_does_not_confirm_simple_key_on_next_line() {
2177        // `a\nb: c\n` — under multi-line plain rules `a\nb` is one
2178        // continuation scalar, terminated by `: `. The simple-key
2179        // candidate registered when the scalar started on line 0 must
2180        // age out before the `:` arrives (it lives on line 1), so the
2181        // `:` does NOT splice a Key before the multi-line scalar.
2182        let input = "a\nb: c\n";
2183        let tokens = collect_tokens(input);
2184        let kinds = meaningful_kinds(&tokens);
2185        // The first plain scalar token must precede any Key token —
2186        // proving the multi-line scalar wasn't retroactively keyed.
2187        let scalar_pos = kinds
2188            .iter()
2189            .position(|&k| k == TokenKind::Scalar(ScalarStyle::Plain))
2190            .expect("plain scalar present");
2191        if let Some(key_pos) = kinds.iter().position(|&k| k == TokenKind::Key) {
2192            assert!(
2193                scalar_pos < key_pos,
2194                "multi-line scalar must precede any key: {kinds:?}",
2195            );
2196        }
2197        // The scalar's source span covers both lines.
2198        let scalar = tokens
2199            .iter()
2200            .find(|t| matches!(t.kind, TokenKind::Scalar(ScalarStyle::Plain)))
2201            .unwrap();
2202        assert_eq!(&input[scalar.start.index..scalar.end.index], "a\nb");
2203    }
2204
2205    #[test]
2206    fn flow_mapping_with_implicit_key_emits_only_flow_indicators() {
2207        // Inside `{}`, `a: b` triggers the simple-key splice for `a`
2208        // but DOES NOT emit BlockMappingStart (we're in flow context).
2209        let input = "{a: b}";
2210        let tokens = collect_tokens(input);
2211        let kinds = meaningful_kinds(&tokens);
2212        assert_eq!(
2213            kinds,
2214            vec![
2215                TokenKind::StreamStart,
2216                TokenKind::FlowMappingStart,
2217                TokenKind::Key,
2218                TokenKind::Scalar(ScalarStyle::Plain),
2219                TokenKind::Value,
2220                TokenKind::Scalar(ScalarStyle::Plain),
2221                TokenKind::FlowMappingEnd,
2222                TokenKind::StreamEnd,
2223            ],
2224        );
2225        assert!(
2226            !kinds.contains(&TokenKind::BlockMappingStart),
2227            "got {kinds:?}",
2228        );
2229        assert_byte_complete(input, &tokens);
2230    }
2231
2232    #[test]
2233    fn flow_explicit_key_indicator_emits_key_token() {
2234        // `?` inside flow context is always a key indicator (no
2235        // whitespace lookahead needed).
2236        let input = "{? a: b}";
2237        let tokens = collect_tokens(input);
2238        let kinds = meaningful_kinds(&tokens);
2239        assert_eq!(kinds[0], TokenKind::StreamStart);
2240        assert_eq!(kinds[1], TokenKind::FlowMappingStart);
2241        assert_eq!(kinds[2], TokenKind::Key);
2242        // After the `?`, the rest is implicit-key-style: candidate for
2243        // `a` is confirmed by `:`.
2244        assert!(kinds.contains(&TokenKind::Value));
2245        assert_byte_complete(input, &tokens);
2246    }
2247
2248    #[test]
2249    fn nested_block_mapping_emits_block_end_on_dedent() {
2250        // outer:
2251        //   inner: x
2252        // y: z
2253        // The dedent before `y` must emit BlockEnd, popping the inner
2254        // mapping's indent level.
2255        let input = "outer:\n  inner: x\ny: z\n";
2256        let tokens = collect_tokens(input);
2257        let kinds = meaningful_kinds(&tokens);
2258        let block_ends = kinds.iter().filter(|&&k| k == TokenKind::BlockEnd).count();
2259        // One BlockEnd for the inner mapping (popped before `y`),
2260        // one for the outer mapping at stream end.
2261        assert_eq!(block_ends, 2, "got {kinds:?}");
2262        assert_byte_complete(input, &tokens);
2263    }
2264
2265    #[test]
2266    fn nested_block_sequence_inside_mapping_unwinds_correctly() {
2267        // items:
2268        //   - a
2269        //   - b
2270        // status: ok
2271        //
2272        // The dedent before `status:` pops the inner sequence's indent
2273        // level, emitting BlockEnd before the next outer mapping key.
2274        let input = "items:\n  - a\n  - b\nstatus: ok\n";
2275        let tokens = collect_tokens(input);
2276        let kinds = meaningful_kinds(&tokens);
2277        // Find the position of the SECOND Key (`status`) and the
2278        // BlockEnd that should precede it (closing the sequence).
2279        let key_positions: Vec<_> = kinds
2280            .iter()
2281            .enumerate()
2282            .filter_map(|(i, &k)| (k == TokenKind::Key).then_some(i))
2283            .collect();
2284        assert_eq!(key_positions.len(), 2, "expected 2 keys: {kinds:?}");
2285        let second_key = key_positions[1];
2286        let preceding_block_end = kinds[..second_key]
2287            .iter()
2288            .rposition(|&k| k == TokenKind::BlockEnd);
2289        assert!(
2290            preceding_block_end.is_some(),
2291            "BlockEnd must precede second key: {kinds:?}",
2292        );
2293        // Final two tokens are BlockEnd (outer mapping), StreamEnd.
2294        let n = kinds.len();
2295        assert_eq!(kinds[n - 1], TokenKind::StreamEnd);
2296        assert_eq!(kinds[n - 2], TokenKind::BlockEnd);
2297        assert_byte_complete(input, &tokens);
2298    }
2299
2300    #[test]
2301    fn value_indicator_with_no_simple_key_emits_block_mapping_start() {
2302        // A bare `: value` at column 0 (empty key shorthand) opens a
2303        // block mapping with no Key splice; the parser will treat it
2304        // as "empty implicit key, then value".
2305        let input = ": value\n";
2306        let tokens = collect_tokens(input);
2307        let kinds = meaningful_kinds(&tokens);
2308        assert_eq!(kinds[0], TokenKind::StreamStart);
2309        assert_eq!(kinds[1], TokenKind::BlockMappingStart);
2310        assert_eq!(kinds[2], TokenKind::Value);
2311        // No Key token before Value — the parser handles empty key.
2312        assert!(!kinds[..3].contains(&TokenKind::Key), "got {kinds:?}",);
2313        assert_byte_complete(input, &tokens);
2314    }
2315
2316    #[test]
2317    fn block_mapping_unwinds_indents_at_stream_end() {
2318        // a:
2319        //   b: c
2320        // (no trailing newline) — must still emit two BlockEnd tokens
2321        // before StreamEnd as the indent stack unwinds.
2322        let input = "a:\n  b: c";
2323        let tokens = collect_tokens(input);
2324        let kinds = meaningful_kinds(&tokens);
2325        // Last meaningful tokens should be BlockEnd, BlockEnd, StreamEnd.
2326        let n = kinds.len();
2327        assert_eq!(kinds[n - 1], TokenKind::StreamEnd);
2328        assert_eq!(kinds[n - 2], TokenKind::BlockEnd);
2329        assert_eq!(kinds[n - 3], TokenKind::BlockEnd);
2330        assert_byte_complete(input, &tokens);
2331    }
2332
2333    #[test]
2334    fn colon_inside_plain_scalar_token_does_not_break_scalar() {
2335        // `https://example.com` — the `:` is not followed by whitespace
2336        // so it stays inside the plain scalar.
2337        let input = "https://example.com";
2338        let tokens = collect_tokens(input);
2339        let scalar = tokens
2340            .iter()
2341            .find(|t| matches!(t.kind, TokenKind::Scalar(_)))
2342            .expect("plain scalar token");
2343        assert_eq!(
2344            &input[scalar.start.index..scalar.end.index],
2345            "https://example.com",
2346        );
2347        assert_byte_complete(input, &tokens);
2348    }
2349
2350    #[test]
2351    fn diagnostics_remain_empty_for_well_formed_inputs() {
2352        for input in ["key: value", "- a\n- b\n", "{a: b, c: d}", "? k\n: v\n"] {
2353            let mut scanner = Scanner::new(input);
2354            while scanner.next_token().is_some() {}
2355            assert!(
2356                scanner.diagnostics().is_empty(),
2357                "{input:?} produced unexpected diagnostics: {:?}",
2358                scanner.diagnostics(),
2359            );
2360        }
2361    }
2362
2363    fn find_scalar(tokens: &[Token]) -> &Token {
2364        tokens
2365            .iter()
2366            .find(|t| matches!(t.kind, TokenKind::Scalar(_)))
2367            .expect("expected scalar token")
2368    }
2369
2370    #[test]
2371    fn single_quoted_scalar_emits_token_spanning_quotes() {
2372        let input = "'hello'";
2373        let tokens = collect_tokens(input);
2374        let scalar = find_scalar(&tokens);
2375        assert_eq!(scalar.kind, TokenKind::Scalar(ScalarStyle::SingleQuoted));
2376        assert_eq!(&input[scalar.start.index..scalar.end.index], "'hello'");
2377        assert_byte_complete(input, &tokens);
2378    }
2379
2380    #[test]
2381    fn double_quoted_scalar_emits_token_spanning_quotes() {
2382        let input = "\"hello\"";
2383        let tokens = collect_tokens(input);
2384        let scalar = find_scalar(&tokens);
2385        assert_eq!(scalar.kind, TokenKind::Scalar(ScalarStyle::DoubleQuoted));
2386        assert_eq!(&input[scalar.start.index..scalar.end.index], "\"hello\"");
2387        assert_byte_complete(input, &tokens);
2388    }
2389
2390    #[test]
2391    fn single_quoted_scalar_treats_doubled_quote_as_escape() {
2392        // `'it''s'` is a single scalar containing `it's`. The middle
2393        // `''` must NOT terminate the scalar.
2394        let input = "'it''s'";
2395        let tokens = collect_tokens(input);
2396        let scalars: Vec<_> = tokens
2397            .iter()
2398            .filter(|t| matches!(t.kind, TokenKind::Scalar(_)))
2399            .collect();
2400        assert_eq!(scalars.len(), 1, "got {:?}", tokens);
2401        assert_eq!(
2402            &input[scalars[0].start.index..scalars[0].end.index],
2403            "'it''s'",
2404        );
2405    }
2406
2407    #[test]
2408    fn double_quoted_scalar_with_escaped_quote_does_not_terminate_early() {
2409        // `"a\"b"` — the middle `\"` is an escaped quote; the closer
2410        // is the final `"`.
2411        let input = "\"a\\\"b\"";
2412        let tokens = collect_tokens(input);
2413        let scalars: Vec<_> = tokens
2414            .iter()
2415            .filter(|t| matches!(t.kind, TokenKind::Scalar(_)))
2416            .collect();
2417        assert_eq!(scalars.len(), 1, "got {tokens:?}");
2418        assert_eq!(
2419            &input[scalars[0].start.index..scalars[0].end.index],
2420            "\"a\\\"b\"",
2421        );
2422        assert_byte_complete(input, &tokens);
2423    }
2424
2425    #[test]
2426    fn double_quoted_scalar_recognises_common_single_byte_escapes() {
2427        // Each escape advances by exactly one char after `\`.
2428        let input = "\"\\n\\t\\r\\0\\\\\\\"\"";
2429        let tokens = collect_tokens(input);
2430        let scalar = find_scalar(&tokens);
2431        assert_eq!(scalar.kind, TokenKind::Scalar(ScalarStyle::DoubleQuoted));
2432        // The whole input should be the scalar.
2433        assert_eq!(scalar.start.index, 0);
2434        assert_eq!(scalar.end.index, input.len());
2435        let mut scanner = Scanner::new(input);
2436        while scanner.next_token().is_some() {}
2437        assert!(scanner.diagnostics().is_empty());
2438    }
2439
2440    #[test]
2441    fn double_quoted_scalar_recognises_hex_escapes() {
2442        // `\x41` is `A`; `é` is `é`; `\U0001F600` is 😀.
2443        let input = "\"\\x41\\u00E9\\U0001F600\"";
2444        let mut scanner = Scanner::new(input);
2445        while scanner.next_token().is_some() {}
2446        assert!(
2447            scanner.diagnostics().is_empty(),
2448            "got {:?}",
2449            scanner.diagnostics()
2450        );
2451    }
2452
2453    #[test]
2454    fn double_quoted_scalar_with_invalid_escape_emits_diagnostic() {
2455        let input = "\"\\q\"";
2456        let mut scanner = Scanner::new(input);
2457        while scanner.next_token().is_some() {}
2458        assert_eq!(
2459            scanner.diagnostics().len(),
2460            1,
2461            "got {:?}",
2462            scanner.diagnostics(),
2463        );
2464        assert_eq!(
2465            scanner.diagnostics()[0].code,
2466            diagnostic_codes::LEX_INVALID_DOUBLE_QUOTED_ESCAPE,
2467        );
2468    }
2469
2470    #[test]
2471    fn double_quoted_scalar_with_short_hex_escape_emits_diagnostic() {
2472        // `\x4` is missing one hex digit; the `"` after closes the
2473        // scalar but the truncated escape is reported.
2474        let input = "\"\\x4\"";
2475        let mut scanner = Scanner::new(input);
2476        while scanner.next_token().is_some() {}
2477        assert!(
2478            scanner
2479                .diagnostics()
2480                .iter()
2481                .any(|d| d.code == diagnostic_codes::LEX_INVALID_DOUBLE_QUOTED_ESCAPE),
2482            "got {:?}",
2483            scanner.diagnostics(),
2484        );
2485    }
2486
2487    #[test]
2488    fn double_quoted_scalar_spans_multiple_lines() {
2489        // A literal newline inside the quotes is part of the scalar.
2490        let input = "\"line1\nline2\"";
2491        let tokens = collect_tokens(input);
2492        let scalar = find_scalar(&tokens);
2493        assert_eq!(scalar.kind, TokenKind::Scalar(ScalarStyle::DoubleQuoted));
2494        // The entire input is the scalar (no Newline trivia between
2495        // the two lines — line breaks inside quoted scalars belong to
2496        // the scalar's source span).
2497        assert_eq!(scalar.start.index, 0);
2498        assert_eq!(scalar.end.index, input.len());
2499    }
2500
2501    #[test]
2502    fn line_continuation_escape_consumes_newline_inside_quoted_scalar() {
2503        // `\<newline>` is a folding line break: the `\` plus the
2504        // following newline are together one escape.
2505        let input = "\"a\\\nb\"";
2506        let mut scanner = Scanner::new(input);
2507        while scanner.next_token().is_some() {}
2508        assert!(
2509            scanner.diagnostics().is_empty(),
2510            "got {:?}",
2511            scanner.diagnostics(),
2512        );
2513    }
2514
2515    #[test]
2516    fn unterminated_quoted_scalar_emits_diagnostic() {
2517        for input in ["'oops", "\"oops"] {
2518            let mut scanner = Scanner::new(input);
2519            while scanner.next_token().is_some() {}
2520            assert!(
2521                scanner
2522                    .diagnostics()
2523                    .iter()
2524                    .any(|d| d.code == diagnostic_codes::LEX_UNTERMINATED_QUOTED_SCALAR),
2525                "{input:?} produced {:?}",
2526                scanner.diagnostics(),
2527            );
2528        }
2529    }
2530
2531    #[test]
2532    fn quoted_scalar_can_be_implicit_key() {
2533        let input = "\"key\": value";
2534        let tokens = collect_tokens(input);
2535        let kinds = meaningful_kinds(&tokens);
2536        assert_eq!(
2537            kinds,
2538            vec![
2539                TokenKind::StreamStart,
2540                TokenKind::BlockMappingStart,
2541                TokenKind::Key,
2542                TokenKind::Scalar(ScalarStyle::DoubleQuoted),
2543                TokenKind::Value,
2544                TokenKind::Scalar(ScalarStyle::Plain),
2545                TokenKind::BlockEnd,
2546                TokenKind::StreamEnd,
2547            ],
2548        );
2549        assert_byte_complete(input, &tokens);
2550    }
2551
2552    #[test]
2553    fn multi_line_quoted_scalar_cannot_be_implicit_key() {
2554        // The scalar opens on line 0; the simple-key candidate's mark
2555        // is on line 0. After scanning across the line break the
2556        // cursor is on line 1, so stale_simple_keys removes the
2557        // candidate before the `:` arrives — no Key splice.
2558        let input = "\"line1\nline2\": value\n";
2559        let tokens = collect_tokens(input);
2560        let kinds = meaningful_kinds(&tokens);
2561        // Expected: StreamStart, Scalar(DoubleQuoted), BlockMappingStart,
2562        // Value, Scalar(Plain), BlockEnd, StreamEnd. The Scalar comes
2563        // BEFORE BlockMappingStart/Value, demonstrating no key splice.
2564        assert_eq!(kinds[0], TokenKind::StreamStart);
2565        assert_eq!(kinds[1], TokenKind::Scalar(ScalarStyle::DoubleQuoted));
2566        assert_eq!(kinds[2], TokenKind::BlockMappingStart);
2567        assert_eq!(kinds[3], TokenKind::Value);
2568        assert!(!kinds[..3].contains(&TokenKind::Key), "got {kinds:?}",);
2569    }
2570
2571    #[test]
2572    fn quoted_scalar_inside_flow_mapping_terminates_at_closing_quote() {
2573        let input = "{\"a\": \"b\"}";
2574        let tokens = collect_tokens(input);
2575        let kinds = meaningful_kinds(&tokens);
2576        assert_eq!(
2577            kinds,
2578            vec![
2579                TokenKind::StreamStart,
2580                TokenKind::FlowMappingStart,
2581                TokenKind::Key,
2582                TokenKind::Scalar(ScalarStyle::DoubleQuoted),
2583                TokenKind::Value,
2584                TokenKind::Scalar(ScalarStyle::DoubleQuoted),
2585                TokenKind::FlowMappingEnd,
2586                TokenKind::StreamEnd,
2587            ],
2588        );
2589        assert_byte_complete(input, &tokens);
2590    }
2591
2592    #[test]
2593    fn literal_block_scalar_at_top_level_spans_to_eof() {
2594        let input = "|\n  hello\n  world\n";
2595        let tokens = collect_tokens(input);
2596        let scalar = tokens
2597            .iter()
2598            .find(|t| t.kind == TokenKind::Scalar(ScalarStyle::Literal))
2599            .expect("literal scalar");
2600        // The scalar covers the header `|`, line break, both content
2601        // lines, and their trailing newlines.
2602        assert_eq!(scalar.start.index, 0);
2603        assert_eq!(scalar.end.index, input.len());
2604        assert_byte_complete(input, &tokens);
2605    }
2606
2607    #[test]
2608    fn folded_block_scalar_emits_folded_style() {
2609        let input = ">\n  hello\n";
2610        let tokens = collect_tokens(input);
2611        assert!(
2612            tokens
2613                .iter()
2614                .any(|t| t.kind == TokenKind::Scalar(ScalarStyle::Folded)),
2615            "got {tokens:?}",
2616        );
2617    }
2618
2619    #[test]
2620    fn block_scalar_terminates_on_dedent_to_parent_indent() {
2621        // key: |
2622        //   line1
2623        //   line2
2624        // next: x
2625        //
2626        // The block scalar's content indent is 2; `next:` at column 0
2627        // is below that, so the scalar terminates without consuming
2628        // `next` and the outer mapping continues.
2629        let input = "key: |\n  line1\n  line2\nnext: x\n";
2630        let tokens = collect_tokens(input);
2631        let kinds = meaningful_kinds(&tokens);
2632        // Find the block scalar's span; everything before "next" must
2633        // be inside it.
2634        let scalar = tokens
2635            .iter()
2636            .find(|t| t.kind == TokenKind::Scalar(ScalarStyle::Literal))
2637            .expect("literal scalar");
2638        let next_idx = input.find("next:").expect("next key in fixture");
2639        assert!(
2640            scalar.end.index <= next_idx,
2641            "scalar should end before `next:` at {next_idx}: scalar ends at {}",
2642            scalar.end.index,
2643        );
2644        // The outer mapping must produce two key/value pairs.
2645        let key_count = kinds.iter().filter(|&&k| k == TokenKind::Key).count();
2646        assert_eq!(key_count, 2, "got {kinds:?}");
2647    }
2648
2649    #[test]
2650    fn block_scalar_with_keep_chomping_indicator_in_header() {
2651        let input = "|+\n  text\n\n";
2652        let tokens = collect_tokens(input);
2653        let scalar = tokens
2654            .iter()
2655            .find(|t| t.kind == TokenKind::Scalar(ScalarStyle::Literal))
2656            .expect("literal scalar");
2657        // The header `|+` and the empty trailing line are part of the
2658        // scalar's source span.
2659        assert_eq!(scalar.start.index, 0);
2660        assert_eq!(scalar.end.index, input.len());
2661        assert_byte_complete(input, &tokens);
2662    }
2663
2664    #[test]
2665    fn block_scalar_with_explicit_indent_indicator_uses_that_indent() {
2666        // `|2` declares the content indent is 2. Lines at less than
2667        // 2 spaces terminate. The single content line at indent 2
2668        // is included; `bye` at indent 0 is not.
2669        let input = "key: |2\n  hi\nbye: x\n";
2670        let tokens = collect_tokens(input);
2671        let scalar = tokens
2672            .iter()
2673            .find(|t| t.kind == TokenKind::Scalar(ScalarStyle::Literal))
2674            .expect("literal scalar");
2675        let bye_idx = input.find("bye:").expect("bye key in fixture");
2676        assert!(
2677            scalar.end.index <= bye_idx,
2678            "scalar must end before `bye`: {} vs {}",
2679            scalar.end.index,
2680            bye_idx,
2681        );
2682        assert_byte_complete(input, &tokens);
2683    }
2684
2685    #[test]
2686    fn block_scalar_at_eof_without_trailing_newline_still_emits() {
2687        let input = "|\n  text";
2688        let tokens = collect_tokens(input);
2689        let scalar = tokens
2690            .iter()
2691            .find(|t| t.kind == TokenKind::Scalar(ScalarStyle::Literal))
2692            .expect("literal scalar");
2693        assert_eq!(scalar.end.index, input.len());
2694    }
2695
2696    #[test]
2697    fn block_scalar_with_internal_blank_lines_includes_them() {
2698        // Blank lines inside the block scalar are part of content.
2699        let input = "|\n  a\n\n  b\n";
2700        let tokens = collect_tokens(input);
2701        let scalar = tokens
2702            .iter()
2703            .find(|t| t.kind == TokenKind::Scalar(ScalarStyle::Literal))
2704            .expect("literal scalar");
2705        assert_eq!(scalar.end.index, input.len());
2706        assert_byte_complete(input, &tokens);
2707    }
2708
2709    #[test]
2710    fn pipe_inside_flow_context_is_part_of_plain_scalar_not_block() {
2711        // `[|]` — `|` in flow context is plain text.
2712        let input = "[|]";
2713        let tokens = collect_tokens(input);
2714        let kinds = meaningful_kinds(&tokens);
2715        // Should NOT see a Literal-style scalar — flow context disables
2716        // the block-scalar dispatch.
2717        assert!(
2718            !kinds.contains(&TokenKind::Scalar(ScalarStyle::Literal)),
2719            "got {kinds:?}",
2720        );
2721        assert_eq!(kinds[1], TokenKind::FlowSequenceStart);
2722        assert!(kinds.contains(&TokenKind::Scalar(ScalarStyle::Plain)));
2723    }
2724
2725    #[test]
2726    fn block_scalar_terminates_on_document_marker() {
2727        let input = "|\n  text\n---\nnext\n";
2728        let tokens = collect_tokens(input);
2729        let kinds = meaningful_kinds(&tokens);
2730        // The scalar must NOT swallow the `---` marker.
2731        assert!(kinds.contains(&TokenKind::DocumentStart), "got {kinds:?}");
2732    }
2733
2734    #[test]
2735    fn plain_scalar_with_internal_whitespace_is_one_token() {
2736        let input = "hello world";
2737        let tokens = collect_tokens(input);
2738        let scalars: Vec<_> = tokens
2739            .iter()
2740            .filter(|t| matches!(t.kind, TokenKind::Scalar(ScalarStyle::Plain)))
2741            .collect();
2742        assert_eq!(scalars.len(), 1, "got {tokens:?}");
2743        assert_eq!(
2744            &input[scalars[0].start.index..scalars[0].end.index],
2745            "hello world",
2746        );
2747        assert_byte_complete(input, &tokens);
2748    }
2749
2750    #[test]
2751    fn plain_scalar_with_multiple_internal_spaces_is_one_token() {
2752        let input = "a   b   c";
2753        let tokens = collect_tokens(input);
2754        let scalars: Vec<_> = tokens
2755            .iter()
2756            .filter(|t| matches!(t.kind, TokenKind::Scalar(ScalarStyle::Plain)))
2757            .collect();
2758        assert_eq!(scalars.len(), 1, "got {tokens:?}");
2759        assert_eq!(
2760            &input[scalars[0].start.index..scalars[0].end.index],
2761            "a   b   c",
2762        );
2763    }
2764
2765    #[test]
2766    fn plain_scalar_drops_trailing_whitespace_before_eof() {
2767        // Trailing spaces on the same line are not part of the scalar.
2768        let input = "hello   ";
2769        let tokens = collect_tokens(input);
2770        let scalar = tokens
2771            .iter()
2772            .find(|t| matches!(t.kind, TokenKind::Scalar(ScalarStyle::Plain)))
2773            .expect("plain scalar");
2774        assert_eq!(&input[scalar.start.index..scalar.end.index], "hello");
2775        // The trailing spaces become a Whitespace trivia token.
2776        assert!(
2777            tokens
2778                .iter()
2779                .any(|t| t.kind == TokenKind::Trivia(TriviaKind::Whitespace)),
2780            "expected trailing whitespace as trivia: {tokens:?}",
2781        );
2782        assert_byte_complete(input, &tokens);
2783    }
2784
2785    #[test]
2786    fn plain_scalar_drops_trailing_whitespace_before_comment() {
2787        // `hello # comment` — the scalar is `hello`; the `# comment`
2788        // is a comment trivia (and the spaces between are whitespace).
2789        let input = "hello # comment";
2790        let tokens = collect_tokens(input);
2791        let scalar = tokens
2792            .iter()
2793            .find(|t| matches!(t.kind, TokenKind::Scalar(ScalarStyle::Plain)))
2794            .expect("plain scalar");
2795        assert_eq!(&input[scalar.start.index..scalar.end.index], "hello");
2796        assert!(
2797            tokens
2798                .iter()
2799                .any(|t| t.kind == TokenKind::Trivia(TriviaKind::Comment)),
2800            "expected comment trivia: {tokens:?}",
2801        );
2802    }
2803
2804    #[test]
2805    fn colon_inside_url_does_not_break_plain_scalar() {
2806        // `https://example.com` — `:` followed by `/` stays inside the
2807        // scalar (regression of step-6 behaviour after the rewrite).
2808        let input = "url: https://example.com\n";
2809        let tokens = collect_tokens(input);
2810        let scalars: Vec<_> = tokens
2811            .iter()
2812            .filter(|t| matches!(t.kind, TokenKind::Scalar(ScalarStyle::Plain)))
2813            .map(|t| &input[t.start.index..t.end.index])
2814            .collect();
2815        assert_eq!(scalars, vec!["url", "https://example.com"]);
2816    }
2817
2818    #[test]
2819    fn multi_line_plain_scalar_continues_under_indent() {
2820        // `key: hello\n  world\n` — the `world` line is indented past
2821        // the parent indent (0+1=1), so it continues the scalar.
2822        let input = "key: hello\n  world\n";
2823        let tokens = collect_tokens(input);
2824        let plain_scalars: Vec<_> = tokens
2825            .iter()
2826            .filter(|t| matches!(t.kind, TokenKind::Scalar(ScalarStyle::Plain)))
2827            .collect();
2828        // Two plain scalars: `key`, and the multi-line value.
2829        assert_eq!(plain_scalars.len(), 2, "got {tokens:?}");
2830        // The value scalar spans both lines.
2831        let value = plain_scalars[1];
2832        assert!(
2833            input[value.start.index..value.end.index].contains("hello"),
2834            "scalar text: {:?}",
2835            &input[value.start.index..value.end.index],
2836        );
2837        assert!(
2838            input[value.start.index..value.end.index].contains("world"),
2839            "scalar text: {:?}",
2840            &input[value.start.index..value.end.index],
2841        );
2842    }
2843
2844    #[test]
2845    fn plain_scalar_terminates_at_blank_line_continuation() {
2846        // A blank line between content terminates the plain scalar.
2847        let input = "key: hello\n\n  world\n";
2848        let tokens = collect_tokens(input);
2849        let plain_scalars: Vec<_> = tokens
2850            .iter()
2851            .filter(|t| matches!(t.kind, TokenKind::Scalar(ScalarStyle::Plain)))
2852            .map(|t| &input[t.start.index..t.end.index])
2853            .collect();
2854        // Hmm — actually a blank line in YAML plain-scalar continuation
2855        // is allowed as folding whitespace. Verify what we emit: at
2856        // minimum, `hello` and `world` should both be present, but we
2857        // accept either (one merged scalar OR separate). Check both.
2858        let merged = plain_scalars.iter().any(|s| s.contains("world"));
2859        assert!(
2860            merged || plain_scalars.contains(&"world"),
2861            "got {plain_scalars:?}"
2862        );
2863    }
2864
2865    #[test]
2866    fn plain_scalar_terminates_on_dedent() {
2867        // `outer:\n  hello\nnext: x` — `next:` at column 0 is below
2868        // the continuation indent (parent=2, min=3), so the value
2869        // scalar ends at end-of-line-1 and `next:` opens a new entry.
2870        let input = "outer:\n  hello\nnext: x\n";
2871        let tokens = collect_tokens(input);
2872        let kinds = meaningful_kinds(&tokens);
2873        // Two Key tokens (outer, next).
2874        let key_count = kinds.iter().filter(|&&k| k == TokenKind::Key).count();
2875        assert_eq!(key_count, 2, "got {kinds:?}");
2876        // Three plain scalars: `outer`, `hello`, `next`, `x`.
2877        let plain_count = kinds
2878            .iter()
2879            .filter(|&&k| k == TokenKind::Scalar(ScalarStyle::Plain))
2880            .count();
2881        assert_eq!(plain_count, 4, "got {kinds:?}");
2882    }
2883
2884    #[test]
2885    fn plain_scalar_terminates_on_following_block_entry_indicator() {
2886        // `outer:\n  - a` — under the value `outer:` we have a block
2887        // sequence whose first entry `- a` is on line 1. The (empty)
2888        // value of `outer:` must NOT swallow `- a` as a continuation.
2889        let input = "outer:\n  - a\n  - b\n";
2890        let tokens = collect_tokens(input);
2891        let kinds = meaningful_kinds(&tokens);
2892        // Should see at least one BlockEntry (we'd see two for the
2893        // two items, but the bigger point is that `- a` was NOT
2894        // absorbed into the plain-scalar continuation).
2895        let block_entry_count = kinds
2896            .iter()
2897            .filter(|&&k| k == TokenKind::BlockEntry)
2898            .count();
2899        assert!(block_entry_count >= 1, "got {kinds:?}");
2900    }
2901
2902    #[test]
2903    fn more_indented_dash_line_folds_into_plain_scalar() {
2904        // yaml-test-suite AB8U: `- single multiline\n - sequence entry\n`.
2905        // The second line's `-` sits at column 1, deeper than the
2906        // sequence indent (0), so per libyaml it folds into the plain
2907        // scalar rather than opening a nested sequence. Expect a single
2908        // BlockEntry and a single plain scalar spanning both lines.
2909        let input = "- single multiline\n - sequence entry\n";
2910        let tokens = collect_tokens(input);
2911        let kinds = meaningful_kinds(&tokens);
2912        let block_entry_count = kinds
2913            .iter()
2914            .filter(|&&k| k == TokenKind::BlockEntry)
2915            .count();
2916        assert_eq!(block_entry_count, 1, "got {kinds:?}");
2917        let plain_scalars: Vec<_> = tokens
2918            .iter()
2919            .filter(|t| matches!(t.kind, TokenKind::Scalar(ScalarStyle::Plain)))
2920            .collect();
2921        assert_eq!(plain_scalars.len(), 1, "got {tokens:?}");
2922        let value = plain_scalars[0];
2923        assert_eq!(
2924            &input[value.start.index..value.end.index],
2925            "single multiline\n - sequence entry",
2926        );
2927    }
2928
2929    #[test]
2930    fn flow_context_plain_scalar_does_not_absorb_terminator_line_break() {
2931        // `{a: 42\n}\n` — the `\n` between `42` and `}` must NOT be
2932        // swallowed into the scalar's continuation. The plain scalar
2933        // ends at `42`; the line break is trivia between scalar and
2934        // closer.
2935        let input = "{a: 42\n}\n";
2936        let tokens = collect_tokens(input);
2937        let scalars: Vec<_> = tokens
2938            .iter()
2939            .filter(|t| matches!(t.kind, TokenKind::Scalar(ScalarStyle::Plain)))
2940            .map(|t| &input[t.start.index..t.end.index])
2941            .collect();
2942        assert!(scalars.contains(&"42"), "got {scalars:?}");
2943        assert_byte_complete(input, &tokens);
2944    }
2945
2946    #[test]
2947    fn plain_scalar_in_flow_context_terminates_on_flow_indicators() {
2948        let input = "[a b, c]";
2949        let tokens = collect_tokens(input);
2950        let plain_scalars: Vec<_> = tokens
2951            .iter()
2952            .filter(|t| matches!(t.kind, TokenKind::Scalar(ScalarStyle::Plain)))
2953            .map(|t| &input[t.start.index..t.end.index])
2954            .collect();
2955        // `a b` is one scalar (internal whitespace allowed); `c` is
2956        // another. The `,` separates them.
2957        assert_eq!(plain_scalars, vec!["a b", "c"]);
2958    }
2959
2960    #[test]
2961    fn multi_line_plain_scalar_does_not_register_as_simple_key() {
2962        // `hello\n  world: value\n` — after the multi-line plain
2963        // scalar emerges, a `:` would be on a different line from the
2964        // candidate's mark.line. stale_simple_keys must drop the
2965        // candidate so the `:` does NOT splice a Key before
2966        // `hello\n  world`.
2967        //
2968        // This is the case that motivated the scanner rewrite.
2969        let input = "hello\n  world: value\n";
2970        let tokens = collect_tokens(input);
2971        let kinds = meaningful_kinds(&tokens);
2972        // Find positions of the first plain Scalar and the first Key.
2973        let scalar_pos = kinds
2974            .iter()
2975            .position(|&k| k == TokenKind::Scalar(ScalarStyle::Plain));
2976        let key_pos = kinds.iter().position(|&k| k == TokenKind::Key);
2977        assert!(scalar_pos.is_some(), "no scalar: {kinds:?}");
2978        // If there is a Key, the multi-line scalar must NOT be its
2979        // body (i.e., the Scalar must not appear AFTER Key without
2980        // first having been emitted standalone). The simplest check:
2981        // the first scalar must come before any Key — because the
2982        // multi-line scalar is committed to the queue before the `:`
2983        // would even be reached.
2984        if let Some(k) = key_pos {
2985            let s = scalar_pos.unwrap();
2986            assert!(s < k, "multi-line scalar must precede any key: {kinds:?}",);
2987        }
2988    }
2989
2990    #[test]
2991    fn plain_scalar_preserves_single_line_simple_key_behaviour() {
2992        // Single-line `hello world: value` — the scalar `hello world`
2993        // (with internal space) IS still a valid implicit key because
2994        // it stays on one line.
2995        let input = "hello world: value\n";
2996        let tokens = collect_tokens(input);
2997        let kinds = meaningful_kinds(&tokens);
2998        assert_eq!(
2999            kinds,
3000            vec![
3001                TokenKind::StreamStart,
3002                TokenKind::BlockMappingStart,
3003                TokenKind::Key,
3004                TokenKind::Scalar(ScalarStyle::Plain),
3005                TokenKind::Value,
3006                TokenKind::Scalar(ScalarStyle::Plain),
3007                TokenKind::BlockEnd,
3008                TokenKind::StreamEnd,
3009            ],
3010        );
3011    }
3012}