Skip to main content

rust_yaml/scanner/
mod.rs

1//! YAML scanner for tokenization
2
3use crate::{Error, Limits, Position, ResourceTracker, Result, error::ErrorContext};
4
5pub mod indentation;
6pub mod scalar_scanner;
7pub mod state;
8pub mod token_processor;
9pub mod tokens;
10// pub mod optimizations; // Temporarily disabled
11pub use scalar_scanner::ScalarScanner;
12pub use tokens::*;
13// pub use optimizations::*;
14
15/// Trait for YAML scanners that convert character streams to tokens
16pub trait Scanner {
17    /// Check if there are more tokens available
18    fn check_token(&self) -> bool;
19
20    /// Peek at the next token without consuming it
21    fn peek_token(&self) -> Result<Option<&Token>>;
22
23    /// Get the next token, consuming it
24    fn get_token(&mut self) -> Result<Option<Token>>;
25
26    /// Reset the scanner state
27    fn reset(&mut self);
28
29    /// Get the current position in the input
30    fn position(&self) -> Position;
31
32    /// Get the input text for error reporting
33    fn input(&self) -> &str;
34}
35
36/// Block-scalar chomping mode per YAML 1.2 §8.1.1.2.
37///
38/// - `Strip` (`-`): drop the final line break and trailing empty lines.
39/// - `Clip` (default): keep exactly one final line break, drop trailing empty lines.
40/// - `Keep` (`+`): preserve the final line break and all trailing empty lines.
41#[derive(Debug, Clone, Copy, PartialEq, Eq)]
42enum ChompingMode {
43    Strip,
44    Clip,
45    Keep,
46}
47
48/// Apply chomping mode to a block-scalar tail.
49///
50/// The collectors emit a `\n` for every line (content or blank). This helper
51/// trims that tail according to spec §8.1.1.2:
52///
53/// - **Strip:** remove every trailing `\n`.
54/// - **Clip:** keep exactly one trailing `\n` if content exists; drop the rest.
55///   Empty input stays empty.
56/// - **Keep:** preserve everything.
57fn apply_chomping(mut s: String, mode: ChompingMode) -> String {
58    match mode {
59        ChompingMode::Keep => s,
60        ChompingMode::Strip => {
61            while s.ends_with('\n') {
62                s.pop();
63            }
64            s
65        }
66        ChompingMode::Clip => {
67            // Strip trailing newlines. If anything remains, restore one.
68            // §8.1.1.2: clip keeps the final line break only when the
69            // scalar has actual content (yaml-test-suite K858: an empty
70            // clip scalar `>` is `""`, not `"\n"`).
71            while s.ends_with('\n') {
72                s.pop();
73            }
74            if !s.is_empty() {
75                s.push('\n');
76            }
77            s
78        }
79    }
80}
81
82/// A basic scanner implementation for YAML tokenization
83#[derive(Debug)]
84#[allow(dead_code)]
85pub struct BasicScanner {
86    input: String,
87    position: Position,
88    current_char: Option<char>,
89    tokens: Vec<Token>,
90    token_index: usize,
91    done: bool,
92    indent_stack: Vec<usize>,
93    current_indent: usize,
94    allow_simple_key: bool,
95    simple_key_allowed: bool,
96    flow_level: usize,
97    preserve_comments: bool,
98    // Indentation style detection
99    detected_indent_style: Option<crate::value::IndentStyle>,
100    indent_samples: Vec<(usize, bool)>, // (size, is_tabs)
101    previous_indent_level: usize,       // Track the previous indentation for style detection
102    // Performance optimizations
103    buffer: String,                   // Reusable string buffer for token values
104    char_cache: Vec<char>,            // Cached characters for faster access
105    char_indices: Vec<(usize, char)>, // Cached character indices for O(1) lookups
106    current_char_index: usize,        // Current index in char_cache
107    profiler: Option<crate::profiling::YamlProfiler>, // Optional profiling
108    // Error tracking
109    scanning_error: Option<Error>, // Store scanning errors for later retrieval
110    // Resource tracking
111    limits: Limits,
112    resource_tracker: ResourceTracker,
113    // Track inline nested sequences that need closing
114    inline_sequence_depth: usize,
115    // Track compact-notation sequences (where `-` is at the same indent as
116    // the parent mapping keys). These are NOT on indent_stack, so we need
117    // separate tracking to know when to emit BlockEnd for them.
118    compact_sequence_indents: Vec<usize>,
119    // Parallel to indent_stack: true when the entry was pushed by a block
120    // sequence, false when by a mapping. Lets us distinguish "continuing a
121    // regular sequence" from "starting a compact sequence at same indent".
122    indent_is_sequence: Vec<bool>,
123}
124
125impl BasicScanner {
126    /// Create a new scanner from input string
127    pub fn new(input: String) -> Self {
128        Self::with_limits(input, Limits::default())
129    }
130
131    /// Create a new scanner with custom resource limits
132    pub fn with_limits(input: String, limits: Limits) -> Self {
133        let char_cache: Vec<char> = input.chars().collect();
134        let char_indices: Vec<(usize, char)> = input.char_indices().collect();
135        let current_char = char_cache.first().copied();
136
137        // Track document size for resource limits
138        let mut resource_tracker = ResourceTracker::new();
139        if let Err(e) = resource_tracker.add_bytes(&limits, input.len()) {
140            // If the input is too large, create scanner with error state
141            return Self {
142                current_char: None,
143                input,
144                position: Position::start(),
145                tokens: Vec::new(),
146                token_index: 0,
147                done: true,
148                indent_stack: vec![0],
149                current_indent: 0,
150                allow_simple_key: false,
151                simple_key_allowed: false,
152                flow_level: 0,
153                preserve_comments: false,
154                detected_indent_style: None,
155                indent_samples: Vec::new(),
156                previous_indent_level: 0,
157                buffer: String::new(),
158                char_cache: Vec::new(),
159                char_indices: Vec::new(),
160                current_char_index: 0,
161                profiler: None,
162                scanning_error: Some(e),
163                limits,
164                resource_tracker,
165                inline_sequence_depth: 0,
166                compact_sequence_indents: Vec::new(),
167                indent_is_sequence: vec![false],
168            };
169        }
170
171        Self {
172            current_char,
173            input,
174            position: Position::start(),
175            tokens: Vec::new(),
176            token_index: 0,
177            done: false,
178            indent_stack: vec![0], // Always start with base indentation
179            current_indent: 0,
180            allow_simple_key: true,
181            simple_key_allowed: true,
182            flow_level: 0,
183            preserve_comments: false,
184            detected_indent_style: None,
185            indent_samples: Vec::new(),
186            previous_indent_level: 0,
187            buffer: String::with_capacity(64), // Pre-allocate buffer
188            char_cache,
189            char_indices,
190            current_char_index: 0,
191            profiler: std::env::var("RUST_YAML_PROFILE")
192                .ok()
193                .map(|_| crate::profiling::YamlProfiler::new()),
194            scanning_error: None,
195            limits,
196            resource_tracker,
197            inline_sequence_depth: 0,
198            compact_sequence_indents: Vec::new(),
199            indent_is_sequence: vec![false],
200        }
201    }
202
203    /// Create a new scanner with eager token scanning (for compatibility)
204    pub fn new_eager(input: String) -> Self {
205        Self::new_eager_with_limits(input, Limits::default())
206    }
207
208    /// Create a new scanner with eager token scanning and custom limits
209    pub fn new_eager_with_limits(input: String, limits: Limits) -> Self {
210        let mut scanner = Self::with_limits(input, limits);
211        // Store any scanning errors for later retrieval
212        if let Err(error) = scanner.scan_all_tokens() {
213            scanner.scanning_error = Some(error);
214        }
215        scanner
216    }
217
218    /// Create a new scanner with comment preservation enabled
219    pub fn new_with_comments(input: String) -> Self {
220        let mut scanner = Self::new(input);
221        scanner.preserve_comments = true;
222        scanner
223    }
224
225    /// Create a new scanner with comments and custom limits
226    pub fn new_with_comments_and_limits(input: String, limits: Limits) -> Self {
227        let mut scanner = Self::with_limits(input, limits);
228        scanner.preserve_comments = true;
229        scanner
230    }
231
232    /// Create a new scanner with eager scanning and comment preservation
233    pub fn new_eager_with_comments(input: String) -> Self {
234        let mut scanner = Self::new_with_comments(input);
235        scanner.scan_all_tokens().unwrap_or(());
236        scanner
237    }
238
239    /// Get the detected indentation style from the document
240    pub const fn detected_indent_style(&self) -> Option<&crate::value::IndentStyle> {
241        self.detected_indent_style.as_ref()
242    }
243
244    /// Check if there was a scanning error
245    pub const fn has_scanning_error(&self) -> bool {
246        self.scanning_error.is_some()
247    }
248
249    /// Get the scanning error if any
250    #[allow(clippy::missing_const_for_fn)]
251    pub fn take_scanning_error(&mut self) -> Option<Error> {
252        self.scanning_error.take()
253    }
254
255    /// Advance to the next character
256    fn advance(&mut self) -> Option<char> {
257        if let Some(ch) = self.current_char {
258            self.position = self.position.advance(ch);
259            self.current_char_index += 1;
260
261            if self.current_char_index < self.char_cache.len() {
262                self.current_char = Some(self.char_cache[self.current_char_index]);
263            } else {
264                self.current_char = None;
265            }
266        }
267
268        self.current_char
269    }
270
271    /// Skip whitespace characters (excluding newlines)
272    fn skip_whitespace(&mut self) {
273        while let Some(ch) = self.current_char {
274            if ch == ' ' || ch == '\t' {
275                self.advance();
276            } else {
277                break;
278            }
279        }
280    }
281
282    /// Handle indentation and produce block tokens if necessary
283    fn handle_indentation(&mut self) -> Result<()> {
284        // In flow context: if there is a non-trivial enclosing block
285        // (indent_stack has more than the implicit root level), each
286        // continuation line that has content must be indented MORE than
287        // that enclosing block's indent. \`flow: [a,\\nb,c]\` with \`b\`
288        // at col 1 violates this rule because the block mapping enclosing
289        // \`flow:\` sits at indent 0 (yaml-test-suite 9C9N).
290        //
291        // Top-level flow (no enclosing block; indent_stack is just \[0\])
292        // is exempt — `[a,\\nb]` is fine there because the flow content
293        // isn't nested inside any block (yaml-test-suite 4ZYM).
294        if self.flow_level > 0 {
295            if self.indent_stack.len() > 1 || !self.compact_sequence_indents.is_empty() {
296                let mut probe = 0usize;
297                let mut i = self.current_char_index;
298                while i < self.char_cache.len() {
299                    match self.char_cache[i] {
300                        ' ' => {
301                            probe += 1;
302                            i += 1;
303                        }
304                        '\t' => i += 1,
305                        _ => break,
306                    }
307                }
308                let has_content = self
309                    .char_cache
310                    .get(i)
311                    .map_or(false, |c| !matches!(c, '\n' | '\r'));
312                // A line that begins with the matching flow closer
313                // (\`]\` / \`}\`) is allowed at the parent indent — it
314                // closes the flow collection, not adds content
315                // (yaml-test-suite NKF9 trailing-line \`}\` at col 1).
316                let is_closer = matches!(self.char_cache.get(i).copied(), Some(']' | '}'));
317                if has_content && !is_closer {
318                    let parent_indent = self.indent_stack.last().copied().unwrap_or(0);
319                    if probe <= parent_indent {
320                        return Err(Error::scan(
321                            self.position,
322                            "Flow content line is not indented enough".to_string(),
323                        ));
324                    }
325                }
326            }
327            return Ok(());
328        }
329
330        let line_start_pos = self.position;
331        let mut indent = 0;
332        let mut has_tabs = false;
333        let mut has_spaces = false;
334        let _indent_start_pos = self.position;
335
336        // Count indentation and detect style
337        while let Some(ch) = self.current_char {
338            if ch == ' ' {
339                indent += 1;
340                has_spaces = true;
341                self.advance();
342            } else if ch == '\t' {
343                indent += 8; // Tab counts as 8 spaces for indentation calculation
344                has_tabs = true;
345                self.advance();
346            } else {
347                break;
348            }
349        }
350
351        // Analyze indentation pattern for style detection
352        // Only analyze if there's actual content after the indentation (not just whitespace)
353        if indent > 0
354            && self.current_char.is_some()
355            && !matches!(self.current_char, Some('\n' | '\r'))
356        {
357            self.analyze_indentation_pattern(indent, has_tabs, has_spaces)?;
358        }
359
360        // YAML 1.2 §6.1 does NOT require all indents to be multiples
361        // of a single "indent width". Siblings must share a column;
362        // children must indent further; but any positive amount works
363        // (e.g. `key:\n  child:\n   grandchild:` with widths 2, 1
364        // is legal). The earlier strict-multiple-of-N check rejected
365        // valid spec fixtures like 6HB6, 8G76, A2M4, P94K, Q9WF,
366        // UGM3. We rely on the indent_stack-driven open/close logic
367        // (and the per-block "more than parent" rule enforced
368        // elsewhere) to catch genuine mis-indentation.
369
370        // Update previous indentation level for future comparisons
371        if indent > 0 {
372            self.previous_indent_level = indent;
373        }
374
375        // Update current indentation level
376        self.current_indent = indent;
377
378        // Close compact-notation sequences whose scope ends at this line.
379        // A compact sequence (where `-` shares the indent of the parent
380        // mapping keys) ends when the next content line at that indent is
381        // NOT a block entry (`- `).  We must emit the sequence's BlockEnd
382        // BEFORE popping the indent_stack so that the nesting order is
383        // correct (sequence closes before its parent mapping).
384        let has_content =
385            self.current_char.is_some() && !matches!(self.current_char, Some('\n' | '\r' | '#'));
386        if has_content {
387            let is_block_entry = self.current_char == Some('-')
388                && self.peek_char(1).map_or(true, |c| c.is_whitespace());
389            while let Some(&seq_indent) = self.compact_sequence_indents.last() {
390                if indent < seq_indent || (indent == seq_indent && !is_block_entry) {
391                    self.compact_sequence_indents.pop();
392                    self.tokens
393                        .push(Token::simple(TokenType::BlockEnd, line_start_pos));
394                } else {
395                    break;
396                }
397            }
398        }
399
400        // Check if we need to emit block end tokens for decreased indentation
401        let pre_pop_top = self.indent_stack.last().copied().unwrap_or(0);
402        while let Some(&last_indent) = self.indent_stack.last() {
403            if indent < last_indent && last_indent > 0 {
404                self.indent_stack.pop();
405                self.indent_is_sequence.pop();
406                self.tokens
407                    .push(Token::simple(TokenType::BlockEnd, line_start_pos));
408            } else {
409                break;
410            }
411        }
412
413        // §6.1: after a dedent, the new line's indent must match some
414        // existing container level — keys/items at a sibling level
415        // must share a column. Landing at a column that is between
416        // two stack levels (e.g. parent at 0, just-closed at 3, new
417        // line at 1) is invalid because no open mapping/sequence sits
418        // at indent 1 (yaml-test-suite DMG6, N4JP).
419        //
420        // The check applies only when:
421        //   * we actually dedented (pre-pop top was deeper than now),
422        //   * the new line has content (the next char is not blank /
423        //     newline / EOF / comment),
424        //   * indent doesn't match the new top.
425        if pre_pop_top > 0
426            && pre_pop_top > self.indent_stack.last().copied().unwrap_or(0)
427            && self
428                .current_char
429                .map_or(false, |c| !matches!(c, '\n' | '\r' | '#'))
430            && indent != self.indent_stack.last().copied().unwrap_or(0)
431        {
432            // Allow if indent is a valid deeper level — e.g.
433            // sibling at depth then deeper child — but for the
434            // dedent path indent must equal a known stack level.
435            return Err(Error::scan(
436                self.position,
437                format!(
438                    "Indentation {indent} doesn't match any open container (expected {} or deeper)",
439                    self.indent_stack.last().copied().unwrap_or(0)
440                ),
441            ));
442        }
443
444        Ok(())
445    }
446
447    /// Analyze indentation pattern to detect the document's indentation style
448    fn analyze_indentation_pattern(
449        &mut self,
450        current_indent: usize,
451        has_tabs: bool,
452        has_spaces: bool,
453    ) -> Result<()> {
454        // Prevent mixed indentation (tabs + spaces on same line).
455        // Carve-out: a tab AFTER one or more spaces and BEFORE
456        // value-position content (not a key) is content-area
457        // whitespace, not indentation. \`foo:\\n \\tbar\` — the 1
458        // space is indent, the tab is a separator before \`bar\`
459        // which is the value of \`foo:\` (yaml-test-suite DK95/00).
460        if has_tabs && has_spaces {
461            // Peek ahead: if the content after the tab+spaces area
462            // contains a key marker (`: ` or `:`+EOL), treat as
463            // indentation (invalid). Otherwise it's a value line.
464            let looks_like_key = self.line_after_indent_is_implicit_key();
465            if looks_like_key {
466                let context =
467                    crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
468                        .with_suggestion(
469                            "Use either tabs OR spaces for indentation, not both".to_string(),
470                        );
471                return Err(Error::invalid_character_with_context(
472                    self.position,
473                    '\t',
474                    "mixed indentation",
475                    context,
476                ));
477            }
478        }
479        // §6.1: indentation must be space characters only. Pure-tab
480        // indentation (\`\\tkey: value\`) is invalid (yaml-test-suite
481        // 4EJS). Two carve-outs:
482        //   * The mixed case is caught by the earlier branch.
483        //   * Tabs before a flow-collection opener (\`\\t[\`, \`\\t{\`)
484        //     at the root are not "block indentation" — there's no
485        //     enclosing block — and yaml-test-suite 6CA3 / Q5MG accept
486        //     them.
487        if has_tabs && !has_spaces && !matches!(self.current_char, Some('[' | '{')) {
488            let context = crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
489                .with_suggestion("Use space characters for indentation".to_string());
490            return Err(Error::invalid_character_with_context(
491                self.position,
492                '\t',
493                "indentation",
494                context,
495            ));
496        }
497
498        // If we detected tabs, check for mixed indentation across lines
499        if has_tabs {
500            match self.detected_indent_style {
501                None => {
502                    // First time detecting indentation style - set to tabs
503                    self.detected_indent_style = Some(crate::value::IndentStyle::Tabs);
504                }
505                Some(crate::value::IndentStyle::Spaces(_)) => {
506                    // Previously detected spaces, now seeing tabs - mixed indentation error
507                    let context =
508                        crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
509                            .with_suggestion(
510                                "Use consistent indentation style throughout the document"
511                                    .to_string(),
512                            );
513                    return Err(Error::invalid_character_with_context(
514                        self.position,
515                        '\t',
516                        "mixed indentation",
517                        context,
518                    ));
519                }
520                Some(crate::value::IndentStyle::Tabs) => {
521                    // Already using tabs - this is consistent
522                }
523            }
524            return Ok(());
525        }
526
527        // For spaces, check for mixed indentation across lines first
528        if has_spaces {
529            // Check if we previously detected tabs
530            if matches!(
531                self.detected_indent_style,
532                Some(crate::value::IndentStyle::Tabs)
533            ) {
534                let context =
535                    crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
536                        .with_suggestion(
537                            "Use consistent indentation style throughout the document".to_string(),
538                        );
539                return Err(Error::invalid_character_with_context(
540                    self.position,
541                    ' ',
542                    "mixed indentation",
543                    context,
544                ));
545            }
546
547            // Calculate the indentation level difference
548            if current_indent > self.previous_indent_level {
549                let indent_diff = current_indent - self.previous_indent_level;
550
551                // Store this sample for analysis (but only meaningful differences)
552                if indent_diff > 0 && indent_diff <= 8 {
553                    // Reasonable indentation range
554                    self.indent_samples.push((indent_diff, false));
555
556                    // Try to determine consistent indentation width
557                    if self.detected_indent_style.is_none() {
558                        self.detect_space_indentation_width();
559                    }
560                }
561            }
562
563            // YAML 1.2 §6.1 does NOT require all indents to be multiples
564            // of a single "indent width". Sibling lines must share a
565            // column and children must indent deeper than parents, but
566            // any positive amount works. The "multiple of N" check
567            // rejected valid spec fixtures (6HB6, M5C3, P94K, Q9WF,
568            // RZP5, UGM3, XW4D, A2M4); we rely on the indent_stack
569            // open/close logic for genuine mis-indentation. The detected
570            // style is still recorded for later style-preservation use
571            // (e.g. emitter), it just no longer drives validation.
572            // self.validate_indentation_consistency(current_indent)?;
573        }
574
575        Ok(())
576    }
577
578    /// Detect the consistent space indentation width from samples
579    fn detect_space_indentation_width(&mut self) {
580        if self.indent_samples.is_empty() {
581            return; // Need at least 1 sample
582        }
583
584        // Find the most common indentation width
585        let mut width_counts = std::collections::HashMap::new();
586
587        for &(width, is_tabs) in &self.indent_samples {
588            if !is_tabs && width > 0 {
589                *width_counts.entry(width).or_insert(0) += 1;
590            }
591        }
592
593        // Find the most frequent width - be more aggressive and detect early
594        if let Some((&most_common_width, &_count)) =
595            width_counts.iter().max_by_key(|&(_, count)| count)
596        {
597            // Set on first consistent sample to enable stricter validation
598            self.detected_indent_style = Some(crate::value::IndentStyle::Spaces(most_common_width));
599        }
600    }
601
602    /// Check if the given indentation level is valid based on current context
603    #[allow(clippy::missing_const_for_fn)] // Cannot be const due to self.detected_indent_style access
604    fn is_valid_indentation_level(&self, indent: usize) -> bool {
605        // For now, allow any indentation that could represent valid nesting
606        // In the future, this could be made more strict by checking against
607        // the current indent_stack to ensure proper nesting
608        if let Some(crate::value::IndentStyle::Spaces(width)) = self.detected_indent_style {
609            // Must be a multiple of the detected width
610            indent % width == 0
611        } else {
612            // If no style detected yet, allow any indentation
613            true
614        }
615    }
616
617    /// Validate that current indentation is consistent with detected style
618    fn validate_indentation_consistency(&self, current_indent: usize) -> Result<()> {
619        if let Some(crate::value::IndentStyle::Spaces(width)) = self.detected_indent_style {
620            // Check if current indentation is a multiple of the detected width
621            if current_indent > 0 && current_indent % width != 0 {
622                let lower_level = (current_indent / width) * width;
623                let higher_level = lower_level + width;
624                let suggestion = format!(
625                    "Expected indentation to be a multiple of {} spaces. Use {} or {} spaces instead of {}",
626                    width, lower_level, higher_level, current_indent
627                );
628                let context =
629                    crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
630                        .with_suggestion(suggestion);
631                return Err(Error::indentation_with_context(
632                    self.position,
633                    (current_indent / width) * width, // expected (nearest valid level)
634                    current_indent,                   // found
635                    context,
636                ));
637            }
638        }
639        Ok(())
640    }
641
642    /// Check if current position starts a plain scalar
643    fn is_plain_scalar_start(&self) -> bool {
644        self.current_char.map_or(false, |ch| match ch {
645            // Pure indicators — never start a plain scalar.
646            ',' | '[' | ']' | '{' | '}' | '#' | '&' | '*' | '!' | '|' | '>' | '\'' | '"' | '%'
647            | '@' | '`' => false,
648            // YAML 1.2 §7.3.3: `?`, `:`, `-` may start a plain scalar when
649            // the next character is non-whitespace (and, in flow context,
650            // not a flow indicator). Otherwise they act as indicators
651            // (complex-key marker / value separator / block-entry marker).
652            '?' | ':' | '-' => match self.peek_char(1) {
653                None => false,
654                Some(c) if c.is_whitespace() => false,
655                Some(c) if self.flow_level > 0 && ",[]{}".contains(c) => false,
656                Some(_) => true,
657            },
658            _ => !ch.is_whitespace(),
659        })
660    }
661
662    /// Check if the value is a YAML boolean
663    fn is_yaml_bool(value: &str) -> bool {
664        matches!(
665            value,
666            "true"
667                | "false"
668                | "True"
669                | "False"
670                | "TRUE"
671                | "FALSE"
672                | "yes"
673                | "no"
674                | "Yes"
675                | "No"
676                | "YES"
677                | "NO"
678                | "on"
679                | "off"
680                | "On"
681                | "Off"
682                | "ON"
683                | "OFF"
684        )
685    }
686
687    /// Check if the value is a YAML null
688    fn is_yaml_null(value: &str) -> bool {
689        matches!(value, "null" | "Null" | "NULL" | "~" | "")
690    }
691
692    /// Normalize a scalar value based on YAML rules.
693    ///
694    /// The scanner preserves the original text of plain scalars. Type
695    /// resolution (including version-aware bool/null mapping) happens in
696    /// the composer (see `crate::resolver::resolve_plain_scalar`). This
697    /// preserves enough information for the composer to apply the
698    /// YAML 1.1 vs 1.2 distinction and for round-trip emitters to
699    /// recover the original spelling.
700    fn normalize_scalar(value: String) -> String {
701        value
702    }
703
704    /// Scan a number token
705    fn scan_number(&mut self) -> Result<Token> {
706        let start_pos = self.position;
707        let mut value = String::new();
708
709        // Handle negative numbers
710        if self.current_char == Some('-') {
711            value.push('-');
712            self.advance();
713        }
714
715        // Scan digits
716        while let Some(ch) = self.current_char {
717            if ch.is_ascii_digit() {
718                value.push(ch);
719                self.advance();
720            } else if ch == '.' {
721                value.push(ch);
722                self.advance();
723                // Scan fractional part
724                while let Some(ch) = self.current_char {
725                    if ch.is_ascii_digit() {
726                        value.push(ch);
727                        self.advance();
728                    } else {
729                        break;
730                    }
731                }
732                break;
733            } else {
734                break;
735            }
736        }
737
738        Ok(Token::new(
739            TokenType::Scalar(value, tokens::QuoteStyle::Plain),
740            start_pos,
741            self.position,
742        ))
743    }
744
745    /// Scan a plain scalar (unquoted string)
746    fn scan_plain_scalar(&mut self) -> Result<Token> {
747        let start_pos = self.position;
748        let start_col = start_pos.column;
749        let mut value = String::new();
750        let mut multi_line = false;
751
752        loop {
753            // Scan content on the current line until we hit a stop condition.
754            while let Some(ch) = self.current_char {
755                if self.flow_level == 0 {
756                    match ch {
757                        '\n' | '\r' => break,
758                        ':' if self.peek_char(1).map_or(true, |c| c.is_whitespace()) => break,
759                        '#' if value.is_empty()
760                            || self.peek_char(-1).map_or(false, |c| c.is_whitespace()) =>
761                        {
762                            break;
763                        }
764                        _ => {}
765                    }
766                } else {
767                    match ch {
768                        // Same line-break handling as block context: stop
769                        // collecting raw content at `\n`/`\r`, then let the
770                        // outer fold logic decide whether the next line
771                        // continues this scalar (yaml-test-suite 8KB6,
772                        // 8UDB, 9BXH).
773                        '\n' | '\r' => break,
774                        ',' | '[' | ']' | '{' | '}' => break,
775                        // In flow context, `:` is a key-value separator
776                        // when followed by whitespace OR any flow indicator
777                        // (`,`, `[`, `]`, `{`, `}`). Tracked by yaml-test-
778                        // suite FRK4 (`{ ? foo :, ... }`).
779                        ':' if self
780                            .peek_char(1)
781                            .map_or(true, |c| c.is_whitespace() || ",[]{}".contains(c)) =>
782                        {
783                            break;
784                        }
785                        '#' if value.is_empty()
786                            || self.peek_char(-1).map_or(false, |c| c.is_whitespace()) =>
787                        {
788                            break;
789                        }
790                        _ => {}
791                    }
792                }
793                value.push(ch);
794                self.advance();
795            }
796
797            // If we didn't stop at a newline, this scalar is complete.
798            if !matches!(self.current_char, Some('\n' | '\r')) {
799                break;
800            }
801
802            // Per §6.5 line folding, trailing whitespace on the line is
803            // dropped (it gets replaced by the fold separator that the
804            // next continuation block emits).
805            while matches!(value.chars().last(), Some(' ' | '\t')) {
806                value.pop();
807            }
808
809            // YAML 1.2 §6.5 / §7.3.3: try to fold continuation lines into
810            // the same plain scalar. A continuation line must be:
811            //   * indented strictly more than the scalar's start column,
812            //   * not a document marker (`---` / `...`),
813            //   * not a comment-only line,
814            //   * not empty-with-EOF.
815            // Save state for backtracking if continuation isn't allowed.
816            let saved_position = self.position;
817            let saved_index = self.current_char_index;
818            let saved_char = self.current_char;
819
820            // Count physical newlines we skip; whitespace within the lines
821            // is also consumed.
822            let mut newlines = 0usize;
823            loop {
824                match self.current_char {
825                    Some('\n') => {
826                        newlines += 1;
827                        self.advance();
828                    }
829                    Some('\r') => {
830                        self.advance();
831                    }
832                    Some(' ' | '\t') => {
833                        self.advance();
834                    }
835                    _ => break,
836                }
837            }
838
839            let next_col = self.position.column;
840            let next_ch = self.current_char;
841            let is_doc_marker = matches!(next_ch, Some('-' | '.'))
842                && self.peek_char(1) == next_ch
843                && self.peek_char(2) == next_ch
844                && self.peek_char(3).map_or(true, |c| c.is_whitespace());
845
846            // Continuation column rule:
847            //   * Flow context: no column rule, only flow indicators
848            //     terminate (8KB6, 8UDB, 9BXH).
849            //   * Block context: must be strictly deeper than the parent
850            //     block's key column. The parent indent is the max of
851            //     `indent_stack.last()` (block mapping/sequence indent)
852            //     and `compact_sequence_indents.last()` — the latter
853            //     tracks sequences opened compactly (e.g. `? - x` where
854            //     the dash didn't push to indent_stack). Without the
855            //     compact-stack check, `? - Detroit Tigers\n  - Chicago`
856            //     would fold both lines into one scalar (yaml-test-
857            //     suite M5DY).
858            //     Fall back to `next_col >= start_col` for top-level
859            //     scalars where there's no enclosing block.
860            let column_ok = if self.flow_level > 0 {
861                true
862            } else {
863                let block_indent = self.indent_stack.last().copied().unwrap_or(0);
864                let compact_indent = self.compact_sequence_indents.last().copied().unwrap_or(0);
865                let parent_indent = block_indent.max(compact_indent);
866                next_col >= parent_indent + 2 || next_col >= start_col
867            };
868            let can_continue = next_ch.is_some()
869                && !matches!(next_ch, Some('\n' | '\r' | '#'))
870                && column_ok
871                && !is_doc_marker
872                && !(self.flow_level > 0 && matches!(next_ch, Some(',' | ']' | '}')));
873
874            if !can_continue {
875                self.position = saved_position;
876                self.current_char_index = saved_index;
877                self.current_char = saved_char;
878                break;
879            }
880
881            // Append fold separator: single newline → space; N>1 newlines
882            // collapse to N-1 retained newlines (YAML §6.5 line folding).
883            if newlines <= 1 {
884                value.push(' ');
885            } else {
886                for _ in 0..(newlines - 1) {
887                    value.push('\n');
888                }
889            }
890            multi_line = true;
891        }
892
893        // YAML 1.2 §8.1.3: implicit keys must be on a single line. If the
894        // plain scalar folded across line breaks AND the next non-
895        // whitespace char is `:` (key-value separator), it's about to be
896        // used as an implicit key — reject (yaml-test-suite G7JE).
897        if multi_line && self.flow_level == 0 {
898            let mut off = 0isize;
899            while matches!(self.peek_char(off), Some(' ' | '\t')) {
900                off += 1;
901            }
902            if self.peek_char(off) == Some(':') {
903                return Err(Error::scan(
904                    self.position,
905                    "Multi-line plain scalar may not be used as an implicit key".to_string(),
906                ));
907            }
908        }
909
910        self.resource_tracker
911            .check_string_length(&self.limits, value.len())?;
912
913        let value = value.trim_end().to_string();
914        let normalized_value = Self::normalize_scalar(value);
915
916        Ok(Token::new(
917            TokenType::Scalar(normalized_value, tokens::QuoteStyle::Plain),
918            start_pos,
919            self.position,
920        ))
921    }
922
923    /// Scan a quoted string
924    fn scan_quoted_string(&mut self, quote_char: char) -> Result<Token> {
925        let start_pos = self.position;
926        let mut value = String::new();
927
928        // Determine quote style based on quote character
929        let quote_style = match quote_char {
930            '\'' => tokens::QuoteStyle::Single,
931            '"' => tokens::QuoteStyle::Double,
932            _ => tokens::QuoteStyle::Plain,
933        };
934
935        self.advance(); // Skip opening quote
936        let mut closed = false;
937        let mut multi_line = false;
938        // High-water mark of bytes contributed by escape sequences. The
939        // trailing-whitespace strip at fold time must not pop past it,
940        // because an escape-produced \t / space is literal content
941        // (yaml-test-suite DE56/00, DE56/01).
942        let mut escape_end: usize = 0;
943
944        while let Some(ch) = self.current_char {
945            if ch == quote_char {
946                // YAML 1.2 §7.3.2 (Single-Quoted): `''` is the only escape,
947                // collapsing to a single `'`. Detect that here BEFORE
948                // treating the quote as the closing delimiter.
949                if quote_char == '\'' && self.peek_char(1) == Some('\'') {
950                    value.push('\'');
951                    self.advance();
952                    self.advance();
953                    continue;
954                }
955                self.advance(); // Skip closing quote
956                closed = true;
957                break;
958            } else if ch == '\\' && quote_char == '"' {
959                self.advance();
960                if let Some(escaped) = self.current_char {
961                    match escaped {
962                        // YAML 1.2 §5.7 double-quoted escape allowlist.
963                        'n' => value.push('\n'),
964                        't' => value.push('\t'),
965                        'r' => value.push('\r'),
966                        '\\' => value.push('\\'),
967                        '"' => value.push('"'),
968                        '0' => value.push('\0'),
969                        'a' => value.push('\x07'),
970                        'b' => value.push('\x08'),
971                        'f' => value.push('\x0C'),
972                        'v' => value.push('\x0B'),
973                        'e' => value.push('\x1B'),
974                        ' ' => value.push(' '),
975                        '/' => value.push('/'),
976                        'N' => value.push('\u{0085}'),
977                        '_' => value.push('\u{00A0}'),
978                        'L' => value.push('\u{2028}'),
979                        'P' => value.push('\u{2029}'),
980                        '\n' => {
981                            // Escaped line break (§7.3.2): the newline is
982                            // dropped AND leading whitespace on the next
983                            // line is excluded from the content.
984                            self.advance();
985                            while matches!(self.current_char, Some(' ' | '\t')) {
986                                self.advance();
987                            }
988                            continue;
989                        }
990                        '\t' => value.push('\t'), // literal tab after `\` → tab (yaml-test-suite 3RLN/DE56)
991                        // Hex / Unicode escapes per YAML 1.2 §5.7:
992                        //   \xNN     — 2 hex digits, codepoint  ≤ 0xFF
993                        //   \uNNNN   — 4 hex digits, codepoint  ≤ 0xFFFF
994                        //   \UNNNNNNNN — 8 hex digits, full Unicode codepoint
995                        'x' | 'u' | 'U' => {
996                            let n = match escaped {
997                                'x' => 2,
998                                'u' => 4,
999                                _ => 8,
1000                            };
1001                            self.advance(); // consume the x/u/U
1002                            let mut codepoint: u32 = 0;
1003                            for _ in 0..n {
1004                                let c = self.current_char.ok_or_else(|| {
1005                                    Error::scan(
1006                                        self.position,
1007                                        format!("Truncated \\{escaped} escape"),
1008                                    )
1009                                })?;
1010                                let d = c.to_digit(16).ok_or_else(|| {
1011                                    Error::scan(
1012                                        self.position,
1013                                        format!("Invalid hex digit `{c}` in \\{escaped} escape"),
1014                                    )
1015                                })?;
1016                                codepoint = (codepoint << 4) | d;
1017                                self.advance();
1018                            }
1019                            let ch = char::from_u32(codepoint).ok_or_else(|| {
1020                                Error::scan(
1021                                    self.position,
1022                                    format!("Invalid Unicode codepoint U+{codepoint:X}"),
1023                                )
1024                            })?;
1025                            value.push(ch);
1026                            escape_end = value.len();
1027                            continue; // already advanced past hex digits
1028                        }
1029                        // Everything else is invalid per spec.
1030                        _ => {
1031                            return Err(Error::scan(
1032                                self.position,
1033                                format!("Invalid escape sequence: \\{escaped}"),
1034                            ));
1035                        }
1036                    }
1037                    escape_end = value.len();
1038                    self.advance();
1039                }
1040            } else if ch == '\\' {
1041                // Single-quoted strings have no backslash escapes — `\` is
1042                // a literal character. (Single-quote escape is `''`.)
1043                value.push(ch);
1044                self.advance();
1045            } else if ch == '\n' || ch == '\r' {
1046                // YAML 1.2 §7.3.2 (double-quoted) / §7.3.3 (single-quoted)
1047                // line folding: a single newline within a quoted scalar
1048                // folds to a space; N>1 consecutive newlines retain N-1;
1049                // leading whitespace on the continuation line is excluded.
1050                let mut newlines = 0usize;
1051                // §6.1: tabs cannot be indentation. A continuation
1052                // line that BEGINS with a tab (no leading spaces) in
1053                // an enclosing block context is invalid (yaml-test-
1054                // suite DK95/01). Tabs that appear AFTER spaces in
1055                // the same indent area are content, not indentation.
1056                let mut just_after_newline = false;
1057                while let Some(c) = self.current_char {
1058                    match c {
1059                        '\n' => {
1060                            newlines += 1;
1061                            multi_line = true;
1062                            self.advance();
1063                            just_after_newline = true;
1064                        }
1065                        '\r' => {
1066                            self.advance();
1067                        }
1068                        ' ' => {
1069                            self.advance();
1070                            just_after_newline = false;
1071                        }
1072                        '\t' if just_after_newline
1073                            && self.flow_level == 0
1074                            && (self.indent_stack.len() > 1
1075                                || !self.compact_sequence_indents.is_empty()) =>
1076                        {
1077                            return Err(Error::scan(
1078                                self.position,
1079                                "Tab cannot serve as indentation of quoted scalar continuation"
1080                                    .to_string(),
1081                            ));
1082                        }
1083                        '\t' => {
1084                            self.advance();
1085                        }
1086                        _ => break,
1087                    }
1088                }
1089                // §8.1.4: a multi-line quoted scalar inside a block
1090                // context must indent each continuation more than the
1091                // enclosing block. \`quoted: "a\\nb"\` with \`b\` at col 1
1092                // violates the rule because \`quoted:\` sits at indent 0
1093                // (yaml-test-suite QB6E). Only fires when there IS an
1094                // enclosing block (indent_stack > [0] or compact-seq
1095                // active) — top-level quoted scalars with continuation
1096                // at col 1 are legal.
1097                if newlines > 0
1098                    && self.flow_level == 0
1099                    && (self.indent_stack.len() > 1 || !self.compact_sequence_indents.is_empty())
1100                    && !matches!(self.current_char, None | Some('\n' | '\r'))
1101                {
1102                    let parent_indent = self.indent_stack.last().copied().unwrap_or(0);
1103                    let indent = self.position.column.saturating_sub(1);
1104                    if indent <= parent_indent {
1105                        return Err(Error::scan(
1106                            self.position,
1107                            "Quoted scalar continuation line is not indented enough".to_string(),
1108                        ));
1109                    }
1110                }
1111                // §6.8: a doc-start/end marker (`---` or `...`) at
1112                // column 1 always terminates the current document.
1113                // Encountering one inside an unterminated quoted
1114                // scalar is invalid — the quote escapes nothing past
1115                // the doc boundary (yaml-test-suite 5TRB, RXY3,
1116                // 9MQT/01).
1117                if self.position.column == 1 {
1118                    let next3: String = self
1119                        .char_cache
1120                        .get(self.current_char_index..self.current_char_index + 3)
1121                        .map(|s| s.iter().collect())
1122                        .unwrap_or_default();
1123                    if (next3 == "---" || next3 == "...")
1124                        && self
1125                            .char_cache
1126                            .get(self.current_char_index + 3)
1127                            .map_or(true, |c| c.is_whitespace())
1128                    {
1129                        return Err(Error::scan(
1130                            self.position,
1131                            format!(
1132                                "Document {} marker `{}` inside quoted scalar",
1133                                if next3 == "---" { "start" } else { "end" },
1134                                next3
1135                            ),
1136                        ));
1137                    }
1138                }
1139                // Drop trailing whitespace on the prior line (the bytes
1140                // we already pushed) before applying the fold. Don't
1141                // strip past `escape_end` — escape-produced whitespace
1142                // is literal content, not "trailing" line whitespace.
1143                while value.len() > escape_end && matches!(value.chars().last(), Some(' ' | '\t')) {
1144                    value.pop();
1145                }
1146                if newlines <= 1 {
1147                    value.push(' ');
1148                } else {
1149                    for _ in 0..(newlines - 1) {
1150                        value.push('\n');
1151                    }
1152                }
1153            } else {
1154                value.push(ch);
1155                self.advance();
1156
1157                // Check string length periodically to fail fast
1158                if value.len() > self.limits.max_string_length {
1159                    return Err(Error::limit_exceeded(format!(
1160                        "String length {} exceeds maximum {}",
1161                        value.len(),
1162                        self.limits.max_string_length
1163                    )));
1164                }
1165            }
1166        }
1167
1168        // Check string length limit
1169        if !closed {
1170            return Err(Error::scan(
1171                self.position,
1172                format!(
1173                    "Unclosed {} quoted string",
1174                    if quote_char == '"' {
1175                        "double"
1176                    } else {
1177                        "single"
1178                    }
1179                ),
1180            ));
1181        }
1182
1183        self.resource_tracker
1184            .check_string_length(&self.limits, value.len())?;
1185
1186        // YAML 1.2 §7.3.1 / §7.3.2: after the closing quote, the rest of
1187        // the line (or sub-expression in flow context) must be empty save
1188        // for a separator. Skip horizontal whitespace and look at the next
1189        // non-space char; if it's content rather than `,`/`:`/`}`/`]`/`#`/
1190        // newline/EOF, it's a trailing-content error (yaml-test-suite
1191        // Q4CL: `"quoted2" trailing content`).
1192        {
1193            let mut offset = 0isize;
1194            let mut saw_space = false;
1195            while matches!(self.peek_char(offset), Some(' ' | '\t')) {
1196                saw_space = true;
1197                offset += 1;
1198            }
1199            let next = self.peek_char(offset);
1200            // A `#` is a comment indicator ONLY when preceded by whitespace
1201            // (YAML 1.2 §6.6); `"value"#cmt` is invalid.
1202            let ok = match next {
1203                None => true,
1204                Some('#') => saw_space,
1205                Some(c) => matches!(c, ',' | ':' | '}' | ']' | '\n' | '\r'),
1206            };
1207            if !ok {
1208                return Err(Error::scan(
1209                    self.position,
1210                    format!("Unexpected `{}` after quoted scalar", next.unwrap_or(' ')),
1211                ));
1212            }
1213            // YAML 1.2 §8.1.3: implicit keys must be on a single line.
1214            // If the scalar folded across line breaks AND the next non-
1215            // whitespace char is `:` (key-value separator), the scalar
1216            // is being used as an implicit key — error.
1217            if multi_line && self.flow_level == 0 && next == Some(':') {
1218                return Err(Error::scan(
1219                    self.position,
1220                    "Multi-line quoted scalar may not be used as an implicit key".to_string(),
1221                ));
1222            }
1223        }
1224
1225        Ok(Token::new(
1226            TokenType::Scalar(value, quote_style),
1227            start_pos,
1228            self.position,
1229        ))
1230    }
1231
1232    /// Scan document start marker (---)
1233    fn scan_document_start(&mut self) -> Result<Option<Token>> {
1234        if self.current_char == Some('-')
1235            && self.peek_char(1) == Some('-')
1236            && self.peek_char(2) == Some('-')
1237            && self.peek_char(3).map_or(true, |c| c.is_whitespace())
1238        {
1239            // Doc markers are invalid inside flow collections.
1240            if self.flow_level > 0 {
1241                return Err(Error::scan(
1242                    self.position,
1243                    "`---` document-start marker is not allowed inside a flow collection"
1244                        .to_string(),
1245                ));
1246            }
1247            let start_pos = self.position;
1248            self.advance(); // -
1249            self.advance(); // -
1250            self.advance(); // -
1251
1252            Ok(Some(Token::new(
1253                TokenType::DocumentStart,
1254                start_pos,
1255                self.position,
1256            )))
1257        } else {
1258            Ok(None)
1259        }
1260    }
1261
1262    /// Scan YAML version directive (%YAML)
1263    fn scan_yaml_directive(&mut self) -> Result<Option<Token>> {
1264        if self.current_char != Some('%') {
1265            return Ok(None);
1266        }
1267
1268        let start_pos = self.position;
1269        let saved_position = self.position;
1270        self.advance(); // Skip '%'
1271
1272        // Check for "YAML"
1273        if self.current_char == Some('Y')
1274            && self.peek_char(1) == Some('A')
1275            && self.peek_char(2) == Some('M')
1276            && self.peek_char(3) == Some('L')
1277            && self.peek_char(4).map_or(false, |c| c.is_whitespace())
1278        {
1279            self.advance(); // Y
1280            self.advance(); // A
1281            self.advance(); // M
1282            self.advance(); // L
1283
1284            // Skip whitespace
1285            self.skip_whitespace();
1286
1287            // Parse version number (e.g., "1.2")
1288            let major = if let Some(ch) = self.current_char {
1289                if ch.is_ascii_digit() {
1290                    let digit = ch.to_digit(10).unwrap() as u8;
1291                    self.advance();
1292                    digit
1293                } else {
1294                    return Err(Error::scan(
1295                        self.position,
1296                        "Expected major version number after %YAML".to_string(),
1297                    ));
1298                }
1299            } else {
1300                return Err(Error::scan(
1301                    self.position,
1302                    "Expected version after %YAML directive".to_string(),
1303                ));
1304            };
1305
1306            // Expect '.'
1307            if self.current_char != Some('.') {
1308                return Err(Error::scan(
1309                    self.position,
1310                    "Expected '.' in YAML version".to_string(),
1311                ));
1312            }
1313            self.advance();
1314
1315            // Parse minor version
1316            let minor = if let Some(ch) = self.current_char {
1317                if ch.is_ascii_digit() {
1318                    let digit = ch.to_digit(10).unwrap() as u8;
1319                    self.advance();
1320                    digit
1321                } else {
1322                    return Err(Error::scan(
1323                        self.position,
1324                        "Expected minor version number after '.'".to_string(),
1325                    ));
1326                }
1327            } else {
1328                return Err(Error::scan(
1329                    self.position,
1330                    "Expected minor version number".to_string(),
1331                ));
1332            };
1333
1334            // YAML 1.2 §6.8.1: the directive line must end after the
1335            // version (modulo whitespace and an optional comment). Extra
1336            // tokens (e.g. `%YAML 1.2 foo`) are invalid — yaml-test-suite
1337            // H7TQ. Also `%YAML 1.1#...` (yaml-test-suite MUS6/00) needs
1338            // whitespace before `#`.
1339            let mut saw_space = false;
1340            while matches!(self.current_char, Some(' ' | '\t')) {
1341                saw_space = true;
1342                self.advance();
1343            }
1344            match self.current_char {
1345                None | Some('\n' | '\r') => {}
1346                Some('#') if saw_space => {
1347                    while let Some(ch) = self.current_char {
1348                        if ch == '\n' || ch == '\r' {
1349                            break;
1350                        }
1351                        self.advance();
1352                    }
1353                }
1354                Some(c) => {
1355                    return Err(Error::scan(
1356                        self.position,
1357                        format!("Unexpected `{c}` after %YAML directive"),
1358                    ));
1359                }
1360            }
1361
1362            Ok(Some(Token::new(
1363                TokenType::YamlDirective(major, minor),
1364                start_pos,
1365                self.position,
1366            )))
1367        } else {
1368            // Not a YAML directive, reset position
1369            self.position = saved_position;
1370            // Properly reset current_char based on saved position
1371            self.current_char = self
1372                .char_indices
1373                .iter()
1374                .find(|(i, _)| *i == saved_position.index)
1375                .map(|(_, ch)| *ch);
1376            // Reset the current_char_index
1377            self.current_char_index = self
1378                .char_indices
1379                .iter()
1380                .position(|(i, _)| *i == saved_position.index)
1381                .unwrap_or(0);
1382            Ok(None)
1383        }
1384    }
1385
1386    /// Scan TAG directive (%TAG)
1387    fn scan_tag_directive(&mut self) -> Result<Option<Token>> {
1388        if self.current_char != Some('%') {
1389            return Ok(None);
1390        }
1391
1392        let start_pos = self.position;
1393        let saved_position = self.position;
1394        self.advance(); // Skip '%'
1395
1396        // Check for "TAG"
1397        if self.current_char == Some('T')
1398            && self.peek_char(1) == Some('A')
1399            && self.peek_char(2) == Some('G')
1400            && self.peek_char(3).map_or(false, |c| c.is_whitespace())
1401        {
1402            self.advance(); // T
1403            self.advance(); // A
1404            self.advance(); // G
1405
1406            // Skip whitespace
1407            self.skip_whitespace();
1408
1409            // Parse handle (e.g., "!" or "!!")
1410            let handle = self.scan_tag_handle()?;
1411
1412            // Skip whitespace
1413            self.skip_whitespace();
1414
1415            // Parse prefix (URI)
1416            let prefix = self.scan_tag_prefix()?;
1417
1418            Ok(Some(Token::new(
1419                TokenType::TagDirective(handle, prefix),
1420                start_pos,
1421                self.position,
1422            )))
1423        } else {
1424            // Reset position if not a TAG directive
1425            self.position = saved_position;
1426            // Properly reset current_char based on saved position
1427            self.current_char = self
1428                .char_indices
1429                .iter()
1430                .find(|(i, _)| *i == saved_position.index)
1431                .map(|(_, ch)| *ch);
1432            // Reset the current_char_index
1433            self.current_char_index = self
1434                .char_indices
1435                .iter()
1436                .position(|(i, _)| *i == saved_position.index)
1437                .unwrap_or(0);
1438            Ok(None)
1439        }
1440    }
1441
1442    /// Scan a tag handle for TAG directive
1443    fn scan_tag_handle(&mut self) -> Result<String> {
1444        let mut handle = String::new();
1445
1446        if self.current_char != Some('!') {
1447            return Err(Error::scan(
1448                self.position,
1449                "Expected '!' at start of tag handle".to_string(),
1450            ));
1451        }
1452
1453        handle.push('!');
1454        self.advance();
1455
1456        // Handle can be "!" or "!!" or "!name!"
1457        if self.current_char == Some('!') {
1458            // Secondary handle "!!"
1459            handle.push('!');
1460            self.advance();
1461        } else if self.current_char.map_or(false, |c| c.is_alphanumeric()) {
1462            // Named handle like "!name!"
1463            while let Some(ch) = self.current_char {
1464                if ch.is_alphanumeric() || ch == '-' || ch == '_' {
1465                    handle.push(ch);
1466                    self.advance();
1467                } else if ch == '!' {
1468                    handle.push(ch);
1469                    self.advance();
1470                    break;
1471                } else {
1472                    break;
1473                }
1474            }
1475        }
1476        // else just "!" primary handle
1477
1478        Ok(handle)
1479    }
1480
1481    /// Scan a tag prefix (URI) for TAG directive
1482    fn scan_tag_prefix(&mut self) -> Result<String> {
1483        let mut prefix = String::new();
1484
1485        // Read until end of line or comment
1486        while let Some(ch) = self.current_char {
1487            if ch == '\n' || ch == '\r' || ch == '#' {
1488                break;
1489            }
1490            if ch.is_whitespace() && prefix.is_empty() {
1491                self.advance();
1492                continue;
1493            }
1494            if ch.is_whitespace() && !prefix.is_empty() {
1495                // Trailing whitespace, we're done
1496                break;
1497            }
1498            prefix.push(ch);
1499            self.advance();
1500        }
1501
1502        if prefix.is_empty() {
1503            return Err(Error::scan(
1504                self.position,
1505                "Expected tag prefix after tag handle".to_string(),
1506            ));
1507        }
1508
1509        Ok(prefix.trim().to_string())
1510    }
1511
1512    /// Check if current position might be a directive
1513    fn is_directive(&self) -> bool {
1514        self.current_char == Some('%') && self.position.column == 1
1515    }
1516
1517    /// Scan document end marker (...)
1518    fn scan_document_end(&mut self) -> Result<Option<Token>> {
1519        if self.current_char == Some('.')
1520            && self.peek_char(1) == Some('.')
1521            && self.peek_char(2) == Some('.')
1522            && self.peek_char(3).map_or(true, |c| c.is_whitespace())
1523        {
1524            // Doc markers are invalid inside flow collections.
1525            if self.flow_level > 0 {
1526                return Err(Error::scan(
1527                    self.position,
1528                    "`...` document-end marker is not allowed inside a flow collection".to_string(),
1529                ));
1530            }
1531            let start_pos = self.position;
1532            self.advance(); // .
1533            self.advance(); // .
1534            self.advance(); // .
1535
1536            // YAML 1.2 §6.4: `...` must be followed only by whitespace or
1537            // end-of-line (comments allowed). Inline content after `...`
1538            // is invalid (yaml-test-suite 3HFZ).
1539            while let Some(ch) = self.current_char {
1540                match ch {
1541                    ' ' | '\t' => {
1542                        self.advance();
1543                    }
1544                    '\n' | '\r' | '#' => break,
1545                    _ => {
1546                        return Err(Error::scan(
1547                            self.position,
1548                            "Content after `...` document-end marker is invalid".to_string(),
1549                        ));
1550                    }
1551                }
1552            }
1553
1554            Ok(Some(Token::new(
1555                TokenType::DocumentEnd,
1556                start_pos,
1557                self.position,
1558            )))
1559        } else {
1560            Ok(None)
1561        }
1562    }
1563
1564    /// Scan a comment token
1565    fn scan_comment(&mut self) -> Result<Token> {
1566        let start_pos = self.position;
1567        let mut comment_text = String::new();
1568
1569        // Skip the '#' character
1570        if self.current_char == Some('#') {
1571            self.advance();
1572        }
1573
1574        // Collect the comment text
1575        while let Some(ch) = self.current_char {
1576            if ch == '\n' || ch == '\r' {
1577                break;
1578            }
1579            comment_text.push(ch);
1580            self.advance();
1581        }
1582
1583        // Trim leading whitespace from comment text
1584        let comment_text = comment_text.trim_start().to_string();
1585
1586        Ok(Token::new(
1587            TokenType::Comment(comment_text),
1588            start_pos,
1589            self.position,
1590        ))
1591    }
1592
1593    /// Process a line and generate appropriate tokens
1594    #[allow(clippy::cognitive_complexity)]
1595    fn process_line(&mut self) -> Result<()> {
1596        // Check for directives at start of line
1597        if self.position.column == 1 && self.current_char == Some('%') {
1598            // Try to scan YAML directive
1599            if let Some(token) = self.scan_yaml_directive()? {
1600                self.tokens.push(token);
1601                return Ok(());
1602            }
1603
1604            // Try to scan TAG directive
1605            if let Some(token) = self.scan_tag_directive()? {
1606                self.tokens.push(token);
1607                return Ok(());
1608            }
1609
1610            // YAML 1.2 §6.8.4: a YAML processor MUST ignore directives it
1611            // does not recognize. Skip the line silently — parsing continues
1612            // with whatever follows on the next line.
1613            if self.current_char == Some('%') {
1614                while let Some(ch) = self.current_char {
1615                    if ch == '\n' || ch == '\r' {
1616                        break;
1617                    }
1618                    self.advance();
1619                }
1620                return Ok(());
1621            }
1622        }
1623
1624        // Check for document markers at start of line
1625        if self.position.column == 1 {
1626            // Check for document start marker
1627            if let Some(token) = self.scan_document_start()? {
1628                self.tokens.push(token);
1629                return Ok(());
1630            }
1631
1632            // Check for document end marker
1633            if let Some(token) = self.scan_document_end()? {
1634                self.tokens.push(token);
1635                return Ok(());
1636            }
1637        }
1638
1639        // Handle indentation at start of line
1640        if self.position.column == 1 {
1641            self.handle_indentation()?;
1642        }
1643
1644        // Skip empty lines and comments
1645        self.skip_whitespace();
1646
1647        match self.current_char {
1648            None => return Ok(()),
1649            Some('#') => {
1650                if self.preserve_comments {
1651                    // Create a comment token
1652                    let comment_token = self.scan_comment()?;
1653                    self.tokens.push(comment_token);
1654                } else {
1655                    // Skip comment lines
1656                    while let Some(ch) = self.current_char {
1657                        if ch == '\n' || ch == '\r' {
1658                            break;
1659                        }
1660                        self.advance();
1661                    }
1662                }
1663                return Ok(());
1664            }
1665            Some('\n' | '\r') => {
1666                self.advance();
1667                return Ok(());
1668            }
1669            _ => {}
1670        }
1671
1672        // Process tokens on this line
1673        while let Some(ch) = self.current_char {
1674            match ch {
1675                '\n' | '\r' => break,
1676                ' ' | '\t' => {
1677                    self.skip_whitespace();
1678                }
1679                '#' => {
1680                    // YAML 1.2 §6.6: a comment must be preceded by whitespace
1681                    // OR be at the start of a line. Inputs like `,#invalid`
1682                    // (yaml-test-suite CVW2) are not valid comments.
1683                    let prev = self.peek_char(-1);
1684                    let at_line_start = self.position.column == 1;
1685                    let preceded_by_space = prev.map_or(true, |c| c.is_whitespace());
1686                    if !at_line_start && !preceded_by_space {
1687                        return Err(Error::scan(
1688                            self.position,
1689                            "Comment `#` must be preceded by whitespace".to_string(),
1690                        ));
1691                    }
1692                    if self.preserve_comments {
1693                        let comment_token = self.scan_comment()?;
1694                        self.tokens.push(comment_token);
1695                    } else {
1696                        while let Some(ch) = self.current_char {
1697                            if ch == '\n' || ch == '\r' {
1698                                break;
1699                            }
1700                            self.advance();
1701                        }
1702                    }
1703                    break;
1704                }
1705
1706                // Flow indicators. §7.4 allows a flow collection as
1707                // the implicit key of a block mapping (`[a]: b`,
1708                // `{x: y}: z`). When the flow-open is at line-start
1709                // (block context) and a `:` follows on the same line,
1710                // open the wrapping block mapping at the column of the
1711                // flow-open token, just as we do for line-start
1712                // properties (yaml-test-suite LX3P, 4FJ6, M2N8/01).
1713                '[' => {
1714                    if self.flow_level == 0
1715                        && self.position.column == self.current_indent + 1
1716                        && self.check_for_mapping_ahead()
1717                    {
1718                        self.maybe_open_block_mapping_for_key()?;
1719                    }
1720                    let pos = self.position;
1721                    self.advance();
1722                    self.flow_level += 1;
1723                    // Check depth limit
1724                    self.resource_tracker
1725                        .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
1726                    self.tokens
1727                        .push(Token::new(TokenType::FlowSequenceStart, pos, self.position));
1728                }
1729                ']' => {
1730                    // YAML 1.2 §7.4: `]` is only valid inside an open
1731                    // flow sequence. Stray `]` is a syntax error
1732                    // (yaml-test-suite 4H7K).
1733                    if self.flow_level == 0 {
1734                        let context = ErrorContext::from_input(&self.input, &self.position, 2)
1735                            .with_suggestion(
1736                                "Remove the extra `]` or open a flow sequence with `[` first"
1737                                    .to_string(),
1738                            );
1739                        return Err(Error::scan_with_context(
1740                            self.position,
1741                            "Unexpected `]` outside flow context",
1742                            context,
1743                        ));
1744                    }
1745                    let pos = self.position;
1746                    self.advance();
1747                    self.flow_level -= 1;
1748                    self.tokens
1749                        .push(Token::new(TokenType::FlowSequenceEnd, pos, self.position));
1750                }
1751                '{' => {
1752                    if self.flow_level == 0
1753                        && self.position.column == self.current_indent + 1
1754                        && self.check_for_mapping_ahead()
1755                    {
1756                        self.maybe_open_block_mapping_for_key()?;
1757                    }
1758                    let pos = self.position;
1759                    self.advance();
1760                    self.flow_level += 1;
1761                    // Check depth limit
1762                    self.resource_tracker
1763                        .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
1764                    self.tokens
1765                        .push(Token::new(TokenType::FlowMappingStart, pos, self.position));
1766                }
1767                '}' => {
1768                    if self.flow_level == 0 {
1769                        let context = ErrorContext::from_input(&self.input, &self.position, 2)
1770                            .with_suggestion(
1771                                "Remove the extra `}` or open a flow mapping with `{` first"
1772                                    .to_string(),
1773                            );
1774                        return Err(Error::scan_with_context(
1775                            self.position,
1776                            "Unexpected `}` outside flow context",
1777                            context,
1778                        ));
1779                    }
1780                    let pos = self.position;
1781                    self.advance();
1782                    self.flow_level -= 1;
1783                    self.tokens
1784                        .push(Token::new(TokenType::FlowMappingEnd, pos, self.position));
1785                }
1786                ',' => {
1787                    // §7.4: \`,\` is a flow indicator. Outside flow
1788                    // context it's not meaningful as a structural
1789                    // separator (yaml-test-suite U99R: \`- !!str, xxx\`
1790                    // — the comma after a tag in block context is
1791                    // invalid).
1792                    if self.flow_level == 0 {
1793                        return Err(Error::scan(
1794                            self.position,
1795                            "Unexpected `,` outside flow context".to_string(),
1796                        ));
1797                    }
1798                    let pos = self.position;
1799                    self.advance();
1800                    self.tokens
1801                        .push(Token::new(TokenType::FlowEntry, pos, self.position));
1802                }
1803
1804                // Key-value separator. YAML 1.2 §7.3.3 / §7.4:
1805                //   * Block context: `:` separates key from value only when
1806                //     followed by whitespace / EOF — otherwise it's part of
1807                //     a plain scalar (e.g. `:foo`, `URL://path`).
1808                //   * Flow context: same, plus `:` may be adjacent to a
1809                //     value when the previous token completed a key node
1810                //     (quoted/plain scalar, alias, or closed flow
1811                //     collection) — see yaml-test-suite 5MUD, 5T43.
1812                ':' if self.peek_char(1).map_or(true, |c| {
1813                    c.is_whitespace() || (self.flow_level > 0 && ",[]{}".contains(c))
1814                }) || (self.flow_level > 0
1815                    && matches!(
1816                        self.tokens.last().map(|t| &t.token_type),
1817                        Some(
1818                            TokenType::Scalar(_, _)
1819                                | TokenType::Alias(_)
1820                                | TokenType::FlowMappingEnd
1821                                | TokenType::FlowSequenceEnd
1822                        )
1823                    )) =>
1824                {
1825                    // §6.2: a \`:\` at line-start (the explicit-value
1826                    // counterpart of an explicit \`?\` key) must be
1827                    // followed by a SPACE — a tab as separator is
1828                    // invalid (yaml-test-suite Y79Y/007, /009).
1829                    if self.flow_level == 0
1830                        && self.position.column == self.current_indent + 1
1831                        && self.peek_char(1) == Some('\t')
1832                    {
1833                        return Err(Error::scan(
1834                            self.position,
1835                            "Tab cannot follow line-start `:` as explicit-value separator"
1836                                .to_string(),
1837                        ));
1838                    }
1839                    // §8.22: an implicit key in block context must fit
1840                    // on a single line. If the previous token is a
1841                    // flow-collection close whose matching open is on
1842                    // a different line, the flow node spans multiple
1843                    // lines and can't serve as the key (yaml-test-
1844                    // suite C2SP \`[23\\n]: 42\`).
1845                    if self.flow_level == 0 {
1846                        let mut is_flow_close = false;
1847                        let mut close_end_line = 0;
1848                        if let Some(last) = self.tokens.last() {
1849                            if matches!(
1850                                last.token_type,
1851                                TokenType::FlowSequenceEnd | TokenType::FlowMappingEnd
1852                            ) {
1853                                is_flow_close = true;
1854                                close_end_line = last.end_position.line;
1855                            }
1856                        }
1857                        if is_flow_close {
1858                            let mut depth = 0i32;
1859                            let mut open_idx: Option<usize> = None;
1860                            for (idx, t) in self.tokens.iter().enumerate().rev() {
1861                                match &t.token_type {
1862                                    TokenType::FlowSequenceEnd | TokenType::FlowMappingEnd => {
1863                                        depth += 1;
1864                                    }
1865                                    TokenType::FlowSequenceStart | TokenType::FlowMappingStart => {
1866                                        depth -= 1;
1867                                        if depth == 0 {
1868                                            open_idx = Some(idx);
1869                                            break;
1870                                        }
1871                                    }
1872                                    _ => {}
1873                                }
1874                            }
1875                            if let Some(oi) = open_idx {
1876                                let open_line = self.tokens[oi].start_position.line;
1877                                // If a `?` (Key) token precedes the
1878                                // matching flow open on the same line
1879                                // as the key, the key is explicit and
1880                                // may span lines (yaml-test-suite M5DY
1881                                // \`? [ ...spans... ]: [ ... ]\`).
1882                                let key_marker_before = self.tokens[..oi].iter().rev().any(|t| {
1883                                    matches!(t.token_type, TokenType::Key)
1884                                        && t.start_position.line == open_line
1885                                });
1886                                if !key_marker_before && open_line != close_end_line {
1887                                    return Err(Error::scan(
1888                                        self.position,
1889                                        "Implicit key in block context: flow collection key spans multiple lines"
1890                                            .to_string(),
1891                                    ));
1892                                }
1893                            }
1894                        }
1895                    }
1896                    let pos = self.position;
1897                    self.advance();
1898                    self.tokens
1899                        .push(Token::new(TokenType::Value, pos, self.position));
1900                }
1901
1902                // §6.2: the explicit-key marker \`?\` must be followed
1903                // by a SPACE (or EOL), not a tab. Tab as separator
1904                // after \`?\` is invalid (yaml-test-suite Y79Y/006, /008).
1905                '?' if self.flow_level == 0 && self.peek_char(1) == Some('\t') => {
1906                    return Err(Error::scan(
1907                        self.position,
1908                        "Tab cannot follow `?` as block-key separator".to_string(),
1909                    ));
1910                }
1911
1912                // Explicit key marker. An indented `?` at line-start
1913                // (e.g. `mapping:\\n  ? key`) opens an implicit block
1914                // mapping at this column — same as a line-start scalar
1915                // key. Without this, scan_plain_scalar wouldn't see
1916                // the inner mapping's indent and would wrongly fold
1917                // the key content into a multi-line scalar
1918                // (yaml-test-suite S9E8, KK5P).
1919                '?' if self.flow_level == 0
1920                    && (self.peek_char(1).map_or(true, |c| c.is_whitespace())
1921                        || self.peek_char(1).is_none()) =>
1922                {
1923                    if self.position.column == self.current_indent + 1 {
1924                        self.maybe_open_block_mapping_for_key()?;
1925                    }
1926                    let pos = self.position;
1927                    self.advance();
1928                    self.tokens
1929                        .push(Token::new(TokenType::Key, pos, self.position));
1930                }
1931                '?' if self.flow_level > 0
1932                    && (self
1933                        .peek_char(1)
1934                        .map_or(true, |c| c.is_whitespace() || ",:]}".contains(c))
1935                        || self.peek_char(1).is_none()) =>
1936                {
1937                    let pos = self.position;
1938                    self.advance();
1939                    self.tokens
1940                        .push(Token::new(TokenType::Key, pos, self.position));
1941                }
1942
1943                // Block entry
1944                '-' if self.flow_level == 0
1945                    && (self.peek_char(1).map_or(true, |c| c.is_whitespace())
1946                        || self.peek_char(1).is_none()) =>
1947                {
1948                    // A block-entry \`-\` immediately after a flow
1949                    // collection's close (\`}\`, \`]\`) ON THE SAME LINE
1950                    // is invalid — no separator between the closed
1951                    // flow node and the next sibling (yaml-test-suite
1952                    // P2EQ \`- { y: z }- invalid\`). The same-line guard
1953                    // is essential — a \`}\` on a previous line with a
1954                    // new \`-\` on the next line is perfectly valid.
1955                    //
1956                    // Likewise, a block-entry \`-\` immediately after a
1957                    // property (Anchor / Tag) on the same line is
1958                    // invalid — the property must precede a node, and
1959                    // a block sequence's first \`-\` must begin a line
1960                    // (yaml-test-suite SY6V \`&anchor - x\`).
1961                    if let Some(last) = self.tokens.last() {
1962                        if matches!(
1963                            last.token_type,
1964                            TokenType::FlowMappingEnd | TokenType::FlowSequenceEnd
1965                        ) && last.end_position.line == self.position.line
1966                        {
1967                            return Err(Error::scan(
1968                                self.position,
1969                                "Block-entry `-` immediately after flow collection close"
1970                                    .to_string(),
1971                            ));
1972                        }
1973                        if matches!(last.token_type, TokenType::Anchor(_) | TokenType::Tag(_))
1974                            && last.end_position.line == self.position.line
1975                        {
1976                            return Err(Error::scan(
1977                                self.position,
1978                                "Block-entry `-` cannot follow a property on the same line"
1979                                    .to_string(),
1980                            ));
1981                        }
1982                        // §8.22: a block sequence's first \`-\` must
1983                        // begin on a new line. \`key: - a\` (implicit
1984                        // key, then dash on same line) is invalid
1985                        // (yaml-test-suite 5U3A). But \`? key\\n: - x\`
1986                        // (explicit value-separator on the same line
1987                        // as the dash) IS valid: the \`?\` key sits
1988                        // on a previous line. We distinguish by
1989                        // walking back from the Value: if the
1990                        // preceding non-property token is a Scalar
1991                        // on the same line as the Value, the key
1992                        // is implicit; otherwise it's after \`?\`.
1993                        if matches!(last.token_type, TokenType::Value)
1994                            && last.end_position.line == self.position.line
1995                        {
1996                            let value_line = last.start_position.line;
1997                            let mut prior_scalar_line = None;
1998                            for t in self.tokens.iter().rev().skip(1) {
1999                                match &t.token_type {
2000                                    TokenType::Anchor(_) | TokenType::Tag(_) => {}
2001                                    TokenType::Scalar(..) => {
2002                                        prior_scalar_line = Some(t.end_position.line);
2003                                        break;
2004                                    }
2005                                    _ => break,
2006                                }
2007                            }
2008                            if prior_scalar_line == Some(value_line) {
2009                                return Err(Error::scan(
2010                                    self.position,
2011                                    "Block sequence value cannot start on the same line as its key"
2012                                        .to_string(),
2013                                ));
2014                            }
2015                        }
2016                    }
2017                    let pos = self.position;
2018                    self.advance();
2019
2020                    // Check if we need to start a new block sequence
2021                    let last_indent = *self.indent_stack.last().unwrap();
2022
2023                    // If a compact sequence (opened from `? - x` or
2024                    // similar) is already active at this dash's column,
2025                    // the dash continues it — don't open a new nested
2026                    // block sequence (yaml-test-suite M5DY).
2027                    let dash_indent = pos.column.saturating_sub(1);
2028                    let compact_active_here = self
2029                        .compact_sequence_indents
2030                        .last()
2031                        .map_or(false, |&si| si == dash_indent);
2032                    if compact_active_here {
2033                        // Continuation of an existing compact sequence.
2034                    } else if self.current_indent > last_indent {
2035                        // Deeper indentation - start new nested sequence
2036                        self.indent_stack.push(self.current_indent);
2037                        self.indent_is_sequence.push(true);
2038                        // Check depth limit
2039                        self.resource_tracker
2040                            .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
2041                        self.tokens
2042                            .push(Token::simple(TokenType::BlockSequenceStart, pos));
2043                    } else if self.current_indent == last_indent
2044                        && *self.indent_is_sequence.last().unwrap_or(&false)
2045                    {
2046                        // Same indent and the top of stack is already a sequence
2047                        // → continuation of that sequence; no new start needed.
2048                    } else if self.current_indent >= last_indent {
2049                        // Same or root level — compact notation.
2050                        // Start a new sequence only if we don't already have one
2051                        // tracked at this exact indent.
2052                        // For a dash that's *not* at line-start (e.g.
2053                        // `? - x` where current_indent is still the
2054                        // line's indent but the dash sits in mid-line),
2055                        // use the dash column - 1 as the sequence's
2056                        // indent so scan_plain_scalar's continuation
2057                        // check correctly sees the deeper context
2058                        // (yaml-test-suite M5DY).
2059                        let dash_indent = pos.column.saturating_sub(1);
2060                        let seq_indent = dash_indent.max(self.current_indent);
2061                        let has_active_compact = self
2062                            .compact_sequence_indents
2063                            .last()
2064                            .map_or(false, |&si| si == seq_indent);
2065
2066                        if !has_active_compact {
2067                            self.compact_sequence_indents.push(seq_indent);
2068                            // Check depth limit
2069                            self.resource_tracker.check_depth(
2070                                &self.limits,
2071                                self.flow_level + self.indent_stack.len(),
2072                            )?;
2073                            self.tokens
2074                                .push(Token::simple(TokenType::BlockSequenceStart, pos));
2075                        }
2076                    }
2077
2078                    self.tokens
2079                        .push(Token::new(TokenType::BlockEntry, pos, self.position));
2080
2081                    // After emitting BlockEntry, check if the next
2082                    // token is another dash (nested sequence). §6.2
2083                    // requires SPACE separation between dashes — a
2084                    // tab between the outer and inner \`-\` is invalid
2085                    // (yaml-test-suite Y79Y/004, /005). Track whether
2086                    // a tab was consumed while skipping the inter-
2087                    // dash whitespace and reject if so.
2088                    let mut saw_tab_between = false;
2089                    while let Some(c) = self.current_char {
2090                        if c == ' ' {
2091                            self.advance();
2092                        } else if c == '\t' {
2093                            saw_tab_between = true;
2094                            self.advance();
2095                        } else {
2096                            break;
2097                        }
2098                    }
2099                    if self.current_char == Some('-')
2100                        && self.peek_char(1).map_or(true, |c| c.is_whitespace())
2101                        && saw_tab_between
2102                    {
2103                        return Err(Error::scan(
2104                            self.position,
2105                            "Tab between block-entries on same line".to_string(),
2106                        ));
2107                    }
2108                    if self.current_char == Some('-')
2109                        && self.peek_char(1).map_or(true, |c| c.is_whitespace())
2110                    {
2111                        // We have a nested sequence on the same line!
2112                        // Track this as an inline sequence
2113                        self.inline_sequence_depth += 1;
2114                        // Push the *indent* (column - 1), not the
2115                        // column, so it matches the convention used by
2116                        // maybe_open_block_mapping_for_key. With column
2117                        // here the next-line indent (column - 1) would
2118                        // be strictly less than the stored value and
2119                        // wrongly trigger an early close, breaking
2120                        // multi-line nested sequences (yaml-test-suite
2121                        // 3ALJ, 57H4).
2122                        self.indent_stack
2123                            .push(self.position.column.saturating_sub(1));
2124                        self.indent_is_sequence.push(true);
2125                        // Check depth limit
2126                        self.resource_tracker
2127                            .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
2128                        self.tokens
2129                            .push(Token::simple(TokenType::BlockSequenceStart, self.position));
2130                        // Continue processing - the next iteration will handle the nested dash
2131                    } else if self.current_char.is_some()
2132                        && !matches!(self.current_char, Some('\n' | '\r'))
2133                    {
2134                        // Content follows "- " on the same line.
2135                        // Update current_indent to the content's column position so that
2136                        // any mapping started here will be at a deeper indent level than
2137                        // the sequence. This ensures handle_indentation properly closes
2138                        // the mapping when the next sibling "- " appears.
2139                        self.current_indent = self.position.column - 1;
2140                    }
2141                }
2142
2143                // Quoted strings — same implicit-key mapping detection
2144                // as for plain scalars (yaml-test-suite 6H3V, 6SLA).
2145                '"' | '\'' => {
2146                    if self.flow_level == 0 && self.check_for_mapping_ahead() {
2147                        self.maybe_open_block_mapping_for_key()?;
2148                    }
2149                    let token = self.scan_quoted_string(ch)?;
2150                    self.tokens.push(token);
2151                }
2152
2153                // Document markers (only if not a block entry).
2154                //
2155                // Reached only when `-` is at column = current_indent + 1 AND
2156                // the next character is non-whitespace — i.e. either the
2157                // `---` document-start marker OR a plain scalar starting
2158                // with `-` (e.g. `---word1`, `-foo`). If `scan_document_start`
2159                // declines, we MUST consume the run as a plain scalar — not
2160                // consulting `is_plain_scalar_start` here, because that helper
2161                // unconditionally rejects `-`, which would leave the outer
2162                // `while let` loop spinning on the same character.
2163                '-' if self.position.column == self.current_indent + 1
2164                    && !self.peek_char(1).map_or(true, |c| c.is_whitespace()) =>
2165                {
2166                    if let Some(token) = self.scan_document_start()? {
2167                        self.tokens.push(token);
2168                    } else {
2169                        let token = self.scan_plain_scalar()?;
2170                        self.tokens.push(token);
2171                    }
2172                }
2173                '.' if self.position.column == self.current_indent + 1 => {
2174                    if let Some(token) = self.scan_document_end()? {
2175                        self.tokens.push(token);
2176                    } else if self.is_plain_scalar_start() {
2177                        let token = self.scan_plain_scalar()?;
2178                        self.tokens.push(token);
2179                    }
2180                }
2181
2182                // Numbers or plain scalars starting with -
2183                // Only scan as number if the entire token is numeric (no trailing letters)
2184                _ if (ch.is_ascii_digit()
2185                    || (ch == '-' && self.peek_char(1).map_or(false, |c| c.is_ascii_digit())))
2186                    && self.is_pure_number() =>
2187                {
2188                    let token = self.scan_number()?;
2189                    self.tokens.push(token);
2190                }
2191
2192                // Anchors and aliases. §6.9: a node's properties
2193                // (anchor/tag) are prefixes of the node. When an `&`,
2194                // `*`, or `!` is at the start of a line (column ==
2195                // current_indent + 1) and a `: ` follows on the same
2196                // line, the property/alias is part of an implicit
2197                // key's leading position. The block mapping that
2198                // contains this key therefore opens at this column,
2199                // *before* the property/alias token is emitted
2200                // (yaml-test-suite 7BMT, 6BFJ, 9KAX, U3XV, 26DV).
2201                '&' => {
2202                    // Mirror H7J7 check for anchors (yaml-test-suite
2203                    // G9HC \`seq:\\n&anchor\\n- a\`).
2204                    if self.flow_level == 0
2205                        && self.position.column == self.current_indent + 1
2206                        && !self.check_for_mapping_ahead()
2207                        && self.indent_stack.len() > 1
2208                        && self.current_indent == self.indent_stack[self.indent_stack.len() - 2]
2209                        && self.most_recent_token_is_value_separator()
2210                    {
2211                        return Err(Error::scan(
2212                            self.position,
2213                            "Anchor at line-start with insufficient indent for value position"
2214                                .to_string(),
2215                        ));
2216                    }
2217                    if self.flow_level == 0
2218                        && self.position.column == self.current_indent + 1
2219                        && self.check_for_mapping_ahead()
2220                    {
2221                        self.maybe_open_block_mapping_for_key()?;
2222                    }
2223                    let token = self.scan_anchor()?;
2224                    self.tokens.push(token);
2225                }
2226                '*' => {
2227                    // §6.9.2: alias/anchor names may contain \`:\` (only
2228                    // flow indicators and whitespace terminate them).
2229                    // So \`*a:\` is an alias named \`a:\`, NOT an alias
2230                    // \`*a\` followed by a key separator. Don't open
2231                    // an implicit block mapping in that case (yaml-
2232                    // test-suite 2SXE).
2233                    if self.flow_level == 0
2234                        && self.position.column == self.current_indent + 1
2235                        && self.check_for_mapping_ahead()
2236                        && !self.colon_belongs_to_alias_anchor_name()
2237                    {
2238                        self.maybe_open_block_mapping_for_key()?;
2239                    }
2240                    let token = self.scan_alias()?;
2241                    self.tokens.push(token);
2242                }
2243
2244                // Block scalars
2245                '|' => {
2246                    let token = self.scan_literal_block_scalar()?;
2247                    self.tokens.push(token);
2248                    // Block scalar collection rewinds the cursor to the
2249                    // start of the next under-indented line. `current_indent`
2250                    // is still set to the inline content's column from the
2251                    // enclosing `- |` / `key: |` site, so the next iteration
2252                    // would mis-dispatch. Break out so the outer loop
2253                    // re-enters `process_line` and reruns indent handling
2254                    // (yaml-test-suite 4QFQ, M6YH, P2AD).
2255                    break;
2256                }
2257                '>' => {
2258                    let token = self.scan_folded_block_scalar()?;
2259                    self.tokens.push(token);
2260                    break;
2261                }
2262
2263                // Tags. Same line-start property-opens-mapping rule
2264                // (yaml-test-suite ZH7C variants).
2265                //
2266                // §6.9: a property at the SAME indent as the
2267                // enclosing mapping/sequence cannot apply to that
2268                // collection's value — the value must be more
2269                // indented. If we're at a line-start \`!\` whose column
2270                // equals the enclosing mapping's indent + 1 AND that
2271                // mapping currently has a key awaiting a value, the
2272                // tag is misplaced (yaml-test-suite H7J7).
2273                '!' => {
2274                    if self.flow_level == 0
2275                        && self.position.column == self.current_indent + 1
2276                        && !self.check_for_mapping_ahead()
2277                        && self.indent_stack.len() > 1
2278                        && self.current_indent == self.indent_stack[self.indent_stack.len() - 2]
2279                        && self.most_recent_token_is_value_separator()
2280                    {
2281                        return Err(Error::scan(
2282                            self.position,
2283                            "Tag at line-start with insufficient indent for value position"
2284                                .to_string(),
2285                        ));
2286                    }
2287                    if self.flow_level == 0
2288                        && self.position.column == self.current_indent + 1
2289                        && self.check_for_mapping_ahead()
2290                    {
2291                        self.maybe_open_block_mapping_for_key()?;
2292                    }
2293                    let token = self.scan_tag()?;
2294                    self.tokens.push(token);
2295                }
2296
2297                // Plain scalars
2298                _ if self.is_plain_scalar_start() => {
2299                    // A plain scalar starting on the SAME line as a
2300                    // flow-collection close (\`}\` or \`]\`) means there's
2301                    // no separator between the closed flow node and
2302                    // the new content (yaml-test-suite 62EZ
2303                    // \`x: { y: z }in: valid\`).
2304                    if self.flow_level == 0 {
2305                        if let Some(last) = self.tokens.last() {
2306                            if matches!(
2307                                last.token_type,
2308                                TokenType::FlowMappingEnd | TokenType::FlowSequenceEnd
2309                            ) && last.end_position.line == self.position.line
2310                            {
2311                                return Err(Error::scan(
2312                                    self.position,
2313                                    "Plain scalar immediately after flow collection close"
2314                                        .to_string(),
2315                                ));
2316                            }
2317                        }
2318                    }
2319                    if self.flow_level == 0 && self.check_for_mapping_ahead() {
2320                        self.maybe_open_block_mapping_for_key()?;
2321                    }
2322
2323                    let token = self.scan_plain_scalar()?;
2324                    self.tokens.push(token);
2325                }
2326
2327                _ => {
2328                    let context = ErrorContext::from_input(&self.input, &self.position, 2)
2329                        .with_suggestion("Check for valid YAML syntax characters".to_string());
2330                    return Err(Error::invalid_character_with_context(
2331                        self.position,
2332                        ch,
2333                        "YAML document",
2334                        context,
2335                    ));
2336                }
2337            }
2338        }
2339
2340        // Inline sequences (nested \`- -\` on one line) used to be
2341        // closed unconditionally at end-of-line. But a nested sequence
2342        // can span lines (`- - a\n  - b\n- c`) — in that case the inner
2343        // sequence must remain open until handle_indentation sees a
2344        // dedent. Reset the inline-sequence counter (so the next line
2345        // is judged on its own merits) but DO NOT emit BlockEnd —
2346        // handle_indentation's indent_stack pop, the end-of-stream
2347        // close at scan_next_token, and the explicit-dedent close at
2348        // handle_indentation's bottom each provide a correct close.
2349        self.inline_sequence_depth = 0;
2350
2351        Ok(())
2352    }
2353
2354    /// Scan the next token lazily
2355    fn scan_next_token(&mut self) -> Result<()> {
2356        if self.done {
2357            return Ok(());
2358        }
2359
2360        // Add stream start token if this is the beginning
2361        if self.tokens.is_empty() {
2362            self.tokens
2363                .push(Token::simple(TokenType::StreamStart, self.position));
2364            return Ok(());
2365        }
2366
2367        // Check if we're at the end of input
2368        if self.current_char.is_none() {
2369            if !self
2370                .tokens
2371                .iter()
2372                .any(|t| matches!(t.token_type, TokenType::StreamEnd))
2373            {
2374                self.tokens
2375                    .push(Token::simple(TokenType::StreamEnd, self.position));
2376            }
2377            self.done = true;
2378            return Ok(());
2379        }
2380
2381        // For now, fall back to scanning all tokens at once for the lazy scanner
2382        // This is a simplified implementation - a full streaming parser would
2383        // need more sophisticated state management
2384        let tokens_before = self.tokens.len();
2385        self.scan_all_tokens()?;
2386
2387        // Mark as done after scanning all tokens
2388        if self.tokens.len() == tokens_before {
2389            self.done = true;
2390        }
2391
2392        Ok(())
2393    }
2394
2395    /// Pre-scan all tokens (simplified approach for basic implementation)
2396    fn scan_all_tokens(&mut self) -> Result<()> {
2397        // Only add StreamStart if we don't have it yet
2398        if !self
2399            .tokens
2400            .iter()
2401            .any(|t| matches!(t.token_type, TokenType::StreamStart))
2402        {
2403            self.tokens
2404                .push(Token::simple(TokenType::StreamStart, self.position));
2405        }
2406
2407        while self.current_char.is_some() {
2408            self.process_line()?;
2409
2410            // Advance past newlines
2411            while let Some(ch) = self.current_char {
2412                if ch == '\n' || ch == '\r' {
2413                    self.advance();
2414                } else {
2415                    break;
2416                }
2417            }
2418        }
2419
2420        // Close any remaining compact sequences (before their parent mappings)
2421        while self.compact_sequence_indents.pop().is_some() {
2422            self.tokens
2423                .push(Token::simple(TokenType::BlockEnd, self.position));
2424        }
2425
2426        // Close any remaining blocks
2427        while self.indent_stack.len() > 1 {
2428            self.indent_stack.pop();
2429            self.indent_is_sequence.pop();
2430            self.tokens
2431                .push(Token::simple(TokenType::BlockEnd, self.position));
2432        }
2433
2434        self.tokens
2435            .push(Token::simple(TokenType::StreamEnd, self.position));
2436        self.done = true;
2437        Ok(())
2438    }
2439
2440    /// Peek at a character at the given offset (can be negative)
2441    /// Check if the current position starts a pure number (digits/dots/minus only,
2442    /// not followed by letters). Values like 500m, 128Mi should be treated as plain scalars.
2443    fn is_pure_number(&self) -> bool {
2444        let mut offset: isize = 0;
2445        let first = self.peek_char(0);
2446        // Skip leading minus
2447        if first == Some('-') {
2448            offset = 1;
2449        }
2450        // Scan digits and at most one dot
2451        let mut has_digit = false;
2452        let mut dot_count = 0;
2453        loop {
2454            match self.peek_char(offset) {
2455                Some(c) if c.is_ascii_digit() => {
2456                    has_digit = true;
2457                    offset += 1;
2458                }
2459                Some('.') => {
2460                    dot_count += 1;
2461                    if dot_count > 1 {
2462                        // Multiple dots (e.g. 0.5.8) — not a number
2463                        return false;
2464                    }
2465                    offset += 1;
2466                }
2467                Some(c) if c.is_ascii_alphabetic() || c == '_' => {
2468                    // Letters follow the digits — not a pure number (e.g. 500m, 128Mi)
2469                    return false;
2470                }
2471                Some(c) => {
2472                    // For a token to be a pure number, what follows
2473                    // the digits must be end-of-token. In flow
2474                    // context that's a flow indicator. In block
2475                    // context the rest of the line must be pure
2476                    // whitespace (possibly trailing a comment) — if
2477                    // there's more non-whitespace content on this
2478                    // line, the digits are part of a larger plain
2479                    // scalar like \`1 - 3\` (yaml-test-suite P76L)
2480                    // or \`20:03:20\` (yaml-test-suite U9NS).
2481                    if self.flow_level > 0 && ",[]{}".contains(c) {
2482                        return has_digit;
2483                    }
2484                    if c == '\n' || c == '\r' {
2485                        return has_digit;
2486                    }
2487                    if c == ' ' || c == '\t' {
2488                        // Look ahead: rest of line must be whitespace
2489                        // or a comment.
2490                        let mut probe = offset + 1;
2491                        loop {
2492                            match self.peek_char(probe) {
2493                                None => return has_digit,
2494                                Some('\n' | '\r') => return has_digit,
2495                                Some('#') => return has_digit,
2496                                Some(' ' | '\t') => probe += 1,
2497                                Some(_) => return false,
2498                            }
2499                        }
2500                    }
2501                    if c == ':' {
2502                        let next = self.peek_char(offset + 1);
2503                        return has_digit && next.map_or(true, |nc| nc.is_whitespace());
2504                    }
2505                    return false;
2506                }
2507                None => return has_digit,
2508            }
2509        }
2510    }
2511
2512    fn peek_char(&self, offset: isize) -> Option<char> {
2513        if offset >= 0 {
2514            let target_index = self.current_char_index + offset as usize;
2515            if target_index < self.char_cache.len() {
2516                Some(self.char_cache[target_index])
2517            } else {
2518                None
2519            }
2520        } else {
2521            let offset_magnitude = (-offset) as usize;
2522            if self.current_char_index >= offset_magnitude {
2523                Some(self.char_cache[self.current_char_index - offset_magnitude])
2524            } else {
2525                None
2526            }
2527        }
2528    }
2529
2530    /// Scan an anchor token (&name)
2531    fn scan_anchor(&mut self) -> Result<Token> {
2532        let start_pos = self.position;
2533        self.advance(); // Skip '&'
2534
2535        let name = self.scan_identifier()?;
2536        if name.is_empty() {
2537            let context = ErrorContext::from_input(&self.input, &self.position, 2).with_suggestion(
2538                "Provide a valid anchor name after &, e.g., &anchor_name".to_string(),
2539            );
2540            return Err(Error::scan_with_context(
2541                self.position,
2542                "Anchor name cannot be empty",
2543                context,
2544            ));
2545        }
2546
2547        // Track anchor for resource limits
2548        self.resource_tracker.add_anchor(&self.limits)?;
2549
2550        Ok(Token::new(
2551            TokenType::Anchor(name),
2552            start_pos,
2553            self.position,
2554        ))
2555    }
2556
2557    /// Scan an alias token (*name)
2558    fn scan_alias(&mut self) -> Result<Token> {
2559        let start_pos = self.position;
2560        self.advance(); // Skip '*'
2561
2562        let name = self.scan_identifier()?;
2563        if name.is_empty() {
2564            let context = ErrorContext::from_input(&self.input, &self.position, 2).with_suggestion(
2565                "Provide a valid alias name after *, e.g., *alias_name".to_string(),
2566            );
2567            return Err(Error::scan_with_context(
2568                self.position,
2569                "Alias name cannot be empty",
2570                context,
2571            ));
2572        }
2573
2574        Ok(Token::new(TokenType::Alias(name), start_pos, self.position))
2575    }
2576
2577    /// Scan an identifier (used for anchor and alias names)
2578    fn scan_identifier(&mut self) -> Result<String> {
2579        // Per YAML 1.2 §6.9.2 (ns-anchor-name = ns-anchor-char+), the only
2580        // exclusions are whitespace and the flow indicators `,[]{}`. This
2581        // accepts ASCII alphanumeric, underscore, hyphen, AND full unicode
2582        // codepoints (including emoji), matching the spec exactly.
2583        let mut identifier = String::new();
2584        while let Some(ch) = self.current_char {
2585            if ch.is_whitespace() || matches!(ch, ',' | '[' | ']' | '{' | '}') {
2586                break;
2587            }
2588            identifier.push(ch);
2589            self.advance();
2590        }
2591        Ok(identifier)
2592    }
2593
2594    /// Scan a tag token (`!tag`, `!!tag`, or `!<verbatim>`).
2595    fn scan_tag(&mut self) -> Result<Token> {
2596        let start_pos = self.position;
2597        self.advance(); // Skip first '!'
2598
2599        let mut tag = String::from("!");
2600
2601        // Check for verbatim tag format: !<tag>
2602        if self.current_char == Some('<') {
2603            tag.push('<');
2604            self.advance(); // Skip '<'
2605
2606            // Scan until closing '>'
2607            while let Some(ch) = self.current_char {
2608                if ch == '>' {
2609                    tag.push(ch);
2610                    self.advance();
2611                    break;
2612                } else if ch.is_control() || ch.is_whitespace() {
2613                    return Err(Error::scan(
2614                        self.position,
2615                        "Invalid character in verbatim tag".to_string(),
2616                    ));
2617                }
2618                tag.push(ch);
2619                self.advance();
2620            }
2621        } else {
2622            // Check for secondary tag handle: !!
2623            if self.current_char == Some('!') {
2624                tag.push('!');
2625                self.advance(); // Skip second '!'
2626            }
2627
2628            // Scan tag name/suffix.
2629            //
2630            // Per YAML 1.2 §5.6, tag suffixes are URI references — they may
2631            // contain any URI character (RFC 3986 unreserved + sub-delims +
2632            // a few others) or `%XX` percent-encoded bytes. The handful of
2633            // characters listed below covers the alphanumeric + URI-safe
2634            // punctuation set used by yaml-test-suite. Percent decoding of
2635            // `%XX` happens later in `TagResolver::resolve`.
2636            //
2637            // §5.3: inside a flow collection, the flow indicators
2638            // `,`, `[`, `]`, `{`, `}` always terminate a node — so we
2639            // must NOT consume them into the tag suffix even though
2640            // RFC 3986 permits them in URIs (yaml-test-suite WZ62).
2641            // YAML 1.2 in practice treats `,` as a flow indicator that
2642            // must be percent-encoded (\`%2C\`) when it appears inside
2643            // a tag suffix — bare \`,\` is not allowed in EITHER block
2644            // or flow context (yaml-test-suite U99R).
2645            while let Some(ch) = self.current_char {
2646                if matches!(ch, ',') {
2647                    break;
2648                }
2649                if self.flow_level > 0 && matches!(ch, '[' | ']' | '{' | '}') {
2650                    break;
2651                }
2652                // §6.8 / §5.6: `:` IS a valid tag URI character — e.g.
2653                // `tag:yaml.org,2002:str` legitimately contains two
2654                // colons inside its URI. But a `:` followed by
2655                // whitespace, EOL or EOF is the YAML mapping-value
2656                // indicator and MUST terminate the tag, otherwise
2657                // `!handle!suffix: value` is mis-scanned as
2658                // `Tag("!handle!suffix:") Scalar("value")` and the
2659                // implicit-key mapping structure is lost. Mirrors the
2660                // `,` carve-out above (a valid URI char that's also a
2661                // YAML flow indicator in some contexts).
2662                if ch == ':' {
2663                    match self.peek_char(1) {
2664                        None => break,
2665                        Some(c) if c.is_whitespace() => break,
2666                        _ => {}
2667                    }
2668                }
2669                if ch.is_alphanumeric() || "-._~:/?#[]@!$&'()*+;=%".contains(ch) {
2670                    tag.push(ch);
2671                    self.advance();
2672                } else {
2673                    break;
2674                }
2675            }
2676        }
2677
2678        Ok(Token::new(TokenType::Tag(tag), start_pos, self.position))
2679    }
2680
2681    /// Scan a literal block scalar (|)
2682    fn scan_literal_block_scalar(&mut self) -> Result<Token> {
2683        let start_pos = self.position;
2684        self.advance(); // Skip '|'
2685
2686        // Parse block scalar header (indicators like +, -, explicit indent)
2687        let (chomping, explicit_indent) = self.scan_block_scalar_header()?;
2688
2689        // Skip to next line
2690        self.skip_to_next_line()?;
2691
2692        // Determine indentation. `base_indent` is the surrounding
2693        // block's indent — i.e. the indent of the sequence or
2694        // mapping that contains this scalar. `self.current_indent`
2695        // is sometimes set to the inline indicator column (e.g. 2
2696        // for `- |`), which would make `base_indent + explicit`
2697        // wrong; use the top of `indent_stack` instead
2698        // (yaml-test-suite 4QFQ `|1`).
2699        let base_indent = self.indent_stack.last().copied().unwrap_or(0);
2700        let content_indent = if let Some(explicit) = explicit_indent {
2701            base_indent + explicit
2702        } else {
2703            // Find the first non-empty content line to determine indentation
2704            self.find_block_scalar_indent(base_indent)?
2705        };
2706
2707        // Collect the literal block content
2708        let content = self.collect_literal_block_content(content_indent, chomping)?;
2709
2710        Ok(Token::new(
2711            TokenType::BlockScalarLiteral(content),
2712            start_pos,
2713            self.position,
2714        ))
2715    }
2716
2717    /// Scan a folded block scalar (>)
2718    fn scan_folded_block_scalar(&mut self) -> Result<Token> {
2719        let start_pos = self.position;
2720        self.advance(); // Skip '>'
2721
2722        // Parse block scalar header (indicators like +, -, explicit indent)
2723        let (chomping, explicit_indent) = self.scan_block_scalar_header()?;
2724
2725        // Skip to next line
2726        self.skip_to_next_line()?;
2727
2728        // See scan_literal_block_scalar for why we read `indent_stack`
2729        // rather than `current_indent`.
2730        let base_indent = self.indent_stack.last().copied().unwrap_or(0);
2731        let content_indent = if let Some(explicit) = explicit_indent {
2732            base_indent + explicit
2733        } else {
2734            // Find the first non-empty content line to determine indentation
2735            self.find_block_scalar_indent(base_indent)?
2736        };
2737
2738        // Collect the folded block content
2739        let content = self.collect_folded_block_content(content_indent, chomping)?;
2740
2741        Ok(Token::new(
2742            TokenType::BlockScalarFolded(content),
2743            start_pos,
2744            self.position,
2745        ))
2746    }
2747
2748    /// Parse block scalar header indicators (+, -, and explicit indent)
2749    fn scan_block_scalar_header(&mut self) -> Result<(ChompingMode, Option<usize>)> {
2750        let mut chomping = ChompingMode::Clip;
2751        let mut explicit_indent: Option<usize> = None;
2752        // §6.6: a comment must be preceded by whitespace. \`|#x\` and
2753        // \`>#x\` are invalid (yaml-test-suite X4QW).
2754        let mut seen_separator_ws = false;
2755
2756        // Parse indicators in any order
2757        while let Some(ch) = self.current_char {
2758            match ch {
2759                '+' => {
2760                    chomping = ChompingMode::Keep;
2761                    self.advance();
2762                }
2763                '-' => {
2764                    chomping = ChompingMode::Strip;
2765                    self.advance();
2766                }
2767                '0'..='9' => {
2768                    let digit = ch.to_digit(10).unwrap() as usize;
2769                    if explicit_indent.is_some() {
2770                        let context = ErrorContext::from_input(&self.input, &self.position, 2)
2771                            .with_suggestion(
2772                                "Use only one indent indicator digit in block scalar".to_string(),
2773                            );
2774                        return Err(Error::scan_with_context(
2775                            self.position,
2776                            "Multiple indent indicators in block scalar",
2777                            context,
2778                        ));
2779                    }
2780                    // YAML 1.2 §8.1.1.1: explicit indent indicator is
2781                    // 1..=9. `|0` and `>0` are invalid
2782                    // (yaml-test-suite 2G84/00).
2783                    if digit == 0 {
2784                        let context = ErrorContext::from_input(&self.input, &self.position, 2)
2785                            .with_suggestion(
2786                                "Block-scalar indent indicator must be 1-9".to_string(),
2787                            );
2788                        return Err(Error::scan_with_context(
2789                            self.position,
2790                            "Block-scalar indent indicator `0` is invalid",
2791                            context,
2792                        ));
2793                    }
2794                    explicit_indent = Some(digit);
2795                    self.advance();
2796                }
2797                ' ' | '\t' => {
2798                    seen_separator_ws = true;
2799                    self.advance(); // Skip whitespace
2800                }
2801                '#' => {
2802                    if !seen_separator_ws {
2803                        return Err(Error::scan(
2804                            self.position,
2805                            "Comment in block-scalar header must be preceded by whitespace"
2806                                .to_string(),
2807                        ));
2808                    }
2809                    // Skip comment to end of line
2810                    while let Some(ch) = self.current_char {
2811                        self.advance();
2812                        if ch == '\n' || ch == '\r' {
2813                            break;
2814                        }
2815                    }
2816                    break;
2817                }
2818                '\n' | '\r' => break,
2819                _ => {
2820                    let context = ErrorContext::from_input(&self.input, &self.position, 2)
2821                        .with_suggestion("Use valid block scalar indicators: | (literal), > (folded), + (keep), - (strip), or digit (indent)".to_string());
2822                    return Err(Error::invalid_character_with_context(
2823                        self.position,
2824                        ch,
2825                        "block scalar header",
2826                        context,
2827                    ));
2828                }
2829            }
2830        }
2831
2832        Ok((chomping, explicit_indent))
2833    }
2834
2835    /// Advance the cursor PAST the next line break, but do not consume
2836    /// any leading whitespace on the line that follows. The block-
2837    /// scalar header parser uses this to step from the indicator line
2838    /// to the start of the content line — the next line's leading
2839    /// spaces are part of its content_indent, not header whitespace.
2840    fn skip_to_next_line(&mut self) -> Result<()> {
2841        // If we're already at column 1 (the comment handler in
2842        // scan_block_scalar_header may have already advanced past a
2843        // newline), do nothing — the next line's leading whitespace
2844        // belongs to its content_indent.
2845        if self.position.column == 1 {
2846            return Ok(());
2847        }
2848        while let Some(ch) = self.current_char {
2849            match ch {
2850                '\n' | '\r' => {
2851                    self.advance();
2852                    return Ok(());
2853                }
2854                ' ' | '\t' => {
2855                    self.advance();
2856                }
2857                _ => return Ok(()),
2858            }
2859        }
2860        Ok(())
2861    }
2862
2863    /// Find the content indentation for a block scalar.
2864    ///
2865    /// Per spec §8.1.1.1, indent is the leading-space count of the first
2866    /// non-empty content line (or the longest blank-line indent if no
2867    /// non-empty line exists). A non-empty line whose indent is not
2868    /// strictly deeper than `base_indent` is outside the scalar's
2869    /// scope — that line is a sibling structure, not content
2870    /// (yaml-test-suite K858).
2871    fn find_block_scalar_indent(&mut self, base_indent: usize) -> Result<usize> {
2872        let saved_position = self.position;
2873        let saved_char = self.current_char;
2874        let saved_char_index = self.current_char_index;
2875
2876        let mut max_blank_indent: usize = 0;
2877        let mut found = false;
2878        let mut content_indent: usize = 1;
2879
2880        loop {
2881            let mut line_indent = 0;
2882            while self.current_char == Some(' ') {
2883                line_indent += 1;
2884                self.advance();
2885            }
2886            // §6.1 + §8.1: tabs cannot serve as block-scalar
2887            // indentation. A line that BEGINS with a tab (no leading
2888            // spaces) inside the block scalar's indent search is
2889            // invalid (yaml-test-suite Y79Y/000 \`foo: |\\n\\tbar\`).
2890            // Tabs that appear AFTER one or more spaces are content,
2891            // not indentation, and remain valid (yaml-test-suite
2892            // 96NN/00 \`foo: |-\\n \\tbar\`).
2893            if line_indent == 0 && self.current_char == Some('\t') {
2894                return Err(Error::scan(
2895                    self.position,
2896                    "Tab cannot serve as block-scalar indentation".to_string(),
2897                ));
2898            }
2899
2900            match self.current_char {
2901                None => {
2902                    if line_indent > max_blank_indent {
2903                        max_blank_indent = line_indent;
2904                    }
2905                    break;
2906                }
2907                Some('\n' | '\r') => {
2908                    if line_indent > max_blank_indent {
2909                        max_blank_indent = line_indent;
2910                    }
2911                    self.advance();
2912                    // fall through to next iteration
2913                }
2914                Some(_) => {
2915                    // If we're nested inside another block — either
2916                    // via the `indent_stack` (normal mapping/sequence
2917                    // open) or `compact_sequence_indents` (a
2918                    // compact block sequence at the same indent as
2919                    // its parent) — and this candidate line is not
2920                    // strictly deeper than base_indent, it's a
2921                    // sibling outside the scalar's scope (yaml-test-
2922                    // suite K858, P2AD).
2923                    let inside_block =
2924                        self.indent_stack.len() > 1 || !self.compact_sequence_indents.is_empty();
2925                    if inside_block && line_indent <= base_indent {
2926                        content_indent = max_blank_indent.max(base_indent + 1);
2927                    } else {
2928                        content_indent = line_indent;
2929                    }
2930                    // §8.1.2.1: leading blank lines may not exceed the
2931                    // detected content indent — that ambiguity is
2932                    // invalid (yaml-test-suite W9L4, S98Z).
2933                    if max_blank_indent > content_indent {
2934                        self.position = saved_position;
2935                        self.current_char = saved_char;
2936                        self.current_char_index = saved_char_index;
2937                        return Err(Error::scan(
2938                            self.position,
2939                            "Block scalar leading blank-line indent exceeds content indent"
2940                                .to_string(),
2941                        ));
2942                    }
2943                    found = true;
2944                    break;
2945                }
2946            }
2947        }
2948
2949        if !found {
2950            content_indent = max_blank_indent;
2951        }
2952
2953        self.position = saved_position;
2954        self.current_char = saved_char;
2955        self.current_char_index = saved_char_index;
2956
2957        Ok(content_indent)
2958    }
2959
2960    /// Count indentation at start of current line
2961    fn count_line_indent(&mut self) -> usize {
2962        let mut indent = 0;
2963        let saved_position = self.position;
2964        let saved_char = self.current_char;
2965        let saved_char_index = self.current_char_index;
2966
2967        while let Some(ch) = self.current_char {
2968            if ch == ' ' {
2969                indent += 1;
2970                self.advance();
2971            } else if ch == '\t' {
2972                indent += 8; // Tab counts as 8 spaces
2973                self.advance();
2974            } else {
2975                break;
2976            }
2977        }
2978
2979        // Restore position
2980        self.position = saved_position;
2981        self.current_char = saved_char;
2982        self.current_char_index = saved_char_index;
2983
2984        indent
2985    }
2986
2987    /// Collect content for a literal block scalar.
2988    ///
2989    /// Each line is preserved with its terminating newline. After collection
2990    /// we apply the chomping mode per spec §8.1.1.2.
2991    fn collect_literal_block_content(
2992        &mut self,
2993        content_indent: usize,
2994        chomping: ChompingMode,
2995    ) -> Result<String> {
2996        let mut content = String::new();
2997
2998        loop {
2999            // Count current line's leading-space indent.
3000            let mut line_indent = 0;
3001            let save_pos = self.position;
3002            let save_ch = self.current_char;
3003            let save_idx = self.current_char_index;
3004            while self.current_char == Some(' ') {
3005                line_indent += 1;
3006                self.advance();
3007            }
3008
3009            let line_is_blank = matches!(self.current_char, Some('\n' | '\r') | None);
3010
3011            if !line_is_blank && line_indent < content_indent {
3012                // Non-empty line with less indent ends the scalar; rewind.
3013                self.position = save_pos;
3014                self.current_char = save_ch;
3015                self.current_char_index = save_idx;
3016                break;
3017            }
3018
3019            // Document marker at line start always ends the scalar,
3020            // regardless of content_indent (allows zero-indented
3021            // block scalars per yaml-test-suite FP8R).
3022            if line_indent == 0 && self.is_doc_marker_here() {
3023                self.position = save_pos;
3024                self.current_char = save_ch;
3025                self.current_char_index = save_idx;
3026                break;
3027            }
3028
3029            if line_is_blank {
3030                // A blank line counts when there's an actual line break
3031                // to consume. EOF after we've consumed some whitespace
3032                // on the trailing line ALSO counts as one final blank
3033                // line (yaml-test-suite JEF9/02: `- |+\n        `).
3034                if matches!(self.current_char, Some('\n' | '\r')) {
3035                    // Whitespace beyond content_indent is literal content
3036                    // even on blank lines (yaml-test-suite 6FWR).
3037                    for _ in content_indent..line_indent {
3038                        content.push(' ');
3039                    }
3040                    content.push('\n');
3041                    self.advance();
3042                    continue;
3043                }
3044                if line_indent > 0 {
3045                    for _ in content_indent..line_indent {
3046                        content.push(' ');
3047                    }
3048                    content.push('\n');
3049                }
3050                break;
3051            }
3052
3053            // Content line: we already consumed `line_indent` spaces, but
3054            // only `content_indent` of them belong to indentation. Any
3055            // extra leading spaces are literal content.
3056            let mut line = String::new();
3057            for _ in content_indent..line_indent {
3058                line.push(' ');
3059            }
3060            while let Some(ch) = self.current_char {
3061                if ch == '\n' || ch == '\r' {
3062                    self.advance();
3063                    break;
3064                }
3065                line.push(ch);
3066                self.advance();
3067            }
3068            content.push_str(&line);
3069            content.push('\n');
3070
3071            if self.current_char.is_none() {
3072                break;
3073            }
3074        }
3075
3076        Ok(apply_chomping(content, chomping))
3077    }
3078
3079    /// Check if cursor is at `---` or `...` followed by whitespace/EOL.
3080    fn is_doc_marker_here(&self) -> bool {
3081        let c0 = self.current_char;
3082        let c1 = self.peek_char(1);
3083        let c2 = self.peek_char(2);
3084        let c3 = self.peek_char(3);
3085        let trailing_ok = c3.map_or(true, |c| c.is_whitespace());
3086        (c0 == Some('-') && c1 == Some('-') && c2 == Some('-') && trailing_ok)
3087            || (c0 == Some('.') && c1 == Some('.') && c2 == Some('.') && trailing_ok)
3088    }
3089
3090    /// Collect content for a folded block scalar.
3091    ///
3092    /// Folding rules (§8.1.3): a sequence of single blank lines between
3093    /// equally-indented non-empty content lines collapses into a single
3094    /// space; runs of blank lines emit `n-1` newlines; more-indented
3095    /// lines preserve their newline boundaries. After collection, apply
3096    /// chomping (§8.1.1.2).
3097    fn collect_folded_block_content(
3098        &mut self,
3099        content_indent: usize,
3100        chomping: ChompingMode,
3101    ) -> Result<String> {
3102        #[derive(Clone, Copy, PartialEq, Eq)]
3103        enum LineKind {
3104            Normal,
3105            MoreIndented,
3106            Empty,
3107        }
3108        struct Line {
3109            text: String,
3110            kind: LineKind,
3111        }
3112
3113        let mut lines: Vec<Line> = Vec::new();
3114
3115        loop {
3116            let mut line_indent = 0;
3117            let save_pos = self.position;
3118            let save_ch = self.current_char;
3119            let save_idx = self.current_char_index;
3120            while self.current_char == Some(' ') {
3121                line_indent += 1;
3122                self.advance();
3123            }
3124
3125            let line_is_blank = matches!(self.current_char, Some('\n' | '\r') | None);
3126
3127            if !line_is_blank && line_indent < content_indent {
3128                self.position = save_pos;
3129                self.current_char = save_ch;
3130                self.current_char_index = save_idx;
3131                break;
3132            }
3133
3134            if line_indent == 0 && self.is_doc_marker_here() {
3135                self.position = save_pos;
3136                self.current_char = save_ch;
3137                self.current_char_index = save_idx;
3138                break;
3139            }
3140
3141            if line_is_blank {
3142                if matches!(self.current_char, Some('\n' | '\r')) {
3143                    lines.push(Line {
3144                        text: String::new(),
3145                        kind: LineKind::Empty,
3146                    });
3147                    self.advance();
3148                    continue;
3149                }
3150                break;
3151            }
3152
3153            // Capture extra-indent leading spaces as part of content.
3154            let mut text = String::new();
3155            for _ in content_indent..line_indent {
3156                text.push(' ');
3157            }
3158            while let Some(ch) = self.current_char {
3159                if ch == '\n' || ch == '\r' {
3160                    self.advance();
3161                    break;
3162                }
3163                text.push(ch);
3164                self.advance();
3165            }
3166            // §8.1.3.2: "more indented" means the content (after the
3167            // common indent strip) begins with extra whitespace —
3168            // either spaces or tabs (yaml-test-suite MJS9).
3169            let kind = if text.starts_with(' ') || text.starts_with('\t') {
3170                LineKind::MoreIndented
3171            } else {
3172                LineKind::Normal
3173            };
3174            lines.push(Line { text, kind });
3175
3176            if self.current_char.is_none() {
3177                break;
3178            }
3179        }
3180
3181        // Build the folded output.
3182        let mut content = String::new();
3183        let mut idx = 0;
3184        while idx < lines.len() {
3185            let line = &lines[idx];
3186            match line.kind {
3187                LineKind::Normal | LineKind::MoreIndented => {
3188                    content.push_str(&line.text);
3189                    // Lookahead: count immediately-following empty lines.
3190                    let mut j = idx + 1;
3191                    let mut empties = 0;
3192                    while j < lines.len() && lines[j].kind == LineKind::Empty {
3193                        empties += 1;
3194                        j += 1;
3195                    }
3196                    if j < lines.len() {
3197                        // Spec §8.1.3.2: folding behaviour depends on
3198                        // whether either surrounding content line is
3199                        // "more indented" than the content indent.
3200                        // - both Normal, 0 empties → fold to space.
3201                        // - both Normal, k empties → k newlines (one
3202                        //   break folded out).
3203                        // - any MoreIndented, 0 empties → 1 newline.
3204                        // - any MoreIndented, k empties → k+1 newlines
3205                        //   (every break preserved).
3206                        let mi_adjacent = line.kind == LineKind::MoreIndented
3207                            || lines[j].kind == LineKind::MoreIndented;
3208                        if empties == 0 {
3209                            if mi_adjacent {
3210                                content.push('\n');
3211                            } else {
3212                                content.push(' ');
3213                            }
3214                        } else {
3215                            let breaks = if mi_adjacent { empties + 1 } else { empties };
3216                            for _ in 0..breaks {
3217                                content.push('\n');
3218                            }
3219                        }
3220                        idx = j;
3221                    } else {
3222                        // End of stream after content (possibly trailing empties).
3223                        // Always emit final `\n` for the last content line; extra
3224                        // trailing empties contribute additional `\n`s, and chomping
3225                        // will trim them later if needed.
3226                        content.push('\n');
3227                        for _ in 0..empties {
3228                            content.push('\n');
3229                        }
3230                        break;
3231                    }
3232                }
3233                LineKind::Empty => {
3234                    // Leading empty lines (no preceding content): emit as `\n`s.
3235                    content.push('\n');
3236                    idx += 1;
3237                }
3238            }
3239        }
3240
3241        Ok(apply_chomping(content, chomping))
3242    }
3243
3244    /// Emit a `BlockMappingStart` token if the current position is the
3245    /// start of an implicit key and no mapping is yet active at this
3246    /// indent level. Shared by plain and quoted scalar dispatch.
3247    fn maybe_open_block_mapping_for_key(&mut self) -> Result<()> {
3248        let last_indent = *self.indent_stack.last().unwrap();
3249        let should_start_new_mapping = if self.current_indent > last_indent {
3250            true
3251        } else if self.current_indent == last_indent {
3252            !self.check_active_mapping_at_level(self.current_indent)
3253        } else {
3254            false
3255        };
3256        if should_start_new_mapping {
3257            // §6.1 + §8.22: opening a NEW block mapping at deeper
3258            // indent than the parent only makes sense if the parent
3259            // has a key WITHOUT a value (the new mapping IS that
3260            // value). If the parent's last content is a complete
3261            // (key, value) pair — i.e. the most recent meaningful
3262            // token is a value-position scalar/alias/close — then
3263            // there's no node to host the deeper mapping (yaml-test-
3264            // suite U44R: \`map:\\n  key1: q\\n   key2: bad\` — key2
3265            // is deeper than key1 but key1's value is already \`q\`).
3266            if self.current_indent > last_indent && last_indent > 0 {
3267                let mut depth = 0i32;
3268                let mut last_meaningful = None;
3269                for t in self.tokens.iter().rev() {
3270                    match &t.token_type {
3271                        TokenType::BlockEnd => depth += 1,
3272                        TokenType::BlockMappingStart | TokenType::BlockSequenceStart => {
3273                            if depth == 0 {
3274                                break;
3275                            }
3276                            depth -= 1;
3277                        }
3278                        TokenType::Anchor(_) | TokenType::Tag(_) => {}
3279                        other => {
3280                            if depth == 0 {
3281                                last_meaningful = Some(other.clone());
3282                                break;
3283                            }
3284                        }
3285                    }
3286                }
3287                if matches!(
3288                    last_meaningful,
3289                    Some(
3290                        TokenType::Scalar(..)
3291                            | TokenType::Alias(_)
3292                            | TokenType::FlowSequenceEnd
3293                            | TokenType::FlowMappingEnd
3294                            | TokenType::BlockScalarLiteral(..)
3295                            | TokenType::BlockScalarFolded(..)
3296                    )
3297                ) {
3298                    return Err(Error::scan(
3299                        self.position,
3300                        "Indentation increase has no parent in current mapping/sequence"
3301                            .to_string(),
3302                    ));
3303                }
3304            }
3305            self.indent_stack.push(self.current_indent);
3306            self.indent_is_sequence.push(false);
3307            self.resource_tracker
3308                .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
3309            self.tokens
3310                .push(Token::simple(TokenType::BlockMappingStart, self.position));
3311        }
3312        Ok(())
3313    }
3314
3315    /// Look ahead on the current line for a `:` that marks a mapping key.
3316    ///
3317    /// Per YAML 1.2 §7.3.3, a plain scalar may contain a `:` that is not
3318    /// followed by whitespace. Only `: ` terminates the scalar. If the
3319    /// line begins with `"` or `'`, the leading quoted scalar's contents
3320    /// are scanned past (including `''` and `\"` escapes) before looking
3321    /// for the `: ` that would make this scalar a key. This handles
3322    /// yaml-test-suite 6H3V (`'foo: bar\': baz'`) and 6SLA.
3323    /// For an alias/anchor at the current position, scan past
3324    /// the `&`/`*` and the name characters; if the FIRST char that
3325    /// would terminate the name is `:`, the colon is PART of the
3326    /// alias/anchor name (yaml-test-suite 2SXE). Returns true in
3327    /// that case so the caller can skip the implicit-key fast-path.
3328    fn colon_belongs_to_alias_anchor_name(&self) -> bool {
3329        // Start after the `&` / `*` introducer.
3330        let mut i = self.current_char_index + 1;
3331        let n = self.char_cache.len();
3332        // Per scan_identifier rules: stop at whitespace or flow indicator.
3333        while i < n {
3334            let c = self.char_cache[i];
3335            if c.is_whitespace() || matches!(c, ',' | '[' | ']' | '{' | '}') {
3336                break;
3337            }
3338            i += 1;
3339        }
3340        // If the next char (or last consumed?) at termination is `:`,
3341        // then the name ended with `:`. Look at the LAST consumed
3342        // char. Actually our scan_identifier accepts `:` as part of
3343        // name — so the colon is already in the name. There's no
3344        // separate "value indicator" colon after.
3345        //
3346        // For the implicit-key fast path to be wrong, we need the
3347        // name to END with `:` (last char of name is `:`).
3348        if i > self.current_char_index + 1 {
3349            let last_name_char = self.char_cache[i - 1];
3350            if last_name_char == ':' {
3351                return true;
3352            }
3353        }
3354        false
3355    }
3356
3357    /// Scan ahead on the current line (the rest of the post-indent
3358    /// content) to determine whether it looks like an implicit
3359    /// mapping key — i.e. has a `: ` separator (or `:` at line end)
3360    /// before any newline.
3361    fn line_after_indent_is_implicit_key(&self) -> bool {
3362        let mut i = self.current_char_index;
3363        let n = self.char_cache.len();
3364        while i < n {
3365            let ch = self.char_cache[i];
3366            if ch == '\n' || ch == '\r' {
3367                return false;
3368            }
3369            if ch == ':' {
3370                let next = self.char_cache.get(i + 1).copied();
3371                if next.is_none() || next.map_or(false, |c| c.is_whitespace()) {
3372                    return true;
3373                }
3374            }
3375            i += 1;
3376        }
3377        false
3378    }
3379
3380    /// Walk back through recent tokens; if the last non-property
3381    /// token was `Value` (`:`), the parser is in value-expectation
3382    /// mode (key not yet matched with a value).
3383    fn most_recent_token_is_value_separator(&self) -> bool {
3384        for t in self.tokens.iter().rev() {
3385            match t.token_type {
3386                TokenType::Anchor(_) | TokenType::Tag(_) => {}
3387                TokenType::Value => return true,
3388                _ => return false,
3389            }
3390        }
3391        false
3392    }
3393
3394    fn check_for_mapping_ahead(&self) -> bool {
3395        let mut i = self.current_char_index;
3396        let n = self.char_cache.len();
3397        if i < n {
3398            let first = self.char_cache[i];
3399            if first == '\'' || first == '"' {
3400                let quote = first;
3401                i += 1;
3402                while i < n {
3403                    let c = self.char_cache[i];
3404                    if c == '\n' || c == '\r' {
3405                        return false; // unterminated quote on line
3406                    }
3407                    if quote == '\'' && c == '\'' && self.char_cache.get(i + 1) == Some(&'\'') {
3408                        // `''` is the in-string single-quote escape.
3409                        i += 2;
3410                        continue;
3411                    }
3412                    if quote == '"' && c == '\\' {
3413                        // Skip the escaped char.
3414                        i += 2;
3415                        continue;
3416                    }
3417                    if c == quote {
3418                        i += 1;
3419                        break;
3420                    }
3421                    i += 1;
3422                }
3423            }
3424        }
3425        // Skip balanced flow collections — a `:` *inside* `[...]` or
3426        // `{...}` does NOT make the line a block-mapping key (the flow
3427        // collection itself can BE the key, but its inner colons are
3428        // part of its own structure). yaml-test-suite: `{key: v}` is
3429        // a standalone flow mapping; `[a]: outer` is a block-map key.
3430        let mut flow_depth: i32 = 0;
3431        while i < n {
3432            let ch = self.char_cache[i];
3433            match ch {
3434                '\n' | '\r' => return false,
3435                '[' | '{' => flow_depth += 1,
3436                ']' | '}' => flow_depth -= 1,
3437                ':' if flow_depth <= 0 => {
3438                    let next = self.char_cache.get(i + 1).copied();
3439                    match next {
3440                        None => return true,
3441                        Some(c) if c.is_whitespace() => return true,
3442                        _ => {}
3443                    }
3444                }
3445                _ => {}
3446            }
3447            i += 1;
3448        }
3449        false
3450    }
3451
3452    /// Check if there's an active mapping at the specified indentation level
3453    /// This method properly handles BlockEnd tokens by tracking mapping start/end pairs
3454    fn check_active_mapping_at_level(&self, _target_indent: usize) -> bool {
3455        let mut depth = 0;
3456
3457        // Walk backwards through tokens to find the innermost unmatched block start.
3458        // Every BlockEnd increments depth; BlockMappingStart and BlockSequenceStart
3459        // decrement it (both open blocks that need a matching BlockEnd).
3460        // When depth == 0 we have found the block start that is still "open".
3461        for token in self.tokens.iter().rev() {
3462            match &token.token_type {
3463                TokenType::BlockMappingStart => {
3464                    if depth == 0 {
3465                        // The innermost open block is a mapping — active at this level.
3466                        return true;
3467                    }
3468                    depth -= 1;
3469                }
3470                TokenType::BlockSequenceStart => {
3471                    if depth == 0 {
3472                        // The innermost open block is a sequence, not a mapping.
3473                        return false;
3474                    }
3475                    depth -= 1;
3476                }
3477                TokenType::BlockEnd => {
3478                    depth += 1;
3479                }
3480                TokenType::StreamStart | TokenType::DocumentStart | TokenType::DocumentEnd => {
3481                    // Stop at document boundaries
3482                    break;
3483                }
3484                _ => {}
3485            }
3486        }
3487
3488        false
3489    }
3490}
3491
3492impl Scanner for BasicScanner {
3493    fn check_token(&self) -> bool {
3494        // For lazy scanning: check if we have cached tokens or can generate more
3495        self.token_index < self.tokens.len() || !self.done
3496    }
3497
3498    fn peek_token(&self) -> Result<Option<&Token>> {
3499        // This is a bit tricky with lazy scanning since peek shouldn't mutate
3500        // For now, return cached token if available
3501        Ok(self.tokens.get(self.token_index))
3502    }
3503
3504    fn get_token(&mut self) -> Result<Option<Token>> {
3505        // If we need more tokens and haven't finished, scan next token
3506        if self.token_index >= self.tokens.len() && !self.done {
3507            self.scan_next_token()?;
3508        }
3509
3510        if self.token_index < self.tokens.len() {
3511            let token = self.tokens[self.token_index].clone();
3512            self.token_index += 1;
3513            Ok(Some(token))
3514        } else {
3515            Ok(None)
3516        }
3517    }
3518
3519    fn reset(&mut self) {
3520        self.token_index = 0;
3521        self.position = Position::start();
3522        self.tokens.clear();
3523        self.done = false;
3524        self.current_char = self.input.chars().next();
3525        self.indent_stack = vec![0];
3526        self.current_indent = 0;
3527        self.flow_level = 0;
3528        self.detected_indent_style = None;
3529        self.indent_samples.clear();
3530        self.previous_indent_level = 0;
3531        self.current_char_index = 0;
3532        self.current_char = self.char_cache.first().copied();
3533    }
3534
3535    fn position(&self) -> Position {
3536        self.position
3537    }
3538
3539    fn input(&self) -> &str {
3540        &self.input
3541    }
3542}
3543
3544#[cfg(test)]
3545mod tests {
3546    use super::*;
3547
3548    /// Drive the parser pipeline on `input` in a dedicated thread, returning
3549    /// `None` if it doesn't finish within `Duration::from_secs(2)`. Used by
3550    /// regression tests for parser hangs so a still-broken parser doesn't
3551    /// block the whole `cargo test` run.
3552    fn parse_with_timeout(input: &str) -> Option<Vec<crate::parser::Event>> {
3553        use crate::parser::{BasicParser, Parser as ParserTrait};
3554        use std::sync::mpsc;
3555        use std::thread;
3556        use std::time::Duration;
3557
3558        let owned = input.to_string();
3559        let (tx, rx) = mpsc::channel();
3560        thread::spawn(move || {
3561            let mut p = BasicParser::new_eager(owned);
3562            let _ = p.take_scanning_error();
3563            let mut events = Vec::new();
3564            loop {
3565                match p.get_event() {
3566                    Ok(Some(ev)) => events.push(ev),
3567                    Ok(None) => break,
3568                    Err(_) => break,
3569                }
3570            }
3571            let _ = tx.send(events);
3572        });
3573        rx.recv_timeout(Duration::from_secs(2)).ok()
3574    }
3575
3576    /// Regression: `---` directly followed by non-space text used to spin the
3577    /// scanner forever because the `-` match arm at line-start dispatched to
3578    /// `scan_document_start` (which correctly returned None) and then to
3579    /// `is_plain_scalar_start` (which returns false for `-`, so no consumption
3580    /// occurred — outer `while let` re-entered with the same char). Fix:
3581    /// fall through to `scan_plain_scalar` unconditionally when not a doc
3582    /// marker — the guard already ensures the char is non-whitespace.
3583    /// See yaml-test-suite tests 82AN / EXG3.
3584    #[test]
3585    fn three_dashes_directly_followed_by_text_does_not_hang() {
3586        let events = parse_with_timeout("---word1\nword2\n")
3587            .expect("parser hung — `---word1` should not produce an infinite loop");
3588        // We must produce at least one scalar whose value starts with `---`,
3589        // proving that the dashes were consumed as part of a plain scalar
3590        // (not interpreted as a document marker, which would consume them
3591        // separately).
3592        let starts_with_dashes = events.iter().any(|e| {
3593            matches!(&e.event_type,
3594                crate::parser::EventType::Scalar { value, .. } if value.starts_with("---")
3595            )
3596        });
3597        assert!(
3598            starts_with_dashes,
3599            "expected a plain scalar starting with `---`, got events: {events:?}"
3600        );
3601    }
3602
3603    /// YAML 1.2 §7.3.3: `?`, `:`, and `-` may start a plain scalar provided
3604    /// the next character is non-space (and, in flow context, not a flow
3605    /// indicator). The previous `is_plain_scalar_start` unconditionally
3606    /// rejected those three characters, so plain scalars like `?foo`,
3607    /// `:foo`, `-foo` were reported as `Invalid character`.
3608    /// Tracked by yaml-test-suite 2EBW.
3609    #[test]
3610    fn question_mark_followed_by_text_starts_plain_scalar() {
3611        use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3612        let mut p = BasicParser::new_eager("?foo: bar\n".to_string());
3613        assert!(p.take_scanning_error().is_none());
3614        let mut keys = Vec::new();
3615        while let Ok(Some(ev)) = p.get_event() {
3616            if let EventType::Scalar { value, .. } = ev.event_type {
3617                keys.push(value);
3618            }
3619        }
3620        assert_eq!(keys, vec!["?foo", "bar"]);
3621    }
3622
3623    #[test]
3624    fn colon_followed_by_text_starts_plain_scalar() {
3625        use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3626        let mut p = BasicParser::new_eager(":foo: bar\n".to_string());
3627        assert!(p.take_scanning_error().is_none());
3628        let mut keys = Vec::new();
3629        while let Ok(Some(ev)) = p.get_event() {
3630            if let EventType::Scalar { value, .. } = ev.event_type {
3631                keys.push(value);
3632            }
3633        }
3634        assert_eq!(keys, vec![":foo", "bar"]);
3635    }
3636
3637    /// YAML 1.2: every started document must be closed with a DocumentEnd
3638    /// event before StreamEnd. The previous `TokenType::StreamEnd` handler
3639    /// only emitted `-DOC` for `DocumentContent` / `BlockNode` states —
3640    /// the `DocumentStart` state (entered after `---` and a single scalar
3641    /// like `"foo"`) was skipped, dropping the `-DOC` event. Affected by
3642    /// yaml-test-suite 27NA, 2G84/*, 2LFX and several others.
3643    #[test]
3644    fn explicit_doc_with_only_a_scalar_emits_doc_end_before_stream_end() {
3645        use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3646        let mut p = BasicParser::new_eager("---\n\"foo\"\n".to_string());
3647        assert!(p.take_scanning_error().is_none());
3648        let mut kinds = Vec::new();
3649        while let Ok(Some(ev)) = p.get_event() {
3650            kinds.push(match ev.event_type {
3651                EventType::StreamStart => "+STR",
3652                EventType::StreamEnd => "-STR",
3653                EventType::DocumentStart { .. } => "+DOC",
3654                EventType::DocumentEnd { .. } => "-DOC",
3655                EventType::Scalar { .. } => "=VAL",
3656                _ => "?",
3657            });
3658        }
3659        // Critical: -DOC must come before -STR.
3660        let doc_end_idx = kinds.iter().position(|s| *s == "-DOC");
3661        let str_end_idx = kinds.iter().position(|s| *s == "-STR");
3662        assert!(
3663            doc_end_idx.is_some(),
3664            "missing -DOC in event stream: {kinds:?}"
3665        );
3666        assert!(
3667            doc_end_idx < str_end_idx,
3668            "expected -DOC before -STR, got {kinds:?}"
3669        );
3670    }
3671
3672    /// YAML 1.2 §5.7 hex / Unicode escapes in double-quoted strings.
3673    #[test]
3674    fn double_quoted_hex_escapes_decode_to_codepoint() {
3675        use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3676        for (input, expected) in [
3677            (r#""\x41""#, "A"),
3678            (r#""é""#, "é"),
3679            (r#""\U0001F600""#, "\u{1f600}"),
3680        ] {
3681            let mut p = BasicParser::new_eager(input.to_string());
3682            assert!(
3683                p.take_scanning_error().is_none(),
3684                "no scan error for {input}"
3685            );
3686            let mut found = None;
3687            while let Ok(Some(ev)) = p.get_event() {
3688                if let EventType::Scalar { value, .. } = ev.event_type {
3689                    found = Some(value);
3690                    break;
3691                }
3692            }
3693            assert_eq!(found.as_deref(), Some(expected), "input {input}");
3694        }
3695    }
3696
3697    #[test]
3698    fn truncated_hex_escape_is_a_scan_error() {
3699        use crate::parser::BasicParser;
3700        let mut p = BasicParser::new_eager(r#""\x4""#.to_string());
3701        assert!(
3702            p.take_scanning_error().is_some(),
3703            "truncated \\x escape must error"
3704        );
3705    }
3706
3707    /// YAML 1.2 §5.7: double-quoted strings have a strict allowlist of escape
3708    /// sequences. `\.` (and any other unknown escape) must be reported as a
3709    /// scan error. Tracked by yaml-test-suite 55WF.
3710    #[test]
3711    fn invalid_double_quoted_escape_is_a_scan_error() {
3712        use crate::parser::{BasicParser, Parser as ParserTrait};
3713        let mut p = BasicParser::new_eager("---\n\"\\.\"\n".to_string());
3714        let scan_err = p.take_scanning_error();
3715        let mut parse_err = false;
3716        if scan_err.is_none() {
3717            loop {
3718                match p.get_event() {
3719                    Ok(Some(_)) => {}
3720                    Ok(None) => break,
3721                    Err(_) => {
3722                        parse_err = true;
3723                        break;
3724                    }
3725                }
3726            }
3727        }
3728        assert!(
3729            scan_err.is_some() || parse_err,
3730            "`\\.` is not a valid double-quoted escape and must error"
3731        );
3732    }
3733
3734    /// YAML 1.2: a complex-key marker (`?`) is the first content after an
3735    /// explicit document start (`---`) — it should open an implicit block
3736    /// mapping. The previous parser handled `?` only in
3737    /// `ImplicitDocumentStart` / `DocumentContent` / already-in-mapping
3738    /// states and errored out for `DocumentStart`, breaking inputs like
3739    /// `--- !!set\n? Mark McGwire\n...`. Tracked by yaml-test-suite 2XXW.
3740    #[test]
3741    fn complex_key_directly_after_explicit_doc_start_opens_mapping() {
3742        use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3743        let mut p = BasicParser::new_eager("--- !!set\n? Mark McGwire\n? Sammy Sosa\n".to_string());
3744        assert!(p.take_scanning_error().is_none());
3745        let mut saw_map_start = false;
3746        let mut saw_error = false;
3747        loop {
3748            match p.get_event() {
3749                Ok(Some(ev)) => {
3750                    if matches!(ev.event_type, EventType::MappingStart { .. }) {
3751                        saw_map_start = true;
3752                    }
3753                }
3754                Ok(None) => break,
3755                Err(_) => {
3756                    saw_error = true;
3757                    break;
3758                }
3759            }
3760        }
3761        assert!(!saw_error, "complex key after `--- !!set` must not error");
3762        assert!(saw_map_start, "expected a MappingStart event");
3763    }
3764
3765    /// YAML 1.2 §6.9.2: anchor / alias names exclude only whitespace and
3766    /// the flow indicators `,[]{}`. Earlier implementations restricted
3767    /// `scan_identifier` to ASCII alphanumeric / `_` / `-`, which rejected
3768    /// valid unicode anchors like `&😁`. Tracked by yaml-test-suite 8XYN.
3769    #[test]
3770    fn anchor_name_may_contain_unicode_symbols() {
3771        use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3772        let mut p = BasicParser::new_eager("---\n- &😁 unicode anchor\n".to_string());
3773        assert!(
3774            p.take_scanning_error().is_none(),
3775            "unicode anchor must not error"
3776        );
3777        let mut anchors = Vec::new();
3778        while let Ok(Some(ev)) = p.get_event() {
3779            if let EventType::Scalar {
3780                anchor: Some(a), ..
3781            } = ev.event_type
3782            {
3783                anchors.push(a);
3784            }
3785        }
3786        assert_eq!(anchors, vec!["😁"]);
3787    }
3788
3789    /// YAML 1.2 §5.6 / RFC 3986 percent-encoding: tag suffixes may contain
3790    /// `%XX` percent-escaped characters, which must be URI-decoded when
3791    /// resolved. The scanner used to reject `%` in tag suffixes as
3792    /// "Invalid character", so e.g. `!e!tag%21 baz` failed before the
3793    /// resolver got a chance to decode it. Tracked by yaml-test-suite 6CK3.
3794    #[test]
3795    fn tag_suffix_with_percent_escape_resolves_to_decoded_uri() {
3796        use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3797        let mut p = BasicParser::new_eager(
3798            "%TAG !e! tag:example.com,2000:app/\n---\n- !e!tag%21 baz\n".to_string(),
3799        );
3800        assert!(
3801            p.take_scanning_error().is_none(),
3802            "tag percent-escapes must not error"
3803        );
3804        let mut tags = Vec::new();
3805        while let Ok(Some(ev)) = p.get_event() {
3806            if let EventType::Scalar { tag: Some(t), .. } = ev.event_type {
3807                tags.push(t);
3808            }
3809        }
3810        assert_eq!(tags, vec!["tag:example.com,2000:app/tag!"]);
3811    }
3812
3813    /// YAML 1.2 §6.8.4: "A YAML processor should ignore any directive it
3814    /// does not recognize." A `%FOO` reserved directive must NOT be treated
3815    /// as a scan error — the directive line is silently skipped and parsing
3816    /// continues. Tracked by yaml-test-suite test 2LFX.
3817    #[test]
3818    fn reserved_directive_is_ignored_not_an_error() {
3819        use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3820        let mut p = BasicParser::new_eager(
3821            "%FOO  bar baz # Should be ignored\n              # with a warning.\n---\n\"foo\"\n"
3822                .to_string(),
3823        );
3824        assert!(
3825            p.take_scanning_error().is_none(),
3826            "unknown directives must NOT produce a scan error"
3827        );
3828        let mut scalars = Vec::new();
3829        while let Ok(Some(ev)) = p.get_event() {
3830            if let EventType::Scalar { value, .. } = ev.event_type {
3831                scalars.push(value);
3832            }
3833        }
3834        assert_eq!(scalars, vec!["foo"]);
3835    }
3836
3837    /// Spec requires the two physical lines of `---word1\nword2` to fold into
3838    /// a single plain scalar `"---word1 word2"`. Tracked by yaml-test-suite 82AN.
3839    #[test]
3840    fn three_dashes_followed_by_text_folds_continuation_line() {
3841        let events = parse_with_timeout("---word1\nword2\n").expect("parser hung");
3842        let scalars: Vec<&str> = events
3843            .iter()
3844            .filter_map(|e| match &e.event_type {
3845                crate::parser::EventType::Scalar { value, .. } => Some(value.as_str()),
3846                _ => None,
3847            })
3848            .collect();
3849        assert_eq!(scalars, vec!["---word1 word2"]);
3850    }
3851
3852    /// Regression: tab between block-entry marker and a `-N` value used to
3853    /// hang the scanner via the same `-` match arm. See yaml-test-suite
3854    /// Y79Y/010.
3855    #[test]
3856    fn dash_tab_negative_number_does_not_hang() {
3857        let events = parse_with_timeout("-\t-1\n")
3858            .expect("parser hung — `-\\t-1` should not produce an infinite loop");
3859        assert!(!events.is_empty(), "expected event stream, got none");
3860    }
3861
3862    #[test]
3863    fn test_basic_tokenization() {
3864        let mut scanner = BasicScanner::new("42".to_string());
3865
3866        assert!(scanner.check_token());
3867
3868        // StreamStart
3869        let token = scanner.get_token().unwrap().unwrap();
3870        assert!(matches!(token.token_type, TokenType::StreamStart));
3871
3872        // Number
3873        let token = scanner.get_token().unwrap().unwrap();
3874        if let TokenType::Scalar(value, _) = token.token_type {
3875            assert_eq!(value, "42");
3876        } else {
3877            panic!("Expected scalar token");
3878        }
3879
3880        // StreamEnd
3881        let token = scanner.get_token().unwrap().unwrap();
3882        assert!(matches!(token.token_type, TokenType::StreamEnd));
3883    }
3884
3885    #[test]
3886    fn test_flow_sequence() {
3887        let mut scanner = BasicScanner::new("[1, 2, 3]".to_string());
3888
3889        // StreamStart
3890        scanner.get_token().unwrap();
3891
3892        // [
3893        let token = scanner.get_token().unwrap().unwrap();
3894        assert!(matches!(token.token_type, TokenType::FlowSequenceStart));
3895
3896        // 1
3897        let token = scanner.get_token().unwrap().unwrap();
3898        if let TokenType::Scalar(value, _) = token.token_type {
3899            assert_eq!(value, "1");
3900        }
3901
3902        // ,
3903        let token = scanner.get_token().unwrap().unwrap();
3904        assert!(matches!(token.token_type, TokenType::FlowEntry));
3905    }
3906
3907    #[test]
3908    fn test_quoted_strings() {
3909        let mut scanner = BasicScanner::new(r#""hello world""#.to_string());
3910
3911        // StreamStart
3912        scanner.get_token().unwrap();
3913
3914        // Quoted string
3915        let token = scanner.get_token().unwrap().unwrap();
3916        if let TokenType::Scalar(value, _) = token.token_type {
3917            assert_eq!(value, "hello world");
3918        } else {
3919            panic!("Expected scalar token");
3920        }
3921    }
3922
3923    #[test]
3924    fn test_comment_handling() {
3925        let input = r"
3926# Full line comment
3927key: value  # End of line comment
3928# Another comment
3929data: test
3930";
3931        let mut scanner = BasicScanner::new(input.to_string());
3932
3933        let mut tokens = Vec::new();
3934        while let Ok(Some(token)) = scanner.get_token() {
3935            tokens.push(token);
3936        }
3937
3938        // Should only contain YAML structure tokens, no comment tokens
3939        let scalar_values: Vec<String> = tokens
3940            .iter()
3941            .filter_map(|t| match &t.token_type {
3942                TokenType::Scalar(s, _) => Some(s.clone()),
3943                _ => None,
3944            })
3945            .collect();
3946
3947        assert_eq!(scalar_values, vec!["key", "value", "data", "test"]);
3948
3949        // Should not contain any comment tokens
3950        assert!(
3951            !tokens
3952                .iter()
3953                .any(|t| matches!(t.token_type, TokenType::Comment(_)))
3954        );
3955    }
3956
3957    #[test]
3958    fn test_hash_in_strings() {
3959        let input = r#"
3960string1: "This has a # character"
3961string2: 'Also has # character'
3962normal: value # This is a comment
3963"#;
3964        let mut scanner = BasicScanner::new(input.to_string());
3965
3966        let mut scalar_values = Vec::new();
3967        while let Ok(Some(token)) = scanner.get_token() {
3968            if let TokenType::Scalar(value, _) = token.token_type {
3969                scalar_values.push(value);
3970            }
3971        }
3972
3973        assert!(scalar_values.contains(&"This has a # character".to_string()));
3974        assert!(scalar_values.contains(&"Also has # character".to_string()));
3975        assert!(scalar_values.contains(&"value".to_string()));
3976        assert!(
3977            !scalar_values
3978                .iter()
3979                .any(|s| s.contains("This is a comment"))
3980        );
3981    }
3982
3983    #[test]
3984    fn test_escape_sequences() {
3985        // YAML 1.2 §5.7 double-quoted escape sequences. Single-quoted strings
3986        // have NO backslash escapes — `''` is the only escape — so this set
3987        // is restricted to the double-quoted cases.
3988        let test_cases = vec![
3989            (r#""Line 1\nLine 2""#, "Line 1\nLine 2"),
3990            (r#""Col1\tCol2""#, "Col1\tCol2"),
3991            (r#""First\rSecond""#, "First\rSecond"),
3992            (r#""Path\\to\\file""#, "Path\\to\\file"),
3993            (r#""He said \"Hello\"""#, "He said \"Hello\""),
3994        ];
3995
3996        for (input, expected) in test_cases {
3997            let mut scanner = BasicScanner::new(input.to_string());
3998            scanner.get_token().unwrap(); // Skip StreamStart
3999
4000            if let Ok(Some(token)) = scanner.get_token() {
4001                if let TokenType::Scalar(value, _) = token.token_type {
4002                    assert_eq!(value, expected, "Failed for input: {}", input);
4003                } else {
4004                    panic!("Expected scalar token for input: {}", input);
4005                }
4006            } else {
4007                panic!("Failed to get token for input: {}", input);
4008            }
4009        }
4010    }
4011
4012    #[test]
4013    fn test_extended_yaml_escapes() {
4014        // Test additional YAML escape sequences
4015        let test_cases = vec![
4016            (r#""\0""#, "\0"),   // null character
4017            (r#""\a""#, "\x07"), // bell
4018            (r#""\b""#, "\x08"), // backspace
4019            (r#""\f""#, "\x0C"), // form feed
4020            (r#""\v""#, "\x0B"), // vertical tab
4021            (r#""\e""#, "\x1B"), // escape
4022            (r#""\ ""#, " "),    // literal space
4023            (r#""\/""#, "/"),    // literal forward slash
4024        ];
4025
4026        for (input, expected) in test_cases {
4027            let mut scanner = BasicScanner::new(input.to_string());
4028            scanner.get_token().unwrap(); // Skip StreamStart
4029
4030            if let Ok(Some(token)) = scanner.get_token() {
4031                if let TokenType::Scalar(value, _) = token.token_type {
4032                    assert_eq!(value, expected, "Failed for input: {}", input);
4033                } else {
4034                    panic!("Expected scalar token for input: {}", input);
4035                }
4036            } else {
4037                panic!("Failed to get token for input: {}", input);
4038            }
4039        }
4040    }
4041
4042    #[test]
4043    fn test_unknown_escape_sequences() {
4044        // YAML 1.2 §5.7: unknown double-quoted escapes are scan errors, not
4045        // preserved literals. (Earlier versions of this scanner kept the
4046        // backslash + char verbatim — see commit history.)
4047        for input in [r#""\z""#, r#""\q""#, r#""\8""#] {
4048            let mut scanner = BasicScanner::new(input.to_string());
4049            scanner.get_token().unwrap(); // StreamStart
4050            assert!(
4051                scanner.get_token().is_err(),
4052                "expected scan error for invalid escape in {input}"
4053            );
4054        }
4055    }
4056}